From b53eb7dcda6cb2c8e5a3f49bee15189fc2232401 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 26 Dec 2018 19:43:38 +0800
Subject: [PATCH 01/73] add init once for assign layer

---
 python/paddle/fluid/layers/nn.py              |  8 ++--
 python/paddle/fluid/layers/tensor.py          | 39 ++++++++++++++-----
 .../fluid/tests/unittests/test_layers.py      | 12 ++++++
 3 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index cc1fdbd2856..00523c07987 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -5010,10 +5010,12 @@ def nce(input,
             alias_probs_[little[0]] = 1.0
             alias_[little[0]] = -1
 
-        probs = assign(input=np.array(custom_dist).astype('float32'))
-        custom_alias = assign(input=np.array(alias_).astype('int32'))
+        probs = assign(
+            input=np.array(custom_dist).astype('float32'), init_once=True)
+        custom_alias = assign(
+            input=np.array(alias_).astype('int32'), init_once=True)
         custom_alias_probs = assign(
-            input=np.array(alias_probs_).astype('float32'))
+            input=np.array(alias_probs_).astype('float32'), init_once=True)
 
         inputs['CustomDistProbs'] = probs
         inputs['CustomDistAlias'] = custom_alias
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 49a486cf0c3..d66d92b1df7 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -285,7 +285,7 @@ def sums(input, out=None):
     return out
 
 
-def assign(input, output=None):
+def assign(input, output=None, init_once=False):
     """
     **Assign**
 
@@ -294,6 +294,7 @@ def assign(input, output=None):
     Args:
         input(Variable|numpy.ndarray): The source variable
         output(Variable|None): The destination variable
+        init_once(bool|false): assign value into global var only in startup program.
 
     Returns:
         Variable: The destination variable that was supplied as the *output*.
@@ -307,10 +308,18 @@ def assign(input, output=None):
     """
     helper = LayerHelper('assign', **locals())
     if output is None:
-        output = helper.create_variable_for_type_inference(dtype=input.dtype)
+        if init_once:
+            output = helper.create_parameter(
+                attr=ParamAttr(), shape=input.shape, dtype=input.dtype)
+        else:
+            output = helper.create_variable_for_type_inference(
+                dtype=input.dtype)
     if isinstance(input, Variable):
+        if init_once:
+            raise ValueError("init once only support numpy assign!")
         helper.append_op(
             type='assign', inputs={'X': [input]}, outputs={'Out': [output]})
+
     elif isinstance(input, numpy.ndarray):
         dtype = convert_np_dtype_to_dtype_(input.dtype)
         if dtype == VarDesc.VarType.FP32:
@@ -325,14 +334,24 @@ def assign(input, output=None):
             raise ValueError("The size of input is too big. Please consider "
                              "saving it to file and 'load_op' to load it")
 
-        helper.append_op(
-            type='assign_value',
-            outputs={'Out': [output]},
-            attrs={
-                'dtype': dtype,
-                'shape': list(input.shape),
-                value_name: values
-            })
+        if init_once:
+            helper.startup_program.global_block().append_op(
+                type='assign_value',
+                outputs={'Out': [output]},
+                attrs={
+                    'dtype': dtype,
+                    'shape': list(input.shape),
+                    value_name: values
+                })
+        else:
+            helper.append_op(
+                type='assign_value',
+                outputs={'Out': [output]},
+                attrs={
+                    'dtype': dtype,
+                    'shape': list(input.shape),
+                    value_name: values
+                })
     else:
         raise ValueError("Wrong type for assign input: %s" % type(input))
 
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index e180822c2b4..92065abb9bf 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -1015,6 +1015,18 @@ class TestBook(unittest.TestCase):
 
         print(str(program))
 
+    def test_assign(self):
+        import numpy as np
+        startup = Program()
+        main = Program()
+        with program_guard(main, startup):
+            probs = layers.assign(
+                input=np.random.random([1, 2]).astype('float32'),
+                init_once=True)
+
+        print(str(main))
+        print(str(startup))
+
 
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From 031995cf589b936891aecedb5e13cd93f42ec4eb Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 26 Dec 2018 22:10:03 +0800
Subject: [PATCH 02/73] fix

---
 python/paddle/fluid/layers/nn.py     | 2 +-
 python/paddle/fluid/layers/tensor.py | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 00523c07987..ee165d092c1 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -26,7 +26,7 @@ from ..initializer import Normal, Constant
 from ..framework import Variable, OpProtoHolder
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
-from .tensor import concat
+from .tensor import concat, assign
 from . import utils
 from .. import unique_name
 from functools import reduce
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index d66d92b1df7..5d5657eae52 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -310,7 +310,10 @@ def assign(input, output=None, init_once=False):
     if output is None:
         if init_once:
             output = helper.create_parameter(
-                attr=ParamAttr(), shape=input.shape, dtype=input.dtype)
+                attr=ParamAttr(),
+                shape=input.shape,
+                dtype=input.dtype,
+                default_initializer=Constant(0.0))
         else:
             output = helper.create_variable_for_type_inference(
                 dtype=input.dtype)
-- 
GitLab


From 0384f3309a68c40c2c7e88c317dc536e3279e8e0 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 26 Dec 2018 22:36:02 +0800
Subject: [PATCH 03/73] enable unit test for test_nce test=develop

---
 python/paddle/fluid/layers/tensor.py               | 1 +
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 5d5657eae52..3e36fb7632c 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -314,6 +314,7 @@ def assign(input, output=None, init_once=False):
                 shape=input.shape,
                 dtype=input.dtype,
                 default_initializer=Constant(0.0))
+            output.stop_gradient = True
         else:
             output = helper.create_variable_for_type_inference(
                 dtype=input.dtype)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 6d6fe245d8a..5d0fd7b1b10 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -32,7 +32,6 @@ endif()
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
-list(REMOVE_ITEM TEST_OPS test_nce) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778
 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152
 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
 
-- 
GitLab


From 908684a535a162f54f3e01b449779dda1853de85 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 9 Jan 2019 14:27:49 +0800
Subject: [PATCH 04/73] change the largest size of assign

---
 python/paddle/fluid/layers/tensor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 3e36fb7632c..4f73194d82e 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -334,7 +334,7 @@ def assign(input, output=None, init_once=False):
             values = [int(v) for v in input.flat]
         else:
             raise ValueError("Unsupported dtype %s", input.dtype)
-        if input.size > 1024 * 1024:
+        if input.size > 1024 * 1024 * 5:
             raise ValueError("The size of input is too big. Please consider "
                              "saving it to file and 'load_op' to load it")
 
-- 
GitLab


From 03fe31097b324f4a6500c7e1dded164fff699b91 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 15 Jan 2019 15:35:25 +0800
Subject: [PATCH 05/73] add static GAN

---
 python/paddle/fluid/imperative/nn.py          |  18 ++-
 .../tests/unittests/test_imperative_base.py   |  11 +-
 .../tests/unittests/test_imperative_gan.py    | 134 ++++++++++++++++++
 3 files changed, 155 insertions(+), 8 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_gan.py

diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index 8754e5d4d0c..eeca3370841 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -209,14 +209,22 @@ class FC(layers.Layer):
     def __init__(self,
                  size,
                  param_attr=None,
+                 bias_attr=None,
                  num_flatten_dims=1,
-                 dtype=core.VarDesc.VarType.FP32):
+                 dtype=core.VarDesc.VarType.FP32,
+                 act=None,
+                 name=None):
         super(FC, self).__init__()
         self._size = size
         self._num_flatten_dims = num_flatten_dims
         self._dtype = dtype
         from ..layer_helper import LayerHelper
-        self._helper = LayerHelper('FC', param_attr=param_attr)
+        self._helper = LayerHelper(
+            'FC',
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            act=act,
+            name=name)
 
     def _build_once(self, input):
         input_shape = input.shape
@@ -247,4 +255,8 @@ class FC(layers.Layer):
             inputs={"X": [tmp]},
             outputs={"Out": out},
             attrs={"use_mkldnn": False})
-        return out
+        # add bias
+        pre_activation = self._helper.append_bias_op(
+            out, dim_start=self._num_flatten_dims)
+        # add activation
+        return self._helper.append_activation(pre_activation)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_base.py b/python/paddle/fluid/tests/unittests/test_imperative_base.py
index 478cc13fb5b..1dd5348a885 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_base.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_base.py
@@ -21,10 +21,11 @@ from paddle.fluid import core
 
 
 @contextlib.contextmanager
-def new_program_scope():
-    prog = fluid.Program()
-    startup_prog = fluid.Program()
-    scope = fluid.core.Scope()
+def new_program_scope(main=None, startup=None, scope=None):
+    prog = main if main else fluid.Program()
+    startup_prog = startup if startup else fluid.Program()
+    scope = scope if scope else fluid.core.Scope()
     with fluid.scope_guard(scope):
         with fluid.program_guard(prog, startup_prog):
-            yield
+            with fluid.unique_name.guard():
+                yield
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
new file mode 100644
index 00000000000..9748e0a3776
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import unittest
+import numpy as np
+import six
+import sys
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
+from test_imperative_base import new_program_scope
+
+
+class Discriminator(fluid.imperative.Layer):
+    def __init__(self):
+        super(Discriminator, self).__init__()
+        self._fc1 = FC(size=32, act='elu', name="d_fc1")
+        self._fc2 = FC(size=1, name="d_fc2")
+
+    def forward(self, inputs):
+        x = self._fc1(inputs)
+        return self._fc2(x)
+
+
+class Generator(fluid.imperative.Layer):
+    def __init__(self):
+        super(Generator, self).__init__()
+        self._fc1 = FC(size=64, act='elu', name="g_fc1")
+        self._fc2 = FC(size=64, act='elu', name="g_fc2")
+        self._fc3 = FC(size=1, name="g_fc3")
+
+    def forward(self, inputs):
+        x = self._fc1(inputs)
+        x = self._fc2(x)
+        return self._fc3(x)
+
+
+class TestImperativeMnist(unittest.TestCase):
+    def test_mnist_cpu_float32(self):
+        seed = 90
+
+        startup = fluid.Program()
+        startup.random_seed = seed
+        discriminate_p = fluid.Program()
+        scope = fluid.core.Scope()
+        exe = fluid.Executor(fluid.CPUPlace())
+        with new_program_scope(
+                main=discriminate_p, startup=startup, scope=scope):
+            fluid.default_main_program().random_seed = seed
+
+            discriminator = Discriminator()
+            generator = Generator()
+
+            img = fluid.layers.data(
+                name="img", shape=[2, 1], append_batch_size=False)
+            noise = fluid.layers.data(
+                name="noise", shape=[2, 2], append_batch_size=False)
+
+            label = fluid.layers.data(
+                name='label',
+                shape=[2, 1],
+                dtype='float32',
+                append_batch_size=False)
+
+            d_real = discriminator(img)
+            d_loss_real = fluid.layers.reduce_mean(
+                fluid.layers.sigmoid_cross_entropy_with_logits(
+                    x=d_real, label=label))
+
+            d_fake = discriminator(generator(noise))
+            d_loss_fake = fluid.layers.reduce_mean(
+                fluid.layers.sigmoid_cross_entropy_with_logits(
+                    x=d_fake, label=label))
+
+            d_loss = d_loss_real + d_loss_fake
+
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            sgd.minimize(d_loss)
+
+        generate_p = fluid.Program()
+        with new_program_scope(main=generate_p, startup=startup, scope=scope):
+            fluid.default_main_program().random_seed = seed
+
+            discriminator = Discriminator()
+            generator = Generator()
+
+            noise = fluid.layers.data(
+                name="noise", shape=[2, 2], append_batch_size=False)
+            label = fluid.layers.data(
+                name='label',
+                shape=[2, 1],
+                dtype='float32',
+                append_batch_size=False)
+
+            d_fake = discriminator(generator(noise))
+            g_loss = fluid.layers.reduce_mean(
+                fluid.layers.sigmoid_cross_entropy_with_logits(
+                    x=d_fake, label=label))
+
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            sgd.minimize(g_loss)
+
+        img = np.ones([2, 1], np.float32)
+        label = np.ones([2, 1], np.float32)
+        noise = np.ones([2, 2], np.float32)
+        exe.run(startup)
+        d_loss_val = exe.run(discriminate_p,
+                             feed={'img': img,
+                                   'noise': noise,
+                                   'label': label},
+                             fetch_list=[d_loss])[0]
+        g_loss_val = exe.run(generate_p,
+                             feed={'noise': noise,
+                                   'label': label},
+                             fetch_list=[g_loss])[0]
+        sys.stderr.write('d_loss %s, g_loss: %s\n' % (d_loss_val, g_loss_val))
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab


From a61e7d0f48607f770ce8521d2c60a72723a24d85 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Tue, 15 Jan 2019 21:10:48 +0800
Subject: [PATCH 06/73] dy gan mostly working

test=develop
---
 python/paddle/fluid/imperative/layers.py      |   9 +-
 python/paddle/fluid/imperative/nn.py          |  31 ++++--
 .../tests/unittests/test_imperative_gan.py    | 105 ++++++++++++------
 3 files changed, 101 insertions(+), 44 deletions(-)

diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index f0fec03dba3..ed67dda6375 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -27,18 +27,21 @@ class Layer(core.Layer):
     """Layers composed of operators."""
 
     def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None):
-        self._once_built = False
+        self._built = False
         self._dtype = dtype
 
+    def parameters(self):
+        return []
+
     def _build_once(self, inputs):
         pass
 
     def __call__(self, *inputs):
-        if not self._once_built:
+        if not self._built:
             self._build_once(*inputs)
-            self._once_built = True
 
         outputs = self.forward(*inputs)
+        self._built = True
         return outputs
 
     def forward(self, *inputs):
diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index eeca3370841..337a463041b 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -220,11 +220,14 @@ class FC(layers.Layer):
         self._dtype = dtype
         from ..layer_helper import LayerHelper
         self._helper = LayerHelper(
-            'FC',
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            act=act,
-            name=name)
+            'FC', param_attr=param_attr, act=act, name=name)
+        self._bias_attr = bias_attr
+
+    def parameters(self):
+        if self._bias_attr:
+            return [self._w, self._b]
+        else:
+            return [self._w]
 
     def _build_once(self, input):
         input_shape = input.shape
@@ -255,8 +258,20 @@ class FC(layers.Layer):
             inputs={"X": [tmp]},
             outputs={"Out": out},
             attrs={"use_mkldnn": False})
+        if not self._bias_attr:
+            return out
+
         # add bias
-        pre_activation = self._helper.append_bias_op(
-            out, dim_start=self._num_flatten_dims)
+        size = list(out.shape[1:])
+        if not self._built:
+            self._b = self._layer.create_parameter(
+                attr=self._bias_attr, shape=size, dtype=out.dtype, is_bias=True)
+        bias_out = self.create_variable_for_type_inference(dtype=out.dtype)
+        self.append_op(
+            type='elementwise_add',
+            inputs={'X': [out],
+                    'Y': [self._b]},
+            outputs={'Out': [bias_out]},
+            attrs={'axis': 1})
         # add activation
-        return self._helper.append_activation(pre_activation)
+        return self._helper.append_activation(bias_out)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index 9748e0a3776..af2a2f45aa8 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -23,6 +23,7 @@ import paddle.fluid as fluid
 from paddle.fluid.optimizer import SGDOptimizer
 from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
 from test_imperative_base import new_program_scope
+from paddle.fluid.imperative.base import to_variable
 
 
 class Discriminator(fluid.imperative.Layer):
@@ -31,6 +32,9 @@ class Discriminator(fluid.imperative.Layer):
         self._fc1 = FC(size=32, act='elu', name="d_fc1")
         self._fc2 = FC(size=1, name="d_fc2")
 
+    def parameters(self):
+        return self._fc1.parameters() + self._fc2.parameters()
+
     def forward(self, inputs):
         x = self._fc1(inputs)
         return self._fc2(x)
@@ -43,6 +47,10 @@ class Generator(fluid.imperative.Layer):
         self._fc2 = FC(size=64, act='elu', name="g_fc2")
         self._fc3 = FC(size=1, name="g_fc3")
 
+    def parameters(self):
+        return self._fc1.parameters() + self._fc2.parameters(
+        ) + self._fc3.parameters()
+
     def forward(self, inputs):
         x = self._fc1(inputs)
         x = self._fc2(x)
@@ -56,12 +64,15 @@ class TestImperativeMnist(unittest.TestCase):
         startup = fluid.Program()
         startup.random_seed = seed
         discriminate_p = fluid.Program()
+        generate_p = fluid.Program()
+        discriminate_p.random_seed = seed
+        generate_p.random_seed = seed
+
         scope = fluid.core.Scope()
         exe = fluid.Executor(fluid.CPUPlace())
+        sys.stderr.write('1111\n')
         with new_program_scope(
                 main=discriminate_p, startup=startup, scope=scope):
-            fluid.default_main_program().random_seed = seed
-
             discriminator = Discriminator()
             generator = Generator()
 
@@ -70,64 +81,92 @@ class TestImperativeMnist(unittest.TestCase):
             noise = fluid.layers.data(
                 name="noise", shape=[2, 2], append_batch_size=False)
 
-            label = fluid.layers.data(
-                name='label',
-                shape=[2, 1],
-                dtype='float32',
-                append_batch_size=False)
-
             d_real = discriminator(img)
             d_loss_real = fluid.layers.reduce_mean(
                 fluid.layers.sigmoid_cross_entropy_with_logits(
-                    x=d_real, label=label))
+                    x=d_real,
+                    label=fluid.layers.fill_constant(
+                        shape=[2, 1], dtype='float32', value=1.0)))
 
             d_fake = discriminator(generator(noise))
             d_loss_fake = fluid.layers.reduce_mean(
                 fluid.layers.sigmoid_cross_entropy_with_logits(
-                    x=d_fake, label=label))
+                    x=d_fake,
+                    label=fluid.layers.fill_constant(
+                        shape=[2, 1], dtype='float32', value=0.0)))
 
             d_loss = d_loss_real + d_loss_fake
 
             sgd = SGDOptimizer(learning_rate=1e-3)
             sgd.minimize(d_loss)
 
-        generate_p = fluid.Program()
         with new_program_scope(main=generate_p, startup=startup, scope=scope):
-            fluid.default_main_program().random_seed = seed
-
             discriminator = Discriminator()
             generator = Generator()
 
             noise = fluid.layers.data(
                 name="noise", shape=[2, 2], append_batch_size=False)
-            label = fluid.layers.data(
-                name='label',
-                shape=[2, 1],
-                dtype='float32',
-                append_batch_size=False)
 
             d_fake = discriminator(generator(noise))
             g_loss = fluid.layers.reduce_mean(
                 fluid.layers.sigmoid_cross_entropy_with_logits(
-                    x=d_fake, label=label))
+                    x=d_fake,
+                    label=fluid.layers.fill_constant(
+                        shape=[2, 1], dtype='float32', value=1.0)))
 
             sgd = SGDOptimizer(learning_rate=1e-3)
             sgd.minimize(g_loss)
 
-        img = np.ones([2, 1], np.float32)
-        label = np.ones([2, 1], np.float32)
-        noise = np.ones([2, 2], np.float32)
-        exe.run(startup)
-        d_loss_val = exe.run(discriminate_p,
-                             feed={'img': img,
-                                   'noise': noise,
-                                   'label': label},
-                             fetch_list=[d_loss])[0]
-        g_loss_val = exe.run(generate_p,
-                             feed={'noise': noise,
-                                   'label': label},
-                             fetch_list=[g_loss])[0]
-        sys.stderr.write('d_loss %s, g_loss: %s\n' % (d_loss_val, g_loss_val))
+        with fluid.scope_guard(scope):
+            img = np.ones([2, 1], np.float32)
+            noise = np.ones([2, 2], np.float32)
+            exe.run(startup)
+            d_loss_val = exe.run(discriminate_p,
+                                 feed={'img': img,
+                                       'noise': noise},
+                                 fetch_list=[d_loss])[0]
+            g_loss_val = exe.run(generate_p,
+                                 feed={'noise': noise},
+                                 fetch_list=[g_loss])[0]
+            sys.stderr.write('d_loss %s, g_loss: %s\n' %
+                             (d_loss_val, g_loss_val))
+
+            static_params = dict()
+            for param in discriminate_p.global_block().all_parameters():
+                sys.stderr.write('%s\n' % param.name)
+                static_params[param.name] = np.array(
+                    scope.find_var(param.name).get_tensor())
+
+        dy_params = dict()
+        with fluid.imperative.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            discriminator = Discriminator()
+            generator = Generator()
+            sgd = SGDOptimizer(learning_rate=1e-3)
+
+            d_real = discriminator(to_variable(np.ones([2, 1], np.float32)))
+            d_loss_real = fluid.layers.reduce_mean(
+                fluid.layers.sigmoid_cross_entropy_with_logits(
+                    x=d_real, label=to_variable(np.ones([2, 1], np.float32))))
+
+            d_fake = discriminator(
+                generator(to_variable(np.ones([2, 2], np.float32))))
+            d_loss_fake = fluid.layers.reduce_mean(
+                fluid.layers.sigmoid_cross_entropy_with_logits(
+                    x=d_fake, label=to_variable(np.zeros([2, 1], np.float32))))
+
+            d_loss = d_loss_real + d_loss_fake
+            sys.stderr.write('dy_d_loss: %s\n' % d_loss._numpy())
+            d_loss._backward()
+            sgd.minimize(d_loss)
+            for p in discriminator.parameters():
+                dy_params[p.name] = p._numpy()
+
+        for k, v in six.iteritems(dy_params):
+            sys.stderr.write('dy_param_loss: %s: %s\n' % (k, np.sum(v)))
+            sys.stderr.write('static_param_loss: %s: %s\n' % (k, np.sum(v)))
 
 
 if __name__ == '__main__':
-- 
GitLab


From 9a4314f025744931962a6f4d68aae11cccf3ab12 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 16 Jan 2019 09:54:32 +0800
Subject: [PATCH 07/73] imperative gan

test=develop
---
 paddle/fluid/imperative/layer.h               |  9 +++-
 paddle/fluid/pybind/pybind.cc                 |  1 +
 python/paddle/fluid/framework.py              |  3 ++
 .../tests/unittests/test_imperative_gan.py    | 42 +++++++++++++------
 4 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 34aa701c5b9..2289da09070 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -101,7 +101,6 @@ class VarBase {
   // Owns `var` and `grad`
   VarBase(framework::Variable* var, VarBase* grad)
       : pre_op_(nullptr),
-        pre_op_out_name_(),
         pre_op_out_idx_(-1),
         var_desc_(nullptr),
         var_(var),
@@ -110,7 +109,6 @@ class VarBase {
 
   explicit VarBase(bool stop_gradient)
       : pre_op_(nullptr),
-        pre_op_out_name_(),
         pre_op_out_idx_(-1),
         var_desc_(nullptr),
         var_(new framework::Variable()),
@@ -127,6 +125,13 @@ class VarBase {
     }
   }
 
+  void Clear() {
+    delete grads_;
+    grads_ = new VarBase(true);
+    pre_op_ = nullptr;
+    pre_op_out_name_ = "";
+  }
+
   void RunBackward();
 
   framework::LoDTensor& GradValue();
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index f3f4854a9ef..efe70a075d1 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -133,6 +133,7 @@ PYBIND11_MODULE(core, m) {
            [](imperative::VarBase &self) { self.RunBackward(); })
       .def("_grad_name", &imperative::VarBase::GradName)
       .def("_grad_value", &imperative::VarBase::GradValue)
+      .def("_clear", &imperative::VarBase::Clear)
       .def("_grad_ivar",
            [](const imperative::VarBase &self) { return self.grads_; },
            py::return_value_policy::reference)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 8d061f41f09..e737b9bc61a 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -388,6 +388,9 @@ class Variable(object):
     def _gradient(self):
         return np.array(self._ivar._grad_value())
 
+    def _clear(self):
+        self._ivar._clear()
+
     def __str__(self):
         return self.to_string(True)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index af2a2f45aa8..c38906ce6ad 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -69,8 +69,6 @@ class TestImperativeMnist(unittest.TestCase):
         generate_p.random_seed = seed
 
         scope = fluid.core.Scope()
-        exe = fluid.Executor(fluid.CPUPlace())
-        sys.stderr.write('1111\n')
         with new_program_scope(
                 main=discriminate_p, startup=startup, scope=scope):
             discriminator = Discriminator()
@@ -117,6 +115,8 @@ class TestImperativeMnist(unittest.TestCase):
             sgd = SGDOptimizer(learning_rate=1e-3)
             sgd.minimize(g_loss)
 
+        exe = fluid.Executor(fluid.CPUPlace())
+        static_params = dict()
         with fluid.scope_guard(scope):
             img = np.ones([2, 1], np.float32)
             noise = np.ones([2, 2], np.float32)
@@ -128,14 +128,14 @@ class TestImperativeMnist(unittest.TestCase):
             g_loss_val = exe.run(generate_p,
                                  feed={'noise': noise},
                                  fetch_list=[g_loss])[0]
-            sys.stderr.write('d_loss %s, g_loss: %s\n' %
-                             (d_loss_val, g_loss_val))
-
-            static_params = dict()
-            for param in discriminate_p.global_block().all_parameters():
-                sys.stderr.write('%s\n' % param.name)
+            for param in generate_p.global_block().all_parameters():
                 static_params[param.name] = np.array(
                     scope.find_var(param.name).get_tensor())
+                sys.stderr.write(
+                    'static_param_loss: %s: %s\n' %
+                    (param.name, np.sum(static_params[param.name])))
+            sys.stderr.write('d_loss %s, g_loss: %s\n' %
+                             (d_loss_val, g_loss_val))
 
         dy_params = dict()
         with fluid.imperative.guard():
@@ -158,15 +158,31 @@ class TestImperativeMnist(unittest.TestCase):
                     x=d_fake, label=to_variable(np.zeros([2, 1], np.float32))))
 
             d_loss = d_loss_real + d_loss_fake
-            sys.stderr.write('dy_d_loss: %s\n' % d_loss._numpy())
             d_loss._backward()
             sgd.minimize(d_loss)
             for p in discriminator.parameters():
-                dy_params[p.name] = p._numpy()
+                p._clear()
+            for p in generator.parameters():
+                p._clear()
 
-        for k, v in six.iteritems(dy_params):
-            sys.stderr.write('dy_param_loss: %s: %s\n' % (k, np.sum(v)))
-            sys.stderr.write('static_param_loss: %s: %s\n' % (k, np.sum(v)))
+            d_fake = discriminator(
+                generator(to_variable(np.ones([2, 2], np.float32))))
+            g_loss = fluid.layers.reduce_mean(
+                fluid.layers.sigmoid_cross_entropy_with_logits(
+                    x=d_fake, label=to_variable(np.ones([2, 1], np.float32))))
+            g_loss._backward()
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            sgd.minimize(g_loss)
+            for p in discriminator.parameters():
+                dy_params[p.name] = p._numpy()
+                sys.stderr.write('dy_param_loss: %s: %s\n' %
+                                 (p.name, np.sum(dy_params[p.name])))
+            for p in generator.parameters():
+                dy_params[p.name] = p._numpy()
+                sys.stderr.write('dy_param_loss: %s: %s\n' %
+                                 (p.name, np.sum(dy_params[p.name])))
+            sys.stderr.write('dy_d_loss: %s, dy_g_loss: %s\n' %
+                             (d_loss._numpy(), g_loss._numpy()))
 
 
 if __name__ == '__main__':
-- 
GitLab


From bfa2621fc3d63d8b6e0dca79c740e6ccc406a24c Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 16 Jan 2019 10:13:04 +0800
Subject: [PATCH 08/73] fix bias

test=develop
---
 python/paddle/fluid/imperative/nn.py          | 14 ++++------
 python/paddle/fluid/layer_helper.py           |  3 +-
 .../tests/unittests/test_imperative_gan.py    | 28 +++++++++++--------
 3 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index 337a463041b..95b59487661 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -221,13 +221,10 @@ class FC(layers.Layer):
         from ..layer_helper import LayerHelper
         self._helper = LayerHelper(
             'FC', param_attr=param_attr, act=act, name=name)
-        self._bias_attr = bias_attr
+        self._bias_attr = bias_attr if bias_attr else ParamAttr()
 
     def parameters(self):
-        if self._bias_attr:
-            return [self._w, self._b]
-        else:
-            return [self._w]
+        return [self._w, self._b]
 
     def _build_once(self, input):
         input_shape = input.shape
@@ -264,10 +261,11 @@ class FC(layers.Layer):
         # add bias
         size = list(out.shape[1:])
         if not self._built:
-            self._b = self._layer.create_parameter(
+            self._b = self._helper.create_parameter(
                 attr=self._bias_attr, shape=size, dtype=out.dtype, is_bias=True)
-        bias_out = self.create_variable_for_type_inference(dtype=out.dtype)
-        self.append_op(
+        bias_out = self._helper.create_variable_for_type_inference(
+            dtype=out.dtype)
+        self._helper.append_op(
             type='elementwise_add',
             inputs={'X': [out],
                     'Y': [self._b]},
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index ea9953f5814..e0fd44ae31e 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -405,8 +405,7 @@ class LayerHelper(object):
         """
         size = list(input_var.shape[dim_start:dim_end])
         bias_attr = self.bias_attr
-        if not bias_attr:
-            return input_var
+        assert bias_attr is not None
 
         b = self.create_parameter(
             attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index c38906ce6ad..410c75026bb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -121,21 +121,21 @@ class TestImperativeMnist(unittest.TestCase):
             img = np.ones([2, 1], np.float32)
             noise = np.ones([2, 2], np.float32)
             exe.run(startup)
-            d_loss_val = exe.run(discriminate_p,
-                                 feed={'img': img,
-                                       'noise': noise},
-                                 fetch_list=[d_loss])[0]
-            g_loss_val = exe.run(generate_p,
-                                 feed={'noise': noise},
-                                 fetch_list=[g_loss])[0]
+            static_d_loss = exe.run(discriminate_p,
+                                    feed={'img': img,
+                                          'noise': noise},
+                                    fetch_list=[d_loss])[0]
+            static_g_loss = exe.run(generate_p,
+                                    feed={'noise': noise},
+                                    fetch_list=[g_loss])[0]
+
+            # generate_p contains all parameters needed.
             for param in generate_p.global_block().all_parameters():
                 static_params[param.name] = np.array(
                     scope.find_var(param.name).get_tensor())
                 sys.stderr.write(
                     'static_param_loss: %s: %s\n' %
                     (param.name, np.sum(static_params[param.name])))
-            sys.stderr.write('d_loss %s, g_loss: %s\n' %
-                             (d_loss_val, g_loss_val))
 
         dy_params = dict()
         with fluid.imperative.guard():
@@ -181,8 +181,14 @@ class TestImperativeMnist(unittest.TestCase):
                 dy_params[p.name] = p._numpy()
                 sys.stderr.write('dy_param_loss: %s: %s\n' %
                                  (p.name, np.sum(dy_params[p.name])))
-            sys.stderr.write('dy_d_loss: %s, dy_g_loss: %s\n' %
-                             (d_loss._numpy(), g_loss._numpy()))
+
+            dy_g_loss = g_loss._numpy()
+            dy_d_loss = d_loss._numpy()
+
+        self.assertEqual(dy_g_loss, static_g_loss)
+        self.assertEqual(dy_d_loss, static_d_loss)
+        for k, v in six.iteritems(dy_params):
+            self.assertTrue(np.allclose(v, static_params[k]))
 
 
 if __name__ == '__main__':
-- 
GitLab


From 179363a15c41175e2174e9cf031006a24a3efc75 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 16 Jan 2019 10:46:47 +0800
Subject: [PATCH 09/73] polish codes

test=develop
---
 python/paddle/fluid/tests/unittests/test_imperative_gan.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index 410c75026bb..e0507e0b938 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -171,16 +171,11 @@ class TestImperativeMnist(unittest.TestCase):
                 fluid.layers.sigmoid_cross_entropy_with_logits(
                     x=d_fake, label=to_variable(np.ones([2, 1], np.float32))))
             g_loss._backward()
-            sgd = SGDOptimizer(learning_rate=1e-3)
             sgd.minimize(g_loss)
             for p in discriminator.parameters():
                 dy_params[p.name] = p._numpy()
-                sys.stderr.write('dy_param_loss: %s: %s\n' %
-                                 (p.name, np.sum(dy_params[p.name])))
             for p in generator.parameters():
                 dy_params[p.name] = p._numpy()
-                sys.stderr.write('dy_param_loss: %s: %s\n' %
-                                 (p.name, np.sum(dy_params[p.name])))
 
             dy_g_loss = g_loss._numpy()
             dy_d_loss = d_loss._numpy()
-- 
GitLab


From e395f2c6a337edc7c413645eab0bbb76c6c408db Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 16 Jan 2019 17:25:15 +0800
Subject: [PATCH 10/73] polish codes

test=develop
---
 paddle/fluid/imperative/layer.cc              |  8 ++--
 paddle/fluid/imperative/layer.h               | 46 ++++++++++++-------
 paddle/fluid/imperative/tracer.cc             | 22 ++++-----
 paddle/fluid/pybind/pybind.cc                 |  6 +--
 python/paddle/fluid/framework.py              |  4 +-
 python/paddle/fluid/imperative/layers.py      |  4 ++
 python/paddle/fluid/imperative/nn.py          |  1 +
 .../tests/unittests/test_imperative_gan.py    |  9 +---
 8 files changed, 53 insertions(+), 47 deletions(-)

diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 426644ca918..b7df4b8886d 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -57,15 +57,15 @@ class Autograd {
   Autograd() {}
 
   void RunBackward(VarBase* var) {
-    if (var->stop_gradient_) {
+    if (var->IsStopGradient()) {
       return;
     }
     VLOG(3) << "start autograd";
 
     std::deque<OpBase*> ready;
-    ready.push_back(var->pre_op_);
+    ready.push_back(var->PreOp());
 
-    std::map<OpBase*, int> dep_counts = ComputeDepCounts(var->pre_op_);
+    std::map<OpBase*, int> dep_counts = ComputeDepCounts(var->PreOp());
 
     while (!ready.empty()) {
       OpBase* ready_op = ready.front();
@@ -77,7 +77,7 @@ class Autograd {
         const std::vector<VarBase*>& ingrads = it.second;
         for (size_t i = 0; i < ingrads.size(); ++i) {
           if (!ingrads[i]) continue;
-          if (ready_op->input_vars_[it.first][i]->stop_gradient_) {
+          if (ready_op->input_vars_[it.first][i]->IsStopGradient()) {
             continue;
           }
           OpBase* pre_op = ready_op->pre_ops_[it.first][i];
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 2289da09070..0b1077c640e 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -100,20 +100,20 @@ class VarBase {
 
   // Owns `var` and `grad`
   VarBase(framework::Variable* var, VarBase* grad)
-      : pre_op_(nullptr),
-        pre_op_out_idx_(-1),
-        var_desc_(nullptr),
+      : var_desc_(nullptr),
         var_(var),
         grads_(grad),
-        stop_gradient_(false) {}
+        stop_gradient_(false),
+        pre_op_(nullptr),
+        pre_op_out_idx_(-1) {}
 
   explicit VarBase(bool stop_gradient)
-      : pre_op_(nullptr),
-        pre_op_out_idx_(-1),
-        var_desc_(nullptr),
+      : var_desc_(nullptr),
         var_(new framework::Variable()),
         grads_(stop_gradient ? nullptr : new VarBase(true)),
-        stop_gradient_(stop_gradient) {}
+        stop_gradient_(stop_gradient),
+        pre_op_(nullptr),
+        pre_op_out_idx_(-1) {}
 
   virtual ~VarBase() {
     if (var_) {
@@ -125,15 +125,27 @@ class VarBase {
     }
   }
 
-  void Clear() {
+  OpBase* PreOp() const { return pre_op_; }
+  int PreOpOutIdx() const { return pre_op_out_idx_; }
+
+  void SetStopGradient(bool stop_gradient) { stop_gradient_ = stop_gradient; }
+  bool IsStopGradient() const { return stop_gradient_; }
+
+  void RunBackward();
+
+  void TrackPreOp(OpBase* pre_op, const std::string& pre_op_out_name,
+                  int pre_op_out_idx, bool stop_gradient) {
+    pre_op_ = pre_op;
+    pre_op_out_name_ = pre_op_out_name;
+    pre_op_out_idx_ = pre_op_out_idx;
+    stop_gradient_ = stop_gradient;
+  }
+
+  void ClearGradient() {
     delete grads_;
     grads_ = new VarBase(true);
-    pre_op_ = nullptr;
-    pre_op_out_name_ = "";
   }
 
-  void RunBackward();
-
   framework::LoDTensor& GradValue();
 
   inline std::string GradName() const {
@@ -143,16 +155,16 @@ class VarBase {
     return string::Sprintf("%s@IGrad", var_desc_->Name());
   }
 
-  OpBase* pre_op_;
-  std::string pre_op_out_name_;
-  int pre_op_out_idx_;
-
   framework::VarDesc* var_desc_;
 
   framework::Variable* var_;
   VarBase* grads_;
 
+ private:
   bool stop_gradient_;
+  OpBase* pre_op_;
+  std::string pre_op_out_name_;
+  int pre_op_out_idx_;
 };
 
 /* The wrapper for OpDesc which holds a OpDesc and a OpDesc of its
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 2878f5be883..843fee41f38 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -63,9 +63,9 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
 
       invars.push_back(inp->var_);
       vars[inp->var_desc_->Name()] = inp;
-      if (inp->pre_op_) {
-        op->pre_ops_[it.first].push_back(inp->pre_op_);
-        op->pre_ops_out_idx_[it.first].push_back(inp->pre_op_out_idx_);
+      if (inp->PreOp()) {
+        op->pre_ops_[it.first].push_back(inp->PreOp());
+        op->pre_ops_out_idx_[it.first].push_back(inp->PreOpOutIdx());
       } else {
         op->pre_ops_[it.first].push_back(nullptr);
       }
@@ -89,10 +89,7 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
       } else {
         LOG(ERROR) << "tracer doesn't support yet";
       }
-      out->stop_gradient_ = stop_gradient;
-      out->pre_op_ = op;
-      out->pre_op_out_name_ = it.first;
-      out->pre_op_out_idx_ = i;
+      out->TrackPreOp(op, it.first, i, stop_gradient);
 
       VLOG(3) << "output vname " << out->var_desc_->Name() << " "
               << out->var_->IsInitialized();
@@ -167,9 +164,9 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
   op->input_vars_[PyLayer::kFwdInp] = inputs;
   op->output_vars_[PyLayer::kFwdOut] = PyLayer::Apply(op->forward_id_, inputs);
   for (VarBase* inp : inputs) {
-    if (inp->pre_op_) {
-      op->pre_ops_[PyLayer::kFwdInp].push_back(inp->pre_op_);
-      op->pre_ops_out_idx_[PyLayer::kFwdInp].push_back(inp->pre_op_out_idx_);
+    if (inp->PreOp()) {
+      op->pre_ops_[PyLayer::kFwdInp].push_back(inp->PreOp());
+      op->pre_ops_out_idx_[PyLayer::kFwdInp].push_back(inp->PreOpOutIdx());
     } else {
       op->pre_ops_[PyLayer::kFwdInp].push_back(nullptr);
     }
@@ -178,10 +175,7 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
   auto& outputs = op->output_vars_[PyLayer::kFwdOut];
   for (size_t i = 0; i < outputs.size(); ++i) {
     VarBase* out = outputs[i];
-    out->stop_gradient_ = stop_gradient;
-    out->pre_op_ = op;
-    out->pre_op_out_name_ = PyLayer::kFwdOut;
-    out->pre_op_out_idx_ = i;
+    out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient);
   }
   if (!stop_gradient) {
     auto& grad_input_vars =
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index efe70a075d1..96fa428ee36 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -133,7 +133,7 @@ PYBIND11_MODULE(core, m) {
            [](imperative::VarBase &self) { self.RunBackward(); })
       .def("_grad_name", &imperative::VarBase::GradName)
       .def("_grad_value", &imperative::VarBase::GradValue)
-      .def("_clear", &imperative::VarBase::Clear)
+      .def("_clear_gradient", &imperative::VarBase::ClearGradient)
       .def("_grad_ivar",
            [](const imperative::VarBase &self) { return self.grads_; },
            py::return_value_policy::reference)
@@ -148,9 +148,9 @@ PYBIND11_MODULE(core, m) {
           py::return_value_policy::reference)
       .def_property(
           "stop_gradient",
-          [](const imperative::VarBase &self) { return self.stop_gradient_; },
+          [](const imperative::VarBase &self) { return self.IsStopGradient(); },
           [](imperative::VarBase &self, bool stop_gradient) {
-            self.stop_gradient_ = stop_gradient;
+            self.SetStopGradient(stop_gradient);
           });
 
   py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index e737b9bc61a..eedfd4a60f4 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -388,8 +388,8 @@ class Variable(object):
     def _gradient(self):
         return np.array(self._ivar._grad_value())
 
-    def _clear(self):
-        self._ivar._clear()
+    def _clear_gradient(self):
+        self._ivar._clear_gradient()
 
     def __str__(self):
         return self.to_string(True)
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index ed67dda6375..6cd0c297552 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -33,6 +33,10 @@ class Layer(core.Layer):
     def parameters(self):
         return []
 
+    def clear_gradients(self):
+        for p in self.parameters():
+            p._clear()
+
     def _build_once(self, inputs):
         pass
 
diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index 95b59487661..79986070c23 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -48,6 +48,7 @@ class Conv2D(layers.Layer):
         assert param_attr is not False, "param_attr should not be False here."
         super(Conv2D, self).__init__(name=name, dtype=dtype)
 
+        # TODO(minqiyang): Move this to the top.
         from ..layer_helper import LayerHelper
         self._helper = LayerHelper(
             type(self).__name__,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index e0507e0b938..4fe286f85ec 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -133,9 +133,6 @@ class TestImperativeMnist(unittest.TestCase):
             for param in generate_p.global_block().all_parameters():
                 static_params[param.name] = np.array(
                     scope.find_var(param.name).get_tensor())
-                sys.stderr.write(
-                    'static_param_loss: %s: %s\n' %
-                    (param.name, np.sum(static_params[param.name])))
 
         dy_params = dict()
         with fluid.imperative.guard():
@@ -160,10 +157,8 @@ class TestImperativeMnist(unittest.TestCase):
             d_loss = d_loss_real + d_loss_fake
             d_loss._backward()
             sgd.minimize(d_loss)
-            for p in discriminator.parameters():
-                p._clear()
-            for p in generator.parameters():
-                p._clear()
+            discriminator.clear_gradients()
+            generator.clear_gradients()
 
             d_fake = discriminator(
                 generator(to_variable(np.ones([2, 2], np.float32))))
-- 
GitLab


From bf180577ba508d368fd8f200230eaf92b1567c59 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 16 Jan 2019 17:33:02 +0800
Subject: [PATCH 11/73] fix

test=develop
---
 python/paddle/fluid/imperative/layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index 6cd0c297552..f457f56203e 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -35,7 +35,7 @@ class Layer(core.Layer):
 
     def clear_gradients(self):
         for p in self.parameters():
-            p._clear()
+            p._clear_gradient()
 
     def _build_once(self, inputs):
         pass
-- 
GitLab


From d1ac56757974a2e0710a4b96071184734d97345e Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 16 Jan 2019 21:11:55 +0800
Subject: [PATCH 12/73] fix

test=develop
---
 python/paddle/fluid/layer_helper.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index e0fd44ae31e..ea9953f5814 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -405,7 +405,8 @@ class LayerHelper(object):
         """
         size = list(input_var.shape[dim_start:dim_end])
         bias_attr = self.bias_attr
-        assert bias_attr is not None
+        if not bias_attr:
+            return input_var
 
         b = self.create_parameter(
             attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
-- 
GitLab


From 001827c270c36cd108687fd0180ed48754d3bec6 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 11:57:25 +0800
Subject: [PATCH 13/73] test_analyzer_mm_dnn runs in serial

test=develop
---
 paddle/fluid/inference/tests/api/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 0f670658892..a694b8194b2 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -84,7 +84,7 @@ inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_te
 # MM DNN
 set(MM_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mm_dnn")
 download_model_and_data(${MM_DNN_INSTALL_DIR} "MM_DNN_model.tar.gz" "MM_DNN_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_mm_dnn ${MM_DNN_INSTALL_DIR} analyzer_mm_dnn_tester.cc)
+inference_analysis_api_test(test_analyzer_mm_dnn ${MM_DNN_INSTALL_DIR} analyzer_mm_dnn_tester.cc SERIAL)
 
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
-- 
GitLab


From 81da854903daef56723820a8f68ed5e95db47b60 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 14:32:33 +0800
Subject: [PATCH 14/73] remove legacy C++ code

---
 paddle/legacy/api/Arguments.cpp               |  174 -
 paddle/legacy/api/CMakeLists.txt              |  120 -
 paddle/legacy/api/ConfigParser.cpp            |  114 -
 paddle/legacy/api/Evaluator.cpp               |   44 -
 paddle/legacy/api/GradientMachine.cpp         |  196 -
 paddle/legacy/api/Internal.h                  |   28 -
 paddle/legacy/api/Matrix.cpp                  |  317 --
 paddle/legacy/api/Paddle.i                    |  202 -
 paddle/legacy/api/PaddleAPI.h                 | 1054 ----
 paddle/legacy/api/PaddleAPIPrivate.h          |   97 -
 paddle/legacy/api/Parameter.cpp               |   68 -
 paddle/legacy/api/ParameterOptimizer.cpp      |  124 -
 paddle/legacy/api/ParameterUpdater.cpp        |   99 -
 paddle/legacy/api/SequenceGenerator.cpp       |  242 -
 paddle/legacy/api/Trainer.cpp                 |  175 -
 paddle/legacy/api/Util.cpp                    |   60 -
 paddle/legacy/api/Vector.cpp                  |  304 --
 paddle/legacy/api/__init__.py                 |   13 -
 paddle/legacy/api/numpy.i                     | 3161 -----------
 paddle/legacy/api/test/.gitignore             |    2 -
 paddle/legacy/api/test/CMakeLists.txt         |   11 -
 paddle/legacy/api/test/testArguments.py       |   54 -
 paddle/legacy/api/test/testGradientMachine.py |  116 -
 paddle/legacy/api/test/testMatrix.py          |  120 -
 paddle/legacy/api/test/testTrain.py           |  116 -
 paddle/legacy/api/test/testTrainConfig.py     |   25 -
 paddle/legacy/api/test/testTrainer.py         |   63 -
 paddle/legacy/api/test/testVector.py          |  153 -
 paddle/legacy/api/test/util.py                |   59 -
 paddle/legacy/capi/Arguments.cpp              |  140 -
 paddle/legacy/capi/CMakeLists.txt             |  118 -
 paddle/legacy/capi/Main.cpp                   |   53 -
 paddle/legacy/capi/Matrix.cpp                 |  171 -
 paddle/legacy/capi/Vector.cpp                 |   69 -
 paddle/legacy/capi/arguments.h                |  171 -
 paddle/legacy/capi/capi.h                     |   32 -
 paddle/legacy/capi/capi_private.h             |   82 -
 paddle/legacy/capi/config.h.in                |   13 -
 paddle/legacy/capi/error.cpp                  |   32 -
 paddle/legacy/capi/error.h                    |   45 -
 paddle/legacy/capi/examples/.gitignore        |    2 -
 paddle/legacy/capi/examples/README.md         |    3 -
 .../capi/examples/model_inference/README.md   |   42 -
 .../examples/model_inference/common/common.h  |   42 -
 .../model_inference/dense/CMakeLists.txt      |    6 -
 .../model_inference/dense/convert_protobin.sh |    2 -
 .../examples/model_inference/dense/main.c     |  116 -
 .../model_inference/dense/merge_v2_model.py   |   22 -
 .../model_inference/dense/mnist_v2.py         |  131 -
 .../model_inference/dense/trainer_config.py   |   13 -
 .../model_inference/multi_thread/.gitignore   |   73 -
 .../multi_thread/CMakeLists.txt               |   29 -
 .../multi_thread/convert_protobin.sh          |    1 -
 .../model_inference/multi_thread/main.c       |  112 -
 .../model_inference/multi_thread/main_gpu.c   |  127 -
 .../multi_thread/trainer_config.py            |   13 -
 .../model_inference/sequence/.gitignore       |   73 -
 .../model_inference/sequence/CMakeLists.txt   |    6 -
 .../sequence/convert_protobin.sh              |    1 -
 .../examples/model_inference/sequence/main.c  |   84 -
 .../sequence/trainer_config.py                |   27 -
 .../model_inference/sparse_binary/.gitignore  |   73 -
 .../sparse_binary/CMakeLists.txt              |    7 -
 .../sparse_binary/convert_protobin.sh         |    1 -
 .../model_inference/sparse_binary/main.c      |   87 -
 .../sparse_binary/trainer_config.py           |   13 -
 paddle/legacy/capi/gradient_machine.cpp       |  180 -
 paddle/legacy/capi/gradient_machine.h         |  127 -
 paddle/legacy/capi/main.h                     |   40 -
 paddle/legacy/capi/matrix.h                   |  146 -
 paddle/legacy/capi/paddle_capi.map            |    6 -
 paddle/legacy/capi/tests/.gitignore           |    2 -
 paddle/legacy/capi/tests/CMakeLists.txt       |   15 -
 paddle/legacy/capi/tests/test_Arguments.cpp   |  129 -
 .../capi/tests/test_GradientMachine.cpp       |  117 -
 paddle/legacy/capi/tests/test_Matrix.cpp      |   93 -
 paddle/legacy/capi/tests/test_Vector.cpp      |   32 -
 .../legacy/capi/tests/test_predict_network.py |   27 -
 paddle/legacy/capi/vector.h                   |   89 -
 paddle/legacy/cuda/CMakeLists.txt             |   89 -
 .../cuda/include/hl_activation_functions.h    |   60 -
 paddle/legacy/cuda/include/hl_aggregate.h     |  106 -
 paddle/legacy/cuda/include/hl_avx_functions.h |   32 -
 paddle/legacy/cuda/include/hl_base.h          |  250 -
 paddle/legacy/cuda/include/hl_batch_norm.h    |   48 -
 .../legacy/cuda/include/hl_batch_transpose.h  |   36 -
 paddle/legacy/cuda/include/hl_cnn.h           |  417 --
 paddle/legacy/cuda/include/hl_cpu_gru.cuh     |  477 --
 paddle/legacy/cuda/include/hl_cpu_lstm.cuh    |  372 --
 .../cuda/include/hl_cpu_matrix_kernel.cuh     |  196 -
 .../include/hl_cpu_matrix_kernel_detail.cuh   |  310 --
 paddle/legacy/cuda/include/hl_cpu_scalar.cuh  |   50 -
 .../legacy/cuda/include/hl_cpu_simd_neon.cuh  |   73 -
 .../legacy/cuda/include/hl_cpu_simd_sse.cuh   |   94 -
 paddle/legacy/cuda/include/hl_cuda.h          |  345 --
 paddle/legacy/cuda/include/hl_cuda.ph         |  112 -
 paddle/legacy/cuda/include/hl_cuda_cublas.h   |  172 -
 paddle/legacy/cuda/include/hl_cuda_cudnn.h    |  516 --
 paddle/legacy/cuda/include/hl_cuda_cudnn.ph   |   80 -
 .../cuda/include/hl_device_functions.cuh      |   71 -
 paddle/legacy/cuda/include/hl_functions.h     |   57 -
 paddle/legacy/cuda/include/hl_gpu.h           |   44 -
 .../legacy/cuda/include/hl_gpu_functions.cuh  |   68 -
 paddle/legacy/cuda/include/hl_gpu_gru.cuh     |  393 --
 paddle/legacy/cuda/include/hl_gpu_lstm.cuh    |  300 --
 .../cuda/include/hl_gpu_matrix_kernel.cuh     |  629 ---
 paddle/legacy/cuda/include/hl_gru_ops.cuh     |  205 -
 paddle/legacy/cuda/include/hl_lstm.h          |  130 -
 paddle/legacy/cuda/include/hl_lstm_ops.cuh    |  213 -
 paddle/legacy/cuda/include/hl_matrix.h        |  311 --
 .../legacy/cuda/include/hl_matrix_apply.cuh   |  423 --
 paddle/legacy/cuda/include/hl_matrix_base.cuh |  164 -
 .../cuda/include/hl_matrix_base_detail.cuh    |  153 -
 paddle/legacy/cuda/include/hl_matrix_ops.cuh  |  253 -
 paddle/legacy/cuda/include/hl_matrix_type.cuh |   51 -
 .../cuda/include/hl_perturbation_util.cuh     |   51 -
 .../cuda/include/hl_recurrent_apply.cuh       |  192 -
 paddle/legacy/cuda/include/hl_sequence.h      |  168 -
 paddle/legacy/cuda/include/hl_sparse.h        |  523 --
 paddle/legacy/cuda/include/hl_sparse.ph       |   85 -
 paddle/legacy/cuda/include/hl_table_apply.h   |   81 -
 paddle/legacy/cuda/include/hl_tensor_ops.h    |  536 --
 paddle/legacy/cuda/include/hl_thread.ph       |   84 -
 paddle/legacy/cuda/include/hl_time.h          |   29 -
 paddle/legacy/cuda/include/hl_top_k.h         |   87 -
 paddle/legacy/cuda/include/hl_warpctc_wrap.h  |   94 -
 .../cuda/include/stub/hl_aggregate_stub.h     |   36 -
 paddle/legacy/cuda/include/stub/hl_cnn_stub.h |  247 -
 .../cuda/include/stub/hl_cuda_cublas_stub.h   |   53 -
 .../cuda/include/stub/hl_cuda_cudnn_stub.h    |  201 -
 .../legacy/cuda/include/stub/hl_cuda_stub.h   |   97 -
 .../legacy/cuda/include/stub/hl_lstm_stub.h   |   67 -
 .../legacy/cuda/include/stub/hl_matrix_stub.h |  138 -
 .../cuda/include/stub/hl_sequence_stub.h      |   66 -
 .../legacy/cuda/include/stub/hl_sparse_stub.h |  185 -
 paddle/legacy/cuda/src/avx_mathfun.h          |  735 ---
 paddle/legacy/cuda/src/hl_avx_functions.cc    |   69 -
 paddle/legacy/cuda/src/hl_batch_norm.cu       |   66 -
 paddle/legacy/cuda/src/hl_batch_transpose.cu  |   59 -
 paddle/legacy/cuda/src/hl_cpu_functions.cc    |   44 -
 paddle/legacy/cuda/src/hl_cuda_aggregate.cu   |  293 -
 paddle/legacy/cuda/src/hl_cuda_cnn.cu         | 1106 ----
 paddle/legacy/cuda/src/hl_cuda_cublas.cc      |  400 --
 paddle/legacy/cuda/src/hl_cuda_cudnn.cc       | 1117 ----
 paddle/legacy/cuda/src/hl_cuda_device.cc      |  681 ---
 paddle/legacy/cuda/src/hl_cuda_lstm.cu        |  876 ---
 paddle/legacy/cuda/src/hl_cuda_matrix.cu      |  806 ---
 paddle/legacy/cuda/src/hl_cuda_sequence.cu    |  408 --
 paddle/legacy/cuda/src/hl_cuda_sparse.cu      | 1262 -----
 paddle/legacy/cuda/src/hl_cuda_sparse.cuh     | 1015 ----
 paddle/legacy/cuda/src/hl_math.cc             |   26 -
 .../legacy/cuda/src/hl_perturbation_util.cu   |  289 -
 paddle/legacy/cuda/src/hl_table_apply.cu      |  124 -
 paddle/legacy/cuda/src/hl_time.cc             |   27 -
 paddle/legacy/cuda/src/hl_top_k.cu            |  481 --
 paddle/legacy/cuda/src/hl_warpctc_wrap.cc     |  151 -
 paddle/legacy/function/BlockExpandOp.cpp      |  202 -
 paddle/legacy/function/BlockExpandOpTest.cpp  |  107 -
 paddle/legacy/function/BufferArg.cpp          |   52 -
 paddle/legacy/function/BufferArg.h            |  364 --
 paddle/legacy/function/BufferArgTest.cpp      |   38 -
 paddle/legacy/function/CMakeLists.txt         |   54 -
 .../legacy/function/ContextProjectionOp.cpp   |  412 --
 paddle/legacy/function/ContextProjectionOp.h  |   86 -
 .../legacy/function/ContextProjectionOpGpu.cu |  413 --
 .../function/ContextProjectionOpTest.cpp      |  114 -
 paddle/legacy/function/ConvOp.h               |  157 -
 paddle/legacy/function/ConvOpTest.h           |  275 -
 paddle/legacy/function/CosSimOp.cpp           |  240 -
 paddle/legacy/function/CosSimOp.h             |   61 -
 paddle/legacy/function/CosSimOpGpu.cu         |  248 -
 paddle/legacy/function/CosSimOpTest.cpp       |   64 -
 paddle/legacy/function/CropOp.cpp             |  177 -
 paddle/legacy/function/CropOp.h               |   51 -
 paddle/legacy/function/CropOpGpu.cu           |  150 -
 paddle/legacy/function/CropOpTest.cpp         |   49 -
 paddle/legacy/function/CrossMapNormalOp.cpp   |  344 --
 paddle/legacy/function/CrossMapNormalOp.h     |   81 -
 paddle/legacy/function/CrossMapNormalOpGpu.cu |  177 -
 .../legacy/function/CrossMapNormalOpTest.cpp  |   80 -
 paddle/legacy/function/DepthwiseConvOp.cpp    |  305 --
 paddle/legacy/function/DepthwiseConvOp.h      |  159 -
 paddle/legacy/function/DepthwiseConvOpGpu.cu  |  376 --
 .../legacy/function/DepthwiseConvOpTest.cpp   |   46 -
 paddle/legacy/function/EigenGemm.cpp          |  102 -
 paddle/legacy/function/EigenThreadDevice.h    |   73 -
 paddle/legacy/function/Function.cpp           |   45 -
 paddle/legacy/function/Function.h             |  214 -
 paddle/legacy/function/FunctionTest.cpp       |  166 -
 paddle/legacy/function/FunctionTest.h         |  410 --
 paddle/legacy/function/GemmConvOp.cpp         |  522 --
 paddle/legacy/function/GemmConvOpTest.cpp     |   50 -
 paddle/legacy/function/GemmFunctor.cpp        |   90 -
 paddle/legacy/function/GemmFunctor.h          |   65 -
 paddle/legacy/function/GruFunctor.h           |  159 -
 paddle/legacy/function/Im2Col.h               |  154 -
 paddle/legacy/function/Im2ColOp.cpp           |  245 -
 paddle/legacy/function/Im2ColOpGpu.cu         |  464 --
 paddle/legacy/function/Im2ColTest.cpp         |  223 -
 paddle/legacy/function/MulOp.cpp              |  347 --
 paddle/legacy/function/MulOp.h                |  102 -
 paddle/legacy/function/MulOpGpu.cu            |  130 -
 paddle/legacy/function/MulOpTest.cpp          |  212 -
 paddle/legacy/function/NaiveConvOp.cpp        |  141 -
 paddle/legacy/function/PadOp.cpp              |  215 -
 paddle/legacy/function/PadOp.h                |   73 -
 paddle/legacy/function/PadOpGpu.cu            |  132 -
 paddle/legacy/function/PadOpTest.cpp          |   49 -
 paddle/legacy/function/RowConvOp.cpp          |  225 -
 paddle/legacy/function/RowConvOp.h            |   56 -
 paddle/legacy/function/RowConvOpGpu.cu        |  373 --
 paddle/legacy/function/RowConvOpTest.cpp      |   62 -
 paddle/legacy/function/ScaleSubRegionOp.cpp   |  155 -
 paddle/legacy/function/ScaleSubRegionOp.h     |   55 -
 paddle/legacy/function/ScaleSubRegionOpGpu.cu |  116 -
 .../legacy/function/ScaleSubRegionOpTest.cpp  |   72 -
 paddle/legacy/function/SwitchOp.cpp           |  140 -
 paddle/legacy/function/SwitchOp.h             |   66 -
 paddle/legacy/function/SwitchOpGpu.cu         |   98 -
 paddle/legacy/function/SwitchOpTest.cpp       |   44 -
 paddle/legacy/function/TensorShape.h          |  107 -
 paddle/legacy/function/TensorShapeTest.cpp    |   53 -
 paddle/legacy/function/TensorType.h           |  149 -
 paddle/legacy/function/TensorTypeTest.cpp     |   64 -
 .../function/neon/NeonDepthwiseConv.cpp       |  120 -
 .../legacy/function/neon/NeonDepthwiseConv.h  |  627 ---
 .../neon/NeonDepthwiseConvTranspose.cpp       |  136 -
 paddle/legacy/function/neon/neon_util.h       |   43 -
 .../legacy/function/nnpack/NNPACKConvOp.cpp   |  247 -
 .../function/nnpack/NNPACKConvOpTest.cpp      |   30 -
 paddle/legacy/gserver/CMakeLists.txt          |  152 -
 .../activations/ActivationFunction.cpp        |  509 --
 .../gserver/activations/ActivationFunction.h  |   66 -
 .../gserver/activations/MKLDNNActivation.cpp  |  249 -
 .../gserver/activations/MKLDNNActivation.h    |  119 -
 .../gserver/dataproviders/DataProvider.cpp    |  410 --
 .../gserver/dataproviders/DataProvider.h      |  480 --
 .../gserver/dataproviders/DataProviderGroup.h |  153 -
 .../dataproviders/MultiDataProvider.cpp       |  122 -
 .../gserver/dataproviders/MultiDataProvider.h |   41 -
 .../gserver/dataproviders/ProtoReader.h       |  177 -
 .../gserver/dataproviders/PyDataProvider.cpp  |  498 --
 .../gserver/dataproviders/PyDataProvider.h    |  124 -
 .../gserver/dataproviders/PyDataProvider2.cpp | 1031 ----
 .../gserver/evaluators/CTCErrorEvaluator.cpp  |  320 --
 .../gserver/evaluators/ChunkEvaluator.cpp     |  296 -
 .../evaluators/DetectionMAPEvaluator.cpp      |  308 --
 .../legacy/gserver/evaluators/Evaluator.cpp   | 1361 -----
 paddle/legacy/gserver/evaluators/Evaluator.h  |  510 --
 .../gradientmachines/GradientMachine.cpp      |  104 -
 .../gradientmachines/GradientMachine.h        |  250 -
 .../gradientmachines/GradientMachineMode.cpp  |   20 -
 .../gradientmachines/GradientMachineMode.h    |  149 -
 .../gradientmachines/MultiGradientMachine.cpp |  898 ----
 .../gradientmachines/MultiGradientMachine.h   |  478 --
 .../gserver/gradientmachines/MultiNetwork.cpp |  185 -
 .../gserver/gradientmachines/MultiNetwork.h   |   64 -
 .../gradientmachines/NeuralNetwork.cpp        |  548 --
 .../gserver/gradientmachines/NeuralNetwork.h  |  179 -
 .../ParallelNeuralNetwork.cpp                 |  214 -
 .../gradientmachines/ParallelNeuralNetwork.h  |  113 -
 .../RecurrentGradientMachine.cpp              | 1501 ------
 .../RecurrentGradientMachine.h                |  580 --
 paddle/legacy/gserver/layers/AddtoLayer.cpp   |   79 -
 paddle/legacy/gserver/layers/AddtoLayer.h     |   63 -
 paddle/legacy/gserver/layers/AgentLayer.cpp   |  281 -
 paddle/legacy/gserver/layers/AgentLayer.h     |  177 -
 paddle/legacy/gserver/layers/AverageLayer.cpp |   67 -
 paddle/legacy/gserver/layers/AverageLayer.h   |   54 -
 .../gserver/layers/BatchNormBaseLayer.cpp     |   80 -
 .../gserver/layers/BatchNormBaseLayer.h       |  101 -
 .../layers/BatchNormalizationLayer.cpp        |  266 -
 .../gserver/layers/BatchNormalizationLayer.h  |   70 -
 .../gserver/layers/BilinearInterpLayer.cpp    |  107 -
 .../gserver/layers/BilinearInterpLayer.h      |   47 -
 .../gserver/layers/BlockExpandLayer.cpp       |  121 -
 .../legacy/gserver/layers/BlockExpandLayer.h  |   68 -
 .../gserver/layers/CRFDecodingLayer.cpp       |   69 -
 .../legacy/gserver/layers/CRFDecodingLayer.h  |   44 -
 paddle/legacy/gserver/layers/CRFLayer.cpp     |  117 -
 paddle/legacy/gserver/layers/CRFLayer.h       |   46 -
 paddle/legacy/gserver/layers/CTCLayer.cpp     |  121 -
 paddle/legacy/gserver/layers/CTCLayer.h       |   41 -
 paddle/legacy/gserver/layers/ClipLayer.cpp    |   79 -
 .../gserver/layers/ConcatenateLayer.cpp       |  208 -
 .../gserver/layers/ContextProjection.cpp      |  185 -
 .../legacy/gserver/layers/ContextProjection.h |   78 -
 paddle/legacy/gserver/layers/Conv3DLayer.cpp  |  253 -
 paddle/legacy/gserver/layers/Conv3DLayer.h    |   51 -
 .../legacy/gserver/layers/ConvBaseLayer.cpp   |  120 -
 paddle/legacy/gserver/layers/ConvBaseLayer.h  |  107 -
 .../gserver/layers/ConvBaseOperator.cpp       |  151 -
 .../legacy/gserver/layers/ConvBaseOperator.h  |  112 -
 .../gserver/layers/ConvBaseProjection.cpp     |  199 -
 .../gserver/layers/ConvBaseProjection.h       |  111 -
 paddle/legacy/gserver/layers/ConvOperator.cpp |  128 -
 paddle/legacy/gserver/layers/ConvOperator.h   |   44 -
 .../legacy/gserver/layers/ConvProjection.cpp  |  123 -
 paddle/legacy/gserver/layers/ConvProjection.h |   43 -
 .../legacy/gserver/layers/ConvShiftLayer.cpp  |  108 -
 .../gserver/layers/ConvTransOperator.cpp      |  125 -
 .../legacy/gserver/layers/ConvTransOperator.h |   44 -
 .../gserver/layers/ConvTransProjection.cpp    |  123 -
 .../gserver/layers/ConvTransProjection.h      |   43 -
 .../gserver/layers/ConvexCombinationLayer.cpp |  155 -
 paddle/legacy/gserver/layers/CosSimLayer.cpp  |   93 -
 paddle/legacy/gserver/layers/CosSimLayer.h    |   48 -
 .../gserver/layers/CosSimVecMatLayer.cpp      |  182 -
 paddle/legacy/gserver/layers/CostLayer.cpp    |  748 ---
 paddle/legacy/gserver/layers/CostLayer.h      |  374 --
 paddle/legacy/gserver/layers/CropLayer.cpp    |  146 -
 paddle/legacy/gserver/layers/CropLayer.h      |   52 -
 .../gserver/layers/CrossChannelNormLayer.cpp  |  137 -
 .../gserver/layers/CrossEntropyOverBeam.cpp   |  393 --
 .../gserver/layers/CrossEntropyOverBeam.h     |  135 -
 .../gserver/layers/CudnnBatchNormLayer.cpp    |  180 -
 .../gserver/layers/CudnnBatchNormLayer.h      |   68 -
 .../gserver/layers/CudnnConvBaseLayer.cpp     |  135 -
 .../gserver/layers/CudnnConvBaseLayer.h       |   53 -
 .../legacy/gserver/layers/CudnnPoolLayer.cpp  |  139 -
 paddle/legacy/gserver/layers/CudnnPoolLayer.h |   61 -
 paddle/legacy/gserver/layers/DataLayer.cpp    |   67 -
 paddle/legacy/gserver/layers/DataLayer.h      |   70 -
 .../legacy/gserver/layers/DataNormLayer.cpp   |  140 -
 paddle/legacy/gserver/layers/DataNormLayer.h  |   62 -
 .../legacy/gserver/layers/DeConv3DLayer.cpp   |  220 -
 paddle/legacy/gserver/layers/DeConv3DLayer.h  |   52 -
 .../gserver/layers/DetectionOutputLayer.cpp   |  160 -
 .../gserver/layers/DetectionOutputLayer.h     |   77 -
 .../legacy/gserver/layers/DetectionUtil.cpp   |  576 --
 paddle/legacy/gserver/layers/DetectionUtil.h  |  307 --
 .../legacy/gserver/layers/DotMulOperator.cpp  |   62 -
 .../gserver/layers/DotMulProjection.cpp       |   68 -
 paddle/legacy/gserver/layers/DotProdLayer.cpp |   97 -
 .../legacy/gserver/layers/EosIdCheckLayer.cpp |   50 -
 .../legacy/gserver/layers/ExpandConvLayer.cpp |  248 -
 .../legacy/gserver/layers/ExpandConvLayer.h   |   51 -
 paddle/legacy/gserver/layers/ExpandLayer.cpp  |  133 -
 paddle/legacy/gserver/layers/ExpandLayer.h    |   63 -
 .../layers/FactorizationMachineLayer.cpp      |  158 -
 .../layers/FactorizationMachineLayer.h        |   80 -
 .../gserver/layers/FeatureMapExpandLayer.cpp  |  155 -
 .../gserver/layers/FullMatrixProjection.cpp   |   60 -
 .../gserver/layers/FullMatrixProjection.h     |   42 -
 .../gserver/layers/FullyConnectedLayer.cpp    |  150 -
 .../gserver/layers/FullyConnectedLayer.h      |   49 -
 .../gserver/layers/GatedRecurrentLayer.cpp    |  414 --
 .../gserver/layers/GatedRecurrentLayer.h      |  100 -
 .../legacy/gserver/layers/GetOutputLayer.cpp  |   41 -
 paddle/legacy/gserver/layers/GruCompute.cpp   |   54 -
 paddle/legacy/gserver/layers/GruCompute.cu    |   47 -
 paddle/legacy/gserver/layers/GruCompute.h     |   41 -
 paddle/legacy/gserver/layers/GruStepLayer.cpp |  177 -
 .../layers/HierarchicalSigmoidLayer.cpp       |  240 -
 .../gserver/layers/HierarchicalSigmoidLayer.h |   94 -
 .../gserver/layers/IdentityProjection.cpp     |  103 -
 .../gserver/layers/InterpolationLayer.cpp     |  130 -
 .../gserver/layers/KmaxSeqScoreLayer.cpp      |  126 -
 .../legacy/gserver/layers/L2DistanceLayer.cpp |   91 -
 .../legacy/gserver/layers/L2DistanceLayer.h   |   52 -
 paddle/legacy/gserver/layers/Layer.cpp        |  410 --
 paddle/legacy/gserver/layers/Layer.h          |  512 --
 .../legacy/gserver/layers/LinearChainCRF.cpp  |  218 -
 paddle/legacy/gserver/layers/LinearChainCRF.h |   97 -
 .../legacy/gserver/layers/LinearChainCTC.cpp  |  265 -
 paddle/legacy/gserver/layers/LinearChainCTC.h |   50 -
 paddle/legacy/gserver/layers/LstmCompute.cpp  |   93 -
 paddle/legacy/gserver/layers/LstmCompute.cu   |   73 -
 paddle/legacy/gserver/layers/LstmCompute.h    |   66 -
 paddle/legacy/gserver/layers/LstmLayer.cpp    |  805 ---
 paddle/legacy/gserver/layers/LstmLayer.h      |  221 -
 .../legacy/gserver/layers/LstmStepLayer.cpp   |  194 -
 paddle/legacy/gserver/layers/MDLstmLayer.cpp  |  769 ---
 .../gserver/layers/MKLDNNAddtoLayer.cpp       |  219 -
 .../legacy/gserver/layers/MKLDNNAddtoLayer.h  |   87 -
 paddle/legacy/gserver/layers/MKLDNNBase.h     |   97 -
 .../gserver/layers/MKLDNNBatchNormLayer.cpp   |  306 --
 .../gserver/layers/MKLDNNBatchNormLayer.h     |  125 -
 .../gserver/layers/MKLDNNConcatLayer.cpp      |  186 -
 .../legacy/gserver/layers/MKLDNNConcatLayer.h |   96 -
 .../legacy/gserver/layers/MKLDNNConvLayer.cpp |  388 --
 .../legacy/gserver/layers/MKLDNNConvLayer.h   |  161 -
 .../legacy/gserver/layers/MKLDNNFcLayer.cpp   |  262 -
 paddle/legacy/gserver/layers/MKLDNNFcLayer.h  |  107 -
 .../legacy/gserver/layers/MKLDNNLRNLayer.cpp  |  163 -
 paddle/legacy/gserver/layers/MKLDNNLRNLayer.h |   78 -
 paddle/legacy/gserver/layers/MKLDNNLayer.cpp  |  304 --
 paddle/legacy/gserver/layers/MKLDNNLayer.h    |  477 --
 .../legacy/gserver/layers/MKLDNNPoolLayer.cpp |  195 -
 .../legacy/gserver/layers/MKLDNNPoolLayer.h   |  110 -
 .../layers/MKLPackedRecurrentLayer.cpp        |  132 -
 .../gserver/layers/MKLPackedRecurrentLayer.h  |   58 -
 .../legacy/gserver/layers/MKLPackedWeight.h   |   86 -
 paddle/legacy/gserver/layers/MaxIdLayer.cpp   |   62 -
 paddle/legacy/gserver/layers/MaxLayer.cpp     |   65 -
 paddle/legacy/gserver/layers/MaxLayer.h       |   58 -
 paddle/legacy/gserver/layers/MaxOutLayer.cpp  |   87 -
 paddle/legacy/gserver/layers/MaxOutLayer.h    |   55 -
 .../gserver/layers/MaxPoolWithMaskLayer.cpp   |  109 -
 .../gserver/layers/MaxPoolWithMaskLayer.h     |   40 -
 paddle/legacy/gserver/layers/MixedLayer.cpp   |  176 -
 paddle/legacy/gserver/layers/MixedLayer.h     |   63 -
 .../gserver/layers/MultiBoxLossLayer.cpp      |  376 --
 .../legacy/gserver/layers/MultiBoxLossLayer.h |  103 -
 .../gserver/layers/MultinomialSampler.cpp     |   86 -
 .../gserver/layers/MultinomialSampler.h       |   81 -
 .../legacy/gserver/layers/MultiplexLayer.cpp  |  180 -
 paddle/legacy/gserver/layers/NCELayer.cpp     |  323 --
 paddle/legacy/gserver/layers/NormLayer.cpp    |   59 -
 paddle/legacy/gserver/layers/NormLayer.h      |   99 -
 .../gserver/layers/NormProjectionLayer.cpp    |  101 -
 .../gserver/layers/NormProjectionLayer.h      |   47 -
 paddle/legacy/gserver/layers/Operator.cpp     |   25 -
 paddle/legacy/gserver/layers/Operator.h       |   96 -
 .../legacy/gserver/layers/OuterProdLayer.cpp  |  141 -
 paddle/legacy/gserver/layers/PadLayer.cpp     |  106 -
 paddle/legacy/gserver/layers/PadLayer.h       |   47 -
 .../gserver/layers/ParameterReluLayer.cpp     |   69 -
 .../gserver/layers/ParameterReluLayer.h       |   65 -
 paddle/legacy/gserver/layers/Pool3DLayer.cpp  |  178 -
 paddle/legacy/gserver/layers/Pool3DLayer.h    |   49 -
 paddle/legacy/gserver/layers/PoolLayer.cpp    |   70 -
 paddle/legacy/gserver/layers/PoolLayer.h      |   55 -
 .../legacy/gserver/layers/PoolProjection.cpp  |  175 -
 paddle/legacy/gserver/layers/PoolProjection.h |   68 -
 .../gserver/layers/PoolProjectionLayer.cpp    |   65 -
 .../gserver/layers/PoolProjectionLayer.h      |   46 -
 paddle/legacy/gserver/layers/PowerLayer.cpp   |  120 -
 paddle/legacy/gserver/layers/PrintLayer.cpp   |   68 -
 paddle/legacy/gserver/layers/PriorBox.cpp     |  159 -
 paddle/legacy/gserver/layers/Projection.cpp   |   32 -
 paddle/legacy/gserver/layers/Projection.h     |  140 -
 paddle/legacy/gserver/layers/ROIPoolLayer.cpp |  233 -
 paddle/legacy/gserver/layers/ROIPoolLayer.h   |   56 -
 .../legacy/gserver/layers/RecurrentLayer.cpp  |  301 --
 paddle/legacy/gserver/layers/RecurrentLayer.h |  130 -
 .../gserver/layers/RecurrentLayerGroup.cpp    |   95 -
 paddle/legacy/gserver/layers/ResizeLayer.cpp  |   79 -
 paddle/legacy/gserver/layers/RotateLayer.cpp  |  102 -
 paddle/legacy/gserver/layers/RotateLayer.h    |   51 -
 paddle/legacy/gserver/layers/RowConvLayer.cpp |  106 -
 paddle/legacy/gserver/layers/RowConvLayer.h   |   44 -
 .../legacy/gserver/layers/RowL2NormLayer.cpp  |   98 -
 .../legacy/gserver/layers/SamplingIdLayer.cpp |   91 -
 .../legacy/gserver/layers/ScaleShiftLayer.cpp |  107 -
 .../gserver/layers/ScaleSubRegionLayer.cpp    |   78 -
 .../gserver/layers/ScaleSubRegionLayer.h      |   52 -
 paddle/legacy/gserver/layers/ScalingLayer.cpp |  106 -
 .../gserver/layers/ScalingProjection.cpp      |   57 -
 .../layers/SelectiveFullyConnectedLayer.cpp   |  336 --
 .../layers/SelectiveFullyConnectedLayer.h     |  103 -
 .../gserver/layers/SequenceConcatLayer.cpp    |  189 -
 .../layers/SequenceLastInstanceLayer.cpp      |  118 -
 .../gserver/layers/SequencePoolLayer.cpp      |   93 -
 .../legacy/gserver/layers/SequencePoolLayer.h |   64 -
 .../gserver/layers/SequenceReshapeLayer.cpp   |  157 -
 .../gserver/layers/SequenceSliceLayer.cpp     |  224 -
 .../legacy/gserver/layers/SequenceToBatch.cpp |  256 -
 .../legacy/gserver/layers/SequenceToBatch.h   |  107 -
 .../legacy/gserver/layers/SliceProjection.cpp |   96 -
 .../gserver/layers/SlopeInterceptLayer.cpp    |   94 -
 .../layers/SpatialPyramidPoolLayer.cpp        |  134 -
 .../gserver/layers/SpatialPyramidPoolLayer.h  |   59 -
 .../gserver/layers/SubNestedSequenceLayer.cpp |  187 -
 .../gserver/layers/SubSequenceLayer.cpp       |  226 -
 .../gserver/layers/SumToOneNormLayer.cpp      |  120 -
 .../gserver/layers/SwitchOrderLayer.cpp       |  109 -
 .../legacy/gserver/layers/SwitchOrderLayer.h  |   47 -
 .../legacy/gserver/layers/TableProjection.cpp |   51 -
 .../legacy/gserver/layers/TableProjection.h   |   50 -
 paddle/legacy/gserver/layers/TensorLayer.cpp  |  145 -
 paddle/legacy/gserver/layers/TensorLayer.h    |   55 -
 paddle/legacy/gserver/layers/TransLayer.cpp   |   69 -
 paddle/legacy/gserver/layers/TransLayer.h     |   41 -
 .../layers/TransposedFullMatrixProjection.cpp |   80 -
 .../legacy/gserver/layers/UpsampleLayer.cpp   |  108 -
 paddle/legacy/gserver/layers/UpsampleLayer.h  |   53 -
 .../legacy/gserver/layers/ValidationLayer.cpp |  171 -
 .../legacy/gserver/layers/ValidationLayer.h   |  104 -
 paddle/legacy/gserver/layers/WarpCTCLayer.cpp |  222 -
 paddle/legacy/gserver/layers/WarpCTCLayer.h   |   66 -
 paddle/legacy/gserver/tests/.gitignore        |    1 -
 paddle/legacy/gserver/tests/CMakeLists.txt    |  103 -
 paddle/legacy/gserver/tests/LayerGradUtil.cpp |  854 ---
 paddle/legacy/gserver/tests/LayerGradUtil.h   |  329 --
 paddle/legacy/gserver/tests/MKLDNNTester.cpp  |  580 --
 paddle/legacy/gserver/tests/MKLDNNTester.h    |  143 -
 .../legacy/gserver/tests/Sequence/dummy.list  |    1 -
 .../tests/Sequence/tour_dict_phrase.dict      |  158 -
 .../gserver/tests/Sequence/tour_train_wdseg   |   10 -
 .../tests/Sequence/tour_train_wdseg.nest      |   14 -
 .../legacy/gserver/tests/Sequence/train.list  |    1 -
 .../gserver/tests/Sequence/train.list.nest    |    1 -
 paddle/legacy/gserver/tests/__init__.py       |   13 -
 .../legacy/gserver/tests/concat_dotmul_a.conf |   31 -
 .../legacy/gserver/tests/concat_dotmul_b.conf |   29 -
 .../gserver/tests/concat_fullmatrix_a.conf    |   35 -
 .../gserver/tests/concat_fullmatrix_b.conf    |   29 -
 .../legacy/gserver/tests/concat_slice_a.conf  |   41 -
 .../legacy/gserver/tests/concat_slice_b.conf  |   41 -
 .../legacy/gserver/tests/concat_table_a.conf  |   32 -
 .../legacy/gserver/tests/concat_table_b.conf  |   29 -
 paddle/legacy/gserver/tests/img_conv_a.conf   |   40 -
 paddle/legacy/gserver/tests/img_conv_b.conf   |   32 -
 paddle/legacy/gserver/tests/img_conv_c.conf   |   43 -
 paddle/legacy/gserver/tests/img_conv_cudnn.py |   31 -
 .../legacy/gserver/tests/img_conv_exconv.py   |   31 -
 paddle/legacy/gserver/tests/img_pool_a.conf   |   44 -
 paddle/legacy/gserver/tests/img_pool_b.conf   |   44 -
 .../gserver/tests/mkldnn_branch_net.conf      |  142 -
 .../gserver/tests/mkldnn_simple_net.conf      |   66 -
 paddle/legacy/gserver/tests/pyDataProvider.py |  146 -
 .../tests/pyDataProvider/pyDataProviderList   |    0
 .../gserver/tests/pyDataProvider/trainer.conf |   75 -
 .../legacy/gserver/tests/rnn_data_provider.py |  115 -
 paddle/legacy/gserver/tests/sequenceGen.py    |   70 -
 .../gserver/tests/sequence_layer_group.conf   |   62 -
 .../legacy/gserver/tests/sequence_lstm.conf   |   64 -
 .../tests/sequence_nest_layer_group.conf      |   83 -
 .../gserver/tests/sequence_nest_rnn.conf      |   74 -
 .../tests/sequence_nest_rnn_multi_input.conf  |   76 -
 ...ence_nest_rnn_multi_unequalength_inputs.py |   96 -
 .../gserver/tests/sequence_recurrent.py       |   55 -
 .../gserver/tests/sequence_recurrent_group.py |   68 -
 paddle/legacy/gserver/tests/sequence_rnn.conf |   57 -
 .../tests/sequence_rnn_matched_inputs.py      |   84 -
 .../tests/sequence_rnn_mixed_inputs.py        |   78 -
 .../tests/sequence_rnn_multi_input.conf       |   58 -
 .../sequence_rnn_multi_unequalength_inputs.py |   76 -
 .../gserver/tests/test_ActivationGrad.cpp     |   98 -
 .../legacy/gserver/tests/test_BatchNorm.cpp   |  195 -
 .../gserver/tests/test_CRFLayerGrad.cpp       |  173 -
 .../gserver/tests/test_CompareSparse.cpp      |  228 -
 .../gserver/tests/test_CompareTwoNets.cpp     |  210 -
 .../legacy/gserver/tests/test_ConvTrans.cpp   |  244 -
 .../legacy/gserver/tests/test_ConvUnify.cpp   |  315 --
 .../tests/test_CrossEntropyOverBeamGrad.cpp   |  352 --
 .../gserver/tests/test_DetectionOutput.cpp    |  194 -
 .../legacy/gserver/tests/test_Evaluator.cpp   |  267 -
 paddle/legacy/gserver/tests/test_Expand.cpp   |  127 -
 .../gserver/tests/test_KmaxSeqScore.cpp       |  164 -
 .../legacy/gserver/tests/test_LayerGrad.cpp   | 2532 ---------
 .../gserver/tests/test_LinearChainCRF.cpp     |   67 -
 paddle/legacy/gserver/tests/test_MKLDNN.cpp   |  448 --
 .../tests/test_MaxPoolingWithMaskOutput.cpp   |  117 -
 .../gserver/tests/test_MultinomialSampler.cpp |  147 -
 .../gserver/tests/test_NetworkCompare.cpp     |  294 -
 paddle/legacy/gserver/tests/test_PriorBox.cpp |  212 -
 .../gserver/tests/test_PyDataProvider.cpp     |  177 -
 .../gserver/tests/test_PyDataProvider2.cpp    |  409 --
 .../gserver/tests/test_PyDataProvider2.py     |  125 -
 .../tests/test_RecurrentGradientMachine.cpp   |  180 -
 .../gserver/tests/test_RecurrentLayer.cpp     |  571 --
 .../gserver/tests/test_SelectiveFCLayer.cpp   |  471 --
 .../gserver/tests/test_SeqSliceLayerGrad.cpp  |  224 -
 paddle/legacy/gserver/tests/test_Upsample.cpp |  153 -
 .../gserver/tests/test_WarpCTCLayer.cpp       |  244 -
 paddle/legacy/math/Allocator.h                |  137 -
 paddle/legacy/math/BaseMatrix.cu              | 1953 -------
 paddle/legacy/math/BaseMatrix.h               | 1095 ----
 paddle/legacy/math/CMakeLists.txt             |   57 -
 paddle/legacy/math/CpuSparseMatrix.cpp        |  787 ---
 paddle/legacy/math/CpuSparseMatrix.h          |  377 --
 paddle/legacy/math/ExecViaCpu.h               |  195 -
 paddle/legacy/math/MKLDNNMatrix.cpp           |  158 -
 paddle/legacy/math/MKLDNNMatrix.h             |  256 -
 paddle/legacy/math/MathFunctions.cpp          |  348 --
 paddle/legacy/math/MathFunctions.h            |  129 -
 paddle/legacy/math/MathUtils.cpp              |   97 -
 paddle/legacy/math/MathUtils.h                |   70 -
 paddle/legacy/math/Matrix.cpp                 | 4787 -----------------
 paddle/legacy/math/Matrix.h                   | 2189 --------
 paddle/legacy/math/MatrixBitCode.cpp          |  291 -
 paddle/legacy/math/MemoryHandle.cpp           |   56 -
 paddle/legacy/math/MemoryHandle.h             |   65 -
 paddle/legacy/math/NEONFunctions.cpp          |   95 -
 paddle/legacy/math/NEONFunctions.h            |   24 -
 paddle/legacy/math/PoolAllocator.cpp          |   83 -
 paddle/legacy/math/PoolAllocator.h            |   61 -
 paddle/legacy/math/RowBuffer.h                |  139 -
 paddle/legacy/math/SIMDFunctions.cpp          |  397 --
 paddle/legacy/math/SIMDFunctions.h            |  179 -
 paddle/legacy/math/SparseMatrix.cpp           |  864 ---
 paddle/legacy/math/SparseMatrix.h             |  286 -
 paddle/legacy/math/SparseRowMatrix.cpp        |  282 -
 paddle/legacy/math/SparseRowMatrix.h          |  341 --
 paddle/legacy/math/Storage.cpp                |  101 -
 paddle/legacy/math/Storage.h                  |   52 -
 paddle/legacy/math/TensorApply.h              |  211 -
 paddle/legacy/math/TensorAssign.h             |  158 -
 paddle/legacy/math/TensorEvaluate.h           |  112 -
 paddle/legacy/math/TensorExpression.h         |  446 --
 paddle/legacy/math/TrainingAlgorithmOp.cu     |  356 --
 paddle/legacy/math/TrainingAlgorithmOp.h      |  122 -
 paddle/legacy/math/Vector.cpp                 | 1091 ----
 paddle/legacy/math/Vector.h                   |  726 ---
 paddle/legacy/math/tests/CMakeLists.txt       |   35 -
 .../legacy/math/tests/OriginalOptimizerApi.h  |  201 -
 paddle/legacy/math/tests/PerfUtils.h          |   46 -
 paddle/legacy/math/tests/TensorCheck.h        |  216 -
 paddle/legacy/math/tests/TestUtils.h          |  294 -
 paddle/legacy/math/tests/test_Allocator.cpp   |  122 -
 paddle/legacy/math/tests/test_BaseMatrix.cpp  |  247 -
 .../legacy/math/tests/test_CpuGpuVector.cpp   |   80 -
 paddle/legacy/math/tests/test_ExecViaCpu.cpp  |  116 -
 paddle/legacy/math/tests/test_FPException.cpp |   93 -
 paddle/legacy/math/tests/test_GpuProfiler.cpp |  165 -
 paddle/legacy/math/tests/test_Matrix.cpp      |  273 -
 paddle/legacy/math/tests/test_RowBuffer.cpp   |   65 -
 .../legacy/math/tests/test_SIMDFunctions.cpp  |  171 -
 .../legacy/math/tests/test_SparseMatrix.cpp   |  565 --
 paddle/legacy/math/tests/test_Tensor.cu       | 1162 ----
 .../math/tests/test_TrainingAlgorithm.cpp     |  461 --
 .../legacy/math/tests/test_batchTranspose.cpp |   55 -
 paddle/legacy/math/tests/test_lazyAssign.cu   |  147 -
 .../legacy/math/tests/test_matrixCompare.cpp  | 1698 ------
 paddle/legacy/math/tests/test_matrixUtil.h    |  233 -
 .../legacy/math/tests/test_perturbation.cpp   |  318 --
 .../math/tests/test_sparseMatrixCompare.cpp   |  174 -
 paddle/legacy/optimizer/CMakeLists.txt        |   16 -
 paddle/legacy/optimizer/adadelta_optimizer.cc |   69 -
 paddle/legacy/optimizer/adadelta_optimizer.h  |   53 -
 paddle/legacy/optimizer/adagrad_optimizer.cc  |   57 -
 paddle/legacy/optimizer/adagrad_optimizer.h   |   46 -
 paddle/legacy/optimizer/adam_optimizer.cc     |   63 -
 paddle/legacy/optimizer/adam_optimizer.h      |   55 -
 paddle/legacy/optimizer/lr_policy.h           |   82 -
 paddle/legacy/optimizer/optimizer.cc          |  106 -
 paddle/legacy/optimizer/optimizer.h           |  107 -
 .../legacy/optimizer/parameter_optimizer.cc   |   92 -
 paddle/legacy/optimizer/parameter_optimizer.h |   56 -
 .../optimizer/parameter_optimizer_test.cc     |  127 -
 paddle/legacy/optimizer/serialization.h       |   49 -
 paddle/legacy/optimizer/serialization_test.cc |   46 -
 paddle/legacy/optimizer/sgd_optimizer.cc      |   65 -
 paddle/legacy/optimizer/sgd_optimizer.h       |   50 -
 paddle/legacy/optimizer/tensor.h              |   68 -
 paddle/legacy/parameter/Argument.cpp          |  707 ---
 paddle/legacy/parameter/Argument.h            |  349 --
 paddle/legacy/parameter/AverageOptimizer.cpp  |  206 -
 paddle/legacy/parameter/AverageOptimizer.h    |  145 -
 paddle/legacy/parameter/CMakeLists.txt        |   11 -
 .../legacy/parameter/FirstOrderOptimizer.cpp  |  330 --
 paddle/legacy/parameter/FirstOrderOptimizer.h |  381 --
 .../parameter/LearningRateScheduler.cpp       |  173 -
 .../legacy/parameter/LearningRateScheduler.h  |   37 -
 .../legacy/parameter/OptimizerFunctions.cpp   |   50 -
 paddle/legacy/parameter/OptimizerFunctions.h  |   43 -
 .../parameter/OptimizerWithRegularizer.cpp    |  193 -
 .../parameter/OptimizerWithRegularizer.h      |  157 -
 paddle/legacy/parameter/Parameter.cpp         |  425 --
 paddle/legacy/parameter/Parameter.h           |  380 --
 .../legacy/parameter/ParameterOptimizer.cpp   |   63 -
 paddle/legacy/parameter/ParameterOptimizer.h  |  211 -
 .../parameter/ParameterUpdateFunctions.cpp    |  300 --
 .../parameter/ParameterUpdateFunctions.h      |   56 -
 .../legacy/parameter/ParameterUpdaterBase.cpp |   41 -
 .../legacy/parameter/ParameterUpdaterBase.h   |  182 -
 .../legacy/parameter/ParameterUpdaterHook.cpp |  155 -
 .../legacy/parameter/ParameterUpdaterHook.h   |   63 -
 paddle/legacy/parameter/Regularizer.cpp       |   54 -
 paddle/legacy/parameter/Regularizer.h         |  115 -
 paddle/legacy/parameter/ThreadLocalBuffer.cpp |   35 -
 paddle/legacy/parameter/ThreadLocalBuffer.h   |   22 -
 paddle/legacy/parameter/Weight.cpp            |   84 -
 paddle/legacy/parameter/Weight.h              |   48 -
 paddle/legacy/parameter/tests/CMakeLists.txt  |    2 -
 .../legacy/parameter/tests/test_argument.cpp  |   57 -
 paddle/legacy/parameter/tests/test_common.cpp |  174 -
 paddle/legacy/pserver/BaseClient.cpp          |   80 -
 paddle/legacy/pserver/BaseClient.h            |  311 --
 paddle/legacy/pserver/CMakeLists.txt          |   56 -
 paddle/legacy/pserver/LightNetwork.cpp        |  459 --
 paddle/legacy/pserver/LightNetwork.h          |  185 -
 paddle/legacy/pserver/ParameterClient2.cpp    |  781 ---
 paddle/legacy/pserver/ParameterClient2.h      |  602 ---
 paddle/legacy/pserver/ParameterServer2.cpp    | 1401 -----
 paddle/legacy/pserver/ParameterServer2.h      |  696 ---
 .../legacy/pserver/ParameterServer2Main.cpp   |   29 -
 .../pserver/ParameterServerController.cpp     |  102 -
 .../pserver/ParameterServerController.h       |   74 -
 paddle/legacy/pserver/ProtoServer.cpp         |   74 -
 paddle/legacy/pserver/ProtoServer.h           |  267 -
 paddle/legacy/pserver/RDMANetwork.h           |  158 -
 paddle/legacy/pserver/SocketChannel.cpp       |  235 -
 paddle/legacy/pserver/SocketChannel.h         |  153 -
 .../pserver/SparseParameterDistribution.cpp   |  123 -
 .../pserver/SparseParameterDistribution.h     |   52 -
 paddle/legacy/pserver/test/.gitignore         |    5 -
 paddle/legacy/pserver/test/CMakeLists.txt     |   28 -
 paddle/legacy/pserver/test/SocketTest.cpp     |  256 -
 .../pserver/test/test_ParameterServer2.cpp    |  624 ---
 .../legacy/pserver/test/test_ProtoServer.cpp  |  169 -
 .../legacy/pserver/test/test_ProtoServer.sh   |   33 -
 paddle/legacy/trainer/CMakeLists.txt          |   73 -
 paddle/legacy/trainer/MergeModel.cpp          |   64 -
 .../trainer/NewRemoteParameterUpdater.cpp     |  150 -
 .../trainer/NewRemoteParameterUpdater.h       |  121 -
 paddle/legacy/trainer/ParamUtil.cpp           |  163 -
 paddle/legacy/trainer/ParamUtil.h             |  125 -
 paddle/legacy/trainer/ParameterUpdater.cpp    |  152 -
 paddle/legacy/trainer/ParameterUpdater.h      |  265 -
 .../legacy/trainer/RemoteParameterUpdater.cpp |  843 ---
 .../legacy/trainer/RemoteParameterUpdater.h   |  416 --
 paddle/legacy/trainer/Tester.cpp              |  380 --
 paddle/legacy/trainer/Tester.h                |  149 -
 paddle/legacy/trainer/TesterConfig.h          |  138 -
 .../legacy/trainer/ThreadParameterUpdater.cpp |  309 --
 .../legacy/trainer/ThreadParameterUpdater.h   |   85 -
 paddle/legacy/trainer/Trainer.cpp             |  653 ---
 paddle/legacy/trainer/Trainer.h               |  204 -
 paddle/legacy/trainer/TrainerBenchmark.cpp    |   71 -
 paddle/legacy/trainer/TrainerConfigHelper.cpp |  199 -
 paddle/legacy/trainer/TrainerConfigHelper.h   |  205 -
 paddle/legacy/trainer/TrainerInternal.cpp     |  303 --
 paddle/legacy/trainer/TrainerInternal.h       |  139 -
 .../legacy/trainer/TrainerInternalConfig.cpp  |   49 -
 paddle/legacy/trainer/TrainerInternalConfig.h |  233 -
 paddle/legacy/trainer/TrainerMain.cpp         |   65 -
 paddle/legacy/trainer/tests/.gitignore        |    3 -
 paddle/legacy/trainer/tests/CMakeLists.txt    |   41 -
 paddle/legacy/trainer/tests/__init__.py       |   13 -
 .../trainer/tests/config_parser_test.py       |   23 -
 .../legacy/trainer/tests/fake_file_list.list  |    1 -
 paddle/legacy/trainer/tests/picojson.h        | 1103 ----
 .../test_pydata_provider_wrapper.data         |    2 -
 .../test_pydata_provider_wrapper.list         |    1 -
 .../tests/rnn_gen_test_model_dir/r1.test.beam |   60 -
 .../tests/rnn_gen_test_model_dir/r1.test.nest |   16 -
 .../rnn_gen_test_model_dir/r1.test.nobeam     |   16 -
 .../rnn_gen_test_model_dir/t1/transtable      |  Bin 116 -> 0 bytes
 .../tests/rnn_gen_test_model_dir/t1/wordvec   |  Bin 116 -> 0 bytes
 paddle/legacy/trainer/tests/sample_data.txt   |   10 -
 .../legacy/trainer/tests/sample_filelist.txt  |    1 -
 .../trainer/tests/sample_trainer_config.conf  |   87 -
 .../tests/sample_trainer_config_hsigmoid.conf |   53 -
 .../tests/sample_trainer_config_parallel.conf |   86 -
 .../tests/sample_trainer_nest_rnn_gen.conf    |   73 -
 .../trainer/tests/sample_trainer_rnn_gen.conf |   66 -
 .../tests/simple_sparse_neural_network.py     |   37 -
 .../tests/simple_sparse_neural_network_dp.py  |   35 -
 .../legacy/trainer/tests/testPyDataWrapper.py |  130 -
 paddle/legacy/trainer/tests/test_Compare.cpp  |  158 -
 .../tests/test_PyDataProviderWrapper.cpp      |  220 -
 paddle/legacy/trainer/tests/test_Trainer.cpp  |  107 -
 .../trainer/tests/test_TrainerOnePass.cpp     |  318 --
 paddle/legacy/trainer/tests/test_config.conf  |   77 -
 paddle/legacy/trainer/tests/test_gen_dict.txt |    9 -
 .../test_recurrent_machine_generation.cpp     |  157 -
 paddle/legacy/utils/.gitignore                |    1 -
 paddle/legacy/utils/Any.h                     |   35 -
 paddle/legacy/utils/CMakeLists.txt            |   20 -
 paddle/legacy/utils/ClassRegistrar.h          |   81 -
 paddle/legacy/utils/Common.h                  |   35 -
 paddle/legacy/utils/CpuId.cpp                 |   66 -
 paddle/legacy/utils/CpuId.h                   |  136 -
 paddle/legacy/utils/CustomStackTrace.cpp      |   59 -
 paddle/legacy/utils/CustomStackTrace.h        |  193 -
 paddle/legacy/utils/DynamicLoader.cpp         |  170 -
 paddle/legacy/utils/DynamicLoader.h           |   68 -
 paddle/legacy/utils/Error.h                   |  145 -
 paddle/legacy/utils/Excepts.h                 |   28 -
 paddle/legacy/utils/Flags.cpp                 |   91 -
 paddle/legacy/utils/Flags.h                   |   44 -
 paddle/legacy/utils/GlobalConstants.cpp       |   23 -
 paddle/legacy/utils/GlobalConstants.h         |   97 -
 paddle/legacy/utils/Locks.h                   |  242 -
 paddle/legacy/utils/Logging.cpp               |   47 -
 paddle/legacy/utils/Logging.h                 |   46 -
 paddle/legacy/utils/PythonUtil.cpp            |  215 -
 paddle/legacy/utils/PythonUtil.h              |  381 --
 paddle/legacy/utils/Queue.h                   |  255 -
 paddle/legacy/utils/Stat.cpp                  |  165 -
 paddle/legacy/utils/Stat.h                    |  302 --
 paddle/legacy/utils/StringUtil.cpp            |   57 -
 paddle/legacy/utils/StringUtil.h              |  105 -
 paddle/legacy/utils/Thread.h                  |  615 ---
 paddle/legacy/utils/ThreadLocal.cpp           |   61 -
 paddle/legacy/utils/ThreadLocal.h             |  231 -
 paddle/legacy/utils/Util.cpp                  |  409 --
 paddle/legacy/utils/Util.h                    |  597 --
 paddle/legacy/utils/Version.cpp               |   60 -
 paddle/legacy/utils/Version.h                 |  131 -
 paddle/legacy/utils/arch/linux/Locks.cpp      |  149 -
 paddle/legacy/utils/arch/osx/Excepts.cpp      |   57 -
 paddle/legacy/utils/arch/osx/Locks.cpp        |  105 -
 paddle/legacy/utils/enable_virtualenv.py      |   26 -
 paddle/legacy/utils/tests/CMakeLists.txt      |   18 -
 .../utils/tests/test_CustomStackTrace.cpp     |   92 -
 .../tests/test_CustomStackTracePrint.cpp      |   30 -
 .../utils/tests/test_CustomStackTracePrint.sh |   15 -
 paddle/legacy/utils/tests/test_Error.cpp      |   34 -
 paddle/legacy/utils/tests/test_SIMDFlags.cpp  |   48 -
 paddle/legacy/utils/tests/test_SpinLock.cpp   |   55 -
 .../legacy/utils/tests/test_StringUtils.cpp   |   23 -
 paddle/legacy/utils/tests/test_Thread.cpp     |   81 -
 .../legacy/utils/tests/test_ThreadBarrier.cpp |   66 -
 797 files changed, 151956 deletions(-)
 delete mode 100644 paddle/legacy/api/Arguments.cpp
 delete mode 100644 paddle/legacy/api/CMakeLists.txt
 delete mode 100644 paddle/legacy/api/ConfigParser.cpp
 delete mode 100644 paddle/legacy/api/Evaluator.cpp
 delete mode 100644 paddle/legacy/api/GradientMachine.cpp
 delete mode 100644 paddle/legacy/api/Internal.h
 delete mode 100644 paddle/legacy/api/Matrix.cpp
 delete mode 100644 paddle/legacy/api/Paddle.i
 delete mode 100644 paddle/legacy/api/PaddleAPI.h
 delete mode 100644 paddle/legacy/api/PaddleAPIPrivate.h
 delete mode 100644 paddle/legacy/api/Parameter.cpp
 delete mode 100644 paddle/legacy/api/ParameterOptimizer.cpp
 delete mode 100644 paddle/legacy/api/ParameterUpdater.cpp
 delete mode 100644 paddle/legacy/api/SequenceGenerator.cpp
 delete mode 100644 paddle/legacy/api/Trainer.cpp
 delete mode 100644 paddle/legacy/api/Util.cpp
 delete mode 100644 paddle/legacy/api/Vector.cpp
 delete mode 100644 paddle/legacy/api/__init__.py
 delete mode 100644 paddle/legacy/api/numpy.i
 delete mode 100644 paddle/legacy/api/test/.gitignore
 delete mode 100644 paddle/legacy/api/test/CMakeLists.txt
 delete mode 100644 paddle/legacy/api/test/testArguments.py
 delete mode 100644 paddle/legacy/api/test/testGradientMachine.py
 delete mode 100644 paddle/legacy/api/test/testMatrix.py
 delete mode 100644 paddle/legacy/api/test/testTrain.py
 delete mode 100644 paddle/legacy/api/test/testTrainConfig.py
 delete mode 100644 paddle/legacy/api/test/testTrainer.py
 delete mode 100644 paddle/legacy/api/test/testVector.py
 delete mode 100644 paddle/legacy/api/test/util.py
 delete mode 100644 paddle/legacy/capi/Arguments.cpp
 delete mode 100644 paddle/legacy/capi/CMakeLists.txt
 delete mode 100644 paddle/legacy/capi/Main.cpp
 delete mode 100644 paddle/legacy/capi/Matrix.cpp
 delete mode 100644 paddle/legacy/capi/Vector.cpp
 delete mode 100644 paddle/legacy/capi/arguments.h
 delete mode 100644 paddle/legacy/capi/capi.h
 delete mode 100644 paddle/legacy/capi/capi_private.h
 delete mode 100644 paddle/legacy/capi/config.h.in
 delete mode 100644 paddle/legacy/capi/error.cpp
 delete mode 100644 paddle/legacy/capi/error.h
 delete mode 100644 paddle/legacy/capi/examples/.gitignore
 delete mode 100644 paddle/legacy/capi/examples/README.md
 delete mode 100644 paddle/legacy/capi/examples/model_inference/README.md
 delete mode 100644 paddle/legacy/capi/examples/model_inference/common/common.h
 delete mode 100644 paddle/legacy/capi/examples/model_inference/dense/CMakeLists.txt
 delete mode 100755 paddle/legacy/capi/examples/model_inference/dense/convert_protobin.sh
 delete mode 100644 paddle/legacy/capi/examples/model_inference/dense/main.c
 delete mode 100644 paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py
 delete mode 100644 paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py
 delete mode 100644 paddle/legacy/capi/examples/model_inference/dense/trainer_config.py
 delete mode 100644 paddle/legacy/capi/examples/model_inference/multi_thread/.gitignore
 delete mode 100644 paddle/legacy/capi/examples/model_inference/multi_thread/CMakeLists.txt
 delete mode 100644 paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
 delete mode 100644 paddle/legacy/capi/examples/model_inference/multi_thread/main.c
 delete mode 100644 paddle/legacy/capi/examples/model_inference/multi_thread/main_gpu.c
 delete mode 100755 paddle/legacy/capi/examples/model_inference/multi_thread/trainer_config.py
 delete mode 100644 paddle/legacy/capi/examples/model_inference/sequence/.gitignore
 delete mode 100644 paddle/legacy/capi/examples/model_inference/sequence/CMakeLists.txt
 delete mode 100644 paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
 delete mode 100644 paddle/legacy/capi/examples/model_inference/sequence/main.c
 delete mode 100644 paddle/legacy/capi/examples/model_inference/sequence/trainer_config.py
 delete mode 100644 paddle/legacy/capi/examples/model_inference/sparse_binary/.gitignore
 delete mode 100644 paddle/legacy/capi/examples/model_inference/sparse_binary/CMakeLists.txt
 delete mode 100644 paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
 delete mode 100644 paddle/legacy/capi/examples/model_inference/sparse_binary/main.c
 delete mode 100755 paddle/legacy/capi/examples/model_inference/sparse_binary/trainer_config.py
 delete mode 100644 paddle/legacy/capi/gradient_machine.cpp
 delete mode 100644 paddle/legacy/capi/gradient_machine.h
 delete mode 100644 paddle/legacy/capi/main.h
 delete mode 100644 paddle/legacy/capi/matrix.h
 delete mode 100644 paddle/legacy/capi/paddle_capi.map
 delete mode 100644 paddle/legacy/capi/tests/.gitignore
 delete mode 100644 paddle/legacy/capi/tests/CMakeLists.txt
 delete mode 100644 paddle/legacy/capi/tests/test_Arguments.cpp
 delete mode 100644 paddle/legacy/capi/tests/test_GradientMachine.cpp
 delete mode 100644 paddle/legacy/capi/tests/test_Matrix.cpp
 delete mode 100644 paddle/legacy/capi/tests/test_Vector.cpp
 delete mode 100644 paddle/legacy/capi/tests/test_predict_network.py
 delete mode 100644 paddle/legacy/capi/vector.h
 delete mode 100755 paddle/legacy/cuda/CMakeLists.txt
 delete mode 100644 paddle/legacy/cuda/include/hl_activation_functions.h
 delete mode 100644 paddle/legacy/cuda/include/hl_aggregate.h
 delete mode 100644 paddle/legacy/cuda/include/hl_avx_functions.h
 delete mode 100644 paddle/legacy/cuda/include/hl_base.h
 delete mode 100644 paddle/legacy/cuda/include/hl_batch_norm.h
 delete mode 100644 paddle/legacy/cuda/include/hl_batch_transpose.h
 delete mode 100644 paddle/legacy/cuda/include/hl_cnn.h
 delete mode 100644 paddle/legacy/cuda/include/hl_cpu_gru.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_cpu_lstm.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_cpu_matrix_kernel.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_cpu_matrix_kernel_detail.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_cpu_scalar.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_cpu_simd_neon.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_cpu_simd_sse.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_cuda.h
 delete mode 100644 paddle/legacy/cuda/include/hl_cuda.ph
 delete mode 100644 paddle/legacy/cuda/include/hl_cuda_cublas.h
 delete mode 100644 paddle/legacy/cuda/include/hl_cuda_cudnn.h
 delete mode 100644 paddle/legacy/cuda/include/hl_cuda_cudnn.ph
 delete mode 100755 paddle/legacy/cuda/include/hl_device_functions.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_functions.h
 delete mode 100644 paddle/legacy/cuda/include/hl_gpu.h
 delete mode 100644 paddle/legacy/cuda/include/hl_gpu_functions.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_gpu_gru.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_gpu_lstm.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_gru_ops.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_lstm.h
 delete mode 100644 paddle/legacy/cuda/include/hl_lstm_ops.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_matrix.h
 delete mode 100644 paddle/legacy/cuda/include/hl_matrix_apply.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_matrix_base.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_matrix_base_detail.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_matrix_ops.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_matrix_type.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_perturbation_util.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_recurrent_apply.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_sequence.h
 delete mode 100644 paddle/legacy/cuda/include/hl_sparse.h
 delete mode 100644 paddle/legacy/cuda/include/hl_sparse.ph
 delete mode 100644 paddle/legacy/cuda/include/hl_table_apply.h
 delete mode 100644 paddle/legacy/cuda/include/hl_tensor_ops.h
 delete mode 100644 paddle/legacy/cuda/include/hl_thread.ph
 delete mode 100644 paddle/legacy/cuda/include/hl_time.h
 delete mode 100644 paddle/legacy/cuda/include/hl_top_k.h
 delete mode 100644 paddle/legacy/cuda/include/hl_warpctc_wrap.h
 delete mode 100644 paddle/legacy/cuda/include/stub/hl_aggregate_stub.h
 delete mode 100644 paddle/legacy/cuda/include/stub/hl_cnn_stub.h
 delete mode 100644 paddle/legacy/cuda/include/stub/hl_cuda_cublas_stub.h
 delete mode 100644 paddle/legacy/cuda/include/stub/hl_cuda_cudnn_stub.h
 delete mode 100644 paddle/legacy/cuda/include/stub/hl_cuda_stub.h
 delete mode 100644 paddle/legacy/cuda/include/stub/hl_lstm_stub.h
 delete mode 100644 paddle/legacy/cuda/include/stub/hl_matrix_stub.h
 delete mode 100644 paddle/legacy/cuda/include/stub/hl_sequence_stub.h
 delete mode 100644 paddle/legacy/cuda/include/stub/hl_sparse_stub.h
 delete mode 100644 paddle/legacy/cuda/src/avx_mathfun.h
 delete mode 100644 paddle/legacy/cuda/src/hl_avx_functions.cc
 delete mode 100644 paddle/legacy/cuda/src/hl_batch_norm.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_batch_transpose.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_cpu_functions.cc
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_aggregate.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_cnn.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_cublas.cc
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_cudnn.cc
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_device.cc
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_lstm.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_matrix.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_sequence.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_sparse.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_sparse.cuh
 delete mode 100644 paddle/legacy/cuda/src/hl_math.cc
 delete mode 100644 paddle/legacy/cuda/src/hl_perturbation_util.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_table_apply.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_time.cc
 delete mode 100644 paddle/legacy/cuda/src/hl_top_k.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_warpctc_wrap.cc
 delete mode 100644 paddle/legacy/function/BlockExpandOp.cpp
 delete mode 100644 paddle/legacy/function/BlockExpandOpTest.cpp
 delete mode 100644 paddle/legacy/function/BufferArg.cpp
 delete mode 100644 paddle/legacy/function/BufferArg.h
 delete mode 100644 paddle/legacy/function/BufferArgTest.cpp
 delete mode 100644 paddle/legacy/function/CMakeLists.txt
 delete mode 100644 paddle/legacy/function/ContextProjectionOp.cpp
 delete mode 100644 paddle/legacy/function/ContextProjectionOp.h
 delete mode 100644 paddle/legacy/function/ContextProjectionOpGpu.cu
 delete mode 100644 paddle/legacy/function/ContextProjectionOpTest.cpp
 delete mode 100644 paddle/legacy/function/ConvOp.h
 delete mode 100644 paddle/legacy/function/ConvOpTest.h
 delete mode 100644 paddle/legacy/function/CosSimOp.cpp
 delete mode 100644 paddle/legacy/function/CosSimOp.h
 delete mode 100644 paddle/legacy/function/CosSimOpGpu.cu
 delete mode 100644 paddle/legacy/function/CosSimOpTest.cpp
 delete mode 100644 paddle/legacy/function/CropOp.cpp
 delete mode 100644 paddle/legacy/function/CropOp.h
 delete mode 100644 paddle/legacy/function/CropOpGpu.cu
 delete mode 100644 paddle/legacy/function/CropOpTest.cpp
 delete mode 100644 paddle/legacy/function/CrossMapNormalOp.cpp
 delete mode 100644 paddle/legacy/function/CrossMapNormalOp.h
 delete mode 100644 paddle/legacy/function/CrossMapNormalOpGpu.cu
 delete mode 100644 paddle/legacy/function/CrossMapNormalOpTest.cpp
 delete mode 100644 paddle/legacy/function/DepthwiseConvOp.cpp
 delete mode 100644 paddle/legacy/function/DepthwiseConvOp.h
 delete mode 100644 paddle/legacy/function/DepthwiseConvOpGpu.cu
 delete mode 100644 paddle/legacy/function/DepthwiseConvOpTest.cpp
 delete mode 100644 paddle/legacy/function/EigenGemm.cpp
 delete mode 100644 paddle/legacy/function/EigenThreadDevice.h
 delete mode 100644 paddle/legacy/function/Function.cpp
 delete mode 100644 paddle/legacy/function/Function.h
 delete mode 100644 paddle/legacy/function/FunctionTest.cpp
 delete mode 100644 paddle/legacy/function/FunctionTest.h
 delete mode 100644 paddle/legacy/function/GemmConvOp.cpp
 delete mode 100644 paddle/legacy/function/GemmConvOpTest.cpp
 delete mode 100644 paddle/legacy/function/GemmFunctor.cpp
 delete mode 100644 paddle/legacy/function/GemmFunctor.h
 delete mode 100644 paddle/legacy/function/GruFunctor.h
 delete mode 100644 paddle/legacy/function/Im2Col.h
 delete mode 100644 paddle/legacy/function/Im2ColOp.cpp
 delete mode 100644 paddle/legacy/function/Im2ColOpGpu.cu
 delete mode 100644 paddle/legacy/function/Im2ColTest.cpp
 delete mode 100644 paddle/legacy/function/MulOp.cpp
 delete mode 100644 paddle/legacy/function/MulOp.h
 delete mode 100644 paddle/legacy/function/MulOpGpu.cu
 delete mode 100644 paddle/legacy/function/MulOpTest.cpp
 delete mode 100644 paddle/legacy/function/NaiveConvOp.cpp
 delete mode 100644 paddle/legacy/function/PadOp.cpp
 delete mode 100644 paddle/legacy/function/PadOp.h
 delete mode 100644 paddle/legacy/function/PadOpGpu.cu
 delete mode 100644 paddle/legacy/function/PadOpTest.cpp
 delete mode 100644 paddle/legacy/function/RowConvOp.cpp
 delete mode 100644 paddle/legacy/function/RowConvOp.h
 delete mode 100644 paddle/legacy/function/RowConvOpGpu.cu
 delete mode 100644 paddle/legacy/function/RowConvOpTest.cpp
 delete mode 100644 paddle/legacy/function/ScaleSubRegionOp.cpp
 delete mode 100644 paddle/legacy/function/ScaleSubRegionOp.h
 delete mode 100644 paddle/legacy/function/ScaleSubRegionOpGpu.cu
 delete mode 100644 paddle/legacy/function/ScaleSubRegionOpTest.cpp
 delete mode 100644 paddle/legacy/function/SwitchOp.cpp
 delete mode 100644 paddle/legacy/function/SwitchOp.h
 delete mode 100644 paddle/legacy/function/SwitchOpGpu.cu
 delete mode 100644 paddle/legacy/function/SwitchOpTest.cpp
 delete mode 100644 paddle/legacy/function/TensorShape.h
 delete mode 100644 paddle/legacy/function/TensorShapeTest.cpp
 delete mode 100644 paddle/legacy/function/TensorType.h
 delete mode 100644 paddle/legacy/function/TensorTypeTest.cpp
 delete mode 100644 paddle/legacy/function/neon/NeonDepthwiseConv.cpp
 delete mode 100644 paddle/legacy/function/neon/NeonDepthwiseConv.h
 delete mode 100644 paddle/legacy/function/neon/NeonDepthwiseConvTranspose.cpp
 delete mode 100644 paddle/legacy/function/neon/neon_util.h
 delete mode 100644 paddle/legacy/function/nnpack/NNPACKConvOp.cpp
 delete mode 100644 paddle/legacy/function/nnpack/NNPACKConvOpTest.cpp
 delete mode 100644 paddle/legacy/gserver/CMakeLists.txt
 delete mode 100644 paddle/legacy/gserver/activations/ActivationFunction.cpp
 delete mode 100644 paddle/legacy/gserver/activations/ActivationFunction.h
 delete mode 100644 paddle/legacy/gserver/activations/MKLDNNActivation.cpp
 delete mode 100644 paddle/legacy/gserver/activations/MKLDNNActivation.h
 delete mode 100644 paddle/legacy/gserver/dataproviders/DataProvider.cpp
 delete mode 100644 paddle/legacy/gserver/dataproviders/DataProvider.h
 delete mode 100644 paddle/legacy/gserver/dataproviders/DataProviderGroup.h
 delete mode 100644 paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp
 delete mode 100644 paddle/legacy/gserver/dataproviders/MultiDataProvider.h
 delete mode 100644 paddle/legacy/gserver/dataproviders/ProtoReader.h
 delete mode 100644 paddle/legacy/gserver/dataproviders/PyDataProvider.cpp
 delete mode 100644 paddle/legacy/gserver/dataproviders/PyDataProvider.h
 delete mode 100644 paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp
 delete mode 100644 paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp
 delete mode 100644 paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp
 delete mode 100644 paddle/legacy/gserver/evaluators/DetectionMAPEvaluator.cpp
 delete mode 100644 paddle/legacy/gserver/evaluators/Evaluator.cpp
 delete mode 100644 paddle/legacy/gserver/evaluators/Evaluator.h
 delete mode 100644 paddle/legacy/gserver/gradientmachines/GradientMachine.cpp
 delete mode 100644 paddle/legacy/gserver/gradientmachines/GradientMachine.h
 delete mode 100644 paddle/legacy/gserver/gradientmachines/GradientMachineMode.cpp
 delete mode 100644 paddle/legacy/gserver/gradientmachines/GradientMachineMode.h
 delete mode 100644 paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp
 delete mode 100644 paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h
 delete mode 100644 paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp
 delete mode 100644 paddle/legacy/gserver/gradientmachines/MultiNetwork.h
 delete mode 100644 paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp
 delete mode 100644 paddle/legacy/gserver/gradientmachines/NeuralNetwork.h
 delete mode 100644 paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp
 delete mode 100644 paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.h
 delete mode 100644 paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp
 delete mode 100644 paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h
 delete mode 100644 paddle/legacy/gserver/layers/AddtoLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/AddtoLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/AgentLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/AgentLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/AverageLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/AverageLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/BatchNormBaseLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/BatchNormalizationLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/BilinearInterpLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/BilinearInterpLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/BlockExpandLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/BlockExpandLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/CRFDecodingLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CRFDecodingLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/CRFLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CRFLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/CTCLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CTCLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/ClipLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ConcatenateLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ContextProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ContextProjection.h
 delete mode 100644 paddle/legacy/gserver/layers/Conv3DLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/Conv3DLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/ConvBaseLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ConvBaseLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/ConvBaseOperator.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ConvBaseOperator.h
 delete mode 100644 paddle/legacy/gserver/layers/ConvBaseProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ConvBaseProjection.h
 delete mode 100644 paddle/legacy/gserver/layers/ConvOperator.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ConvOperator.h
 delete mode 100644 paddle/legacy/gserver/layers/ConvProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ConvProjection.h
 delete mode 100644 paddle/legacy/gserver/layers/ConvShiftLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ConvTransOperator.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ConvTransOperator.h
 delete mode 100644 paddle/legacy/gserver/layers/ConvTransProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ConvTransProjection.h
 delete mode 100644 paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CosSimLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CosSimLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CostLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CostLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/CropLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CropLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/CrossChannelNormLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CrossEntropyOverBeam.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CrossEntropyOverBeam.h
 delete mode 100644 paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CudnnBatchNormLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CudnnConvBaseLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/CudnnPoolLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CudnnPoolLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/DataLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/DataLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/DataNormLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/DataNormLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/DeConv3DLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/DeConv3DLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/DetectionOutputLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/DetectionOutputLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/DetectionUtil.cpp
 delete mode 100644 paddle/legacy/gserver/layers/DetectionUtil.h
 delete mode 100644 paddle/legacy/gserver/layers/DotMulOperator.cpp
 delete mode 100644 paddle/legacy/gserver/layers/DotMulProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/DotProdLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/EosIdCheckLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ExpandConvLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ExpandConvLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/ExpandLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ExpandLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/FactorizationMachineLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/FullMatrixProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/FullMatrixProjection.h
 delete mode 100644 paddle/legacy/gserver/layers/FullyConnectedLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/FullyConnectedLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/GatedRecurrentLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/GetOutputLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/GruCompute.cpp
 delete mode 100644 paddle/legacy/gserver/layers/GruCompute.cu
 delete mode 100644 paddle/legacy/gserver/layers/GruCompute.h
 delete mode 100644 paddle/legacy/gserver/layers/GruStepLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/IdentityProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/InterpolationLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/KmaxSeqScoreLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/L2DistanceLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/L2DistanceLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/Layer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/Layer.h
 delete mode 100644 paddle/legacy/gserver/layers/LinearChainCRF.cpp
 delete mode 100644 paddle/legacy/gserver/layers/LinearChainCRF.h
 delete mode 100644 paddle/legacy/gserver/layers/LinearChainCTC.cpp
 delete mode 100644 paddle/legacy/gserver/layers/LinearChainCTC.h
 delete mode 100644 paddle/legacy/gserver/layers/LstmCompute.cpp
 delete mode 100644 paddle/legacy/gserver/layers/LstmCompute.cu
 delete mode 100644 paddle/legacy/gserver/layers/LstmCompute.h
 delete mode 100644 paddle/legacy/gserver/layers/LstmLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/LstmLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/LstmStepLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MDLstmLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNAddtoLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNAddtoLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNBase.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNConcatLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNConcatLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNConvLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNFcLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNLRNLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNPoolLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLPackedWeight.h
 delete mode 100644 paddle/legacy/gserver/layers/MaxIdLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MaxLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MaxLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MaxOutLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MaxOutLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MixedLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MixedLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MultiBoxLossLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MultiBoxLossLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MultinomialSampler.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MultinomialSampler.h
 delete mode 100644 paddle/legacy/gserver/layers/MultiplexLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/NCELayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/NormLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/NormLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/NormProjectionLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/NormProjectionLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/Operator.cpp
 delete mode 100644 paddle/legacy/gserver/layers/Operator.h
 delete mode 100644 paddle/legacy/gserver/layers/OuterProdLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/PadLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/PadLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/ParameterReluLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ParameterReluLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/Pool3DLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/Pool3DLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/PoolLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/PoolLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/PoolProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/PoolProjection.h
 delete mode 100644 paddle/legacy/gserver/layers/PoolProjectionLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/PoolProjectionLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/PowerLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/PrintLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/PriorBox.cpp
 delete mode 100644 paddle/legacy/gserver/layers/Projection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/Projection.h
 delete mode 100644 paddle/legacy/gserver/layers/ROIPoolLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ROIPoolLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/RecurrentLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/RecurrentLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ResizeLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/RotateLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/RotateLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/RowConvLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/RowConvLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/RowL2NormLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SamplingIdLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ScaleShiftLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ScaleSubRegionLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/ScalingLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ScalingProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/SequenceConcatLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SequencePoolLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SequencePoolLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SequenceSliceLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SequenceToBatch.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SequenceToBatch.h
 delete mode 100644 paddle/legacy/gserver/layers/SliceProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SubSequenceLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SumToOneNormLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SwitchOrderLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SwitchOrderLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/TableProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/TableProjection.h
 delete mode 100644 paddle/legacy/gserver/layers/TensorLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/TensorLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/TransLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/TransLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/UpsampleLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/UpsampleLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/ValidationLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ValidationLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/WarpCTCLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/WarpCTCLayer.h
 delete mode 100644 paddle/legacy/gserver/tests/.gitignore
 delete mode 100644 paddle/legacy/gserver/tests/CMakeLists.txt
 delete mode 100644 paddle/legacy/gserver/tests/LayerGradUtil.cpp
 delete mode 100644 paddle/legacy/gserver/tests/LayerGradUtil.h
 delete mode 100644 paddle/legacy/gserver/tests/MKLDNNTester.cpp
 delete mode 100644 paddle/legacy/gserver/tests/MKLDNNTester.h
 delete mode 100644 paddle/legacy/gserver/tests/Sequence/dummy.list
 delete mode 100644 paddle/legacy/gserver/tests/Sequence/tour_dict_phrase.dict
 delete mode 100644 paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
 delete mode 100644 paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
 delete mode 100644 paddle/legacy/gserver/tests/Sequence/train.list
 delete mode 100644 paddle/legacy/gserver/tests/Sequence/train.list.nest
 delete mode 100644 paddle/legacy/gserver/tests/__init__.py
 delete mode 100644 paddle/legacy/gserver/tests/concat_dotmul_a.conf
 delete mode 100644 paddle/legacy/gserver/tests/concat_dotmul_b.conf
 delete mode 100644 paddle/legacy/gserver/tests/concat_fullmatrix_a.conf
 delete mode 100644 paddle/legacy/gserver/tests/concat_fullmatrix_b.conf
 delete mode 100644 paddle/legacy/gserver/tests/concat_slice_a.conf
 delete mode 100644 paddle/legacy/gserver/tests/concat_slice_b.conf
 delete mode 100644 paddle/legacy/gserver/tests/concat_table_a.conf
 delete mode 100644 paddle/legacy/gserver/tests/concat_table_b.conf
 delete mode 100644 paddle/legacy/gserver/tests/img_conv_a.conf
 delete mode 100644 paddle/legacy/gserver/tests/img_conv_b.conf
 delete mode 100644 paddle/legacy/gserver/tests/img_conv_c.conf
 delete mode 100644 paddle/legacy/gserver/tests/img_conv_cudnn.py
 delete mode 100644 paddle/legacy/gserver/tests/img_conv_exconv.py
 delete mode 100644 paddle/legacy/gserver/tests/img_pool_a.conf
 delete mode 100644 paddle/legacy/gserver/tests/img_pool_b.conf
 delete mode 100644 paddle/legacy/gserver/tests/mkldnn_branch_net.conf
 delete mode 100644 paddle/legacy/gserver/tests/mkldnn_simple_net.conf
 delete mode 100644 paddle/legacy/gserver/tests/pyDataProvider.py
 delete mode 100644 paddle/legacy/gserver/tests/pyDataProvider/pyDataProviderList
 delete mode 100644 paddle/legacy/gserver/tests/pyDataProvider/trainer.conf
 delete mode 100644 paddle/legacy/gserver/tests/rnn_data_provider.py
 delete mode 100644 paddle/legacy/gserver/tests/sequenceGen.py
 delete mode 100644 paddle/legacy/gserver/tests/sequence_layer_group.conf
 delete mode 100644 paddle/legacy/gserver/tests/sequence_lstm.conf
 delete mode 100644 paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
 delete mode 100644 paddle/legacy/gserver/tests/sequence_nest_rnn.conf
 delete mode 100644 paddle/legacy/gserver/tests/sequence_nest_rnn_multi_input.conf
 delete mode 100644 paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
 delete mode 100644 paddle/legacy/gserver/tests/sequence_recurrent.py
 delete mode 100644 paddle/legacy/gserver/tests/sequence_recurrent_group.py
 delete mode 100644 paddle/legacy/gserver/tests/sequence_rnn.conf
 delete mode 100644 paddle/legacy/gserver/tests/sequence_rnn_matched_inputs.py
 delete mode 100644 paddle/legacy/gserver/tests/sequence_rnn_mixed_inputs.py
 delete mode 100644 paddle/legacy/gserver/tests/sequence_rnn_multi_input.conf
 delete mode 100644 paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
 delete mode 100644 paddle/legacy/gserver/tests/test_ActivationGrad.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_BatchNorm.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_CRFLayerGrad.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_CompareSparse.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_CompareTwoNets.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_ConvTrans.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_ConvUnify.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_DetectionOutput.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_Evaluator.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_Expand.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_LayerGrad.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_LinearChainCRF.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_MKLDNN.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_MultinomialSampler.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_NetworkCompare.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_PriorBox.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_PyDataProvider.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_PyDataProvider2.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_PyDataProvider2.py
 delete mode 100644 paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_RecurrentLayer.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_SeqSliceLayerGrad.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_Upsample.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp
 delete mode 100644 paddle/legacy/math/Allocator.h
 delete mode 100644 paddle/legacy/math/BaseMatrix.cu
 delete mode 100644 paddle/legacy/math/BaseMatrix.h
 delete mode 100644 paddle/legacy/math/CMakeLists.txt
 delete mode 100644 paddle/legacy/math/CpuSparseMatrix.cpp
 delete mode 100644 paddle/legacy/math/CpuSparseMatrix.h
 delete mode 100644 paddle/legacy/math/ExecViaCpu.h
 delete mode 100644 paddle/legacy/math/MKLDNNMatrix.cpp
 delete mode 100644 paddle/legacy/math/MKLDNNMatrix.h
 delete mode 100644 paddle/legacy/math/MathFunctions.cpp
 delete mode 100644 paddle/legacy/math/MathFunctions.h
 delete mode 100644 paddle/legacy/math/MathUtils.cpp
 delete mode 100644 paddle/legacy/math/MathUtils.h
 delete mode 100644 paddle/legacy/math/Matrix.cpp
 delete mode 100644 paddle/legacy/math/Matrix.h
 delete mode 100644 paddle/legacy/math/MatrixBitCode.cpp
 delete mode 100644 paddle/legacy/math/MemoryHandle.cpp
 delete mode 100644 paddle/legacy/math/MemoryHandle.h
 delete mode 100644 paddle/legacy/math/NEONFunctions.cpp
 delete mode 100644 paddle/legacy/math/NEONFunctions.h
 delete mode 100644 paddle/legacy/math/PoolAllocator.cpp
 delete mode 100644 paddle/legacy/math/PoolAllocator.h
 delete mode 100644 paddle/legacy/math/RowBuffer.h
 delete mode 100644 paddle/legacy/math/SIMDFunctions.cpp
 delete mode 100644 paddle/legacy/math/SIMDFunctions.h
 delete mode 100644 paddle/legacy/math/SparseMatrix.cpp
 delete mode 100644 paddle/legacy/math/SparseMatrix.h
 delete mode 100644 paddle/legacy/math/SparseRowMatrix.cpp
 delete mode 100644 paddle/legacy/math/SparseRowMatrix.h
 delete mode 100644 paddle/legacy/math/Storage.cpp
 delete mode 100644 paddle/legacy/math/Storage.h
 delete mode 100644 paddle/legacy/math/TensorApply.h
 delete mode 100644 paddle/legacy/math/TensorAssign.h
 delete mode 100644 paddle/legacy/math/TensorEvaluate.h
 delete mode 100644 paddle/legacy/math/TensorExpression.h
 delete mode 100644 paddle/legacy/math/TrainingAlgorithmOp.cu
 delete mode 100644 paddle/legacy/math/TrainingAlgorithmOp.h
 delete mode 100644 paddle/legacy/math/Vector.cpp
 delete mode 100644 paddle/legacy/math/Vector.h
 delete mode 100644 paddle/legacy/math/tests/CMakeLists.txt
 delete mode 100644 paddle/legacy/math/tests/OriginalOptimizerApi.h
 delete mode 100644 paddle/legacy/math/tests/PerfUtils.h
 delete mode 100644 paddle/legacy/math/tests/TensorCheck.h
 delete mode 100644 paddle/legacy/math/tests/TestUtils.h
 delete mode 100644 paddle/legacy/math/tests/test_Allocator.cpp
 delete mode 100644 paddle/legacy/math/tests/test_BaseMatrix.cpp
 delete mode 100644 paddle/legacy/math/tests/test_CpuGpuVector.cpp
 delete mode 100644 paddle/legacy/math/tests/test_ExecViaCpu.cpp
 delete mode 100644 paddle/legacy/math/tests/test_FPException.cpp
 delete mode 100644 paddle/legacy/math/tests/test_GpuProfiler.cpp
 delete mode 100644 paddle/legacy/math/tests/test_Matrix.cpp
 delete mode 100644 paddle/legacy/math/tests/test_RowBuffer.cpp
 delete mode 100644 paddle/legacy/math/tests/test_SIMDFunctions.cpp
 delete mode 100644 paddle/legacy/math/tests/test_SparseMatrix.cpp
 delete mode 100644 paddle/legacy/math/tests/test_Tensor.cu
 delete mode 100644 paddle/legacy/math/tests/test_TrainingAlgorithm.cpp
 delete mode 100644 paddle/legacy/math/tests/test_batchTranspose.cpp
 delete mode 100644 paddle/legacy/math/tests/test_lazyAssign.cu
 delete mode 100644 paddle/legacy/math/tests/test_matrixCompare.cpp
 delete mode 100644 paddle/legacy/math/tests/test_matrixUtil.h
 delete mode 100644 paddle/legacy/math/tests/test_perturbation.cpp
 delete mode 100644 paddle/legacy/math/tests/test_sparseMatrixCompare.cpp
 delete mode 100644 paddle/legacy/optimizer/CMakeLists.txt
 delete mode 100644 paddle/legacy/optimizer/adadelta_optimizer.cc
 delete mode 100644 paddle/legacy/optimizer/adadelta_optimizer.h
 delete mode 100644 paddle/legacy/optimizer/adagrad_optimizer.cc
 delete mode 100644 paddle/legacy/optimizer/adagrad_optimizer.h
 delete mode 100644 paddle/legacy/optimizer/adam_optimizer.cc
 delete mode 100644 paddle/legacy/optimizer/adam_optimizer.h
 delete mode 100644 paddle/legacy/optimizer/lr_policy.h
 delete mode 100644 paddle/legacy/optimizer/optimizer.cc
 delete mode 100644 paddle/legacy/optimizer/optimizer.h
 delete mode 100644 paddle/legacy/optimizer/parameter_optimizer.cc
 delete mode 100644 paddle/legacy/optimizer/parameter_optimizer.h
 delete mode 100644 paddle/legacy/optimizer/parameter_optimizer_test.cc
 delete mode 100644 paddle/legacy/optimizer/serialization.h
 delete mode 100644 paddle/legacy/optimizer/serialization_test.cc
 delete mode 100644 paddle/legacy/optimizer/sgd_optimizer.cc
 delete mode 100644 paddle/legacy/optimizer/sgd_optimizer.h
 delete mode 100644 paddle/legacy/optimizer/tensor.h
 delete mode 100644 paddle/legacy/parameter/Argument.cpp
 delete mode 100644 paddle/legacy/parameter/Argument.h
 delete mode 100644 paddle/legacy/parameter/AverageOptimizer.cpp
 delete mode 100644 paddle/legacy/parameter/AverageOptimizer.h
 delete mode 100644 paddle/legacy/parameter/CMakeLists.txt
 delete mode 100644 paddle/legacy/parameter/FirstOrderOptimizer.cpp
 delete mode 100644 paddle/legacy/parameter/FirstOrderOptimizer.h
 delete mode 100644 paddle/legacy/parameter/LearningRateScheduler.cpp
 delete mode 100644 paddle/legacy/parameter/LearningRateScheduler.h
 delete mode 100644 paddle/legacy/parameter/OptimizerFunctions.cpp
 delete mode 100644 paddle/legacy/parameter/OptimizerFunctions.h
 delete mode 100644 paddle/legacy/parameter/OptimizerWithRegularizer.cpp
 delete mode 100644 paddle/legacy/parameter/OptimizerWithRegularizer.h
 delete mode 100644 paddle/legacy/parameter/Parameter.cpp
 delete mode 100644 paddle/legacy/parameter/Parameter.h
 delete mode 100644 paddle/legacy/parameter/ParameterOptimizer.cpp
 delete mode 100644 paddle/legacy/parameter/ParameterOptimizer.h
 delete mode 100644 paddle/legacy/parameter/ParameterUpdateFunctions.cpp
 delete mode 100644 paddle/legacy/parameter/ParameterUpdateFunctions.h
 delete mode 100644 paddle/legacy/parameter/ParameterUpdaterBase.cpp
 delete mode 100644 paddle/legacy/parameter/ParameterUpdaterBase.h
 delete mode 100644 paddle/legacy/parameter/ParameterUpdaterHook.cpp
 delete mode 100644 paddle/legacy/parameter/ParameterUpdaterHook.h
 delete mode 100644 paddle/legacy/parameter/Regularizer.cpp
 delete mode 100644 paddle/legacy/parameter/Regularizer.h
 delete mode 100644 paddle/legacy/parameter/ThreadLocalBuffer.cpp
 delete mode 100644 paddle/legacy/parameter/ThreadLocalBuffer.h
 delete mode 100644 paddle/legacy/parameter/Weight.cpp
 delete mode 100644 paddle/legacy/parameter/Weight.h
 delete mode 100644 paddle/legacy/parameter/tests/CMakeLists.txt
 delete mode 100644 paddle/legacy/parameter/tests/test_argument.cpp
 delete mode 100644 paddle/legacy/parameter/tests/test_common.cpp
 delete mode 100644 paddle/legacy/pserver/BaseClient.cpp
 delete mode 100644 paddle/legacy/pserver/BaseClient.h
 delete mode 100644 paddle/legacy/pserver/CMakeLists.txt
 delete mode 100644 paddle/legacy/pserver/LightNetwork.cpp
 delete mode 100644 paddle/legacy/pserver/LightNetwork.h
 delete mode 100644 paddle/legacy/pserver/ParameterClient2.cpp
 delete mode 100644 paddle/legacy/pserver/ParameterClient2.h
 delete mode 100644 paddle/legacy/pserver/ParameterServer2.cpp
 delete mode 100644 paddle/legacy/pserver/ParameterServer2.h
 delete mode 100644 paddle/legacy/pserver/ParameterServer2Main.cpp
 delete mode 100644 paddle/legacy/pserver/ParameterServerController.cpp
 delete mode 100644 paddle/legacy/pserver/ParameterServerController.h
 delete mode 100644 paddle/legacy/pserver/ProtoServer.cpp
 delete mode 100644 paddle/legacy/pserver/ProtoServer.h
 delete mode 100644 paddle/legacy/pserver/RDMANetwork.h
 delete mode 100644 paddle/legacy/pserver/SocketChannel.cpp
 delete mode 100644 paddle/legacy/pserver/SocketChannel.h
 delete mode 100644 paddle/legacy/pserver/SparseParameterDistribution.cpp
 delete mode 100644 paddle/legacy/pserver/SparseParameterDistribution.h
 delete mode 100644 paddle/legacy/pserver/test/.gitignore
 delete mode 100644 paddle/legacy/pserver/test/CMakeLists.txt
 delete mode 100644 paddle/legacy/pserver/test/SocketTest.cpp
 delete mode 100644 paddle/legacy/pserver/test/test_ParameterServer2.cpp
 delete mode 100644 paddle/legacy/pserver/test/test_ProtoServer.cpp
 delete mode 100755 paddle/legacy/pserver/test/test_ProtoServer.sh
 delete mode 100644 paddle/legacy/trainer/CMakeLists.txt
 delete mode 100644 paddle/legacy/trainer/MergeModel.cpp
 delete mode 100644 paddle/legacy/trainer/NewRemoteParameterUpdater.cpp
 delete mode 100644 paddle/legacy/trainer/NewRemoteParameterUpdater.h
 delete mode 100644 paddle/legacy/trainer/ParamUtil.cpp
 delete mode 100644 paddle/legacy/trainer/ParamUtil.h
 delete mode 100644 paddle/legacy/trainer/ParameterUpdater.cpp
 delete mode 100644 paddle/legacy/trainer/ParameterUpdater.h
 delete mode 100644 paddle/legacy/trainer/RemoteParameterUpdater.cpp
 delete mode 100644 paddle/legacy/trainer/RemoteParameterUpdater.h
 delete mode 100644 paddle/legacy/trainer/Tester.cpp
 delete mode 100644 paddle/legacy/trainer/Tester.h
 delete mode 100644 paddle/legacy/trainer/TesterConfig.h
 delete mode 100644 paddle/legacy/trainer/ThreadParameterUpdater.cpp
 delete mode 100644 paddle/legacy/trainer/ThreadParameterUpdater.h
 delete mode 100644 paddle/legacy/trainer/Trainer.cpp
 delete mode 100644 paddle/legacy/trainer/Trainer.h
 delete mode 100644 paddle/legacy/trainer/TrainerBenchmark.cpp
 delete mode 100644 paddle/legacy/trainer/TrainerConfigHelper.cpp
 delete mode 100644 paddle/legacy/trainer/TrainerConfigHelper.h
 delete mode 100644 paddle/legacy/trainer/TrainerInternal.cpp
 delete mode 100644 paddle/legacy/trainer/TrainerInternal.h
 delete mode 100644 paddle/legacy/trainer/TrainerInternalConfig.cpp
 delete mode 100644 paddle/legacy/trainer/TrainerInternalConfig.h
 delete mode 100644 paddle/legacy/trainer/TrainerMain.cpp
 delete mode 100644 paddle/legacy/trainer/tests/.gitignore
 delete mode 100644 paddle/legacy/trainer/tests/CMakeLists.txt
 delete mode 100644 paddle/legacy/trainer/tests/__init__.py
 delete mode 100644 paddle/legacy/trainer/tests/config_parser_test.py
 delete mode 100644 paddle/legacy/trainer/tests/fake_file_list.list
 delete mode 100644 paddle/legacy/trainer/tests/picojson.h
 delete mode 100644 paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
 delete mode 100644 paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
 delete mode 100644 paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.beam
 delete mode 100644 paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
 delete mode 100644 paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam
 delete mode 100644 paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/transtable
 delete mode 100644 paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/wordvec
 delete mode 100644 paddle/legacy/trainer/tests/sample_data.txt
 delete mode 100644 paddle/legacy/trainer/tests/sample_filelist.txt
 delete mode 100644 paddle/legacy/trainer/tests/sample_trainer_config.conf
 delete mode 100644 paddle/legacy/trainer/tests/sample_trainer_config_hsigmoid.conf
 delete mode 100644 paddle/legacy/trainer/tests/sample_trainer_config_parallel.conf
 delete mode 100644 paddle/legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf
 delete mode 100644 paddle/legacy/trainer/tests/sample_trainer_rnn_gen.conf
 delete mode 100644 paddle/legacy/trainer/tests/simple_sparse_neural_network.py
 delete mode 100644 paddle/legacy/trainer/tests/simple_sparse_neural_network_dp.py
 delete mode 100644 paddle/legacy/trainer/tests/testPyDataWrapper.py
 delete mode 100644 paddle/legacy/trainer/tests/test_Compare.cpp
 delete mode 100644 paddle/legacy/trainer/tests/test_PyDataProviderWrapper.cpp
 delete mode 100644 paddle/legacy/trainer/tests/test_Trainer.cpp
 delete mode 100644 paddle/legacy/trainer/tests/test_TrainerOnePass.cpp
 delete mode 100644 paddle/legacy/trainer/tests/test_config.conf
 delete mode 100644 paddle/legacy/trainer/tests/test_gen_dict.txt
 delete mode 100644 paddle/legacy/trainer/tests/test_recurrent_machine_generation.cpp
 delete mode 100644 paddle/legacy/utils/.gitignore
 delete mode 100644 paddle/legacy/utils/Any.h
 delete mode 100644 paddle/legacy/utils/CMakeLists.txt
 delete mode 100644 paddle/legacy/utils/ClassRegistrar.h
 delete mode 100644 paddle/legacy/utils/Common.h
 delete mode 100644 paddle/legacy/utils/CpuId.cpp
 delete mode 100644 paddle/legacy/utils/CpuId.h
 delete mode 100644 paddle/legacy/utils/CustomStackTrace.cpp
 delete mode 100644 paddle/legacy/utils/CustomStackTrace.h
 delete mode 100644 paddle/legacy/utils/DynamicLoader.cpp
 delete mode 100644 paddle/legacy/utils/DynamicLoader.h
 delete mode 100644 paddle/legacy/utils/Error.h
 delete mode 100644 paddle/legacy/utils/Excepts.h
 delete mode 100644 paddle/legacy/utils/Flags.cpp
 delete mode 100644 paddle/legacy/utils/Flags.h
 delete mode 100644 paddle/legacy/utils/GlobalConstants.cpp
 delete mode 100644 paddle/legacy/utils/GlobalConstants.h
 delete mode 100644 paddle/legacy/utils/Locks.h
 delete mode 100644 paddle/legacy/utils/Logging.cpp
 delete mode 100644 paddle/legacy/utils/Logging.h
 delete mode 100644 paddle/legacy/utils/PythonUtil.cpp
 delete mode 100644 paddle/legacy/utils/PythonUtil.h
 delete mode 100644 paddle/legacy/utils/Queue.h
 delete mode 100644 paddle/legacy/utils/Stat.cpp
 delete mode 100644 paddle/legacy/utils/Stat.h
 delete mode 100644 paddle/legacy/utils/StringUtil.cpp
 delete mode 100644 paddle/legacy/utils/StringUtil.h
 delete mode 100644 paddle/legacy/utils/Thread.h
 delete mode 100644 paddle/legacy/utils/ThreadLocal.cpp
 delete mode 100644 paddle/legacy/utils/ThreadLocal.h
 delete mode 100644 paddle/legacy/utils/Util.cpp
 delete mode 100644 paddle/legacy/utils/Util.h
 delete mode 100644 paddle/legacy/utils/Version.cpp
 delete mode 100644 paddle/legacy/utils/Version.h
 delete mode 100644 paddle/legacy/utils/arch/linux/Locks.cpp
 delete mode 100644 paddle/legacy/utils/arch/osx/Excepts.cpp
 delete mode 100644 paddle/legacy/utils/arch/osx/Locks.cpp
 delete mode 100644 paddle/legacy/utils/enable_virtualenv.py
 delete mode 100644 paddle/legacy/utils/tests/CMakeLists.txt
 delete mode 100644 paddle/legacy/utils/tests/test_CustomStackTrace.cpp
 delete mode 100644 paddle/legacy/utils/tests/test_CustomStackTracePrint.cpp
 delete mode 100755 paddle/legacy/utils/tests/test_CustomStackTracePrint.sh
 delete mode 100644 paddle/legacy/utils/tests/test_Error.cpp
 delete mode 100644 paddle/legacy/utils/tests/test_SIMDFlags.cpp
 delete mode 100644 paddle/legacy/utils/tests/test_SpinLock.cpp
 delete mode 100644 paddle/legacy/utils/tests/test_StringUtils.cpp
 delete mode 100644 paddle/legacy/utils/tests/test_Thread.cpp
 delete mode 100644 paddle/legacy/utils/tests/test_ThreadBarrier.cpp

diff --git a/paddle/legacy/api/Arguments.cpp b/paddle/legacy/api/Arguments.cpp
deleted file mode 100644
index 7bb5a6f75b9..00000000000
--- a/paddle/legacy/api/Arguments.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-#include "paddle/legacy/parameter/Argument.h"
-
-size_t Arguments::getSlotNum() const { return m->outputs.size(); }
-
-Arguments* Arguments::createArguments(size_t slotNum) {
-  auto args = new Arguments();
-  args->m->outputs.resize(slotNum);
-  return args;
-}
-
-void Arguments::resize(size_t slotNum) { m->outputs.resize(slotNum); }
-
-Arguments::Arguments() : m(new ArgumentsPrivate()) {}
-
-Arguments::~Arguments() { delete m; }
-
-Arguments* Arguments::createByPaddleArgumentVector(void* ptr) {
-  auto p = (std::vector<paddle::Argument>*)(ptr);
-  auto args = new Arguments();
-  args->m->outputs = *p;
-  return args;
-}
-
-Arguments* Arguments::createByPaddleArgument(const void* ptr) {
-  auto p = (paddle::Argument*)(ptr);
-  auto args = new Arguments();
-  args->m->outputs.push_back(*p);
-  return args;
-}
-
-Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return Matrix::createByPaddleMatrixPtr(&a.value);
-}
-
-Matrix* Arguments::getSlotGrad(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return Matrix::createByPaddleMatrixPtr(&a.grad);
-}
-
-IVector* Arguments::getSlotIds(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return IVector::createByPaddleVectorPtr(&a.ids);
-}
-
-Matrix* Arguments::getSlotIn(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return Matrix::createByPaddleMatrixPtr(&a.in);
-}
-
-void Arguments::setSlotValue(size_t idx, Matrix* mat) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.value = m->cast<paddle::Matrix>(mat->getSharedPtr());
-}
-
-void Arguments::setSlotGrad(size_t idx, Matrix* mat) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.grad = m->cast<paddle::Matrix>(mat->getSharedPtr());
-}
-
-void Arguments::setSlotIn(size_t idx, Matrix* mat) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.in = m->cast<paddle::Matrix>(mat->getSharedPtr());
-}
-
-void Arguments::setSlotIds(size_t idx, IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
-  a.ids = v;
-}
-
-template <typename T1>
-static inline void doCopyFromSafely(std::shared_ptr<T1>& dest,
-                                    std::shared_ptr<T1>& src) {
-  if (src) {
-    if (dest) {
-      dest->copyFrom(*src);
-    } else {
-      dest = src;
-    }
-  }
-}
-
-IVector* Arguments::getSlotSequenceStartPositions(size_t idx) const
-    throw(RangeError) {
-  auto& a = m->getArg(idx);
-  if (a.sequenceStartPositions) {
-    return IVector::createByPaddleVectorPtr(
-        &a.sequenceStartPositions->getMutableVector(false));
-  } else {
-    return nullptr;
-  }
-}
-
-IVector* Arguments::getSlotSubSequenceStartPositions(size_t idx) const
-    throw(RangeError) {
-  auto& a = m->getArg(idx);
-  if (a.subSequenceStartPositions) {
-    return IVector::createByPaddleVectorPtr(
-        &a.subSequenceStartPositions->getMutableVector(false));
-  } else {
-    return nullptr;
-  }
-}
-
-void Arguments::setSlotSequenceStartPositions(size_t idx,
-                                              IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
-  a.sequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
-}
-
-void Arguments::setSlotSubSequenceStartPositions(
-    size_t idx, IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
-  a.subSequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
-}
-
-IVector* Arguments::getSlotSequenceDim(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return IVector::createByPaddleVectorPtr(&a.cpuSequenceDims);
-}
-
-void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.cpuSequenceDims = m->cast<paddle::IVector>(vec->getSharedPtr());
-}
-
-float Arguments::sum() const { return paddle::Argument::sum(m->outputs); }
-
-int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return a.getBatchSize();
-}
-
-void Arguments::setSlotFrameHeight(size_t idx, size_t h) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.setFrameHeight(h);
-}
-
-void Arguments::setSlotFrameWidth(size_t idx, size_t w) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.setFrameWidth(w);
-}
-
-size_t Arguments::getSlotFrameHeight(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return a.getFrameHeight();
-}
-
-size_t Arguments::getSlotFrameWidth(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return a.getFrameWidth();
-}
-
-void* Arguments::getInternalArgumentsPtr() const { return &m->outputs; }
diff --git a/paddle/legacy/api/CMakeLists.txt b/paddle/legacy/api/CMakeLists.txt
deleted file mode 100644
index 06e1f5d5f08..00000000000
--- a/paddle/legacy/api/CMakeLists.txt
+++ /dev/null
@@ -1,120 +0,0 @@
-set(API_SOURCES
-    Arguments.cpp
-    ConfigParser.cpp
-    Evaluator.cpp
-    GradientMachine.cpp
-    Matrix.cpp
-    Parameter.cpp
-    ParameterOptimizer.cpp
-    ParameterUpdater.cpp
-    SequenceGenerator.cpp
-    Trainer.cpp
-    Util.cpp
-    Vector.cpp)
-set(API_HEADER
-    PaddleAPI.h
-    Internal.h)
-
-add_library(paddle_api STATIC ${API_SOURCES})
-add_dependencies(paddle_api paddle_proto paddle_trainer_lib)
-
-INCLUDE(${SWIG_USE_FILE})
-INCLUDE_DIRECTORIES(${PADDLE_SOURCE_DIR}/paddle)
-
-FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py)
-
-SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
-
-SET(SWIG_NEED_FLAGS
-    -ftls-model=global-dynamic
-    -Wno-parentheses-equality
-    -Wno-self-assign
-    -Wno-maybe-uninitialized
-    -Wno-missing-field-initializers)
-  FOREACH(flag ${SWIG_NEED_FLAGS})
-  safe_set_cxxflag(SWIG_CXX_FLAGS ${flag})
-ENDFOREACH()
-
-SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SWIG_CXX_FLAGS}")
-
-SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
-    paddle_parameter
-    paddle_function
-    paddle_math
-    paddle_utils
-    paddle_gserver
-    paddle_pserver
-    paddle_api
-    paddle_cuda
-    paddle_trainer_lib
-    paddle_network
-    paddle_proto
-    ${external_project_dependencies}
-    ${RDMA_LIBS}
-)
-
-IF(APPLE)
-    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
-ELSE(APPLE)
-    SET(START_GROUP "-Xlinker -start-group")
-    SET(END_GROUP "-Xlinker -end-group")
-    SET(ARCHIVE_START "-Wl,--whole-archive")
-    SET(ARCHIVE_END "-Wl,--no-whole-archive")
-ENDIF(APPLE)
-
-SWIG_ADD_MODULE(swig_paddle python Paddle.i)
-SWIG_LINK_LIBRARIES(swig_paddle
-    ${MACOS_LD_FLAGS}
-    ${START_GROUP}
-    ${ARCHIVE_START}
-    paddle_gserver
-    paddle_function
-    ${METRIC_LIBS}
-    ${ARCHIVE_END}
-    paddle_pserver
-    paddle_trainer_lib
-    paddle_network
-    paddle_parameter
-    paddle_optimizer
-    paddle_math
-    paddle_utils
-    paddle_proto
-    paddle_cuda
-    paddle_api
-    ${CMAKE_DL_LIBS}
-    ${EXTERNAL_LIBS}
-    ${CMAKE_THREAD_LIBS_INIT}
-    ${RDMA_LD_FLAGS}
-    ${START_END}
-)
-
-add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/py_paddle
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_BINARY_DIR}/python/py_paddle
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_BINARY_DIR}/python/py_paddle
-    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/.timestamp
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
-    DEPENDS _swig_paddle
-)
-
-# TODO(yuyang18) : make wheel name calculated by cmake
-add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so)
-
-if(WITH_TESTING)
-    IF(NOT PY_PIP_FOUND)
-        SET(PIP_SOURCES_DIR ${PYTHON_SOURCES_DIR}/pip)
-        ExternalProject_Add(pip
-            ${EXTERNAL_PROJECT_LOG_ARGS}
-            GIT_REPOSITORY      https://github.com/pypa/pip.git
-            GIT_TAG             9.0.1
-            PREFIX              ${PIP_SOURCES_DIR}
-            CONFIGURE_COMMAND   ""
-            BUILD_COMMAND       ""
-            INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-            BUILD_IN_SOURCE     1
-            #DEPENDS python setuptools python_api_wheel
-        )
-    ENDIF()
-    add_subdirectory(test)
-endif()
diff --git a/paddle/legacy/api/ConfigParser.cpp b/paddle/legacy/api/ConfigParser.cpp
deleted file mode 100644
index 016d6da4e2e..00000000000
--- a/paddle/legacy/api/ConfigParser.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-#include "paddle/legacy/trainer/Trainer.h"
-
-struct ParameterConfigPrivate {
-  paddle::ParameterPtr parameter;
-  paddle::ParameterConfig config;
-
-  inline paddle::ParameterConfig* getConfigPtr() {
-    if (parameter != nullptr) {
-      auto& conf = parameter->getConfig();
-      return const_cast<paddle::ParameterConfig*>(&conf);
-    } else {
-      return &config;
-    }
-  }
-};
-
-TrainerConfig::TrainerConfig() : m(new TrainerConfigPrivate()) {}
-
-TrainerConfig::~TrainerConfig() { delete m; }
-
-TrainerConfig* TrainerConfig::createFromTrainerConfigFile(
-    const std::string& confPath) {
-  LOG(INFO) << "load trainer config from " << confPath;
-  auto conf = std::make_shared<paddle::TrainerConfigHelper>(confPath);
-  auto retv = new TrainerConfig();
-  retv->m->conf = conf;
-  return retv;
-}
-
-TrainerConfig* TrainerConfig::createFromProtoString(const std::string& str) {
-  auto retv = new TrainerConfig();
-  paddle::TrainerConfig trainerConfigProto;
-  auto conf = std::make_shared<paddle::TrainerConfigHelper>(trainerConfigProto);
-  CHECK(conf->getMutableConfig().ParseFromString(str));
-  retv->m->conf = conf;
-  return retv;
-}
-
-ModelConfig::ModelConfig() : m(new ModelConfigPrivate()) {}
-
-ModelConfig::~ModelConfig() { delete m; }
-
-ModelConfig* TrainerConfig::getModelConfig() const {
-  auto retv = new ModelConfig();
-  retv->m->conf = m->conf;
-  return retv;
-}
-
-ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}
-
-ParameterConfig::~ParameterConfig() { delete m; }
-
-ParameterConfig* ParameterConfig::createParameterConfigFromParameterSharedPtr(
-    void* ptr) {
-  auto& p = *(paddle::ParameterPtr*)(ptr);
-  if (p != nullptr) {
-    auto conf = new ParameterConfig();
-    conf->m->parameter = p;
-    return conf;
-  } else {
-    return nullptr;
-  }
-}
-
-ParameterConfig* ParameterConfig::createParameterConfigFromParameterPtr(
-    void* ptr) {
-  auto& p = *(paddle::Parameter*)(ptr);
-  auto conf = new ParameterConfig();
-  conf->m->config = p.getConfig();
-  return conf;
-}
-
-std::string ParameterConfig::toProtoString() const {
-  return m->getConfigPtr()->SerializeAsString();
-}
-
-void* ParameterConfig::getRawPtr() { return m->getConfigPtr(); }
-
-OptimizationConfig::OptimizationConfig() : m(new OptimizationConfigPrivate()) {}
-
-OptimizationConfig::~OptimizationConfig() { delete m; }
-
-std::string OptimizationConfig::toProtoString() {
-  return m->getConfig().SerializeAsString();
-}
-
-OptimizationConfig* TrainerConfig::getOptimizationConfig() const {
-  auto opt_config = new OptimizationConfig();
-  opt_config->m->trainer_config = m->conf;
-  return opt_config;
-}
-
-OptimizationConfig* OptimizationConfig::createFromProtoString(
-    const std::string& str) {
-  auto conf = new OptimizationConfig();
-  conf->m->config.ParseFromString(str);
-  return conf;
-}
diff --git a/paddle/legacy/api/Evaluator.cpp b/paddle/legacy/api/Evaluator.cpp
deleted file mode 100644
index c4aac47cbec..00000000000
--- a/paddle/legacy/api/Evaluator.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <sstream>
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-Evaluator::Evaluator() : m(new EvaluatorPrivate()) {}
-Evaluator::~Evaluator() { delete m; }
-
-void Evaluator::start() { m->rawPtr->start(); }
-
-void Evaluator::finish() { m->rawPtr->finish(); }
-
-std::string Evaluator::toString() {
-  std::ostringstream sout;
-  m->rawPtr->printStats(sout);
-  return sout.str();
-}
-
-std::vector<std::string> Evaluator::getNames() const {
-  std::vector<std::string> retv;
-  m->rawPtr->getNames(&retv);
-  return retv;
-}
-
-double Evaluator::getValue(const std::string name) const {
-  paddle::Error err;
-  double v = m->rawPtr->getValue(name, &err);
-  if (!err.isOK()) {
-    throw std::runtime_error(err.msg());
-  }
-  return v;
-}
diff --git a/paddle/legacy/api/GradientMachine.cpp b/paddle/legacy/api/GradientMachine.cpp
deleted file mode 100644
index 5ad2fe11a4c..00000000000
--- a/paddle/legacy/api/GradientMachine.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-#include "Internal.h"
-#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-
-std::vector<int> GradientMachine::defaultParamTypes = {
-    PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM};
-
-GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}
-
-GradientMachine::~GradientMachine() { delete m; }
-
-GradientMachine* GradientMachine::createFromPaddleModelPtr(
-    const void* confPtr,
-    GradientMatchineCreateMode mode,
-    const std::vector<int>& types) {
-  auto& conf = *(const paddle::ModelConfig*)(confPtr);
-  std::vector<ParameterType> realTypes;
-  staticCastVector(&realTypes, types);
-  auto machineRawPtr = paddle::GradientMachine::create(conf, mode, realTypes);
-  auto machinePtr = std::shared_ptr<paddle::GradientMachine>(machineRawPtr);
-  if (machinePtr != nullptr) {
-    auto machine = new GradientMachine();
-    machine->m->machine = machinePtr;
-    return machine;
-  } else {
-    return nullptr;
-  }
-}
-
-GradientMachine* GradientMachine::createByConfigProtoStr(
-    const std::string& protoStr,
-    GradientMatchineCreateMode mode,
-    const std::vector<int>& types) {
-  paddle::ModelConfig conf;
-  conf.ParseFromString(protoStr);
-  if (conf.IsInitialized()) {
-    return GradientMachine::createFromPaddleModelPtr(&conf, mode, types);
-  } else {
-    return nullptr;
-  }
-}
-
-GradientMachine* GradientMachine::createByModelConfig(
-    ModelConfig* conf,
-    GradientMatchineCreateMode mode,
-    const std::vector<int>& types) {
-  auto confPtr = &conf->m->conf->getModelConfig();
-  return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
-}
-
-void GradientMachine::start() { m->machine->start(); }
-
-void GradientMachine::finish() { m->machine->finish(); }
-
-void GradientMachine::onPassEnd() { m->machine->onPassEnd(); }
-
-void GradientMachine::prefetch(const Arguments& inArgs) {
-  auto& in =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  m->machine->prefetch(in);
-}
-
-void GradientMachine::forward(const Arguments& inArgs,
-                              Arguments* outArgs,
-                              PassType passType) {
-  auto& in =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  auto& out = m->cast<std::vector<paddle::Argument>>(
-      outArgs->getInternalArgumentsPtr());
-  paddle::PassType pt = (paddle::PassType)(passType);
-  m->machine->forward(in, &out, pt);
-}
-
-UpdateCallback::~UpdateCallback() {}
-
-void UpdateCallback::apply(Parameter* p) {
-  // UNUSED(p);
-}
-
-class UpdateCallbackWrapper {
- public:
-  explicit UpdateCallbackWrapper(const UpdateCallback& callback)
-      : callback(const_cast<UpdateCallback&>(callback)) {}
-
-  void operator()(paddle::Parameter* param) {
-    auto p = Parameter::createFromRawPtr(&param);
-    // @TODO Use Stack variable instead.
-    callback.apply(p);
-    delete p;
-  }
-
- private:
-  UpdateCallback& callback;
-};
-
-void GradientMachine::backward(const UpdateCallback& callback) {
-  m->machine->backward(UpdateCallbackWrapper(callback));
-}
-
-void GradientMachine::forwardBackward(const Arguments& inArgs,
-                                      Arguments* outArgs,
-                                      PassType passType,
-                                      const UpdateCallback& callback) {
-  auto& in =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  auto& out = m->cast<std::vector<paddle::Argument>>(
-      outArgs->getInternalArgumentsPtr());
-  paddle::PassType pt = (paddle::PassType)(passType);
-  m->machine->forwardBackward(in, &out, pt, UpdateCallbackWrapper(callback));
-}
-
-void GradientMachine::loadParameters(const std::string& path) {
-  m->machine->loadParameters(path);
-}
-
-size_t GradientMachine::getParameterSize() const {
-  return m->machine->getParameters().size();
-}
-
-Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
-  auto params = m->machine->getParameters();
-  if (i < params.size()) {
-    return Parameter::createFromSharedPtr(&m->machine->getParameters()[i]);
-  } else {
-    throw RangeError();
-  }
-}
-
-size_t GradientMachine::getNonStaticParameterSize() const {
-  return m->machine->getNonStaticParameters().size();
-}
-
-Parameter* GradientMachine::getNonStaticParameter(size_t i) throw(RangeError) {
-  auto params = m->machine->getNonStaticParameters();
-  if (i < params.size()) {
-    return Parameter::createFromSharedPtr(
-        &m->machine->getNonStaticParameters()[i]);
-  } else {
-    throw RangeError();
-  }
-}
-
-void GradientMachine::randParameters() { m->machine->randParameters(); }
-
-Arguments* GradientMachine::getLayerOutput(const std::string& layerName) const
-    throw(UnsupportError) {
-  auto nn = m->machine;
-  if (nn) {
-    auto arg = nn->getLayerOutput(layerName);
-    return Arguments::createByPaddleArgument(&arg);
-  } else {
-    throw UnsupportError();
-  }
-}
-
-SequenceGenerator* GradientMachine::asSequenceGenerator(
-    const std::vector<std::string>& dict,
-    size_t begin_id,
-    size_t end_id,
-    size_t max_length,
-    size_t beam_size) {
-  SequenceGenerator* r =
-      SequenceGenerator::createByGradientMachineSharedPtr(&m->machine);
-  r->setDict(dict);
-  r->setBos(begin_id);
-  r->setEos(end_id);
-  r->setMaxLength(max_length);
-  r->setBeamSize(beam_size);
-  return r;
-}
-
-Evaluator* GradientMachine::makeEvaluator() {
-  auto ev = new Evaluator();
-  ev->m->rawPtr = m->machine->makeEvaluator();
-  return ev;
-}
-
-void GradientMachine::eval(Evaluator* evaluator) {
-  m->machine->eval(evaluator->m->rawPtr);
-}
diff --git a/paddle/legacy/api/Internal.h b/paddle/legacy/api/Internal.h
deleted file mode 100644
index 2195cc6739d..00000000000
--- a/paddle/legacy/api/Internal.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "PaddleAPI.h"
-
-#include <algorithm>
-#include <vector>
-
-template <typename T1, typename T2>
-void staticCastVector(std::vector<T2>* dest, const std::vector<T1>& src) {
-  dest->resize(src.size());
-  std::transform(src.begin(), src.end(), dest->begin(), [](T1 t) {
-    return static_cast<T2>(t);
-  });
-}
diff --git a/paddle/legacy/api/Matrix.cpp b/paddle/legacy/api/Matrix.cpp
deleted file mode 100644
index 8862d0ea92c..00000000000
--- a/paddle/legacy/api/Matrix.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/math/Matrix.h"
-#include <cstring>
-#include <iostream>
-#include "PaddleAPI.h"
-#include "paddle/legacy/math/CpuSparseMatrix.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-
-struct MatrixPrivate {
-  std::shared_ptr<paddle::Matrix> mat;
-};
-
-Matrix::Matrix() : m(new MatrixPrivate()) {}
-
-Matrix* Matrix::createByPaddleMatrixPtr(void* sharedPtr) {
-  auto* mat = reinterpret_cast<paddle::MatrixPtr*>(sharedPtr);
-  if ((*mat) != nullptr) {
-    auto m = new Matrix();
-    m->m->mat = *mat;
-    return m;
-  } else {
-    return nullptr;
-  }
-}
-
-Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::create(height, width, useGpu);
-  m->m->mat->zero();
-  return m;
-}
-
-Matrix* Matrix::createDense(const std::vector<float>& data,
-                            size_t height,
-                            size_t width,
-                            bool useGpu) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::create(height, width, useGpu);
-  m->m->mat->copyFrom(data.data(), data.size());
-  return m;
-}
-
-Matrix* Matrix::createDenseFromNumpy(float* data,
-                                     int dim1,
-                                     int dim2,
-                                     bool copy,
-                                     bool useGpu) throw(UnsupportError) {
-  if (useGpu) {
-    /// Gpu mode only supports copy=True
-    if (!copy) {
-      throw UnsupportError("Gpu mode only supports copy=True");
-    }
-    return Matrix::createGpuDenseFromNumpy(data, dim1, dim2);
-  } else {
-    return Matrix::createCpuDenseFromNumpy(data, dim1, dim2, copy);
-  }
-}
-
-Matrix* Matrix::createCpuDenseFromNumpy(float* data,
-                                        int dim1,
-                                        int dim2,
-                                        bool copy) {
-  auto m = new Matrix();
-  if (copy) {
-    m->m->mat = paddle::Matrix::create(dim1, dim2);
-    m->m->mat->copyFrom(data, dim1 * dim2);
-  } else {
-    m->m->mat = paddle::Matrix::create(data, dim1, dim2, false);
-  }
-  return m;
-}
-
-Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::create(dim1, dim2, false, true);
-  m->m->mat->copyFrom(data, dim1 * dim2);
-  return m;
-}
-
-Matrix* Matrix::createSparse(size_t height,
-                             size_t width,
-                             size_t nnz,
-                             bool isNonVal,
-                             bool isTrans,
-                             bool useGpu) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::createSparseMatrix(
-      height,
-      width,
-      nnz,
-      isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
-      isTrans,
-      useGpu);
-  return m;
-}
-
-Matrix::~Matrix() { delete m; }
-
-size_t Matrix::getHeight() const { return m->mat->getHeight(); }
-
-size_t Matrix::getWidth() const { return m->mat->getWidth(); }
-
-float Matrix::get(size_t x, size_t y) const throw(RangeError) {
-  if (x > this->getWidth() || y > this->getHeight()) {
-    RangeError e;
-    throw e;
-  }
-  return m->mat->getElement(x, y);
-}
-
-void Matrix::set(size_t x, size_t y, float val) throw(RangeError,
-                                                      UnsupportError) {
-  if (x > this->getWidth() || y > this->getHeight()) {
-    RangeError e;
-    throw e;
-  }
-  auto rawMat = m->mat.get();
-  if (auto cDenseMat = dynamic_cast<paddle::CpuMatrix*>(rawMat)) {
-    *(cDenseMat->getData() + x + y * cDenseMat->getWidth()) = val;
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-bool Matrix::isSparse() const {
-  auto raw_mat = m->mat.get();
-  return dynamic_cast<paddle::CpuSparseMatrix*>(raw_mat) != nullptr ||
-         dynamic_cast<paddle::GpuSparseMatrix*>(raw_mat) != nullptr;
-}
-
-SparseValueType Matrix::getSparseValueType() const throw(UnsupportError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr) {
-    return (SparseValueType)cpuSparseMat->getValueType();
-  } else {
-    auto gpuSparseMat =
-        std::dynamic_pointer_cast<paddle::GpuSparseMatrix>(m->mat);
-    if (gpuSparseMat != nullptr) {
-      return (SparseValueType)gpuSparseMat->getValueType();
-    } else {
-      UnsupportError e;
-      throw e;
-    }
-  }
-}
-
-SparseFormatType Matrix::getSparseFormat() const throw(UnsupportError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr) {
-    return (SparseFormatType)cpuSparseMat->getFormat();
-  } else {
-    auto gpuSparseMat =
-        std::dynamic_pointer_cast<paddle::GpuSparseMatrix>(m->mat);
-    if (gpuSparseMat != nullptr) {
-      return SPARSE_CSR;
-    } else {
-      UnsupportError e;
-      throw e;
-    }
-  }
-}
-
-IntArray Matrix::getSparseRowCols(size_t i) const
-    throw(UnsupportError, RangeError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr &&
-      cpuSparseMat->getFormat() == paddle::SPARSE_CSR) {
-    if (i < cpuSparseMat->getHeight()) {
-      // cpuSparseMat->print(std::cout);
-      size_t len = cpuSparseMat->getColNum(i);
-      return IntArray(cpuSparseMat->getRowCols(i), len);
-    } else {
-      RangeError e;
-      throw e;
-    }
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-IntWithFloatArray Matrix::getSparseRowColsVal(size_t i) const
-    throw(UnsupportError, RangeError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr &&
-      cpuSparseMat->getValueType() == paddle::FLOAT_VALUE) {
-    if (i < cpuSparseMat->getHeight()) {
-      return IntWithFloatArray(cpuSparseMat->getRowValues(i),
-                               cpuSparseMat->getRowCols(i),
-                               cpuSparseMat->getColNum(i));
-    } else {
-      RangeError e;
-      throw e;
-    }
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-FloatArray Matrix::getData() const {
-  auto rawMat = m->mat.get();
-  if (dynamic_cast<paddle::GpuMemoryHandle*>(rawMat->getMemoryHandle().get())) {
-    // is gpu. then copy data
-    float* data = rawMat->getData();
-    size_t len = rawMat->getElementCnt();
-    float* cpuData = new float[len];
-    hl_memcpy_device2host(cpuData, data, len * sizeof(float));
-    FloatArray ret_val(cpuData, len);
-    ret_val.needFree = true;
-    return ret_val;
-  } else {
-    FloatArray ret_val(rawMat->getData(), rawMat->getElementCnt());
-    return ret_val;
-  }
-}
-
-void Matrix::sparseCopyFrom(
-    const std::vector<int>& rows,
-    const std::vector<int>& cols,
-    const std::vector<float>& vals) throw(UnsupportError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr) {
-    // LOG(INFO) <<"RowSize = "<<rows.size()
-    //  <<" ColSize = "<<cols.size()
-    //  <<" ValSize = "<<vals.size();
-    cpuSparseMat->copyFrom(const_cast<std::vector<int>&>(rows),
-                           const_cast<std::vector<int>&>(cols),
-                           const_cast<std::vector<float>&>(vals));
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-void* Matrix::getSharedPtr() const { return &m->mat; }
-
-void Matrix::toNumpyMatInplace(float** view_data,
-                               int* dim1,
-                               int* dim2) throw(UnsupportError) {
-  auto cpuMat = std::dynamic_pointer_cast<paddle::CpuMatrix>(m->mat);
-  if (cpuMat) {
-    *dim1 = cpuMat->getHeight();
-    *dim2 = cpuMat->getWidth();
-    *view_data = cpuMat->getData();
-  } else {
-    throw UnsupportError();
-  }
-}
-void Matrix::copyToNumpyMat(float** view_m_data,
-                            int* dim1,
-                            int* dim2) throw(UnsupportError) {
-  static_assert(sizeof(paddle::real) == sizeof(float),
-                "Currently PaddleAPI only support for single "
-                "precision version of paddle.");
-  if (this->isSparse()) {
-    throw UnsupportError();
-  } else {
-    *dim1 = m->mat->getHeight();
-    *dim2 = m->mat->getWidth();
-    *view_m_data = new float[(*dim1) * (*dim2)];
-    if (auto cpuMat = dynamic_cast<paddle::CpuMatrix*>(m->mat.get())) {
-      auto src = cpuMat->getData();
-      auto dest = *view_m_data;
-      std::memcpy(dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
-    } else if (auto gpuMat = dynamic_cast<paddle::GpuMatrix*>(m->mat.get())) {
-      auto src = gpuMat->getData();
-      auto dest = *view_m_data;
-      hl_memcpy_device2host(
-          dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
-    } else {
-      LOG(WARNING) << "Unexpected Situation";
-      throw UnsupportError();
-    }
-  }
-}
-
-void Matrix::copyFromNumpyMat(float* data,
-                              int dim1,
-                              int dim2) throw(UnsupportError, RangeError) {
-  if (isSparse()) {
-    throw UnsupportError();
-  } else {
-    if (this->getHeight() == (size_t)dim1 && this->getWidth() == (size_t)dim2) {
-      if (m->mat->getData() != data) {
-        m->mat->copyFrom(data, dim1 * dim2);
-      }
-    } else {
-      throw RangeError();
-    }
-  }
-}
-
-bool Matrix::isGpu() const {
-  auto rawPtr = m->mat.get();
-  return dynamic_cast<paddle::GpuMatrix*>(rawPtr) != nullptr ||
-         dynamic_cast<paddle::GpuSparseMatrix*>(rawPtr) != nullptr;
-}
diff --git a/paddle/legacy/api/Paddle.i b/paddle/legacy/api/Paddle.i
deleted file mode 100644
index 7a1456a5c06..00000000000
--- a/paddle/legacy/api/Paddle.i
+++ /dev/null
@@ -1,202 +0,0 @@
-%module(directors="1") swig_paddle
-%include "std_string.i"
-%{
-#define SWIG_FILE_WITH_INIT
-#include "legacy/api/PaddleAPI.h"
-%}
-
-%include "exception.i"
-%typemap(throws) UnsupportError %{
-  SWIG_exception(SWIG_RuntimeError, $1.what());
-  SWIG_fail;
-%}
-
-%include "std_vector.i"
-%include "std_pair.i"
-#ifdef SWIGPYTHON
-%include "numpy.i"
-#endif
-
-%init %{
-#ifdef SWIGPYTHON
-import_array();
-#endif
-%}
-
-
-namespace std {
-%template(vector_int) vector<int>;
-%template(vector_uint) vector<unsigned int>;
-%template(vector_float) vector<float>;
-%template(vector_string) vector<string>;
-%template(vector_vec_star) vector<Vector*>;
-}
-#ifdef SWIGPYTHON 
-%typemap(in) (int argc, char** argv) { 
-    int i = 0; 
-    if (!PyList_Check($input)) { 
-        PyErr_SetString(PyExc_ValueError, "Expecting a list"); 
-        return NULL; 
-    } 
-    $1 = PyList_Size($input); 
-    $2 = (char **) malloc(($1+1)*sizeof(char *)); 
-    for (i = 0; i < $1; i++) { 
-        PyObject *s = PyList_GetItem($input,i); 
-        if (!PyString_Check(s)) { 
-            free($2); 
-            PyErr_SetString(PyExc_ValueError, "List items must be strings"); 
-            return NULL; 
-        } 
-        $2[i] = PyString_AsString(s); 
-    } 
-    $2[i] = 0; 
-} 
-%typemap(freearg) (int argc, char** argv) { 
-    if ($2) free($2); 
-} 
-
-%typemap(out) FloatArray {
-  $result = PyList_New($1.length);
-  for (size_t i=0; i<$1.length; ++i) {
-    PyList_SetItem($result, i, PyFloat_FromDouble($1.buf[i]));
-  }  
-  if($1.needFree) {
-    delete [] $1.buf;  
-  }
-}
-
-%typemap(out) IntArray {
-  $result = PyList_New($1.length);  
-  for (size_t i=0; i<$1.length; ++i) {
-    PyList_SetItem($result, i, PyInt_FromLong($1.buf[i]));  
-  }
-  if ($1.needFree) {
-    delete [] $1.buf;  
-  }
-}
-
-%typemap(out) IntWithFloatArray {
-  $result = PyList_New($1.length);
-  for (size_t i=0; i<$1.length; ++i) {
-    PyList_SetItem($result, i, PyTuple_Pack(2, 
-      PyInt_FromLong($1.idxBuf[i]),
-      PyFloat_FromDouble($1.valBuf[i])
-    ));
-  }
-  if ($1.needFree) {
-    delete [] $1.idxBuf;
-    delete [] $1.valBuf;
-  } 
-}
-
-
-%rename(__getitem__) IVector::get;
-%rename(__setitem__) IVector::set;
-%rename(__len__) IVector::getSize;
-%rename(__getitem__) Vector::get;
-%rename(__setitem__) Vector::set;
-%rename(__len__) Vector::getSize;
-%rename(__len__) Parameter::getSize;
-%rename(__call__) ParameterTraverseCallback::apply;
-%rename(__repr__) Evaluator::toString;
-
-%apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) { 
-  (float* data, int dim1, int dim2) 
-}
-
-%apply (float** ARGOUTVIEW_ARRAY2, int* DIM1, int* DIM2) { 
-  (float** view_data, int* dim1, int* dim2) 
-}
-
-%apply (float** ARGOUTVIEWM_ARRAY2, int* DIM1, int* DIM2) {
-  (float** view_m_data, int* dim1, int* dim2)  
-}
-
-%apply (int** ARGOUTVIEWM_ARRAY1, int* DIM1) {
-  (int** view_m_data, int* dim1)  
-}
-
-%apply (int* INPLACE_ARRAY1, int DIM1) { 
-  (int* data, int dim) 
-}
-
-%apply (int** ARGOUTVIEW_ARRAY1, int* DIM1) {
-  (int** view_data, int* dim1)  
-}
-
-%apply (float* INPLACE_ARRAY1, int DIM1) {
-  (float* data, int dim)
-}
-
-%apply (float** ARGOUTVIEW_ARRAY1, int* DIM1) {
-  (float** view_data, int* dim1)
-}
-
-%apply (float** ARGOUTVIEWM_ARRAY1, int* DIM1) {
-  (float** view_m_data, int* dim1)
-}
-
-#endif
-// The below functions internally create object by "new", so it should use
-// use SWIG to handle gc. There are hints for SWIG to handle GC.
-%newobject Matrix::createZero;
-%newobject Matrix::createSparse;
-%newobject Matrix::createDense;
-%newobject Matrix::createDenseFromNumpy;
-%newobject Matrix::createCpuDenseFromNumpy;
-%newobject Matrix::createGpuDenseFromNumpy;
-%newobject Vector::createZero;
-%newobject Vector::create;
-%newobject Vector::createVectorFromNumpy;
-%newobject Vector::createCpuVectorFromNumpy;
-%newobject Vector::createGpuVectorFromNumpy;
-%newobject IVector::createZero;
-%newobject IVector::create;
-%newobject IVector::createVectorFromNumpy;
-%newobject IVector::createCpuVectorFromNumpy;
-%newobject IVector::createGpuVectorFromNumpy;
-%newobject Trainer::createByCommandLine;
-%newobject Trainer::getForwardOutput;
-%newobject Trainer::getLayerOutput;
-%newobject Arguments::getSlotValue;
-%newobject Arguments::getSlotIds;
-%newobject Arguments::getSlotIn;
-%newobject Arguments::getSlotSequenceStartPositions;
-%newobject Arguments::getSlotSequenceDim;
-%newobject Arguments::createArguments;
-%newobject GradientMachine::createByConfigProtoStr;
-%newobject GradientMachine::createByModelConfig;
-%newobject GradientMachine::asSequenceGenerator;
-%newobject GradientMachine::getParameter;
-%newobject GradientMachine::getLayerOutput;
-%newobject GradientMachine::makeEvaluator;
-%newobject TrainerConfig::createFromTrainerConfigFile;
-%newobject TrainerConfig::getModelConfig;
-%newobject TrainerConfig::getOptimizationConfig;
-%newobject Parameter::getBuf;
-%newobject Parameter::getConfig;
-%newobject ParameterOptimizer::create;
-%newobject ParameterOptimizer::needSpecialTraversal;
-%newobject ParameterUpdater::createLocalUpdater;
-%newobject ParameterUpdater::createRemoteUpdater;
-%newobject ParameterUpdater::createNewRemoteUpdater;
-
-%feature("director") UpdateCallback;
-%feature("autodoc", 1); // To generate method stub, for code hint in ide
-
-// Ignore many private class, and method cannot be handled by swig.
-%ignore MatrixPrivate;
-%ignore TrainerPrivate;
-%ignore IVector::operator[];
-%ignore ArgumentsPrivate;
-%ignore GradientMachinePrivate;
-%ignore TrainerConfigPrivate;
-%ignore ModelConfigPrivate;
-%ignore ParameterPrivate;
-%ignore SequenceGeneratorPrivate;
-%ignore VectorPrivate;
-%ignore ParameterConfigPrivate;
-%ignore OptimizationConfigPrivate;
-%ignore ParameterTraverseCallbackPrivate;
-%include "legacy/utils/GlobalConstants.h"
-%include "legacy/api/PaddleAPI.h"
diff --git a/paddle/legacy/api/PaddleAPI.h b/paddle/legacy/api/PaddleAPI.h
deleted file mode 100644
index 475984a3d57..00000000000
--- a/paddle/legacy/api/PaddleAPI.h
+++ /dev/null
@@ -1,1054 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdexcept>
-#include <string>
-#include <vector>
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-
-/// Import PaddlePaddle's enumeration into global namespace.
-using namespace paddle::enumeration_wrapper;  // NOLINT
-
-/**
- * @brief Initialize paddle.
- *
- * In python, this method should be invoked as
- * @code
- *  import sys
- *  import paddle
- *  paddle.initPaddle(sys.argv)
- *  or you can change arguments as any list of str.
- * @endcode
- */
-void initPaddle(int argc, char** argv);
-
-/// Return FLAGS_use_gpu
-bool isUsingGpu();
-
-/// Set the Flags_use_gpu to the given parameter
-void setUseGpu(bool useGpu);
-
-/// Return true if this py_paddle is compiled in GPU Version
-bool isGpuVersion();
-
-/// Return FLAGS_trainer_count
-int getTrainerCount();
-
-/// The Error of IO Operation. Such as file not found, etc.
-class IOError {};
-
-/// Out of range error
-class RangeError {};
-
-/// Not support Error, such as access GPU memory directly, etc.
-class UnsupportError : public std::runtime_error {
- public:
-  UnsupportError() : std::runtime_error(" ") {}
-  explicit UnsupportError(const std::string& message)
-      : std::runtime_error(message) {}
-};
-
-/// This type will map to python's list of float.
-struct FloatArray {
-  const float* buf;
-  const size_t length;
-  bool needFree;  // true if the buf is dynamic alloced.
-  FloatArray(const float* b, const size_t l);
-};
-
-/// This type will map to python's list of int
-struct IntArray {
-  const int* buf;
-  const size_t length;
-  bool needFree;
-  IntArray(const int* b, const size_t l, bool f = false);
-};
-
-/// This type will map to python's list of (int, float)
-struct IntWithFloatArray {
-  const float* valBuf;
-  const int* idxBuf;
-  const size_t length;
-  bool needFree;
-  IntWithFloatArray(const float* v, const int* i, size_t l, bool f = false);
-};
-
-enum SparseValueType { SPARSE_NON_VALUE = 0, SPARSE_VALUE = 1 };
-
-enum SparseFormatType { SPARSE_CSR = 0, SPARSE_CSC = 1 };
-
-/**
- * In Python, -1UL is hard to write. So define a const value used by python
- * side.
- */
-const size_t NO_SPARSE_ID = -1UL;
-
-struct MatrixPrivate;
-class Matrix {
-  Matrix();  // User Cannot Create Matrix.
-  DISABLE_COPY(Matrix);
-  static Matrix* createByPaddleMatrixPtr(void* sharedPtr);
-
- public:
-  virtual ~Matrix();
-
-  /**
-   * Create A Matrix with height,width, which is filled by zero.
-   */
-  static Matrix* createZero(size_t height,
-                            size_t width,
-                            bool useGpu = isUsingGpu());
-
-  /**
-   * Create Sparse Matrix.
-   *
-   * After create sparse, sparseCopyFrom can be used to fill matrix.
-   *
-   * @param nnz  Number of non zero values.
-   *
-   * @note the default sparse type is SPARSE_CSR.
-   */
-  static Matrix* createSparse(size_t height,
-                              size_t width,
-                              size_t nnz,
-                              bool isNonVal = true,
-                              bool trans = false,
-                              bool useGpu = isUsingGpu());
-
-  /**
-   * Create Dense Matrix.
-   *
-   * @param data  list of float should be passed in python.
-   * @note        the value will be copy into a new matrix.
-   */
-  static Matrix* createDense(const std::vector<float>& data,
-                             size_t height,
-                             size_t width,
-                             bool useGpu = isUsingGpu());
-
-  static Matrix* createDenseFromNumpy(
-      float* data,
-      int dim1,
-      int dim2,
-      bool copy = true,
-      bool useGpu = isUsingGpu()) throw(UnsupportError);
-
-  /**
-   *  Create Cpu Dense Matrix from numpy matrix, dtype=float32
-   *
-   *  @param data  a numpy matrix.
-   *  @param dim1  dimension of data.
-   *  @param dim2  dimension of data.
-   *  @param copy  true if copy into a new matrix, false will create
-   *               matrix inplace. copy = false should be used with extreme
-   *               care because Matrix will share the memory with the given
-   *               numpy array. If the numpy array object is no longer valid,
-   *               the memory space will not be usable.
-   */
-  static Matrix* createCpuDenseFromNumpy(float* data,
-                                         int dim1,
-                                         int dim2,
-                                         bool copy = true);
-
-  /// Create Gpu Dense Matrix from numpy matrix, dtype=float32
-  static Matrix* createGpuDenseFromNumpy(float* data, int dim1, int dim2);
-
-  /**
-   * Cast to numpy matrix.
-   *
-   * @note    This method take no parameter in python.
-   * @note    This method in python will return a numpy matrix, not void.
-   * @note    Only CpuDenseMatrix is supported.
-   *
-   * Example:
-   * @code
-   * import paddle
-   * m = paddle.Matrix.createZero(10,2)
-   * numpy_mat = m.toNumpyMat()
-   * @endcode
-   */
-  void toNumpyMatInplace(float** view_data,
-                         int* dim1,
-                         int* dim2) throw(UnsupportError);
-
-  /// Copy To numpy mat.
-  void copyToNumpyMat(float** view_m_data,
-                      int* dim1,
-                      int* dim2) throw(UnsupportError);
-
-  /// Copy From Numpy Mat
-  void copyFromNumpyMat(float* data, int dim1, int dim2) throw(UnsupportError,
-                                                               RangeError);
-
-  /// return true if this matrix is sparse.
-  bool isSparse() const;
-
-  SparseValueType getSparseValueType() const throw(UnsupportError);
-
-  SparseFormatType getSparseFormat() const throw(UnsupportError);
-
-  IntArray getSparseRowCols(size_t i) const throw(UnsupportError, RangeError);
-
-  IntWithFloatArray getSparseRowColsVal(size_t i) const
-      throw(UnsupportError, RangeError);
-
-  size_t getHeight() const;
-
-  size_t getWidth() const;
-
-  float get(size_t x, size_t y) const throw(RangeError);
-
-  void set(size_t x, size_t y, float val) throw(RangeError, UnsupportError);
-
-  /// return type is list of float
-  FloatArray getData() const;
-
-  /**
-   * Copy from rows, cols, values.
-   *
-   * if sparse_nonvalue, the values should be []
-   */
-  void sparseCopyFrom(const std::vector<int>& rows,
-                      const std::vector<int>& cols,
-                      const std::vector<float>& values =
-                          std::vector<float>()) throw(UnsupportError);
-
-  bool isGpu() const;
-
- private:
-  void* getSharedPtr() const;
-
-  MatrixPrivate* m;
-  friend class Trainer;
-  friend class GradientMachine;
-  friend class Arguments;
-};
-
-struct VectorPrivate;
-class Vector {
-  DISABLE_COPY(Vector);
-  Vector();
-  static Vector* createByPaddleVectorPtr(void* ptr);
-
-  void* getSharedPtr();
-
- public:
-  ~Vector();
-
-  /// Create Vector filled with zero.
-  static Vector* createZero(size_t sz, bool useGpu = isUsingGpu());
-
-  /**
-   * Create Vector from list of float.
-   *
-   * It will create a new vector, and copy data into it.
-   */
-  static Vector* create(const std::vector<float>& data,
-                        bool useGpu = isUsingGpu());
-
-  static Vector* createVectorFromNumpy(
-      float* data,
-      int dim,
-      bool copy = true,
-      bool useGpu = isUsingGpu()) throw(UnsupportError);
-  /**
-   * Create Cpu Vector from numpy array, which dtype=float32
-   *
-   * If copy is false, it will create vector inplace.
-   */
-  static Vector* createCpuVectorFromNumpy(float* data,
-                                          int dim,
-                                          bool copy = true);
-
-  /// Create Gpu Vector from numpy array, which dtype=float32
-  static Vector* createGpuVectorFromNumpy(float* data, int dim);
-
-  /**
-   * copy from another vector
-   * throw(RangeError) if size of src vector is different from size of this
-   * vector
-   */
-  void copyFrom(Vector* src) throw(RangeError);
-
-  /// Cast to numpy array inplace.
-  void toNumpyArrayInplace(float** view_data, int* dim1) throw(UnsupportError);
-
-  /// Copy to numpy array.
-  void copyToNumpyArray(float** view_m_data, int* dim1);
-
-  /// Copy from numpy array.
-  void copyFromNumpyArray(float* data, int dim);
-
-  /// __getitem__ in python
-  float get(const size_t idx) const throw(RangeError, UnsupportError);
-
-  /// __setitem__ in python
-  void set(const size_t idx, float val) throw(RangeError, UnsupportError);
-
-  /// Return is GPU vector or not.
-  bool isGpu() const;
-
-  /// Return a list of float, the memory is alloced and copied.
-  FloatArray getData() const;
-
-  /// __len__ in python
-  size_t getSize() const;
-
- private:
-  VectorPrivate* m;
-
- private:
-  friend class Parameter;
-  friend class ParameterOptimizer;
-  friend struct ParameterTraverseCallbackPrivate;
-};
-
-struct IVectorPrivate;
-class IVector {
-  IVector();
-  DISABLE_COPY(IVector);
-  static IVector* createByPaddleVectorPtr(void* ptr);
-
- public:
-  /// Create IVector filled with zero
-  static IVector* createZero(size_t sz, bool useGpu = isUsingGpu());
-
-  /**
-   * Create IVector from list of int.
-   * It will create a new vector, and copy data into it.
-   */
-  static IVector* create(const std::vector<int>& data,
-                         bool useGpu = isUsingGpu());
-
-  static IVector* createVectorFromNumpy(
-      int* data,
-      int dim,
-      bool copy = true,
-      bool useGpu = isUsingGpu()) throw(UnsupportError);
-
-  /**
-   * Create Cpu IVector from numpy array, which dtype=int32
-   *
-   * If copy is false, it will create vector inplace
-   */
-  static IVector* createCpuVectorFromNumpy(int* data,
-                                           int dim,
-                                           bool copy = true);
-  /**
-   * Create Gpu IVector from numpy array, which dtype=int32
-   */
-  static IVector* createGpuVectorFromNumpy(int* data, int dim);
-
-  /// Cast to numpy array inplace.
-  void toNumpyArrayInplace(int** view_data, int* dim1) throw(UnsupportError);
-
-  /// Copy to numpy array.
-  void copyToNumpyArray(int** view_m_data, int* dim1);
-
-  /// Copy from numpy array.
-  void copyFromNumpyArray(int* data, int dim);
-
-  virtual ~IVector();
-
-  /// Return a list of int, the memory is alloced and copied.
-  IntArray getData() const;
-
-  /// This method will map to python [] method.
-  int& operator[](const size_t idx) throw(RangeError, UnsupportError);
-
-  const int& operator[](const size_t idx) const
-      throw(RangeError, UnsupportError);
-
-  inline int get(const size_t idx) const throw(RangeError, UnsupportError) {
-    return (*this)[idx];
-  }
-
-  inline void set(const size_t idx, int val) throw(RangeError, UnsupportError) {
-    (*this)[idx] = val;
-  }
-
-  /// Return true if it is gpu vector.
-  bool isGpu() const;
-
-  /// This method will map to python __len__();
-  size_t getSize() const;
-
- private:
-  void* getSharedPtr() const;
-
-  friend class Arguments;
-  IVectorPrivate* m;
-};
-
-struct ArgumentsPrivate;
-
-/// The Arguments is actual a std::vector<paddle::Argument> in paddle.
-class Arguments {
- private:
-  Arguments();  // Internal Create.
-  DISABLE_COPY(Arguments);
-
- public:
-  /**
-   * Create a arguments with size.
-   * Note that it can be zero.
-   */
-  static Arguments* createArguments(size_t slotNum);
-
-  void resize(size_t slotNum);
-
-  virtual ~Arguments();
-
-  /**
-   * Return the slot number that aguments contains.
-   *
-   * It is actually the vector's size
-   */
-  size_t getSlotNum() const;
-
-  /**
-   * The get functions of Arguments
-   *
-   * the param idx is the slot id
-   */
-  Matrix* getSlotValue(size_t idx) const throw(RangeError);
-  Matrix* getSlotGrad(size_t idx) const throw(RangeError);
-  IVector* getSlotIds(size_t idx) const throw(RangeError);
-  Matrix* getSlotIn(size_t idx) const throw(RangeError);
-  IVector* getSlotSequenceStartPositions(size_t idx) const throw(RangeError);
-  IVector* getSlotSubSequenceStartPositions(size_t idx) const throw(RangeError);
-  IVector* getSlotSequenceDim(size_t idx) const throw(RangeError);
-  // End Of get functions of Arguments
-
-  int64_t getBatchSize(size_t idx = 0) const throw(RangeError);
-
-  /**
-   * The set functions of Arguments.
-   *
-   * The param idx is the slot id.
-   * The other param is the input Matrix or vector.
-   */
-  void setSlotValue(size_t idx, Matrix* mat) throw(RangeError);
-  void setSlotGrad(size_t idx, Matrix* mat) throw(RangeError);
-  void setSlotIn(size_t idx, Matrix* mat) throw(RangeError);
-  void setSlotIds(size_t idx, IVector* vec) throw(RangeError);
-  void setSlotSequenceStartPositions(size_t idx,
-                                     IVector* vec) throw(RangeError);
-  void setSlotSubSequenceStartPositions(size_t idx,
-                                        IVector* vec) throw(RangeError);
-  void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
-
-  /**
-   * Set the frame height of the idx-th Argument.
-   *
-   * @param ids The index of which Argument.
-   * @param h The height value.
-   */
-  void setSlotFrameHeight(size_t idx, size_t h) throw(RangeError);
-
-  /**
-   * Set the frame height of the idx-th Argument.
-   *
-   * @param ids The index of which Argument.
-   * @param h The height value.
-   */
-  void setSlotFrameWidth(size_t idx, size_t w) throw(RangeError);
-
-  size_t getSlotFrameHeight(size_t idx = 0) const throw(RangeError);
-  size_t getSlotFrameWidth(size_t idx = 0) const throw(RangeError);
-
-  float sum() const;
-
- private:
-  static Arguments* createByPaddleArgumentVector(void* ptr);
-  static Arguments* createByPaddleArgument(const void* ptr);
-  void* getInternalArgumentsPtr() const;
-
- private:
-  ArgumentsPrivate* m;
-  friend class Trainer;
-  friend class GradientMachine;
-  friend class SequenceGenerator;
-};
-
-enum GradientMatchineCreateMode {
-  CREATE_MODE_NORMAL = paddle::GradientMachine::kNormal,
-  CREATE_MODE_SGD_SPARSE_CPU_TRAINING =
-      paddle::GradientMachine::kSgdSparseCpuTraining,
-  CREATE_MODE_TESTING = paddle::GradientMachine::kTesting
-};
-
-struct ParameterConfigPrivate;
-class ParameterConfig {
-  DISABLE_COPY(ParameterConfig);
-  ParameterConfig();
-
-  /**
-   * Internal methods
-   */
-  static ParameterConfig* createParameterConfigFromParameterSharedPtr(
-      void* ptr);
-  static ParameterConfig* createParameterConfigFromParameterPtr(void* ptr);
-  void* getRawPtr();
-
- public:
-  ~ParameterConfig();
-
-  /**
-   * return proto buf string.
-   */
-  std::string toProtoString() const;
-
- private:
-  ParameterConfigPrivate* m;
-
- private:
-  friend class Parameter;
-  friend class ParameterOptimizer;
-  friend struct ParameterTraverseCallbackPrivate;
-};
-
-struct OptimizationConfigPrivate;
-class OptimizationConfig {
-  DISABLE_COPY(OptimizationConfig);
-  OptimizationConfig();
-
- public:
-  static OptimizationConfig* createFromProtoString(const std::string& str);
-  ~OptimizationConfig();
-
-  /**
-   * return protobuf string.
-   */
-  std::string toProtoString();
-
- private:
-  OptimizationConfigPrivate* m;
-
-  friend class TrainerConfig;
-  friend class ParameterOptimizer;
-  friend class ParameterUpdater;
-  friend class Trainer;
-};
-
-struct ParameterPrivate;
-class Parameter {
- private:
-  Parameter();
-  DISABLE_COPY(Parameter);
-
- public:
-  virtual ~Parameter();
-
-  /**
-   * get parameter name
-   */
-  std::string getName() const;
-
-  /**
-   * get buf in Parameter
-   */
-  Vector* getBuf(ParameterType type);
-
-  /**
-   * get id
-   */
-  size_t getID() const;
-
-  ParameterConfig* getConfig();
-  void setValueUpdated();
-
-  bool save(const std::string& filename) const;
-
-  bool load(const std::string& filename) const;
-
-  size_t getSize() const;
-
- private:
-  static Parameter* createFromRawPtr(void* ptr);
-  static Parameter* createFromSharedPtr(void* ptr);
-
- private:
-  ParameterPrivate* m;
-  friend class UpdateCallbackWrapper;
-  friend class GradientMachine;
-  friend class ParameterUpdater;
-};
-
-struct ModelConfigPrivate;
-/**
- * You can only get model config from TrainerConfig.
- *
- * It is used by GradientMachine.
- */
-class ModelConfig {
- private:
-  ModelConfig();
-  DISABLE_COPY(ModelConfig);
-
- public:
-  virtual ~ModelConfig();
-
- private:
-  ModelConfigPrivate* m;
-  friend class TrainerConfig;
-  friend struct TrainerConfigPrivate;
-  friend class GradientMachine;
-};
-
-struct TrainerConfigPrivate;
-/**
- * To get TrainerConfig from file.
- *
- * It is used by GradientMachine.
- */
-class TrainerConfig {
- private:
-  TrainerConfig();
-  DISABLE_COPY(TrainerConfig);
-
- public:
-  virtual ~TrainerConfig();
-
-  static TrainerConfig* createFromTrainerConfigFile(
-      const std::string& configPath);
-  static TrainerConfig* createFromProtoString(const std::string& str);
-
-  ModelConfig* getModelConfig() const;
-
-  OptimizationConfig* getOptimizationConfig() const;
-
- private:
-  TrainerConfigPrivate* m;
-  friend class Trainer;
-};
-
-/**
- * The callback in backword.
- *
- * You can inherit this class in python.
- *
- * @code
- * class UpdateCallbackInPython(paddle.UpdateCallback):
- *   def __init__(self):
- *     paddle.UpdateCallback.__init__(self)
- *
- *   def apply(self, param):
- *     assert isinstance(param, paddle.Parameter)
- * @endcode
- */
-class UpdateCallback {
- public:
-  virtual ~UpdateCallback();
-  virtual void apply(Parameter* p);
-};
-
-struct ParameterTraverseCallbackPrivate;
-class ParameterTraverseCallback {
-  DISABLE_COPY(ParameterTraverseCallback);
-  ParameterTraverseCallback();
-
- public:
-  ~ParameterTraverseCallback();
-
-  void apply(const std::vector<Vector*>& vecs,
-             const ParameterConfig& config,
-             size_t sparseId);
-
- private:
-  ParameterTraverseCallbackPrivate* m;
-  friend class ParameterOptimizer;
-};
-
-/**
- * The ParameterOptimizer Wrapper Class.
- *
- * Basically same as common/ParameterOptimizer.h
- */
-struct ParameterOptimizerPrivate;
-class ParameterOptimizer {
-  DISABLE_COPY(ParameterOptimizer);
-  ParameterOptimizer();
-
- public:
-  static ParameterOptimizer* create(OptimizationConfig* config);
-
-  ~ParameterOptimizer();
-
-  void init(size_t numRows, const ParameterConfig* config);
-
-  void startPass();
-
-  void finishPass();
-
-  void startBatch(size_t numSamplesProcessed);
-
-  void finishBatch();
-
-  void update(const std::vector<Vector*>& vecs,
-              const ParameterConfig& conf,
-              size_t sparseId = NO_SPARSE_ID);
-
-  std::vector<int> getParameterTypes() const;
-
-  ParameterTraverseCallback* needSpecialTraversal(
-      const ParameterConfig& config) const;
-
- private:
-  ParameterOptimizerPrivate* m;
-};
-
-class SequenceGenerator;
-class Evaluator;
-struct GradientMachinePrivate;
-class GradientMachine {
- private:
-  GradientMachine();
-  DISABLE_COPY(GradientMachine);
-
- public:
-  virtual ~GradientMachine();
-
-  /**
-   * Create By ProtoStr.
-   *
-   * The ProtoStr can be generate by python's protobuf code.
-   */
-  static GradientMachine* createByConfigProtoStr(
-      const std::string& protoStr,
-      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
-      const std::vector<int>& parameterTypes = defaultParamTypes);
-
-  /**
-   * Create by ModelConfig object.
-   *
-   * To get ModelConfig, you can get TrainerConfig from config file, then get
-   * model config by TrainerConfig
-   */
-  static GradientMachine* createByModelConfig(
-      ModelConfig* conf,
-      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
-      const std::vector<int>& parameterTypes = defaultParamTypes);
-
-  /**
-   * @brief finish
-   */
-  void finish();
-
-  void start();
-
-  /**
-   * Prefetch row ids of sparse parameter.
-   */
-  void prefetch(const Arguments& inArgs);
-
-  /**
-   * Do some thing when train pass ended.
-   */
-  void onPassEnd();
-
-  /**
-   * The forward stage of GradientMachine.
-   *
-   * @note  the outArgs could be zero length arguemnts.
-   * @note  THIS METHOD IS VERY USEFULL FOR PREDICT FROM TRAINED MODEL.
-   */
-  void forward(const Arguments& inArgs, Arguments* outArgs, PassType passType);
-
-  /**
-   * The backward stage of GradientMachine.
-   *
-   * @note  Currently the ParameterUpdater is not wrapped in SWIG, so backward
-   * cannot actually train a network. But you can write a update callback to
-   * change the parameter or implement a ParameterUpdater in python side.
-   */
-  void backward(const UpdateCallback& callback = UpdateCallback());
-
-  /**
-   * Combine forward/backward
-   */
-  void forwardBackward(const Arguments& inArgs,
-                       Arguments* outArgs,
-                       PassType passType,
-                       const UpdateCallback& callback = UpdateCallback());
-
-  void loadParameters(const std::string& path);
-
-  size_t getParameterSize() const;
-  Parameter* getParameter(size_t i) throw(RangeError);
-
-  size_t getNonStaticParameterSize() const;
-  Parameter* getNonStaticParameter(size_t i) throw(RangeError);
-
-  void randParameters();
-
-  Arguments* getLayerOutput(const std::string& layerName) const
-      throw(UnsupportError);
-
-  /**
-   * Create a sequence generator.
-   *
-   * @note  It just like a paddle_gen_sequence.
-   */
-  SequenceGenerator* asSequenceGenerator(
-      const std::vector<std::string>& dict = std::vector<std::string>(),
-      size_t begin_id = 0UL,
-      size_t end_id = 0UL,
-      size_t max_length = 100UL,
-      size_t beam_size = -1UL);
-
-  Evaluator* makeEvaluator();
-
-  void eval(Evaluator* evaluator);
-
- private:
-  GradientMachinePrivate* m;
-
-  static GradientMachine* createFromPaddleModelPtr(
-      const void* confPtr,
-      GradientMatchineCreateMode mode,
-      const std::vector<int>& types);
-
-  // Not to use c++ 11 init-list, so we use static var as function default arg.
-  static std::vector<int> defaultParamTypes;
-  friend class Trainer;
-  friend class ParameterUpdater;
-};
-
-struct ParameterUpdaterPrivate;
-class ParameterUpdater {
- private:
-  ParameterUpdater();
-
- public:
-  static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
-  static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
-                                               int passCount,
-                                               bool useSparseUpdater);
-  static ParameterUpdater* createNewRemoteUpdater(
-      OptimizationConfig* config,
-      const std::string pserverSpec,
-      const bool useEtcd) throw(UnsupportError);
-  ~ParameterUpdater();
-
-  /**
-   * @brief initialize Parameter Updater by GradientMachine.
-   * @param gm
-   */
-  void init(const GradientMachine& gm);
-
-  /**
-   * @brief begin of a training/testing of one pass.
-   */
-  void startPass();
-
-  /**
-   * @brief end of a traning/testing of one pass.
-   */
-  void finishPass();
-
-  /**
-   * @brief begin of a training/testing of one batch.
-   * @param data batch's size
-   * @return PassType, mostly will be training.
-   */
-  PassType startBatch(size_t batchSize);
-
-  /**
-   * @brief end of a traning/testing of one batch
-   * @param cost current batch cost.
-   */
-  void finishBatch(float cost);
-
-  /**
-   * @brief update a parameter (by local optimizer or by cluster pserver)
-   * @param param
-   */
-  void update(Parameter* param);
-
-  /**
-   * @breif only get required sparse rows by default.
-   * @param fullSize: get full matrix parameter if *fullSize* set
-   * @param apply: get PARAMETER_APPLY on pserver if *apply* set
-   */
-  void getParametersRemote(bool fullSize = false, bool apply = false);
-
-  /**
-   * @brief restore the average parameter.
-   * @note It is only used in AverageOptimizer. Restore will get the current
-   * PARAMETER_VALUE back.
-   */
-  void restore();
-
-  /**
-   * @brief apply. Store the average parameter.
-   * @note It is only used in AverageOptimizer. Apply will store the current
-   * PARAMETER_VALUE to buffer, calcaualte current Average Parameter, and save
-   * it to PARAMETER_VALUE.
-   */
-  void apply();
-
-  /**
-   * @brief catchUpWith The Regularization will be delayed in many situations(
-   * pserver, local sparse). Catch Up means catch the regularization up, apply
-   * regularization to all params.
-   */
-  void catchUpWith();
-
- private:
-  ParameterUpdaterPrivate* m;
-};
-
-struct EvaluatorPrivate;
-class Evaluator {
- private:
-  Evaluator();
-  DISABLE_COPY(Evaluator);
-
- public:
-  ~Evaluator();
-
-  /**
-   * @brief begin an evaluate stage.
-   */
-  void start();
-
-  /**
-   * @brief end an evaluate stage.
-   */
-  void finish();
-
-  /**
-   * @brief toString will get a evaluate result.
-   *
-   * __repr__ method in python
-   */
-  std::string toString();
-
-  std::vector<std::string> getNames() const;
-
-  double getValue(const std::string name) const;
-
- private:
-  EvaluatorPrivate* m;
-
-  friend class GradientMachine;
-};
-
-struct TrainerPrivate;
-class Trainer {
- private:
-  TrainerPrivate* m;
-  Trainer();
-  Trainer(TrainerConfig* optConfig, GradientMachine* gm);
-  DISABLE_COPY(Trainer);
-
- public:
-  virtual ~Trainer();
-
-  /// Create A Trainer By TrainerConfig. using paddle command line.
-  static Trainer* createByCommandLine() throw(IOError);
-
-  static Trainer* create(TrainerConfig* optConfig,
-                         GradientMachine* gm) throw(IOError);
-
-  /// Start training
-  void startTrain();
-
-  /// Finish training
-  void finishTrain();
-
-  /// Start a pass.
-  void startTrainPass();
-
-  /// Finish a pass
-  void finishTrainPass();
-
-  /**
-   * Train one batch,
-   *
-   * @return true if all batch finished.
-   */
-  bool trainOneBatch(size_t batchSize);
-
-  void trainOneDataBatch(size_t batchSize, const Arguments& args);
-
-  void startTestPeriod();
-  void testOneDataBatch(size_t batchSize, const Arguments& args);
-  void finishTestPeriod();
-
-  void forwardOneBatch(size_t batchSize);
-
-  Arguments* getForwardOutput();
-
-  Arguments* getLayerOutput(const std::string& layerName) const;
-};
-
-/// the N-Best results generated from one input sequence.
-class ISequenceResults {
- public:
-  virtual ~ISequenceResults();
-
-  /// Number of result.
-  virtual size_t getSize() const = 0;
-
-  /**
-   * Get sentence from dictionary.
-   *
-   * @param id  the index of result.
-   * @param split  if true, the return sentence will be splited with ' ' by
-   *               each word. Default is false.
-   */
-  virtual std::string getSentence(size_t id, bool split = false) const
-      throw(RangeError) = 0;
-  virtual std::vector<int> getSequence(size_t id) const throw(RangeError) = 0;
-  virtual float getScore(size_t id) const throw(RangeError) = 0;
-};
-
-struct SequenceGeneratorPrivate;
-class SequenceGenerator {
-  DISABLE_COPY(SequenceGenerator);
-  SequenceGenerator();
-
- public:
-  virtual ~SequenceGenerator();
-
-  /**
-   * Generate Sequence by input.
-   *
-   * @note  The inArgs is just one sequence of data.
-   * @note  The return will get a N-best generate result by inArgs.
-   *        Sort by score.
-   */
-  ISequenceResults* generateSequence(const Arguments& inArgs) const;
-
-  void setDict(const std::vector<std::string>& dict);
-  void setBos(size_t bos);
-  void setEos(size_t eos);
-  void setMaxLength(size_t maxlength);
-  void setBeamSize(size_t beamSize);
-
- private:
-  static SequenceGenerator* createByGradientMachineSharedPtr(void* ptr);
-  friend class GradientMachine;
-
- private:
-  SequenceGeneratorPrivate* m;
-};
diff --git a/paddle/legacy/api/PaddleAPIPrivate.h b/paddle/legacy/api/PaddleAPIPrivate.h
deleted file mode 100644
index 3ee192c31d5..00000000000
--- a/paddle/legacy/api/PaddleAPIPrivate.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <memory>
-#include "PaddleAPI.h"
-#include "paddle/legacy/gserver/evaluators/Evaluator.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/legacy/parameter/ParameterUpdaterBase.h"
-#include "paddle/legacy/trainer/TrainerConfigHelper.h"
-
-struct GradientMachinePrivate {
-  std::shared_ptr<paddle::GradientMachine> machine;
-
-  template <typename T>
-  inline T& cast(void* ptr) {
-    return *(T*)(ptr);
-  }
-};
-
-struct OptimizationConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfigHelper> trainer_config;
-  paddle::OptimizationConfig config;
-
-  const paddle::OptimizationConfig& getConfig() {
-    if (trainer_config != nullptr) {
-      return trainer_config->getOptConfig();
-    } else {
-      return config;
-    }
-  }
-};
-
-struct TrainerConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfigHelper> conf;
-  TrainerConfigPrivate() {}
-};
-
-struct ModelConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfigHelper> conf;
-};
-
-struct ArgumentsPrivate {
-  std::vector<paddle::Argument> outputs;
-
-  inline paddle::Argument& getArg(size_t idx) throw(RangeError) {
-    if (idx < outputs.size()) {
-      return outputs[idx];
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-
-  template <typename T>
-  std::shared_ptr<T>& cast(void* rawPtr) const {
-    return *(std::shared_ptr<T>*)(rawPtr);
-  }
-};
-
-struct ParameterUpdaterPrivate {
-  std::unique_ptr<paddle::ParameterUpdater> updater;
-};
-
-struct ParameterPrivate {
-  std::shared_ptr<paddle::Parameter> sharedPtr;
-  paddle::Parameter* rawPtr;  // rawPtr only used in ParameterUpdater,
-                              // in other situation sharedPtr should
-                              // contains value.
-
-  ParameterPrivate() : sharedPtr(nullptr), rawPtr(nullptr) {}
-
-  paddle::Parameter* getPtr() {
-    if (sharedPtr) {
-      return sharedPtr.get();
-    } else {
-      return rawPtr;
-    }
-  }
-};
-
-struct EvaluatorPrivate {
-  paddle::Evaluator* rawPtr;
-
-  EvaluatorPrivate() : rawPtr(nullptr) {}
-  ~EvaluatorPrivate() { delete rawPtr; }
-};
diff --git a/paddle/legacy/api/Parameter.cpp b/paddle/legacy/api/Parameter.cpp
deleted file mode 100644
index f05740eb750..00000000000
--- a/paddle/legacy/api/Parameter.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/parameter/Parameter.h"
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-Parameter::Parameter() : m(new ParameterPrivate()) {}
-
-Parameter::~Parameter() { delete m; }
-
-Parameter* Parameter::createFromRawPtr(void* ptr) {
-  auto p = new Parameter();
-  p->m->rawPtr = *static_cast<paddle::Parameter**>(ptr);
-  return p;
-}
-
-Parameter* Parameter::createFromSharedPtr(void* ptr) {
-  auto& p = *(paddle::ParameterPtr*)(ptr);
-  if (p == nullptr) {
-    return nullptr;
-  } else {
-    auto retParam = new Parameter();
-    retParam->m->sharedPtr = p;
-    return retParam;
-  }
-}
-
-std::string Parameter::getName() const { return m->getPtr()->getName(); }
-
-Vector* Parameter::getBuf(ParameterType type) {
-  auto buf = m->getPtr()->getBuf(type);
-  return Vector::createByPaddleVectorPtr(&buf);
-}
-
-ParameterConfig* Parameter::getConfig() {
-  if (m->sharedPtr) {
-    return ParameterConfig::createParameterConfigFromParameterSharedPtr(
-        &m->sharedPtr);
-  } else {
-    return ParameterConfig::createParameterConfigFromParameterPtr(m->rawPtr);
-  }
-}
-
-size_t Parameter::getID() const { return m->getPtr()->getID(); }
-
-void Parameter::setValueUpdated() { m->getPtr()->setValueUpdated(); }
-
-bool Parameter::save(const std::string& filename) const {
-  return m->getPtr()->save(filename);
-}
-
-bool Parameter::load(const std::string& filename) const {
-  return m->getPtr()->load(filename);
-}
-
-size_t Parameter::getSize() const { return m->getPtr()->getSize(); }
diff --git a/paddle/legacy/api/ParameterOptimizer.cpp b/paddle/legacy/api/ParameterOptimizer.cpp
deleted file mode 100644
index 477d9dae443..00000000000
--- a/paddle/legacy/api/ParameterOptimizer.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/parameter/ParameterOptimizer.h"
-#include <algorithm>
-#include "Internal.h"
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-struct ParameterOptimizerPrivate {
-  std::unique_ptr<paddle::ParameterOptimizer> optimizer;
-};
-
-struct ParameterTraverseCallbackPrivate {
-  paddle::ParameterOptimizer::TraverseCallback callback;
-
-  ParameterTraverseCallbackPrivate() {}
-
-  ParameterTraverseCallbackPrivate(
-      const paddle::ParameterOptimizer::TraverseCallback& callback)
-      : callback(callback) {}
-
-  void apply(const std::vector<Vector*>& vecs,
-             const ParameterConfig& conf,
-             size_t sparseId) {
-    std::vector<paddle::VectorPtr> real_vecs;
-    real_vecs.resize(vecs.size());
-    std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
-      if (v) {
-        return *(paddle::VectorPtr*)(v->getSharedPtr());
-      } else {
-        return paddle::VectorPtr();
-      }
-    });
-
-    paddle::ParameterConfig& real_conf =
-        *(paddle::ParameterConfig*)(const_cast<ParameterConfig&>(conf)
-                                        .getRawPtr());
-    callback(real_vecs.data(), real_conf, sparseId);
-  }
-};
-
-ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}
-
-ParameterOptimizer::~ParameterOptimizer() { delete m; }
-
-ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
-  CHECK(config != nullptr);
-  auto retOptimizer = new ParameterOptimizer();
-  retOptimizer->m->optimizer.reset(
-      paddle::ParameterOptimizer::create(config->m->getConfig(), false));
-  return retOptimizer;
-}
-
-void ParameterOptimizer::init(size_t numRows, const ParameterConfig* config) {
-  auto& conf = *(paddle::ParameterConfig*)(const_cast<ParameterConfig*>(config)
-                                               ->getRawPtr());
-  m->optimizer->init(numRows, &conf);
-}
-
-void ParameterOptimizer::startPass() { m->optimizer->startPass(); }
-
-void ParameterOptimizer::finishPass() { m->optimizer->finishPass(); }
-
-void ParameterOptimizer::startBatch(size_t numSamplesProcessed) {
-  constexpr size_t high_1 = 1UL << (sizeof(size_t) * 8 - 1);
-  CHECK_EQ(numSamplesProcessed & high_1, 0UL);  // Safely cast.
-  m->optimizer->startBatch((int64_t)numSamplesProcessed);
-}
-
-void ParameterOptimizer::finishBatch() { m->optimizer->finishBatch(); }
-
-void ParameterOptimizer::update(const std::vector<Vector*>& vecs,
-                                const ParameterConfig& conf,
-                                size_t sparseId) {
-  ParameterTraverseCallbackPrivate invoker(
-      [&](const paddle::VectorPtr _vecs[],
-          const paddle::ParameterConfig& config,
-          size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
-  invoker.apply(vecs, conf, sparseId);
-}
-
-std::vector<int> ParameterOptimizer::getParameterTypes() const {
-  std::vector<int> returnValue;
-  staticCastVector(&returnValue, m->optimizer->getParameterTypes());
-  return returnValue;
-}
-
-ParameterTraverseCallback::ParameterTraverseCallback()
-    : m(new ParameterTraverseCallbackPrivate()) {}
-
-ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; }
-
-void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
-                                      const ParameterConfig& conf,
-                                      size_t sparseId) {
-  m->apply(vecs, conf, sparseId);
-}
-
-ParameterTraverseCallback* ParameterOptimizer::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  auto& param_config =
-      *(paddle::ParameterConfig*)const_cast<ParameterConfig&>(config)
-           .getRawPtr();
-  auto callback = m->optimizer->needSpecialTraversal(param_config);
-  if (callback) {
-    auto retCallback = new ParameterTraverseCallback();
-    retCallback->m->callback = callback;
-    return retCallback;
-  } else {
-    return nullptr;
-  }
-}
diff --git a/paddle/legacy/api/ParameterUpdater.cpp b/paddle/legacy/api/ParameterUpdater.cpp
deleted file mode 100644
index 44af3f4635f..00000000000
--- a/paddle/legacy/api/ParameterUpdater.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-
-#include "PaddleAPIPrivate.h"
-#ifndef PADDLE_WITHOUT_GOLANG
-#include "paddle/legacy/trainer/NewRemoteParameterUpdater.h"
-#endif
-#include "paddle/legacy/trainer/RemoteParameterUpdater.h"
-#include "paddle/legacy/trainer/ThreadParameterUpdater.h"
-
-ParameterUpdater::ParameterUpdater() : m(new ParameterUpdaterPrivate()) {}
-
-ParameterUpdater *ParameterUpdater::createLocalUpdater(
-    OptimizationConfig *config) {
-  auto updater = new ParameterUpdater();
-  updater->m->updater.reset(
-      new paddle::SgdThreadUpdater(config->m->getConfig()));
-  return updater;
-}
-
-ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
-    OptimizationConfig *config,
-    const std::string pserverSpec,
-    const bool useEtcd) throw(UnsupportError) {
-#ifndef PADDLE_WITHOUT_GOLANG
-  auto updater = new ParameterUpdater();
-  updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
-      config->m->getConfig(), pserverSpec, useEtcd));
-  return updater;
-#else
-  throw UnsupportError("not compiled with WITH_GOLANG");
-#endif
-}
-
-ParameterUpdater *ParameterUpdater::createRemoteUpdater(
-    OptimizationConfig *config, int passCount, bool useSparseUpdater) {
-  auto updater = new ParameterUpdater();
-  auto remoteUpdater = new paddle::RemoteParameterUpdater(
-      config->m->getConfig(), passCount, nullptr);
-  if (useSparseUpdater) {
-    std::unique_ptr<paddle::ParameterUpdater> remoteUpdaterPtr(remoteUpdater);
-    auto sparseRemoteUpdater =
-        new paddle::SparseRemoteParameterUpdaterComposite(
-            config->m->getConfig(),
-            passCount,
-            false,
-            std::move(remoteUpdaterPtr));
-    updater->m->updater.reset(sparseRemoteUpdater);
-  } else {
-    updater->m->updater.reset(remoteUpdater);
-  }
-  return updater;
-}
-
-ParameterUpdater::~ParameterUpdater() { delete m; }
-
-void ParameterUpdater::init(const GradientMachine &gm) {
-  m->updater->init(gm.m->machine->getNonStaticParameters());
-}
-
-void ParameterUpdater::startPass() { m->updater->startPass(); }
-
-void ParameterUpdater::finishPass() { m->updater->finishPass(); }
-
-PassType ParameterUpdater::startBatch(size_t batchSize) {
-  return m->updater->startBatch((int64_t)batchSize);
-}
-
-void ParameterUpdater::finishBatch(float cost) {
-  m->updater->finishBatch(cost);
-}
-
-void ParameterUpdater::update(Parameter *param) {
-  auto paddleParam = param->m->getPtr();
-  m->updater->update(paddleParam);
-}
-
-void ParameterUpdater::getParametersRemote(bool fullSize, bool apply) {
-  m->updater->getParametersRemote(fullSize, apply);
-}
-
-void ParameterUpdater::restore() { m->updater->restore(); }
-
-void ParameterUpdater::apply() { m->updater->apply(); }
-
-void ParameterUpdater::catchUpWith() { m->updater->catchUpWith(); }
diff --git a/paddle/legacy/api/SequenceGenerator.cpp b/paddle/legacy/api/SequenceGenerator.cpp
deleted file mode 100644
index 2a73228f6d4..00000000000
--- a/paddle/legacy/api/SequenceGenerator.cpp
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <iterator>
-#include <sstream>
-#include <vector>
-#include "PaddleAPI.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/legacy/parameter/Argument.h"
-#include "paddle/legacy/utils/Flags.h"
-
-// used to represent partial sequence
-struct Path {
-  std::vector<int> ids;
-  float logProb;
-  paddle::MachineState machineState;
-
-  Path() { logProb = 0; }
-
-  Path(std::vector<int>& ids, float logProb, paddle::MachineState& machineState)
-      : ids(ids), logProb(logProb), machineState(machineState) {}
-
-  bool operator<(const Path& other) const { return (logProb > other.logProb); }
-};
-
-// Return top k (k == beam_size) optimal paths using beam search. The last
-// element of inArgs is the Argument of feedback. gradMachine has MaxIdLayer
-// as output and outArgs thus stores top k labels and their probabilities per
-// position
-static void findNBest(paddle::GradientMachine* gradMachine,
-                      std::vector<paddle::Argument>& inArgs,
-                      std::vector<Path>& finalPaths,
-                      size_t bos_id,
-                      size_t eos_id,
-                      size_t max_length) {
-  std::vector<Path> paths;
-  Path emptyPath;
-  paths.push_back(emptyPath);
-  finalPaths.clear();
-  gradMachine->resetState();
-  paddle::Argument feedback = inArgs.back();
-  feedback.ids->setElement(0, (int)(bos_id));
-  float minFinalPathLogProb = 0;
-  size_t beam = 0;
-  int id;
-  std::vector<paddle::Argument> outArgs;
-  while (true) {  // iterate over each generated word
-    std::vector<Path> newPaths;
-    paddle::MachineState machineState;
-    for (size_t j = 0; j < paths.size(); j++) {
-      Path& path = paths[j];
-      if (path.machineState.size() > 0) {
-        gradMachine->setState(path.machineState);
-        feedback.ids->setElement(0, path.ids.back());
-      }
-      gradMachine->forward(inArgs, &outArgs, paddle::PASS_TEST);
-      gradMachine->getState(machineState);
-      beam = outArgs[0].ids->getSize();
-      for (size_t k = 0; k < beam; k++) {
-        id = outArgs[0].ids->getElement(k);
-        float prob = outArgs[0].in->getElement(0, k);
-        std::vector<int> nids(path.ids);
-        nids.push_back(id);
-        float newLogProb = path.logProb + log(prob);
-        Path newPath(nids, newLogProb, machineState);
-        if (id == (int)eos_id || nids.size() >= max_length) {
-          finalPaths.push_back(newPath);
-          if (minFinalPathLogProb > newPath.logProb) {
-            minFinalPathLogProb = newPath.logProb;
-          }
-        } else {
-          newPaths.push_back(newPath);
-        }
-      }
-    }
-
-    if (newPaths.size() == 0) {
-      break;
-    }
-    std::nth_element(newPaths.begin(),
-                     newPaths.begin() + std::min(beam, newPaths.size()),
-                     newPaths.end());
-    if (newPaths.size() > beam) {
-      newPaths.resize(beam);
-    }
-    // pathA < pathB means pathA.logProb > pathB.logProb
-    float maxPathLogProb =
-        std::min_element(newPaths.begin(), newPaths.end())->logProb;
-    if (finalPaths.size() >= beam && minFinalPathLogProb >= maxPathLogProb) {
-      break;
-    }
-    paths = newPaths;
-  }  // end while
-
-  std::partial_sort(finalPaths.begin(),
-                    finalPaths.begin() + std::min(beam, finalPaths.size()),
-                    finalPaths.end());
-  if (finalPaths.size() > beam) {
-    finalPaths.resize(beam);
-  }
-}
-
-struct SequenceGeneratorPrivate {
-  std::shared_ptr<paddle::GradientMachine> machine;
-  std::shared_ptr<std::vector<std::string>> dict;
-  size_t beginPos;
-  size_t endPos;
-  size_t maxLength;
-
-  paddle::Argument feedback;
-
-  template <typename T>
-  inline T& cast(void* ptr) {
-    return *(T*)(ptr);
-  }
-
-  inline void findNBest(std::vector<paddle::Argument>& inArgs,
-                        std::vector<Path>& path) {
-    ::findNBest(machine.get(), inArgs, path, beginPos, endPos, maxLength);
-  }
-
-  SequenceGeneratorPrivate()
-      : dict(std::make_shared<std::vector<std::string>>()),
-        beginPos(0UL),
-        endPos(0UL),
-        maxLength(0UL),
-        feedback(__create_feedback__()) {}
-
- private:
-  static paddle::Argument __create_feedback__() {
-    paddle::Argument feedback;
-    feedback.ids = paddle::IVector::create(/* size= */ 1, FLAGS_use_gpu);
-
-    feedback.sequenceStartPositions =
-        paddle::ICpuGpuVector::create(/* size= */ 2, /* useGpu= */ false);
-    feedback.sequenceStartPositions->getMutableData(false)[0] = 0;
-    feedback.sequenceStartPositions->getMutableData(false)[1] = 1;
-    return feedback;
-  }
-};
-
-SequenceGenerator::SequenceGenerator() : m(new SequenceGeneratorPrivate()) {}
-
-SequenceGenerator::~SequenceGenerator() { delete m; }
-
-class PathSequenceResults : public ISequenceResults {
-  // ISequenceResults interface
- public:
-  PathSequenceResults(const std::shared_ptr<std::vector<Path>>& path,
-                      const std::shared_ptr<std::vector<std::string>>& dict)
-      : path_(path), dict_(dict) {}
-
-  size_t getSize() const { return path_->size(); }
-  std::string getSentence(size_t id, bool split) const throw(RangeError) {
-    if (id < getSize()) {
-      Path& p = (*path_)[id];
-      std::ostringstream sout;
-      std::transform(p.ids.begin(),
-                     p.ids.end(),
-                     std::ostream_iterator<std::string>(sout, split ? " " : ""),
-                     [&](int id) { return (*dict_)[id]; });
-      return sout.str();
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-  std::vector<int> getSequence(size_t id) const throw(RangeError) {
-    if (id < getSize()) {
-      Path& p = (*path_)[id];
-      return p.ids;
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-  float getScore(size_t id) const throw(RangeError) {
-    if (id < getSize()) {
-      Path& p = (*path_)[id];
-      return p.logProb;
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-
- private:
-  std::shared_ptr<std::vector<Path>> path_;
-  std::shared_ptr<std::vector<std::string>> dict_;
-};
-
-ISequenceResults* SequenceGenerator::generateSequence(
-    const Arguments& inArgs) const {
-  auto& in_args =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  for (auto& arg : in_args) {
-    arg.sequenceStartPositions = m->feedback.sequenceStartPositions;
-  }
-  in_args.push_back(m->feedback);
-  auto path = std::make_shared<std::vector<Path>>();
-  m->findNBest(in_args, *path);
-  return new PathSequenceResults(path, m->dict);
-}
-
-SequenceGenerator* SequenceGenerator::createByGradientMachineSharedPtr(
-    void* ptr) {
-  SequenceGenerator* r = new SequenceGenerator();
-  r->m->machine = r->m->cast<std::shared_ptr<paddle::GradientMachine>>(ptr);
-  return r;
-}
-
-void SequenceGenerator::setDict(const std::vector<std::string>& dict) {
-  *m->dict = dict;
-}
-
-void SequenceGenerator::setBos(size_t bos) { m->beginPos = bos; }
-
-void SequenceGenerator::setEos(size_t eos) { m->endPos = eos; }
-
-void SequenceGenerator::setMaxLength(size_t maxLength) {
-  m->maxLength = maxLength;
-}
-
-void SequenceGenerator::setBeamSize(size_t beamSize) {
-  if (beamSize != -1UL) {
-    FLAGS_beam_size = beamSize;
-  }
-}
-
-ISequenceResults::~ISequenceResults() {}
diff --git a/paddle/legacy/api/Trainer.cpp b/paddle/legacy/api/Trainer.cpp
deleted file mode 100644
index e7c607201b0..00000000000
--- a/paddle/legacy/api/Trainer.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-#include <stdlib.h>
-#include <atomic>
-#include <memory>
-
-#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/legacy/trainer/ParamUtil.h"
-#include "paddle/legacy/trainer/Trainer.h"
-#include "paddle/legacy/trainer/TrainerInternal.h"
-#include "paddle/legacy/utils/Flags.h"
-
-using paddle::real;
-
-DECLARE_string(config);
-DECLARE_string(init_model_path);
-DECLARE_int32(start_pass);
-
-struct TrainerPrivate : public paddle::Trainer {
-  bool _trainOneBatch(size_t batchSize);
-  bool forwardOneBatch(size_t batchSize);
-  void forwardOneDataBatch(const std::vector<paddle::Argument>& inArgs);
-  void setBatchSize(size_t batchSize);
-  std::vector<paddle::Argument>& getForwardOutput();
-
-  void startTestPeriod();
-  void finishTestPeriod();
-  void testOneDataBatch(const paddle::DataBatch& dataBatch);
-  TrainerPrivate() : paddle::Trainer() {}
-};
-
-Trainer::Trainer() : m(new TrainerPrivate()) {
-  auto conf = paddle::TrainerConfigHelper::createFromFlags();
-  if (conf != nullptr) {
-    m->init(conf);
-  }
-}
-
-Trainer::~Trainer() { delete m; }
-
-Trainer* Trainer::createByCommandLine() throw(IOError) {
-  auto retv = new Trainer();
-  if (retv->m->getConfig().IsInitialized()) {
-    return retv;
-  } else {
-    throw IOError();
-  }
-}
-
-Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
-    : m(new TrainerPrivate()) {
-  m->init(config->m->conf, /* testing= */ false, gm ? gm->m->machine : nullptr);
-}
-
-Trainer* Trainer::create(TrainerConfig* config,
-                         GradientMachine* gm) throw(IOError) {
-  auto retv = new Trainer(config, gm);
-  if (retv->m->getConfig().IsInitialized()) {
-    return retv;
-  } else {
-    retv->m->getConfig().CheckInitialized();
-    throw IOError();
-  }
-}
-
-void Trainer::startTrain() { m->startTrain(); }
-
-void Trainer::finishTrain() { m->finishTrain(); }
-
-void Trainer::startTrainPass() { m->startTrainPass(); }
-
-void Trainer::finishTrainPass() { m->finishTrainPass(); }
-
-void Trainer::trainOneDataBatch(size_t batchSize, const Arguments& inArgs) {
-  paddle::DataBatch dataBatch;
-  dataBatch.getStreams() = inArgs.m->outputs;
-  dataBatch.setSize(batchSize);
-  m->trainOneDataBatch(dataBatch);
-}
-
-bool Trainer::trainOneBatch(size_t batchSize) {
-  return m->_trainOneBatch(batchSize);
-}
-
-bool TrainerPrivate::_trainOneBatch(size_t batchSize) {
-  paddle::DataBatch dataBatch;
-  CHECK(dataProvider_) << "data_provider is not specified";
-  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-  if (num == 0) {
-    return false;
-  }
-  trainOneDataBatch(dataBatch);
-  return false;
-}
-
-void TrainerPrivate::startTestPeriod() {
-  if (!tester_) {
-    createTester();
-  }
-  tester_->startTestPeriod();
-}
-
-void Trainer::startTestPeriod() { m->startTestPeriod(); }
-
-void TrainerPrivate::testOneDataBatch(const paddle::DataBatch& dataBatch) {
-  tester_->testOneDataBatch(dataBatch, &forwardOutput_);
-}
-
-void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) {
-  paddle::DataBatch dataBatch;
-  dataBatch.getStreams() = args.m->outputs;
-  dataBatch.setSize(batchSize);
-  m->testOneDataBatch(dataBatch);
-}
-
-void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); }
-void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
-
-Arguments* Trainer::getLayerOutput(const std::string& layerName) const {
-  auto nn = this->m->getGradientMachine();
-  CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
-  auto arg = nn->getLayerOutput(layerName);
-  return Arguments::createByPaddleArgument(&arg);
-}
-
-void Trainer::forwardOneBatch(size_t batchSize) {
-  m->forwardOneBatch(batchSize);
-}
-
-bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
-  CHECK(dataProvider_) << "data_provider is not specified";
-  paddle::DataBatch dataBatch;
-  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-  if (num == 0) {
-    return false;
-  }
-
-  forwardOneDataBatch(dataBatch.getStreams());
-  return true;
-}
-
-void TrainerPrivate::forwardOneDataBatch(
-    const std::vector<paddle::Argument>& inArgs) {
-  std::vector<paddle::Argument>& outArgs = forwardOutput_;
-
-  if (config_->getOptConfig().use_sparse_remote_updater()) {
-    trainerInternal_.getGradientMachine()->prefetch(inArgs);
-    trainerInternal_.getParameterUpdater()->getParametersRemote();
-  }
-  trainerInternal_.getGradientMachine()->forward(
-      inArgs, &outArgs, paddle::PASS_TEST);
-}
-
-Arguments* Trainer::getForwardOutput() {
-  return Arguments::createByPaddleArgumentVector(&m->getForwardOutput());
-}
-
-std::vector<paddle::Argument>& TrainerPrivate::getForwardOutput() {
-  return forwardOutput_;
-}
diff --git a/paddle/legacy/api/Util.cpp b/paddle/legacy/api/Util.cpp
deleted file mode 100644
index b458c4d90ec..00000000000
--- a/paddle/legacy/api/Util.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include <algorithm>
-#include <iostream>
-#include <iterator>
-
-void initPaddle(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  paddle::initPython(argc, argv);
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-}
-
-FloatArray::FloatArray(const float* b, const size_t l)
-    : buf(b), length(l), needFree(false) {}
-
-IntArray::IntArray(const int* b, const size_t l, bool f)
-    : buf(b), length(l), needFree(f) {}
-
-IntWithFloatArray::IntWithFloatArray(const float* v,
-                                     const int* i,
-                                     size_t l,
-                                     bool f)
-    : valBuf(v), idxBuf(i), length(l), needFree(f) {}
-
-bool isUsingGpu() { return FLAGS_use_gpu; }
-
-void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
-
-bool isGpuVersion() {
-#ifndef PADDLE_WITH_CUDA
-  return false;
-#else
-  return true;
-#endif
-}
-
-int getTrainerCount() { return FLAGS_trainer_count; }
-
-static_assert(NUM_PARAMETER_TYPES == paddle::NUM_PARAMETER_TYPES,
-              "The Parameter Type should be same in core/api and core/common");
diff --git a/paddle/legacy/api/Vector.cpp b/paddle/legacy/api/Vector.cpp
deleted file mode 100644
index 73b6d3a15d6..00000000000
--- a/paddle/legacy/api/Vector.cpp
+++ /dev/null
@@ -1,304 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-
-#include "paddle/legacy/math/Vector.h"
-
-#include <cstring>
-
-struct IVectorPrivate {
-  paddle::IVectorPtr vec;
-};
-
-IVector::IVector() : m(new IVectorPrivate()) {}
-
-IVector* IVector::createZero(size_t sz, bool useGpu) {
-  auto v = new IVector();
-  v->m->vec = paddle::IVector::create(sz, useGpu);
-  v->m->vec->zeroMem();
-  return v;
-}
-
-IVector* IVector::create(const std::vector<int>& data, bool useGpu) {
-  auto v = new IVector();
-  v->m->vec = paddle::IVector::create(data.size(), useGpu);
-  v->m->vec->copyFrom(data.data(), data.size());
-  return v;
-}
-
-IVector* IVector::createVectorFromNumpy(int* data,
-                                        int dim,
-                                        bool copy,
-                                        bool useGpu) throw(UnsupportError) {
-  if (useGpu) {
-    /// if use gpu only copy=true is supported
-    if (!copy) {
-      throw UnsupportError("Gpu mode only supports copy=True");
-    }
-    return IVector::createGpuVectorFromNumpy(data, dim);
-  } else {
-    return IVector::createCpuVectorFromNumpy(data, dim, copy);
-  }
-}
-
-IVector* IVector::createCpuVectorFromNumpy(int* data, int dim, bool copy) {
-  auto v = new IVector();
-  if (copy) {
-    v->m->vec = paddle::IVector::create(dim, false);
-    v->m->vec->copyFrom(data, dim);
-  } else {
-    v->m->vec = paddle::IVector::create(data, dim, false);
-  }
-  return v;
-}
-
-IVector* IVector::createGpuVectorFromNumpy(int* data, int dim) {
-  auto v = new IVector();
-  v->m->vec = paddle::IVector::create(dim, true);
-  v->m->vec->copyFrom(data, dim);
-  return v;
-}
-
-bool IVector::isGpu() const {
-  return dynamic_cast<paddle::GpuIVector*>(m->vec.get()) != nullptr;
-}
-
-IntArray IVector::getData() const {
-  if (this->isGpu()) {
-    int* src = m->vec->getData();
-    size_t len = m->vec->getSize();
-    int* dest = new int[len];
-    hl_memcpy_device2host(dest, src, len * sizeof(int));
-    return IntArray(dest, len, true);
-  } else {
-    return IntArray(m->vec->getData(), m->vec->getSize());
-  }
-}
-
-int& IVector::operator[](const size_t idx) throw(RangeError, UnsupportError) {
-  if (this->isGpu()) {
-    UnsupportError e;
-    throw e;
-  } else {
-    if (idx >= m->vec->getSize()) {
-      RangeError e;
-      throw e;
-    }
-  }
-  return m->vec->getData()[idx];
-}
-
-const int& IVector::operator[](const size_t idx) const
-    throw(RangeError, UnsupportError) {
-  return (*const_cast<IVector*>(this))[idx];
-}
-
-IVector* IVector::createByPaddleVectorPtr(void* ptr) {
-  auto* p = (paddle::IVectorPtr*)ptr;
-  if ((*p) != nullptr) {
-    IVector* vec = new IVector();
-    vec->m->vec = *p;
-    return vec;
-  } else {
-    return nullptr;
-  }
-}
-
-IVector::~IVector() { delete m; }
-
-void* IVector::getSharedPtr() const { return &m->vec; }
-
-size_t IVector::getSize() const { return m->vec->getSize(); }
-
-void IVector::toNumpyArrayInplace(int** data, int* dim1) throw(UnsupportError) {
-  auto v = std::dynamic_pointer_cast<paddle::CpuIVector>(m->vec);
-  if (v) {
-    *data = v->getData();
-    *dim1 = v->getSize();
-  } else {
-    throw UnsupportError();
-  }
-}
-
-void IVector::copyToNumpyArray(int** view_m_data, int* dim1) {
-  *dim1 = m->vec->getSize();
-  *view_m_data = new int[*dim1];
-  if (auto cpuVec = dynamic_cast<paddle::CpuIVector*>(m->vec.get())) {
-    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(int) * (*dim1));
-  } else if (auto gpuVec = dynamic_cast<paddle::GpuIVector*>(m->vec.get())) {
-    hl_memcpy_device2host(
-        *view_m_data, gpuVec->getData(), sizeof(int) * (*dim1));
-  } else {
-    LOG(INFO) << "Unexpected situation";
-  }
-}
-
-void IVector::copyFromNumpyArray(int* data, int dim) {
-  m->vec->resize(dim);
-  m->vec->copyFrom(data, dim);
-}
-
-struct VectorPrivate {
-  paddle::VectorPtr vec;
-
-  void safeAccessData(const size_t idx,
-                      const std::function<void(float&)>& func) const
-      throw(RangeError, UnsupportError) {
-    auto cpuVec = std::dynamic_pointer_cast<const paddle::CpuVector>(vec);
-    if (cpuVec != nullptr) {
-      if (idx < vec->getSize()) {
-        func(vec->getData()[idx]);
-      } else {
-        throw RangeError();
-      }
-    } else {
-      throw UnsupportError();
-    }
-  }
-};
-
-Vector::Vector() : m(new VectorPrivate()) {}
-
-Vector::~Vector() { delete m; }
-
-Vector* Vector::createZero(size_t sz, bool useGpu) {
-  auto retVec = new Vector();
-  retVec->m->vec = paddle::Vector::create(sz, useGpu);
-  retVec->m->vec->zero();
-  return retVec;
-}
-
-Vector* Vector::create(const std::vector<float>& data, bool useGpu) {
-  auto retVec = new Vector();
-  retVec->m->vec = paddle::Vector::create(data.size(), useGpu);
-  retVec->m->vec->copyFrom(data.data(), data.size());
-  return retVec;
-}
-
-Vector* Vector::createByPaddleVectorPtr(void* ptr) {
-  auto& v = *(paddle::VectorPtr*)(ptr);
-  if (v == nullptr) {
-    return nullptr;
-  } else {
-    auto retVec = new Vector();
-    retVec->m->vec = v;
-    return retVec;
-  }
-}
-
-Vector* Vector::createVectorFromNumpy(float* data,
-                                      int dim,
-                                      bool copy,
-                                      bool useGpu) throw(UnsupportError) {
-  if (useGpu) {
-    /// if use gpu only copy=True is supported
-    if (!copy) {
-      throw UnsupportError("Gpu mode only supports copy=True");
-    }
-    return Vector::createGpuVectorFromNumpy(data, dim);
-  } else {
-    return Vector::createCpuVectorFromNumpy(data, dim, copy);
-  }
-}
-
-Vector* Vector::createCpuVectorFromNumpy(float* data, int dim, bool copy) {
-  CHECK_GT(dim, 0);
-  auto retVec = new Vector();
-  if (copy) {
-    retVec->m->vec = paddle::Vector::create((size_t)dim, false);
-    retVec->m->vec->copyFrom(data, dim);
-  } else {
-    retVec->m->vec = paddle::Vector::create(data, (size_t)dim, false);
-  }
-  return retVec;
-}
-
-Vector* Vector::createGpuVectorFromNumpy(float* data, int dim) {
-  CHECK_GT(dim, 0);
-  auto retVec = new Vector();
-  retVec->m->vec = paddle::Vector::create((size_t)dim, true);
-  retVec->m->vec->copyFrom(data, (size_t)dim);
-  return retVec;
-}
-
-void Vector::toNumpyArrayInplace(float** view_data,
-                                 int* dim1) throw(UnsupportError) {
-  auto v = std::dynamic_pointer_cast<paddle::CpuVector>(m->vec);
-  if (v != nullptr) {
-    *view_data = v->getData();
-    *dim1 = (int)v->getSize();
-  } else {
-    throw UnsupportError();
-  }
-}
-
-void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
-  *dim1 = m->vec->getSize();
-  *view_m_data = new float[*dim1];
-  if (auto cpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
-    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
-  } else if (auto gpuVec = dynamic_cast<paddle::GpuVector*>(m->vec.get())) {
-    hl_memcpy_device2host(
-        *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
-  } else {
-    LOG(INFO) << "Unexpected situation";
-  }
-}
-
-void Vector::copyFromNumpyArray(float* data, int dim) {
-  m->vec->resize(dim);
-  m->vec->copyFrom(data, dim);
-}
-
-FloatArray Vector::getData() const {
-  if (this->isGpu()) {
-    float* src = m->vec->getData();
-    size_t len = m->vec->getSize();
-    float* dest = new float[len];
-    hl_memcpy_device2host(dest, src, len * sizeof(float));
-    FloatArray ret_val(dest, len);
-    ret_val.needFree = true;
-    return ret_val;
-  } else {
-    FloatArray ret_val(m->vec->getData(), m->vec->getSize());
-    return ret_val;
-  }
-}
-
-void Vector::copyFrom(Vector* src) throw(RangeError) {
-  if (src->m->vec->getSize() != m->vec->getSize()) {
-    throw RangeError();
-  }
-  m->vec->copyFrom(*src->m->vec);
-}
-
-bool Vector::isGpu() const {
-  return std::dynamic_pointer_cast<paddle::GpuVector>(m->vec) != nullptr;
-}
-
-float Vector::get(const size_t idx) const throw(RangeError, UnsupportError) {
-  float r;
-  m->safeAccessData(idx, [&](float& o) { r = o; });
-  return r;
-}
-
-void Vector::set(const size_t idx, float val) throw(RangeError,
-                                                    UnsupportError) {
-  m->safeAccessData(idx, [&](float& o) { o = val; });
-}
-
-size_t Vector::getSize() const { return m->vec->getSize(); }
-
-void* Vector::getSharedPtr() { return &m->vec; }
diff --git a/paddle/legacy/api/__init__.py b/paddle/legacy/api/__init__.py
deleted file mode 100644
index f662d682632..00000000000
--- a/paddle/legacy/api/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/paddle/legacy/api/numpy.i b/paddle/legacy/api/numpy.i
deleted file mode 100644
index 2ddc11de7a4..00000000000
--- a/paddle/legacy/api/numpy.i
+++ /dev/null
@@ -1,3161 +0,0 @@
-/* -*- C -*-  (not really, but good for syntax highlighting) */
-
-/*
- * Copyright (c) 2005-2015, NumPy Developers.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- *        notice, this list of conditions and the following disclaimer.
- *
- *     * Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials provided
- *        with the distribution.
- *
- *     * Neither the name of the NumPy Developers nor the names of any
- *        contributors may be used to endorse or promote products derived
- *        from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifdef SWIGPYTHON
-
-%{
-#ifndef SWIG_FILE_WITH_INIT
-#define NO_IMPORT_ARRAY
-#endif
-#include "stdio.h"
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#include <numpy/arrayobject.h>
-%}
-
-/**********************************************************************/
-
-%fragment("NumPy_Backward_Compatibility", "header")
-{
-%#if NPY_API_VERSION < 0x00000007
-%#define NPY_ARRAY_DEFAULT NPY_DEFAULT
-%#define NPY_ARRAY_FARRAY  NPY_FARRAY
-%#define NPY_FORTRANORDER  NPY_FORTRAN
-%#endif
-}
-
-/**********************************************************************/
-
-/* The following code originally appeared in
- * enthought/kiva/agg/src/numeric.i written by Eric Jones.  It was
- * translated from C++ to C by John Hunter.  Bill Spotz has modified
- * it to fix some minor bugs, upgrade from Numeric to numpy (all
- * versions), add some comments and functionality, and convert from
- * direct code insertion to SWIG fragments.
- */
-
-%fragment("NumPy_Macros", "header")
-{
-/* Macros to extract array attributes.
- */
-%#if NPY_API_VERSION < 0x00000007
-%#define is_array(a)            ((a) && PyArray_Check((PyArrayObject*)a))
-%#define array_type(a)          (int)(PyArray_TYPE((PyArrayObject*)a))
-%#define array_numdims(a)       (((PyArrayObject*)a)->nd)
-%#define array_dimensions(a)    (((PyArrayObject*)a)->dimensions)
-%#define array_size(a,i)        (((PyArrayObject*)a)->dimensions[i])
-%#define array_strides(a)       (((PyArrayObject*)a)->strides)
-%#define array_stride(a,i)      (((PyArrayObject*)a)->strides[i])
-%#define array_data(a)          (((PyArrayObject*)a)->data)
-%#define array_descr(a)         (((PyArrayObject*)a)->descr)
-%#define array_flags(a)         (((PyArrayObject*)a)->flags)
-%#define array_enableflags(a,f) (((PyArrayObject*)a)->flags) = f
-%#else
-%#define is_array(a)            ((a) && PyArray_Check(a))
-%#define array_type(a)          PyArray_TYPE((PyArrayObject*)a)
-%#define array_numdims(a)       PyArray_NDIM((PyArrayObject*)a)
-%#define array_dimensions(a)    PyArray_DIMS((PyArrayObject*)a)
-%#define array_strides(a)       PyArray_STRIDES((PyArrayObject*)a)
-%#define array_stride(a,i)      PyArray_STRIDE((PyArrayObject*)a,i)
-%#define array_size(a,i)        PyArray_DIM((PyArrayObject*)a,i)
-%#define array_data(a)          PyArray_DATA((PyArrayObject*)a)
-%#define array_descr(a)         PyArray_DESCR((PyArrayObject*)a)
-%#define array_flags(a)         PyArray_FLAGS((PyArrayObject*)a)
-%#define array_enableflags(a,f) PyArray_ENABLEFLAGS((PyArrayObject*)a,f)
-%#endif
-%#define array_is_contiguous(a) (PyArray_ISCONTIGUOUS((PyArrayObject*)a))
-%#define array_is_native(a)     (PyArray_ISNOTSWAPPED((PyArrayObject*)a))
-%#define array_is_fortran(a)    (PyArray_ISFORTRAN((PyArrayObject*)a))
-}
-
-/**********************************************************************/
-
-%fragment("NumPy_Utilities",
-          "header")
-{
-  /* Given a PyObject, return a string describing its type.
-   */
-  const char* pytype_string(PyObject* py_obj)
-  {
-    if (py_obj == NULL          ) return "C NULL value";
-    if (py_obj == Py_None       ) return "Python None" ;
-    if (PyCallable_Check(py_obj)) return "callable"    ;
-    if (PyString_Check(  py_obj)) return "string"      ;
-    if (PyInt_Check(     py_obj)) return "int"         ;
-    if (PyFloat_Check(   py_obj)) return "float"       ;
-    if (PyDict_Check(    py_obj)) return "dict"        ;
-    if (PyList_Check(    py_obj)) return "list"        ;
-    if (PyTuple_Check(   py_obj)) return "tuple"       ;
-%#if PY_MAJOR_VERSION < 3
-    if (PyFile_Check(    py_obj)) return "file"        ;
-    if (PyModule_Check(  py_obj)) return "module"      ;
-    if (PyInstance_Check(py_obj)) return "instance"    ;
-%#endif
-
-    return "unknown type";
-  }
-
-  /* Given a NumPy typecode, return a string describing the type.
-   */
-  const char* typecode_string(int typecode)
-  {
-    static const char* type_names[25] = {"bool",
-                                         "byte",
-                                         "unsigned byte",
-                                         "short",
-                                         "unsigned short",
-                                         "int",
-                                         "unsigned int",
-                                         "long",
-                                         "unsigned long",
-                                         "long long",
-                                         "unsigned long long",
-                                         "float",
-                                         "double",
-                                         "long double",
-                                         "complex float",
-                                         "complex double",
-                                         "complex long double",
-                                         "object",
-                                         "string",
-                                         "unicode",
-                                         "void",
-                                         "ntypes",
-                                         "notype",
-                                         "char",
-                                         "unknown"};
-    return typecode < 24 ? type_names[typecode] : type_names[24];
-  }
-
-  /* Make sure input has correct numpy type.  This now just calls
-     PyArray_EquivTypenums().
-   */
-  int type_match(int actual_type,
-                 int desired_type)
-  {
-    return PyArray_EquivTypenums(actual_type, desired_type);
-  }
-
-%#ifdef SWIGPY_USE_CAPSULE
-  void free_cap(PyObject * cap)
-  {
-    void* array = (void*) PyCapsule_GetPointer(cap,SWIGPY_CAPSULE_NAME);
-    if (array != NULL) free(array);
-  }
-%#endif
-
-
-}
-
-/**********************************************************************/
-
-%fragment("NumPy_Object_to_Array",
-          "header",
-          fragment="NumPy_Backward_Compatibility",
-          fragment="NumPy_Macros",
-          fragment="NumPy_Utilities")
-{
-  /* Given a PyObject pointer, cast it to a PyArrayObject pointer if
-   * legal.  If not, set the python error string appropriately and
-   * return NULL.
-   */
-  PyArrayObject* obj_to_array_no_conversion(PyObject* input,
-                                            int        typecode)
-  {
-    PyArrayObject* ary = NULL;
-    if (is_array(input) && (typecode == NPY_NOTYPE ||
-                            PyArray_EquivTypenums(array_type(input), typecode)))
-    {
-      ary = (PyArrayObject*) input;
-    }
-    else if is_array(input)
-    {
-      const char* desired_type = typecode_string(typecode);
-      const char* actual_type  = typecode_string(array_type(input));
-      PyErr_Format(PyExc_TypeError,
-                   "Array of type '%s' required.  Array of type '%s' given",
-                   desired_type, actual_type);
-      ary = NULL;
-    }
-    else
-    {
-      const char* desired_type = typecode_string(typecode);
-      const char* actual_type  = pytype_string(input);
-      PyErr_Format(PyExc_TypeError,
-                   "Array of type '%s' required.  A '%s' was given",
-                   desired_type,
-                   actual_type);
-      ary = NULL;
-    }
-    return ary;
-  }
-
-  /* Convert the given PyObject to a NumPy array with the given
-   * typecode.  On success, return a valid PyArrayObject* with the
-   * correct type.  On failure, the python error string will be set and
-   * the routine returns NULL.
-   */
-  PyArrayObject* obj_to_array_allow_conversion(PyObject* input,
-                                               int       typecode,
-                                               int*      is_new_object)
-  {
-    PyArrayObject* ary = NULL;
-    PyObject*      py_obj;
-    if (is_array(input) && (typecode == NPY_NOTYPE ||
-                            PyArray_EquivTypenums(array_type(input),typecode)))
-    {
-      ary = (PyArrayObject*) input;
-      *is_new_object = 0;
-    }
-    else
-    {
-      py_obj = PyArray_FROMANY(input, typecode, 0, 0, NPY_ARRAY_DEFAULT);
-      /* If NULL, PyArray_FromObject will have set python error value.*/
-      ary = (PyArrayObject*) py_obj;
-      *is_new_object = 1;
-    }
-    return ary;
-  }
-
-  /* Given a PyArrayObject, check to see if it is contiguous.  If so,
-   * return the input pointer and flag it as not a new object.  If it is
-   * not contiguous, create a new PyArrayObject using the original data,
-   * flag it as a new object and return the pointer.
-   */
-  PyArrayObject* make_contiguous(PyArrayObject* ary,
-                                 int*           is_new_object,
-                                 int            min_dims,
-                                 int            max_dims)
-  {
-    PyArrayObject* result;
-    if (array_is_contiguous(ary))
-    {
-      result = ary;
-      *is_new_object = 0;
-    }
-    else
-    {
-      result = (PyArrayObject*) PyArray_ContiguousFromObject((PyObject*)ary,
-                                                              array_type(ary),
-                                                              min_dims,
-                                                              max_dims);
-      *is_new_object = 1;
-    }
-    return result;
-  }
-
-  /* Given a PyArrayObject, check to see if it is Fortran-contiguous.
-   * If so, return the input pointer, but do not flag it as not a new
-   * object.  If it is not Fortran-contiguous, create a new
-   * PyArrayObject using the original data, flag it as a new object
-   * and return the pointer.
-   */
-  PyArrayObject* make_fortran(PyArrayObject* ary,
-                              int*           is_new_object)
-  {
-    PyArrayObject* result;
-    if (array_is_fortran(ary))
-    {
-      result = ary;
-      *is_new_object = 0;
-    }
-    else
-    {
-      Py_INCREF(array_descr(ary));
-      result = (PyArrayObject*) PyArray_FromArray(ary,
-                                                  array_descr(ary),
-                                                  NPY_FORTRANORDER);
-      *is_new_object = 1;
-    }
-    return result;
-  }
-
-  /* Convert a given PyObject to a contiguous PyArrayObject of the
-   * specified type.  If the input object is not a contiguous
-   * PyArrayObject, a new one will be created and the new object flag
-   * will be set.
-   */
-  PyArrayObject* obj_to_array_contiguous_allow_conversion(PyObject* input,
-                                                          int       typecode,
-                                                          int*      is_new_object)
-  {
-    int is_new1 = 0;
-    int is_new2 = 0;
-    PyArrayObject* ary2;
-    PyArrayObject* ary1 = obj_to_array_allow_conversion(input,
-                                                        typecode,
-                                                        &is_new1);
-    if (ary1)
-    {
-      ary2 = make_contiguous(ary1, &is_new2, 0, 0);
-      if ( is_new1 && is_new2)
-      {
-        Py_DECREF(ary1);
-      }
-      ary1 = ary2;
-    }
-    *is_new_object = is_new1 || is_new2;
-    return ary1;
-  }
-
-  /* Convert a given PyObject to a Fortran-ordered PyArrayObject of the
-   * specified type.  If the input object is not a Fortran-ordered
-   * PyArrayObject, a new one will be created and the new object flag
-   * will be set.
-   */
-  PyArrayObject* obj_to_array_fortran_allow_conversion(PyObject* input,
-                                                       int       typecode,
-                                                       int*      is_new_object)
-  {
-    int is_new1 = 0;
-    int is_new2 = 0;
-    PyArrayObject* ary2;
-    PyArrayObject* ary1 = obj_to_array_allow_conversion(input,
-                                                        typecode,
-                                                        &is_new1);
-    if (ary1)
-    {
-      ary2 = make_fortran(ary1, &is_new2);
-      if (is_new1 && is_new2)
-      {
-        Py_DECREF(ary1);
-      }
-      ary1 = ary2;
-    }
-    *is_new_object = is_new1 || is_new2;
-    return ary1;
-  }
-} /* end fragment */
-
-/**********************************************************************/
-
-%fragment("NumPy_Array_Requirements",
-          "header",
-          fragment="NumPy_Backward_Compatibility",
-          fragment="NumPy_Macros")
-{
-  /* Test whether a python object is contiguous.  If array is
-   * contiguous, return 1.  Otherwise, set the python error string and
-   * return 0.
-   */
-  int require_contiguous(PyArrayObject* ary)
-  {
-    int contiguous = 1;
-    if (!array_is_contiguous(ary))
-    {
-      PyErr_SetString(PyExc_TypeError,
-                      "Array must be contiguous.  A non-contiguous array was given");
-      contiguous = 0;
-    }
-    return contiguous;
-  }
-
-  /* Test whether a python object is (C_ or F_) contiguous.  If array is
-   * contiguous, return 1.  Otherwise, set the python error string and
-   * return 0.
-   */
-  int require_c_or_f_contiguous(PyArrayObject* ary)
-  {
-    int contiguous = 1;
-    if (!(array_is_contiguous(ary) || array_is_fortran(ary)))
-    {
-      PyErr_SetString(PyExc_TypeError,
-                      "Array must be contiguous (C_ or F_).  A non-contiguous array was given");
-      contiguous = 0;
-    }
-    return contiguous;
-  }
-
-  /* Require that a numpy array is not byte-swapped.  If the array is
-   * not byte-swapped, return 1.  Otherwise, set the python error string
-   * and return 0.
-   */
-  int require_native(PyArrayObject* ary)
-  {
-    int native = 1;
-    if (!array_is_native(ary))
-    {
-      PyErr_SetString(PyExc_TypeError,
-                      "Array must have native byteorder.  "
-                      "A byte-swapped array was given");
-      native = 0;
-    }
-    return native;
-  }
-
-  /* Require the given PyArrayObject to have a specified number of
-   * dimensions.  If the array has the specified number of dimensions,
-   * return 1.  Otherwise, set the python error string and return 0.
-   */
-  int require_dimensions(PyArrayObject* ary,
-                         int            exact_dimensions)
-  {
-    int success = 1;
-    if (array_numdims(ary) != exact_dimensions)
-    {
-      PyErr_Format(PyExc_TypeError,
-                   "Array must have %d dimensions.  Given array has %d dimensions",
-                   exact_dimensions,
-                   array_numdims(ary));
-      success = 0;
-    }
-    return success;
-  }
-
-  /* Require the given PyArrayObject to have one of a list of specified
-   * number of dimensions.  If the array has one of the specified number
-   * of dimensions, return 1.  Otherwise, set the python error string
-   * and return 0.
-   */
-  int require_dimensions_n(PyArrayObject* ary,
-                           int*           exact_dimensions,
-                           int            n)
-  {
-    int success = 0;
-    int i;
-    char dims_str[255] = "";
-    char s[255];
-    for (i = 0; i < n && !success; i++)
-    {
-      if (array_numdims(ary) == exact_dimensions[i])
-      {
-        success = 1;
-      }
-    }
-    if (!success)
-    {
-      for (i = 0; i < n-1; i++)
-      {
-        sprintf(s, "%d, ", exact_dimensions[i]);
-        strcat(dims_str,s);
-      }
-      sprintf(s, " or %d", exact_dimensions[n-1]);
-      strcat(dims_str,s);
-      PyErr_Format(PyExc_TypeError,
-                   "Array must have %s dimensions.  Given array has %d dimensions",
-                   dims_str,
-                   array_numdims(ary));
-    }
-    return success;
-  }
-
-  /* Require the given PyArrayObject to have a specified shape.  If the
-   * array has the specified shape, return 1.  Otherwise, set the python
-   * error string and return 0.
-   */
-  int require_size(PyArrayObject* ary,
-                   npy_intp*      size,
-                   int            n)
-  {
-    int i;
-    int success = 1;
-    int len;
-    char desired_dims[255] = "[";
-    char s[255];
-    char actual_dims[255] = "[";
-    for(i=0; i < n;i++)
-    {
-      if (size[i] != -1 &&  size[i] != array_size(ary,i))
-      {
-        success = 0;
-      }
-    }
-    if (!success)
-    {
-      for (i = 0; i < n; i++)
-      {
-        if (size[i] == -1)
-        {
-          sprintf(s, "*,");
-        }
-        else
-        {
-          sprintf(s, "%ld,", (long int)size[i]);
-        }
-        strcat(desired_dims,s);
-      }
-      len = strlen(desired_dims);
-      desired_dims[len-1] = ']';
-      for (i = 0; i < n; i++)
-      {
-        sprintf(s, "%ld,", (long int)array_size(ary,i));
-        strcat(actual_dims,s);
-      }
-      len = strlen(actual_dims);
-      actual_dims[len-1] = ']';
-      PyErr_Format(PyExc_TypeError,
-                   "Array must have shape of %s.  Given array has shape of %s",
-                   desired_dims,
-                   actual_dims);
-    }
-    return success;
-  }
-
-  /* Require the given PyArrayObject to to be Fortran ordered.  If the
-   * the PyArrayObject is already Fortran ordered, do nothing.  Else,
-   * set the Fortran ordering flag and recompute the strides.
-   */
-  int require_fortran(PyArrayObject* ary)
-  {
-    int success = 1;
-    int nd = array_numdims(ary);
-    int i;
-    npy_intp * strides = array_strides(ary);
-    if (array_is_fortran(ary)) return success;
-    /* Set the Fortran ordered flag */
-    array_enableflags(ary,NPY_ARRAY_FARRAY);
-    /* Recompute the strides */
-    strides[0] = strides[nd-1];
-    for (i=1; i < nd; ++i)
-      strides[i] = strides[i-1] * array_size(ary,i-1);
-    return success;
-  }
-}
-
-/* Combine all NumPy fragments into one for convenience */
-%fragment("NumPy_Fragments",
-          "header",
-          fragment="NumPy_Backward_Compatibility",
-          fragment="NumPy_Macros",
-          fragment="NumPy_Utilities",
-          fragment="NumPy_Object_to_Array",
-          fragment="NumPy_Array_Requirements")
-{
-}
-
-/* End John Hunter translation (with modifications by Bill Spotz)
- */
-
-/* %numpy_typemaps() macro
- *
- * This macro defines a family of 75 typemaps that allow C arguments
- * of the form
- *
- *    1. (DATA_TYPE IN_ARRAY1[ANY])
- *    2. (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
- *    3. (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
- *
- *    4. (DATA_TYPE IN_ARRAY2[ANY][ANY])
- *    5. (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
- *    6. (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
- *    7. (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
- *    8. (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
- *
- *    9. (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
- *   10. (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
- *   11. (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
- *   12. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_ARRAY3)
- *   13. (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
- *   14. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_FARRAY3)
- *
- *   15. (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
- *   16. (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
- *   17. (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
- *   18. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, , DIM_TYPE DIM4, DATA_TYPE* IN_ARRAY4)
- *   19. (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
- *   20. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_FARRAY4)
- *
- *   21. (DATA_TYPE INPLACE_ARRAY1[ANY])
- *   22. (DATA_TYPE* INPLACE_ARRAY1, DIM_TYPE DIM1)
- *   23. (DIM_TYPE DIM1, DATA_TYPE* INPLACE_ARRAY1)
- *
- *   24. (DATA_TYPE INPLACE_ARRAY2[ANY][ANY])
- *   25. (DATA_TYPE* INPLACE_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
- *   26. (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_ARRAY2)
- *   27. (DATA_TYPE* INPLACE_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
- *   28. (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_FARRAY2)
- *
- *   29. (DATA_TYPE INPLACE_ARRAY3[ANY][ANY][ANY])
- *   30. (DATA_TYPE* INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
- *   31. (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
- *   32. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_ARRAY3)
- *   33. (DATA_TYPE* INPLACE_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
- *   34. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_FARRAY3)
- *
- *   35. (DATA_TYPE INPLACE_ARRAY4[ANY][ANY][ANY][ANY])
- *   36. (DATA_TYPE* INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
- *   37. (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
- *   38. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_ARRAY4)
- *   39. (DATA_TYPE* INPLACE_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
- *   40. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_FARRAY4)
- *
- *   41. (DATA_TYPE ARGOUT_ARRAY1[ANY])
- *   42. (DATA_TYPE* ARGOUT_ARRAY1, DIM_TYPE DIM1)
- *   43. (DIM_TYPE DIM1, DATA_TYPE* ARGOUT_ARRAY1)
- *
- *   44. (DATA_TYPE ARGOUT_ARRAY2[ANY][ANY])
- *
- *   45. (DATA_TYPE ARGOUT_ARRAY3[ANY][ANY][ANY])
- *
- *   46. (DATA_TYPE ARGOUT_ARRAY4[ANY][ANY][ANY][ANY])
- *
- *   47. (DATA_TYPE** ARGOUTVIEW_ARRAY1, DIM_TYPE* DIM1)
- *   48. (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEW_ARRAY1)
- *
- *   49. (DATA_TYPE** ARGOUTVIEW_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
- *   50. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_ARRAY2)
- *   51. (DATA_TYPE** ARGOUTVIEW_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
- *   52. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_FARRAY2)
- *
- *   53. (DATA_TYPE** ARGOUTVIEW_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
- *   54. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_ARRAY3)
- *   55. (DATA_TYPE** ARGOUTVIEW_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
- *   56. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_FARRAY3)
- *
- *   57. (DATA_TYPE** ARGOUTVIEW_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- *   58. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEW_ARRAY4)
- *   59. (DATA_TYPE** ARGOUTVIEW_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- *   60. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEW_FARRAY4)
- *
- *   61. (DATA_TYPE** ARGOUTVIEWM_ARRAY1, DIM_TYPE* DIM1)
- *   62. (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEWM_ARRAY1)
- *
- *   63. (DATA_TYPE** ARGOUTVIEWM_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
- *   64. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_ARRAY2)
- *   65. (DATA_TYPE** ARGOUTVIEWM_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
- *   66. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_FARRAY2)
- *
- *   67. (DATA_TYPE** ARGOUTVIEWM_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
- *   68. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEWM_ARRAY3)
- *   69. (DATA_TYPE** ARGOUTVIEWM_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
- *   70. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEWM_FARRAY3)
- *
- *   71. (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- *   72. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_ARRAY4)
- *   73. (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- *   74. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_FARRAY4)
- *
- *   75. (DATA_TYPE* INPLACE_ARRAY_FLAT, DIM_TYPE DIM_FLAT)
- *
- * where "DATA_TYPE" is any type supported by the NumPy module, and
- * "DIM_TYPE" is any int-like type suitable for specifying dimensions.
- * The difference between "ARRAY" typemaps and "FARRAY" typemaps is
- * that the "FARRAY" typemaps expect Fortran ordering of
- * multidimensional arrays.  In python, the dimensions will not need
- * to be specified (except for the "DATA_TYPE* ARGOUT_ARRAY1"
- * typemaps).  The IN_ARRAYs can be a numpy array or any sequence that
- * can be converted to a numpy array of the specified type.  The
- * INPLACE_ARRAYs must be numpy arrays of the appropriate type.  The
- * ARGOUT_ARRAYs will be returned as new numpy arrays of the
- * appropriate type.
- *
- * These typemaps can be applied to existing functions using the
- * %apply directive.  For example:
- *
- *     %apply (double* IN_ARRAY1, int DIM1) {(double* series, int length)};
- *     double prod(double* series, int length);
- *
- *     %apply (int DIM1, int DIM2, double* INPLACE_ARRAY2)
- *           {(int rows, int cols, double* matrix        )};
- *     void floor(int rows, int cols, double* matrix, double f);
- *
- *     %apply (double IN_ARRAY3[ANY][ANY][ANY])
- *           {(double tensor[2][2][2]         )};
- *     %apply (double ARGOUT_ARRAY3[ANY][ANY][ANY])
- *           {(double low[2][2][2]                )};
- *     %apply (double ARGOUT_ARRAY3[ANY][ANY][ANY])
- *           {(double upp[2][2][2]                )};
- *     void luSplit(double tensor[2][2][2],
- *                  double low[2][2][2],
- *                  double upp[2][2][2]    );
- *
- * or directly with
- *
- *     double prod(double* IN_ARRAY1, int DIM1);
- *
- *     void floor(int DIM1, int DIM2, double* INPLACE_ARRAY2, double f);
- *
- *     void luSplit(double IN_ARRAY3[ANY][ANY][ANY],
- *                  double ARGOUT_ARRAY3[ANY][ANY][ANY],
- *                  double ARGOUT_ARRAY3[ANY][ANY][ANY]);
- */
-
-%define %numpy_typemaps(DATA_TYPE, DATA_TYPECODE, DIM_TYPE)
-
-/************************/
-/* Input Array Typemaps */
-/************************/
-
-/* Typemap suite for (DATA_TYPE IN_ARRAY1[ANY])
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE IN_ARRAY1[ANY])
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE IN_ARRAY1[ANY])
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[1] = { $1_dim0 };
-  array = obj_to_array_contiguous_allow_conversion($input,
-                                                   DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 1) ||
-      !require_size(array, size, 1)) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-%typemap(freearg)
-  (DATA_TYPE IN_ARRAY1[ANY])
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[1] = { -1 };
-  array = obj_to_array_contiguous_allow_conversion($input,
-                                                   DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 1) ||
-      !require_size(array, size, 1)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-}
-%typemap(freearg)
-  (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[1] = {-1};
-  array = obj_to_array_contiguous_allow_conversion($input,
-                                                   DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 1) ||
-      !require_size(array, size, 1)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DATA_TYPE*) array_data(array);
-}
-%typemap(freearg)
-  (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE IN_ARRAY2[ANY][ANY])
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE IN_ARRAY2[ANY][ANY])
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE IN_ARRAY2[ANY][ANY])
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[2] = { $1_dim0, $1_dim1 };
-  array = obj_to_array_contiguous_allow_conversion($input,
-                                                   DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 2) ||
-      !require_size(array, size, 2)) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-%typemap(freearg)
-  (DATA_TYPE IN_ARRAY2[ANY][ANY])
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[2] = { -1, -1 };
-  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 2) ||
-      !require_size(array, size, 2)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-}
-%typemap(freearg)
-  (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[2] = { -1, -1 };
-  array = obj_to_array_contiguous_allow_conversion($input,
-                                                   DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 2) ||
-      !require_size(array, size, 2)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DATA_TYPE*) array_data(array);
-}
-%typemap(freearg)
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[2] = { -1, -1 };
-  array = obj_to_array_fortran_allow_conversion($input,
-                                                DATA_TYPECODE,
-                                                &is_new_object);
-  if (!array || !require_dimensions(array, 2) ||
-      !require_size(array, size, 2) || !require_fortran(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-}
-%typemap(freearg)
-  (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[2] = { -1, -1 };
-  array = obj_to_array_fortran_allow_conversion($input,
-                                                   DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 2) ||
-      !require_size(array, size, 2) || !require_fortran(array)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DATA_TYPE*) array_data(array);
-}
-%typemap(freearg)
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[3] = { $1_dim0, $1_dim1, $1_dim2 };
-  array = obj_to_array_contiguous_allow_conversion($input,
-                                                   DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 3) ||
-      !require_size(array, size, 3)) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-%typemap(freearg)
-  (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[3] = { -1, -1, -1 };
-  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 3) ||
-      !require_size(array, size, 3)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-  $4 = (DIM_TYPE) array_size(array,2);
-}
-%typemap(freearg)
-  (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  /* for now, only concerned with lists */
-  $1 = PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-  (DATA_TYPE** array=NULL, PyArrayObject** object_array=NULL, int* is_new_object_array=NULL)
-{
-  npy_intp size[2] = { -1, -1 };
-  PyArrayObject* temp_array;
-  Py_ssize_t i;
-  int is_new_object;
-
-  /* length of the list */
-  $2 = PyList_Size($input);
-
-  /* the arrays */
-  array = (DATA_TYPE **)malloc($2*sizeof(DATA_TYPE *));
-  object_array = (PyArrayObject **)calloc($2,sizeof(PyArrayObject *));
-  is_new_object_array = (int *)calloc($2,sizeof(int));
-
-  if (array == NULL || object_array == NULL || is_new_object_array == NULL)
-  {
-    SWIG_fail;
-  }
-
-  for (i=0; i<$2; i++)
-  {
-    temp_array = obj_to_array_contiguous_allow_conversion(PySequence_GetItem($input,i), DATA_TYPECODE, &is_new_object);
-
-    /* the new array must be stored so that it can be destroyed in freearg */
-    object_array[i] = temp_array;
-    is_new_object_array[i] = is_new_object;
-
-    if (!temp_array || !require_dimensions(temp_array, 2)) SWIG_fail;
-
-    /* store the size of the first array in the list, then use that for comparison. */
-    if (i == 0)
-    {
-      size[0] = array_size(temp_array,0);
-      size[1] = array_size(temp_array,1);
-    }
-
-    if (!require_size(temp_array, size, 2)) SWIG_fail;
-
-    array[i] = (DATA_TYPE*) array_data(temp_array);
-  }
-
-  $1 = (DATA_TYPE**) array;
-  $3 = (DIM_TYPE) size[0];
-  $4 = (DIM_TYPE) size[1];
-}
-%typemap(freearg)
-  (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  Py_ssize_t i;
-
-  if (array$argnum!=NULL) free(array$argnum);
-
-  /*freeing the individual arrays if needed */
-  if (object_array$argnum!=NULL)
-  {
-    if (is_new_object_array$argnum!=NULL)
-    {
-      for (i=0; i<$2; i++)
-      {
-        if (object_array$argnum[i] != NULL && is_new_object_array$argnum[i])
-        { Py_DECREF(object_array$argnum[i]); }
-      }
-      free(is_new_object_array$argnum);
-    }
-    free(object_array$argnum);
-  }
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
- *                    DATA_TYPE* IN_ARRAY3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_ARRAY3)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_ARRAY3)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[3] = { -1, -1, -1 };
-  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 3) ||
-      !require_size(array, size, 3)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DIM_TYPE) array_size(array,2);
-  $4 = (DATA_TYPE*) array_data(array);
-}
-%typemap(freearg)
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_ARRAY3)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[3] = { -1, -1, -1 };
-  array = obj_to_array_fortran_allow_conversion($input, DATA_TYPECODE,
-                                                &is_new_object);
-  if (!array || !require_dimensions(array, 3) ||
-      !require_size(array, size, 3) | !require_fortran(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-  $4 = (DIM_TYPE) array_size(array,2);
-}
-%typemap(freearg)
-  (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
- *                    DATA_TYPE* IN_FARRAY3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_FARRAY3)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_FARRAY3)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[3] = { -1, -1, -1 };
-  array = obj_to_array_fortran_allow_conversion($input,
-                                                   DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 3) ||
-      !require_size(array, size, 3) || !require_fortran(array)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DIM_TYPE) array_size(array,2);
-  $4 = (DATA_TYPE*) array_data(array);
-}
-%typemap(freearg)
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_FARRAY3)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[4] = { $1_dim0, $1_dim1, $1_dim2 , $1_dim3};
-  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 4) ||
-      !require_size(array, size, 4)) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-%typemap(freearg)
-  (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[4] = { -1, -1, -1, -1 };
-  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 4) ||
-      !require_size(array, size, 4)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-  $4 = (DIM_TYPE) array_size(array,2);
-  $5 = (DIM_TYPE) array_size(array,3);
-}
-%typemap(freearg)
-  (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  /* for now, only concerned with lists */
-  $1 = PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-  (DATA_TYPE** array=NULL, PyArrayObject** object_array=NULL, int* is_new_object_array=NULL)
-{
-  npy_intp size[3] = { -1, -1, -1 };
-  PyArrayObject* temp_array;
-  Py_ssize_t i;
-  int is_new_object;
-
-  /* length of the list */
-  $2 = PyList_Size($input);
-
-  /* the arrays */
-  array = (DATA_TYPE **)malloc($2*sizeof(DATA_TYPE *));
-  object_array = (PyArrayObject **)calloc($2,sizeof(PyArrayObject *));
-  is_new_object_array = (int *)calloc($2,sizeof(int));
-
-  if (array == NULL || object_array == NULL || is_new_object_array == NULL)
-  {
-    SWIG_fail;
-  }
-
-  for (i=0; i<$2; i++)
-  {
-    temp_array = obj_to_array_contiguous_allow_conversion(PySequence_GetItem($input,i), DATA_TYPECODE, &is_new_object);
-
-    /* the new array must be stored so that it can be destroyed in freearg */
-    object_array[i] = temp_array;
-    is_new_object_array[i] = is_new_object;
-
-    if (!temp_array || !require_dimensions(temp_array, 3)) SWIG_fail;
-
-    /* store the size of the first array in the list, then use that for comparison. */
-    if (i == 0)
-    {
-      size[0] = array_size(temp_array,0);
-      size[1] = array_size(temp_array,1);
-      size[2] = array_size(temp_array,2);
-    }
-
-    if (!require_size(temp_array, size, 3)) SWIG_fail;
-
-    array[i] = (DATA_TYPE*) array_data(temp_array);
-  }
-
-  $1 = (DATA_TYPE**) array;
-  $3 = (DIM_TYPE) size[0];
-  $4 = (DIM_TYPE) size[1];
-  $5 = (DIM_TYPE) size[2];
-}
-%typemap(freearg)
-  (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  Py_ssize_t i;
-
-  if (array$argnum!=NULL) free(array$argnum);
-
-  /*freeing the individual arrays if needed */
-  if (object_array$argnum!=NULL)
-  {
-    if (is_new_object_array$argnum!=NULL)
-    {
-      for (i=0; i<$2; i++)
-      {
-        if (object_array$argnum[i] != NULL && is_new_object_array$argnum[i])
-        { Py_DECREF(object_array$argnum[i]); }
-      }
-      free(is_new_object_array$argnum);
-    }
-    free(object_array$argnum);
-  }
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4,
- *                    DATA_TYPE* IN_ARRAY4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_ARRAY4)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_ARRAY4)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[4] = { -1, -1, -1 , -1};
-  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 4) ||
-      !require_size(array, size, 4)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DIM_TYPE) array_size(array,2);
-  $4 = (DIM_TYPE) array_size(array,3);
-  $5 = (DATA_TYPE*) array_data(array);
-}
-%typemap(freearg)
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_ARRAY4)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[4] = { -1, -1, -1, -1 };
-  array = obj_to_array_fortran_allow_conversion($input, DATA_TYPECODE,
-                                                &is_new_object);
-  if (!array || !require_dimensions(array, 4) ||
-      !require_size(array, size, 4) | !require_fortran(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-  $4 = (DIM_TYPE) array_size(array,2);
-  $5 = (DIM_TYPE) array_size(array,3);
-}
-%typemap(freearg)
-  (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4,
- *                    DATA_TYPE* IN_FARRAY4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_FARRAY4)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_FARRAY4)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[4] = { -1, -1, -1 , -1 };
-  array = obj_to_array_fortran_allow_conversion($input, DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 4) ||
-      !require_size(array, size, 4) || !require_fortran(array)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DIM_TYPE) array_size(array,2);
-  $4 = (DIM_TYPE) array_size(array,3);
-  $5 = (DATA_TYPE*) array_data(array);
-}
-%typemap(freearg)
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_FARRAY4)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/***************************/
-/* In-Place Array Typemaps */
-/***************************/
-
-/* Typemap suite for (DATA_TYPE INPLACE_ARRAY1[ANY])
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE INPLACE_ARRAY1[ANY])
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE INPLACE_ARRAY1[ANY])
-  (PyArrayObject* array=NULL)
-{
-  npy_intp size[1] = { $1_dim0 };
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,1) || !require_size(array, size, 1) ||
-      !require_contiguous(array) || !require_native(array)) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY1, DIM_TYPE DIM1)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* INPLACE_ARRAY1, DIM_TYPE DIM1)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* INPLACE_ARRAY1, DIM_TYPE DIM1)
-  (PyArrayObject* array=NULL, int i=1)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,1) || !require_contiguous(array)
-      || !require_native(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = 1;
-  for (i=0; i < array_numdims(array); ++i) $2 *= array_size(array,i);
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DATA_TYPE* INPLACE_ARRAY1)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DATA_TYPE* INPLACE_ARRAY1)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DATA_TYPE* INPLACE_ARRAY1)
-  (PyArrayObject* array=NULL, int i=0)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,1) || !require_contiguous(array)
-      || !require_native(array)) SWIG_fail;
-  $1 = 1;
-  for (i=0; i < array_numdims(array); ++i) $1 *= array_size(array,i);
-  $2 = (DATA_TYPE*) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE INPLACE_ARRAY2[ANY][ANY])
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE INPLACE_ARRAY2[ANY][ANY])
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE INPLACE_ARRAY2[ANY][ANY])
-  (PyArrayObject* array=NULL)
-{
-  npy_intp size[2] = { $1_dim0, $1_dim1 };
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,2) || !require_size(array, size, 2) ||
-      !require_contiguous(array) || !require_native(array)) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* INPLACE_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* INPLACE_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,2) || !require_contiguous(array)
-      || !require_native(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_ARRAY2)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_ARRAY2)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_ARRAY2)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,2) || !require_contiguous(array) ||
-      !require_native(array)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DATA_TYPE*) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE* INPLACE_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* INPLACE_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* INPLACE_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,2) || !require_contiguous(array)
-      || !require_native(array) || !require_fortran(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_FARRAY2)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_FARRAY2)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_FARRAY2)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,2) || !require_contiguous(array) ||
-      !require_native(array) || !require_fortran(array)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DATA_TYPE*) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE INPLACE_ARRAY3[ANY][ANY][ANY])
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE INPLACE_ARRAY3[ANY][ANY][ANY])
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE INPLACE_ARRAY3[ANY][ANY][ANY])
-  (PyArrayObject* array=NULL)
-{
-  npy_intp size[3] = { $1_dim0, $1_dim1, $1_dim2 };
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,3) || !require_size(array, size, 3) ||
-      !require_contiguous(array) || !require_native(array)) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,3) || !require_contiguous(array) ||
-      !require_native(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-  $4 = (DIM_TYPE) array_size(array,2);
-}
-
-/* Typemap suite for (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  $1 = PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-  (DATA_TYPE** array=NULL, PyArrayObject** object_array=NULL)
-{
-  npy_intp size[2] = { -1, -1 };
-  PyArrayObject* temp_array;
-  Py_ssize_t i;
-
-  /* length of the list */
-  $2 = PyList_Size($input);
-
-  /* the arrays */
-  array = (DATA_TYPE **)malloc($2*sizeof(DATA_TYPE *));
-  object_array = (PyArrayObject **)calloc($2,sizeof(PyArrayObject *));
-
-  if (array == NULL || object_array == NULL)
-  {
-    SWIG_fail;
-  }
-
-  for (i=0; i<$2; i++)
-  {
-    temp_array = obj_to_array_no_conversion(PySequence_GetItem($input,i), DATA_TYPECODE);
-
-    /* the new array must be stored so that it can be destroyed in freearg */
-    object_array[i] = temp_array;
-
-    if ( !temp_array || !require_dimensions(temp_array, 2) ||
-      !require_contiguous(temp_array) ||
-      !require_native(temp_array) ||
-      !PyArray_EquivTypenums(array_type(temp_array), DATA_TYPECODE)
-    ) SWIG_fail;
-
-    /* store the size of the first array in the list, then use that for comparison. */
-    if (i == 0)
-    {
-      size[0] = array_size(temp_array,0);
-      size[1] = array_size(temp_array,1);
-    }
-
-    if (!require_size(temp_array, size, 2)) SWIG_fail;
-
-    array[i] = (DATA_TYPE*) array_data(temp_array);
-  }
-
-  $1 = (DATA_TYPE**) array;
-  $3 = (DIM_TYPE) size[0];
-  $4 = (DIM_TYPE) size[1];
-}
-%typemap(freearg)
-  (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  if (array$argnum!=NULL) free(array$argnum);
-  if (object_array$argnum!=NULL) free(object_array$argnum);
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
- *                    DATA_TYPE* INPLACE_ARRAY3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_ARRAY3)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_ARRAY3)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,3) || !require_contiguous(array)
-      || !require_native(array)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DIM_TYPE) array_size(array,2);
-  $4 = (DATA_TYPE*) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE* INPLACE_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* INPLACE_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* INPLACE_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,3) || !require_contiguous(array) ||
-      !require_native(array) || !require_fortran(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-  $4 = (DIM_TYPE) array_size(array,2);
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
- *                    DATA_TYPE* INPLACE_FARRAY3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_FARRAY3)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_FARRAY3)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,3) || !require_contiguous(array)
-      || !require_native(array) || !require_fortran(array)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DIM_TYPE) array_size(array,2);
-  $4 = (DATA_TYPE*) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE INPLACE_ARRAY4[ANY][ANY][ANY][ANY])
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE INPLACE_ARRAY4[ANY][ANY][ANY][ANY])
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE INPLACE_ARRAY4[ANY][ANY][ANY][ANY])
-  (PyArrayObject* array=NULL)
-{
-  npy_intp size[4] = { $1_dim0, $1_dim1, $1_dim2 , $1_dim3 };
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,4) || !require_size(array, size, 4) ||
-      !require_contiguous(array) || !require_native(array)) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,4) || !require_contiguous(array) ||
-      !require_native(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-  $4 = (DIM_TYPE) array_size(array,2);
-  $5 = (DIM_TYPE) array_size(array,3);
-}
-
-/* Typemap suite for (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  $1 = PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-  (DATA_TYPE** array=NULL, PyArrayObject** object_array=NULL)
-{
-  npy_intp size[3] = { -1, -1, -1 };
-  PyArrayObject* temp_array;
-  Py_ssize_t i;
-
-  /* length of the list */
-  $2 = PyList_Size($input);
-
-  /* the arrays */
-  array = (DATA_TYPE **)malloc($2*sizeof(DATA_TYPE *));
-  object_array = (PyArrayObject **)calloc($2,sizeof(PyArrayObject *));
-
-  if (array == NULL || object_array == NULL)
-  {
-    SWIG_fail;
-  }
-
-  for (i=0; i<$2; i++)
-  {
-    temp_array = obj_to_array_no_conversion(PySequence_GetItem($input,i), DATA_TYPECODE);
-
-    /* the new array must be stored so that it can be destroyed in freearg */
-    object_array[i] = temp_array;
-
-    if ( !temp_array || !require_dimensions(temp_array, 3) ||
-      !require_contiguous(temp_array) ||
-      !require_native(temp_array) ||
-      !PyArray_EquivTypenums(array_type(temp_array), DATA_TYPECODE)
-    ) SWIG_fail;
-
-    /* store the size of the first array in the list, then use that for comparison. */
-    if (i == 0)
-    {
-      size[0] = array_size(temp_array,0);
-      size[1] = array_size(temp_array,1);
-      size[2] = array_size(temp_array,2);
-    }
-
-    if (!require_size(temp_array, size, 3)) SWIG_fail;
-
-    array[i] = (DATA_TYPE*) array_data(temp_array);
-  }
-
-  $1 = (DATA_TYPE**) array;
-  $3 = (DIM_TYPE) size[0];
-  $4 = (DIM_TYPE) size[1];
-  $5 = (DIM_TYPE) size[2];
-}
-%typemap(freearg)
-  (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  if (array$argnum!=NULL) free(array$argnum);
-  if (object_array$argnum!=NULL) free(object_array$argnum);
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4,
- *                    DATA_TYPE* INPLACE_ARRAY4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_ARRAY4)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_ARRAY4)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,4) || !require_contiguous(array)
-      || !require_native(array)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DIM_TYPE) array_size(array,2);
-  $4 = (DIM_TYPE) array_size(array,3);
-  $5 = (DATA_TYPE*) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE* INPLACE_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* INPLACE_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* INPLACE_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,4) || !require_contiguous(array) ||
-      !require_native(array) || !require_fortran(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-  $4 = (DIM_TYPE) array_size(array,2);
-  $5 = (DIM_TYPE) array_size(array,3);
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
- *                    DATA_TYPE* INPLACE_FARRAY4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_FARRAY4)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_FARRAY4)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,4) || !require_contiguous(array)
-      || !require_native(array) || !require_fortran(array)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DIM_TYPE) array_size(array,2);
-  $4 = (DIM_TYPE) array_size(array,3);
-  $5 = (DATA_TYPE*) array_data(array);
-}
-
-/*************************/
-/* Argout Array Typemaps */
-/*************************/
-
-/* Typemap suite for (DATA_TYPE ARGOUT_ARRAY1[ANY])
- */
-%typemap(in,numinputs=0,
-         fragment="NumPy_Backward_Compatibility,NumPy_Macros")
-  (DATA_TYPE ARGOUT_ARRAY1[ANY])
-  (PyObject* array = NULL)
-{
-  npy_intp dims[1] = { $1_dim0 };
-  array = PyArray_SimpleNew(1, dims, DATA_TYPECODE);
-  if (!array) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-%typemap(argout)
-  (DATA_TYPE ARGOUT_ARRAY1[ANY])
-{
-  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
-}
-
-/* Typemap suite for (DATA_TYPE* ARGOUT_ARRAY1, DIM_TYPE DIM1)
- */
-%typemap(in,numinputs=1,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* ARGOUT_ARRAY1, DIM_TYPE DIM1)
-  (PyObject* array = NULL)
-{
-  npy_intp dims[1];
-  if (!PyInt_Check($input))
-  {
-    const char* typestring = pytype_string($input);
-    PyErr_Format(PyExc_TypeError,
-                 "Int dimension expected.  '%s' given.",
-                 typestring);
-    SWIG_fail;
-  }
-  $2 = (DIM_TYPE) PyInt_AsLong($input);
-  dims[0] = (npy_intp) $2;
-  array = PyArray_SimpleNew(1, dims, DATA_TYPECODE);
-  if (!array) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-}
-%typemap(argout)
-  (DATA_TYPE* ARGOUT_ARRAY1, DIM_TYPE DIM1)
-{
-  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DATA_TYPE* ARGOUT_ARRAY1)
- */
-%typemap(in,numinputs=1,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DATA_TYPE* ARGOUT_ARRAY1)
-  (PyObject* array = NULL)
-{
-  npy_intp dims[1];
-  if (!PyInt_Check($input))
-  {
-    const char* typestring = pytype_string($input);
-    PyErr_Format(PyExc_TypeError,
-                 "Int dimension expected.  '%s' given.",
-                 typestring);
-    SWIG_fail;
-  }
-  $1 = (DIM_TYPE) PyInt_AsLong($input);
-  dims[0] = (npy_intp) $1;
-  array = PyArray_SimpleNew(1, dims, DATA_TYPECODE);
-  if (!array) SWIG_fail;
-  $2 = (DATA_TYPE*) array_data(array);
-}
-%typemap(argout)
-  (DIM_TYPE DIM1, DATA_TYPE* ARGOUT_ARRAY1)
-{
-  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
-}
-
-/* Typemap suite for (DATA_TYPE ARGOUT_ARRAY2[ANY][ANY])
- */
-%typemap(in,numinputs=0,
-         fragment="NumPy_Backward_Compatibility,NumPy_Macros")
-  (DATA_TYPE ARGOUT_ARRAY2[ANY][ANY])
-  (PyObject* array = NULL)
-{
-  npy_intp dims[2] = { $1_dim0, $1_dim1 };
-  array = PyArray_SimpleNew(2, dims, DATA_TYPECODE);
-  if (!array) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-%typemap(argout)
-  (DATA_TYPE ARGOUT_ARRAY2[ANY][ANY])
-{
-  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
-}
-
-/* Typemap suite for (DATA_TYPE ARGOUT_ARRAY3[ANY][ANY][ANY])
- */
-%typemap(in,numinputs=0,
-         fragment="NumPy_Backward_Compatibility,NumPy_Macros")
-  (DATA_TYPE ARGOUT_ARRAY3[ANY][ANY][ANY])
-  (PyObject* array = NULL)
-{
-  npy_intp dims[3] = { $1_dim0, $1_dim1, $1_dim2 };
-  array = PyArray_SimpleNew(3, dims, DATA_TYPECODE);
-  if (!array) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-%typemap(argout)
-  (DATA_TYPE ARGOUT_ARRAY3[ANY][ANY][ANY])
-{
-  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
-}
-
-/* Typemap suite for (DATA_TYPE ARGOUT_ARRAY4[ANY][ANY][ANY][ANY])
- */
-%typemap(in,numinputs=0,
-         fragment="NumPy_Backward_Compatibility,NumPy_Macros")
-  (DATA_TYPE ARGOUT_ARRAY4[ANY][ANY][ANY][ANY])
-  (PyObject* array = NULL)
-{
-  npy_intp dims[4] = { $1_dim0, $1_dim1, $1_dim2, $1_dim3 };
-  array = PyArray_SimpleNew(4, dims, DATA_TYPECODE);
-  if (!array) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-%typemap(argout)
-  (DATA_TYPE ARGOUT_ARRAY4[ANY][ANY][ANY][ANY])
-{
-  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
-}
-
-/*****************************/
-/* Argoutview Array Typemaps */
-/*****************************/
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_ARRAY1, DIM_TYPE* DIM1)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEW_ARRAY1, DIM_TYPE* DIM1    )
-  (DATA_TYPE*  data_temp = NULL , DIM_TYPE  dim_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility")
-  (DATA_TYPE** ARGOUTVIEW_ARRAY1, DIM_TYPE* DIM1)
-{
-  npy_intp dims[1] = { *$2 };
-  PyObject* obj = PyArray_SimpleNewFromData(1, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEW_ARRAY1)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DATA_TYPE** ARGOUTVIEW_ARRAY1)
-  (DIM_TYPE  dim_temp, DATA_TYPE*  data_temp = NULL )
-{
-  $1 = &dim_temp;
-  $2 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility")
-  (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEW_ARRAY1)
-{
-  npy_intp dims[1] = { *$1 };
-  PyObject* obj = PyArray_SimpleNewFromData(1, dims, DATA_TYPECODE, (void*)(*$2));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEW_ARRAY2, DIM_TYPE* DIM1     , DIM_TYPE* DIM2     )
-  (DATA_TYPE*  data_temp = NULL , DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility")
-  (DATA_TYPE** ARGOUTVIEW_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
-{
-  npy_intp dims[2] = { *$2, *$3 };
-  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_ARRAY2)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1     , DIM_TYPE* DIM2     , DATA_TYPE** ARGOUTVIEW_ARRAY2)
-  (DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp, DATA_TYPE*  data_temp = NULL )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_ARRAY2)
-{
-  npy_intp dims[2] = { *$1, *$2 };
-  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$3));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEW_FARRAY2, DIM_TYPE* DIM1     , DIM_TYPE* DIM2     )
-  (DATA_TYPE*  data_temp = NULL  , DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
-  (DATA_TYPE** ARGOUTVIEW_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
-{
-  npy_intp dims[2] = { *$2, *$3 };
-  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_FARRAY2)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1     , DIM_TYPE* DIM2     , DATA_TYPE** ARGOUTVIEW_FARRAY2)
-  (DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp, DATA_TYPE*  data_temp = NULL  )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_FARRAY2)
-{
-  npy_intp dims[2] = { *$1, *$2 };
-  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$3));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEW_ARRAY3, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    )
-  (DATA_TYPE* data_temp = NULL  , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility")
-  (DATA_TYPE** ARGOUTVIEW_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
-{
-  npy_intp dims[3] = { *$2, *$3, *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3,
-                      DATA_TYPE** ARGOUTVIEW_ARRAY3)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_ARRAY3)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DATA_TYPE* data_temp = NULL)
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_ARRAY3)
-{
-  npy_intp dims[3] = { *$1, *$2, *$3 };
-  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$4));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEW_FARRAY3, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    )
-  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
-  (DATA_TYPE** ARGOUTVIEW_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
-{
-  npy_intp dims[3] = { *$2, *$3, *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3,
-                      DATA_TYPE** ARGOUTVIEW_FARRAY3)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DATA_TYPE** ARGOUTVIEW_FARRAY3)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DATA_TYPE* data_temp = NULL   )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_FARRAY3)
-{
-  npy_intp dims[3] = { *$1, *$2, *$3 };
-  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$4));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEW_ARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
-  (DATA_TYPE* data_temp = NULL  , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-  $5 = &dim4_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility")
-  (DATA_TYPE** ARGOUTVIEW_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
-{
-  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
-                      DATA_TYPE** ARGOUTVIEW_ARRAY4)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEW_ARRAY4)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL  )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &dim4_temp;
-  $5 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEW_ARRAY4)
-{
-  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEW_FARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
-  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-  $5 = &dim4_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
-  (DATA_TYPE** ARGOUTVIEW_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
-{
-  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
-                      DATA_TYPE** ARGOUTVIEW_FARRAY4)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEW_FARRAY4)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL   )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &dim4_temp;
-  $5 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEW_FARRAY4)
-{
-  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/*************************************/
-/* Managed Argoutview Array Typemaps */
-/*************************************/
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY1, DIM_TYPE* DIM1)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY1, DIM_TYPE* DIM1    )
-  (DATA_TYPE*  data_temp = NULL  , DIM_TYPE  dim_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY1, DIM_TYPE* DIM1)
-{
-  npy_intp dims[1] = { *$2 };
-  PyObject* obj = PyArray_SimpleNewFromData(1, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEWM_ARRAY1)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DATA_TYPE** ARGOUTVIEWM_ARRAY1)
-  (DIM_TYPE  dim_temp, DATA_TYPE*  data_temp = NULL  )
-{
-  $1 = &dim_temp;
-  $2 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEWM_ARRAY1)
-{
-  npy_intp dims[1] = { *$1 };
-  PyObject* obj = PyArray_SimpleNewFromData(1, dims, DATA_TYPECODE, (void*)(*$2));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY2, DIM_TYPE* DIM1     , DIM_TYPE* DIM2     )
-  (DATA_TYPE*  data_temp = NULL  , DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
-{
-  npy_intp dims[2] = { *$2, *$3 };
-  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_ARRAY2)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1     , DIM_TYPE* DIM2     , DATA_TYPE** ARGOUTVIEWM_ARRAY2)
-  (DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp, DATA_TYPE*  data_temp = NULL  )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_ARRAY2)
-{
-  npy_intp dims[2] = { *$1, *$2 };
-  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$3));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEWM_FARRAY2, DIM_TYPE* DIM1     , DIM_TYPE* DIM2     )
-  (DATA_TYPE*  data_temp = NULL   , DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
-  (DATA_TYPE** ARGOUTVIEWM_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
-{
-  npy_intp dims[2] = { *$2, *$3 };
-  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_FARRAY2)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1     , DIM_TYPE* DIM2     , DATA_TYPE** ARGOUTVIEWM_FARRAY2)
-  (DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp, DATA_TYPE*  data_temp = NULL   )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_FARRAY2)
-{
-  npy_intp dims[2] = { *$1, *$2 };
-  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$3));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY3, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    )
-  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
-{
-  npy_intp dims[3] = { *$2, *$3, *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3,
-                      DATA_TYPE** ARGOUTVIEWM_ARRAY3)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DATA_TYPE** ARGOUTVIEWM_ARRAY3)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DATA_TYPE* data_temp = NULL   )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEWM_ARRAY3)
-{
-  npy_intp dims[3] = { *$1, *$2, *$3 };
-  PyObject* obj= PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$4));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEWM_FARRAY3, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    )
-  (DATA_TYPE* data_temp = NULL    , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
-  (DATA_TYPE** ARGOUTVIEWM_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
-{
-  npy_intp dims[3] = { *$2, *$3, *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3,
-                      DATA_TYPE** ARGOUTVIEWM_FARRAY3)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DATA_TYPE** ARGOUTVIEWM_FARRAY3)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DATA_TYPE* data_temp = NULL    )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEWM_FARRAY3)
-{
-  npy_intp dims[3] = { *$1, *$2, *$3 };
-  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$4));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
-  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-  $5 = &dim4_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
-{
-  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
-                      DATA_TYPE** ARGOUTVIEWM_ARRAY4)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEWM_ARRAY4)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL   )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &dim4_temp;
-  $5 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_ARRAY4)
-{
-  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
-  (DATA_TYPE* data_temp = NULL    , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-  $5 = &dim4_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
-  (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
-{
-  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
-                      DATA_TYPE** ARGOUTVIEWM_FARRAY4)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEWM_FARRAY4)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL    )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &dim4_temp;
-  $5 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_FARRAY4)
-{
-  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
-  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-  $5 = &dim4_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
-{
-  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
-                      DATA_TYPE** ARGOUTVIEWM_ARRAY4)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEWM_ARRAY4)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL   )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &dim4_temp;
-  $5 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_ARRAY4)
-{
-  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
-  (DATA_TYPE* data_temp = NULL    , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-  $5 = &dim4_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
-  (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
-{
-  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
-                      DATA_TYPE** ARGOUTVIEWM_FARRAY4)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEWM_FARRAY4)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL    )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &dim4_temp;
-  $5 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_FARRAY4)
-{
-  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/**************************************/
-/* In-Place Array Typemap - flattened */
-/**************************************/
-
-/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY_FLAT, DIM_TYPE DIM_FLAT)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* INPLACE_ARRAY_FLAT, DIM_TYPE DIM_FLAT)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* INPLACE_ARRAY_FLAT, DIM_TYPE DIM_FLAT)
-  (PyArrayObject* array=NULL, int i=1)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_c_or_f_contiguous(array)
-      || !require_native(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = 1;
-  for (i=0; i < array_numdims(array); ++i) $2 *= array_size(array,i);
-}
-
-%enddef    /* %numpy_typemaps() macro */
-/* *************************************************************** */
-
-/* Concrete instances of the %numpy_typemaps() macro: Each invocation
- * below applies all of the typemaps above to the specified data type.
- */
-%numpy_typemaps(signed char       , NPY_BYTE     , int)
-%numpy_typemaps(unsigned char     , NPY_UBYTE    , int)
-%numpy_typemaps(short             , NPY_SHORT    , int)
-%numpy_typemaps(unsigned short    , NPY_USHORT   , int)
-%numpy_typemaps(int               , NPY_INT      , int)
-%numpy_typemaps(unsigned int      , NPY_UINT     , int)
-%numpy_typemaps(long              , NPY_LONG     , int)
-%numpy_typemaps(unsigned long     , NPY_ULONG    , int)
-%numpy_typemaps(long long         , NPY_LONGLONG , int)
-%numpy_typemaps(unsigned long long, NPY_ULONGLONG, int)
-%numpy_typemaps(float             , NPY_FLOAT    , int)
-%numpy_typemaps(double            , NPY_DOUBLE   , int)
-
-/* ***************************************************************
- * The follow macro expansion does not work, because C++ bool is 4
- * bytes and NPY_BOOL is 1 byte
- *
- *    %numpy_typemaps(bool, NPY_BOOL, int)
- */
-
-/* ***************************************************************
- * On my Mac, I get the following warning for this macro expansion:
- * 'swig/python detected a memory leak of type 'long double *', no destructor found.'
- *
- *    %numpy_typemaps(long double, NPY_LONGDOUBLE, int)
- */
-
-#ifdef __cplusplus
-
-%include <std_complex.i>
-
-%numpy_typemaps(std::complex<float>,  NPY_CFLOAT , int)
-%numpy_typemaps(std::complex<double>, NPY_CDOUBLE, int)
-
-#endif
-
-#endif /* SWIGPYTHON */
diff --git a/paddle/legacy/api/test/.gitignore b/paddle/legacy/api/test/.gitignore
deleted file mode 100644
index b7948824a1e..00000000000
--- a/paddle/legacy/api/test/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*.w0
-*.wbias
diff --git a/paddle/legacy/api/test/CMakeLists.txt b/paddle/legacy/api/test/CMakeLists.txt
deleted file mode 100644
index 13cb79129cc..00000000000
--- a/paddle/legacy/api/test/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/testTrain.py
-    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/*.py ${CMAKE_CURRENT_BINARY_DIR}
-)
-add_custom_target(copy_api_test ALL DEPENDS testTrain.py)
-
-py_test(testTrain SRCS testTrain.py)
-py_test(testMatrix SRCS testMatrix.py)
-py_test(testVector SRCS testVector.py)
-py_test(testTrainer SRCS testTrainer.py)
-py_test(testArguments SRCS testArguments.py)
-py_test(testGradientMachine SRCS testGradientMachine.py)
diff --git a/paddle/legacy/api/test/testArguments.py b/paddle/legacy/api/test/testArguments.py
deleted file mode 100644
index 4d40ffec9a0..00000000000
--- a/paddle/legacy/api/test/testArguments.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from py_paddle import swig_paddle
-import numpy as np
-import unittest
-
-
-class TestArguments(unittest.TestCase):
-    def test_load_arguments(self):
-        m = swig_paddle.Matrix.createDense([4, 2, 4, 3, 9, 5], 2, 3)
-        args = swig_paddle.Arguments.createArguments(1)
-        args.setSlotValue(0, m)
-
-        self.assertAlmostEqual(27.0, args.sum())
-
-        mat = args.getSlotValue(0)
-        assert isinstance(mat, swig_paddle.Matrix)
-        np_mat = mat.toNumpyMatInplace()
-        # The matrix unittest is in testMatrix.py
-        self.assertEqual(np_mat.shape, (2, 3))
-
-        args.setSlotIds(0, swig_paddle.IVector.create([1, 2, 3, 4, 5, 6]))
-        iv = args.getSlotIds(0)
-        assert isinstance(iv, swig_paddle.IVector)
-        np_arr = iv.toNumpyArrayInplace()
-        self.assertEqual(np_arr.shape, (6, ))
-
-    def test_arguments_shape(self):
-        h, w = 4, 6
-        v = np.random.rand(2, h * w)
-        m = swig_paddle.Matrix.createDense(v.flatten(), 2, h * w)
-        args = swig_paddle.Arguments.createArguments(1)
-        args.setSlotValue(0, m)
-        args.setSlotFrameHeight(0, h)
-        args.setSlotFrameWidth(0, w)
-        self.assertEqual(args.getSlotFrameHeight(), h)
-        self.assertEqual(args.getSlotFrameWidth(), w)
-
-
-if __name__ == '__main__':
-    swig_paddle.initPaddle("--use_gpu=0")
-    unittest.main()
diff --git a/paddle/legacy/api/test/testGradientMachine.py b/paddle/legacy/api/test/testGradientMachine.py
deleted file mode 100644
index 4b705f66ecc..00000000000
--- a/paddle/legacy/api/test/testGradientMachine.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from py_paddle import swig_paddle
-import paddle.proto.ParameterConfig_pb2
-import util
-import unittest
-import numpy
-
-
-class TestGradientMachine(unittest.TestCase):
-    def test_create_gradient_machine(self):
-        conf_file_path = "./testTrainConfig.py"
-        trainer_config = swig_paddle.TrainerConfig.createFromTrainerConfigFile(
-            conf_file_path)
-        self.assertIsNotNone(trainer_config)
-        opt_config = trainer_config.getOptimizationConfig()
-        model_config = trainer_config.getModelConfig()
-        self.assertIsNotNone(model_config)
-        machine = swig_paddle.GradientMachine.createByModelConfig(
-            model_config, swig_paddle.CREATE_MODE_NORMAL,
-            swig_paddle.ParameterOptimizer.create(opt_config).getParameterTypes(
-            ))
-        self.assertIsNotNone(machine)
-        ipt, _ = util.loadMNISTTrainData()
-        output = swig_paddle.Arguments.createArguments(0)
-
-        optimizers = {}
-
-        # Initial Machine Parameter all to 0.1
-        for param in machine.getParameters():
-            assert isinstance(param, swig_paddle.Parameter)
-            val = param.getBuf(swig_paddle.PARAMETER_VALUE)
-            assert isinstance(val, swig_paddle.Vector)
-            arr = numpy.full((len(val), ), 0.1, dtype="float32")
-            val.copyFromNumpyArray(arr)
-            self.assertTrue(param.save(param.getName()))
-            param_config = param.getConfig().toProto()
-            assert isinstance(param_config,
-                              paddle.proto.ParameterConfig_pb2.ParameterConfig)
-            opt = swig_paddle.ParameterOptimizer.create(opt_config)
-            optimizers[param.getID()] = opt
-            num_rows = param_config.dims[1]
-            opt.init(num_rows, param.getConfig())
-
-        for k in optimizers:
-            opt = optimizers[k]
-            opt.startPass()
-
-        batch_size = ipt.getSlotValue(0).getHeight()
-        for k in optimizers:
-            opt = optimizers[k]
-            opt.startBatch(batch_size)
-
-        machine.forward(ipt, output, swig_paddle.PASS_TRAIN)
-        self.assertEqual(1, output.getSlotNum())
-        self.isCalled = False
-
-        def backward_callback(param_):
-            self.isCalled = isinstance(param_, swig_paddle.Parameter)
-            assert isinstance(param_, swig_paddle.Parameter)
-            vec = param_.getBuf(swig_paddle.PARAMETER_VALUE)
-            assert isinstance(vec, swig_paddle.Vector)
-            vec = vec.copyToNumpyArray()
-            for val_ in vec:
-                self.assertTrue(
-                    util.doubleEqual(val_, 0.1))  # Assert All Value is 0.1
-
-            vecs = list(param_.getBufs())
-            opt_ = optimizers[param_.getID()]
-            opt_.update(vecs, param_.getConfig())
-
-        machine.backward(backward_callback)
-
-        for k in optimizers:
-            opt = optimizers[k]
-            opt.finishBatch()
-
-        for k in optimizers:
-            opt = optimizers[k]
-            opt.finishPass()
-
-        self.assertTrue(self.isCalled)
-
-        for param in machine.getParameters():
-            self.assertTrue(param.load(param.getName()))
-
-    def test_train_one_pass(self):
-        conf_file_path = './testTrainConfig.py'
-        trainer_config = swig_paddle.TrainerConfig.createFromTrainerConfigFile(
-            conf_file_path)
-        model_config = trainer_config.getModelConfig()
-        machine = swig_paddle.GradientMachine.createByModelConfig(model_config)
-
-        at_end = False
-
-        output = swig_paddle.Arguments.createArguments(0)
-        if not at_end:
-            input_, at_end = util.loadMNISTTrainData(1000)
-            machine.forwardBackward(input_, output, swig_paddle.PASS_TRAIN)
-
-
-if __name__ == '__main__':
-    swig_paddle.initPaddle('--use_gpu=0')
-    unittest.main()
diff --git a/paddle/legacy/api/test/testMatrix.py b/paddle/legacy/api/test/testMatrix.py
deleted file mode 100644
index f08fbf3ccdf..00000000000
--- a/paddle/legacy/api/test/testMatrix.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from py_paddle import swig_paddle
-import numpy as np
-import unittest
-
-
-class TestMatrix(unittest.TestCase):
-    def test_createZero_get_set(self):
-        m = swig_paddle.Matrix.createZero(32, 24)
-        self.assertEqual(m.getWidth(), 24)
-        self.assertEqual(m.getHeight(), 32)
-        for x in xrange(24):
-            for y in xrange(32):
-                self.assertEqual(0.0, m.get(x, y))
-        with self.assertRaises(swig_paddle.RangeError):
-            m.get(51, 47)
-        m.set(3, 3, 3.0)
-        self.assertEqual(m.get(3, 3), 3.0)
-
-    def test_sparse(self):
-        m = swig_paddle.Matrix.createSparse(3, 3, 6, True, False, False)
-        self.assertIsNotNone(m)
-        self.assertTrue(m.isSparse())
-        self.assertEqual(m.getSparseValueType(), swig_paddle.SPARSE_NON_VALUE)
-        self.assertEqual(m.getSparseFormat(), swig_paddle.SPARSE_CSR)
-        m.sparseCopyFrom([0, 2, 3, 3], [0, 1, 2], [])
-        self.assertEqual(m.getSparseRowCols(0), [0, 1])
-        self.assertEqual(m.getSparseRowCols(1), [2])
-        self.assertEqual(m.getSparseRowCols(2), [])
-
-    def test_sparse_value(self):
-        m = swig_paddle.Matrix.createSparse(3, 3, 6, False, False, False)
-        self.assertIsNotNone(m)
-        m.sparseCopyFrom([0, 2, 3, 3], [0, 1, 2], [7.3, 4.2, 3.2])
-
-        def assertKVArraySame(actual, expect):
-            self.assertEqual(len(actual), len(expect))
-            for i in xrange(len(actual)):
-                a = actual[i]
-                e = expect[i]
-                self.assertIsInstance(a, tuple)
-                self.assertIsInstance(e, tuple)
-                self.assertEqual(len(a), 2)
-                self.assertEqual(len(e), 2)
-                self.assertEqual(a[0], e[0])
-                self.assertTrue(abs(a[1] - e[1]) < 1e-5)
-
-        first_row = m.getSparseRowColsVal(0)
-        assertKVArraySame(first_row, [(0, 7.3), (1, 4.2)])
-
-    def test_createDenseMat(self):
-        m = swig_paddle.Matrix.createDense([0.1, 0.2, 0.3, 0.4, 0.5, 0.6], 2, 3)
-        self.assertIsNotNone(m)
-        self.assertTrue(abs(m.get(1, 1) - 0.5) < 1e-5)
-
-    def test_numpyCpu(self):
-        numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
-        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat, False)
-        self.assertEqual((int(m.getHeight()), int(m.getWidth())),
-                         numpy_mat.shape)
-
-        # the numpy matrix and paddle matrix shared the same memory.
-        numpy_mat[0, 1] = 342.23
-
-        for h in xrange(m.getHeight()):
-            for w in xrange(m.getWidth()):
-                self.assertEqual(m.get(h, w), numpy_mat[h, w])
-
-        mat2 = m.toNumpyMatInplace()
-        mat2[1, 1] = 32.2
-        self.assertTrue(np.array_equal(mat2, numpy_mat))
-
-    def test_numpyGpu(self):
-        if swig_paddle.isGpuVersion():
-            numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype='float32')
-            gpu_m = swig_paddle.Matrix.createGpuDenseFromNumpy(numpy_mat)
-            assert isinstance(gpu_m, swig_paddle.Matrix)
-            self.assertEqual((int(gpu_m.getHeight()), int(gpu_m.getWidth())),
-                             numpy_mat.shape)
-            self.assertTrue(gpu_m.isGpu())
-            numpy_mat = gpu_m.copyToNumpyMat()
-            numpy_mat[0, 1] = 3.23
-            for a, e in zip(gpu_m.getData(), [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]):
-                self.assertAlmostEqual(a, e)
-
-            gpu_m.copyFromNumpyMat(numpy_mat)
-
-            for a, e in zip(gpu_m.getData(), [1.0, 3.23, 3.0, 4.0, 5.0, 6.0]):
-                self.assertAlmostEqual(a, e)
-
-    def test_numpy(self):
-        numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
-        m = swig_paddle.Matrix.createDenseFromNumpy(numpy_mat)
-        self.assertEqual((int(m.getHeight()), int(m.getWidth())),
-                         numpy_mat.shape)
-        self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
-        for a, e in zip(m.getData(), [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]):
-            self.assertAlmostEqual(a, e)
-
-
-if __name__ == "__main__":
-    swig_paddle.initPaddle("--use_gpu=0")
-    suite = unittest.TestLoader().loadTestsFromTestCase(TestMatrix)
-    unittest.TextTestRunner().run(suite)
-    if swig_paddle.isGpuVersion():
-        swig_paddle.setUseGpu(True)
-        unittest.main()
diff --git a/paddle/legacy/api/test/testTrain.py b/paddle/legacy/api/test/testTrain.py
deleted file mode 100644
index 7061a4c43bf..00000000000
--- a/paddle/legacy/api/test/testTrain.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from py_paddle import swig_paddle
-import paddle.trainer.config_parser
-import numpy
-import util
-
-
-def init_params(params):
-    def init_param(p):
-        assert isinstance(p, swig_paddle.Parameter)
-        val = p.getBuf(swig_paddle.PARAMETER_VALUE)
-        assert isinstance(val, swig_paddle.Vector)
-        arr = val.toNumpyArrayInplace()
-        for i in xrange(len(arr)):
-            arr[i] = numpy.random.uniform(-1.0, 1.0)
-
-    for p in params:
-        init_param(p)
-
-
-def init_optimizers(opt_conf, params):
-    opts = {}
-    for param in params:
-        param_conf = param.getConfig().toProto()
-        opts[param.getID()] = swig_paddle.ParameterOptimizer.create(opt_conf)
-        opts[param.getID()].init(param_conf.dims[1], param.getConfig())
-    retv_opts = [None for _ in xrange(len(opts))]
-    for k in opts:
-        assert k < len(retv_opts)
-        retv_opts[k] = opts[k]
-    return retv_opts
-
-
-def main():
-    trainer_config = paddle.trainer.config_parser.parse_config(
-        "./testTrainConfig.py", "")
-    opt_config = trainer_config.opt_config
-    print "========Optimization Config ======="
-    print opt_config
-    print "==================================="
-    opt_config = swig_paddle.OptimizationConfig.createFromProto(opt_config)
-    _temp_optimizer_ = swig_paddle.ParameterOptimizer.create(opt_config)
-    enable_types = _temp_optimizer_.getParameterTypes()
-    m = swig_paddle.GradientMachine.createFromConfigProto(
-        trainer_config.model_config, swig_paddle.CREATE_MODE_NORMAL,
-        enable_types)
-    assert m is not None
-    assert isinstance(m, swig_paddle.GradientMachine)
-    init_params(m.getParameters())
-
-    optimizers = init_optimizers(opt_config, m.getParameters())
-
-    # Train One Pass.
-    for optimizer in optimizers:
-        optimizer.startPass()
-    batch_id = 0
-    while True:  # Train one batch
-        batch_size = 1000
-        inArgs, atEnd = util.loadMNISTTrainData(batch_size)
-        if atEnd:
-            break
-        outArgs = swig_paddle.Arguments.createArguments(0)
-
-        for optimizer in optimizers:
-            optimizer.startBatch(batch_size)
-
-        def update_callback(param):
-            try:
-                bufs = list(param.getBufs())
-                opt = optimizers[param.getID()]
-                opt.update(bufs, param.getConfig())
-                callback = opt.needSpecialTraversal(param.getConfig())
-                if callback is not None:
-                    callback(bufs, param.getConfig(), swig_paddle.NO_SPARSE_ID)
-
-            except Exception as e:
-                print e
-
-        ev = m.makeEvaluator()
-        ev.start()
-        m.forwardBackward(inArgs, outArgs, swig_paddle.PASS_TRAIN,
-                          update_callback)
-        m.eval(ev)
-        ev.finish()
-        for name in ev.getNames():
-            print name, ev.getValue(name)
-        for optimizer in optimizers:
-            optimizer.finishBatch()
-
-        cost_vec = outArgs.getSlotValue(0)
-        assert isinstance(cost_vec, swig_paddle.Matrix)
-        cost_vec = cost_vec.copyToNumpyMat()
-        print 'Finish Batch', batch_id, 'with cost ', cost_vec.sum(
-        ) / batch_size
-        batch_id += 1
-
-    for optimizer in optimizers:
-        optimizer.finishPass()
-
-
-if __name__ == '__main__':
-    swig_paddle.initPaddle("--use_gpu=0", "--trainer_count=1")
-    main()
diff --git a/paddle/legacy/api/test/testTrainConfig.py b/paddle/legacy/api/test/testTrainConfig.py
deleted file mode 100644
index c02d61ebad5..00000000000
--- a/paddle/legacy/api/test/testTrainConfig.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=100, learning_method=AdamOptimizer())
-
-din = data_layer(name='input', size=784)
-
-fc1 = fc_layer(name='hidden1', input=din, size=100)
-fc2 = fc_layer(name='hidden2', input=fc1, size=100)
-
-opt = fc_layer(input=fc2, size=10, act=SoftmaxActivation())
-outputs(classification_cost(input=opt, label=data_layer('lbl', 10)))
diff --git a/paddle/legacy/api/test/testTrainer.py b/paddle/legacy/api/test/testTrainer.py
deleted file mode 100644
index a76cbf02d83..00000000000
--- a/paddle/legacy/api/test/testTrainer.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.config_parser import parse_config
-from paddle.trainer.config_parser import logger
-from py_paddle import swig_paddle
-import util
-
-
-def main():
-    trainer_config = parse_config("./testTrainConfig.py", "")
-    model = swig_paddle.GradientMachine.createFromConfigProto(
-        trainer_config.model_config)
-    trainer = swig_paddle.Trainer.create(trainer_config, model)
-    trainer.startTrain()
-    for train_pass in xrange(2):
-        trainer.startTrainPass()
-        num = 0
-        cost = 0
-        while True:  # Train one batch
-            batch_size = 1000
-            data, atEnd = util.loadMNISTTrainData(batch_size)
-            if atEnd:
-                break
-            trainer.trainOneDataBatch(batch_size, data)
-            outs = trainer.getForwardOutput()
-            cost += sum(outs[0]['value'])
-            num += batch_size
-        trainer.finishTrainPass()
-        logger.info('train cost=%f' % (cost / num))
-
-        trainer.startTestPeriod()
-        num = 0
-        cost = 0
-        while True:  # Test one batch
-            batch_size = 1000
-            data, atEnd = util.loadMNISTTrainData(batch_size)
-            if atEnd:
-                break
-            trainer.testOneDataBatch(batch_size, data)
-            outs = trainer.getForwardOutput()
-            cost += sum(outs[0]['value'])
-            num += batch_size
-        trainer.finishTestPeriod()
-        logger.info('test cost=%f' % (cost / num))
-
-    trainer.finishTrain()
-
-
-if __name__ == '__main__':
-    swig_paddle.initPaddle("--use_gpu=0", "--trainer_count=1")
-    main()
diff --git a/paddle/legacy/api/test/testVector.py b/paddle/legacy/api/test/testVector.py
deleted file mode 100644
index 6339cf85426..00000000000
--- a/paddle/legacy/api/test/testVector.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from py_paddle import swig_paddle
-import util
-import numpy as np
-import unittest
-
-
-class TestIVector(unittest.TestCase):
-    def test_createZero(self):
-        m = swig_paddle.IVector.createZero(10, False)
-        self.assertIsNotNone(m)
-        for i in xrange(10):
-            self.assertEqual(m[i], 0)
-            m[i] = i
-            self.assertEqual(m[i], i)
-
-        m = swig_paddle.IVector.createZero(10)
-        self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
-        self.assertEqual(m.getData(), [0] * 10)
-
-    def test_create(self):
-        m = swig_paddle.IVector.create(range(10), False)
-        self.assertIsNotNone(m)
-        for i in xrange(10):
-            self.assertEqual(m[i], i)
-
-        m = swig_paddle.IVector.create(range(10))
-        self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
-        self.assertEqual(m.getData(), range(10))
-
-    def test_cpu_numpy(self):
-        vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
-        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec, False)
-        self.assertEqual(vec.shape[0], int(iv.__len__()))
-        vec[4] = 832
-        for i in xrange(len(iv)):
-            self.assertEqual(vec[i], iv[i])
-        vec2 = iv.toNumpyArrayInplace()
-        vec2[1] = 384
-        for i in xrange(len(iv)):
-            self.assertEqual(vec[i], iv[i])
-            self.assertEqual(vec2[i], iv[i])
-
-    def test_gpu_numpy(self):
-        if swig_paddle.isGpuVersion():
-            vec = swig_paddle.IVector.create(range(0, 10), True)
-            assert isinstance(vec, swig_paddle.IVector)
-            self.assertTrue(vec.isGpu())
-            self.assertEqual(vec.getData(), range(0, 10))
-            num_arr = vec.copyToNumpyArray()
-            assert isinstance(num_arr, np.ndarray)  # for code hint.
-            num_arr[4] = 7
-            self.assertEquals(vec.getData(), range(0, 10))
-
-            vec.copyFromNumpyArray(num_arr)
-            expect_vec = range(0, 10)
-            expect_vec[4] = 7
-            self.assertEqual(vec.getData(), expect_vec)
-
-    def test_numpy(self):
-        vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
-        iv = swig_paddle.IVector.createVectorFromNumpy(vec)
-        self.assertEqual(iv.isGpu(), swig_paddle.isUsingGpu())
-        self.assertEqual(iv.getData(), list(vec))
-
-
-class TestVector(unittest.TestCase):
-    def testCreateZero(self):
-        v = swig_paddle.Vector.createZero(10, False)
-        self.assertIsNotNone(v)
-        for i in xrange(len(v)):
-            self.assertTrue(util.doubleEqual(v[i], 0))
-            v[i] = i
-            self.assertTrue(util.doubleEqual(v[i], i))
-
-        v = swig_paddle.Vector.createZero(10)
-        self.assertEqual(v.isGpu(), swig_paddle.isUsingGpu())
-        self.assertEqual(v.getData(), [0] * 10)
-
-    def testCreate(self):
-        v = swig_paddle.Vector.create([x / 100.0 for x in xrange(100)], False)
-        self.assertIsNotNone(v)
-        for i in xrange(len(v)):
-            self.assertTrue(util.doubleEqual(v[i], i / 100.0))
-        self.assertEqual(100, len(v))
-
-        v = swig_paddle.Vector.create([x / 100.0 for x in xrange(100)])
-        self.assertEqual(v.isGpu(), swig_paddle.isUsingGpu())
-        self.assertEqual(100, len(v))
-        vdata = v.getData()
-        for i in xrange(len(v)):
-            self.assertTrue(util.doubleEqual(vdata[i], i / 100.0))
-
-    def testCpuNumpy(self):
-        numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
-        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr, False)
-        assert isinstance(vec, swig_paddle.Vector)
-        numpy_arr[0] = 0.1
-        for n, v in zip(numpy_arr, vec):
-            self.assertTrue(util.doubleEqual(n, v))
-
-        numpy_2 = vec.toNumpyArrayInplace()
-        vec[0] = 1.3
-        for x, y in zip(numpy_arr, numpy_2):
-            self.assertTrue(util.doubleEqual(x, y))
-
-        for x, y in zip(numpy_arr, vec):
-            self.assertTrue(util.doubleEqual(x, y))
-
-        numpy_3 = vec.copyToNumpyArray()
-        numpy_3[0] = 0.4
-        self.assertTrue(util.doubleEqual(vec[0], 1.3))
-        self.assertTrue(util.doubleEqual(numpy_3[0], 0.4))
-
-        for i in xrange(1, len(numpy_3)):
-            util.doubleEqual(numpy_3[i], vec[i])
-
-    def testNumpy(self):
-        numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
-        vec = swig_paddle.Vector.createVectorFromNumpy(numpy_arr)
-        self.assertEqual(vec.isGpu(), swig_paddle.isUsingGpu())
-        vecData = vec.getData()
-        for n, v in zip(numpy_arr, vecData):
-            self.assertTrue(util.doubleEqual(n, v))
-
-    def testCopyFromNumpy(self):
-        vec = swig_paddle.Vector.createZero(1, False)
-        arr = np.array([1.3, 3.2, 2.4], dtype="float32")
-        vec.copyFromNumpyArray(arr)
-        for i in xrange(len(vec)):
-            self.assertTrue(util.doubleEqual(vec[i], arr[i]))
-
-
-if __name__ == '__main__':
-    swig_paddle.initPaddle("--use_gpu=0")
-    suite = unittest.TestLoader().loadTestsFromTestCase(TestVector)
-    unittest.TextTestRunner().run(suite)
-    if swig_paddle.isGpuVersion():
-        swig_paddle.setUseGpu(True)
-        unittest.main()
diff --git a/paddle/legacy/api/test/util.py b/paddle/legacy/api/test/util.py
deleted file mode 100644
index 9f4631c53e1..00000000000
--- a/paddle/legacy/api/test/util.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-
-import numpy as np
-from py_paddle import swig_paddle
-
-
-def doubleEqual(a, b):
-    return abs(a - b) < 1e-5
-
-
-def __readFromFile():
-    for i in xrange(10002):
-        label = np.random.randint(0, 9)
-        sample = np.random.rand(784) + 0.1 * label
-        yield sample, label
-
-
-def loadMNISTTrainData(batch_size=100):
-    if not hasattr(loadMNISTTrainData, "gen"):
-        generator = __readFromFile()
-        loadMNISTTrainData.gen = generator
-    else:
-        generator = loadMNISTTrainData.gen
-    args = swig_paddle.Arguments.createArguments(2)
-    # batch_size = 100
-
-    dense_slot = []
-    id_slot = []
-    atEnd = False
-
-    for _ in xrange(batch_size):
-        try:
-            result = generator.next()
-            dense_slot.extend(result[0])
-            id_slot.append(result[1])
-        except StopIteration:
-            atEnd = True
-            del loadMNISTTrainData.gen
-            break
-
-    dense_slot = swig_paddle.Matrix.createDense(dense_slot, batch_size, 784)
-    id_slot = swig_paddle.IVector.create(id_slot)
-    args.setSlotValue(0, dense_slot)
-    args.setSlotIds(1, id_slot)
-    return args, atEnd
diff --git a/paddle/legacy/capi/Arguments.cpp b/paddle/legacy/capi/Arguments.cpp
deleted file mode 100644
index 0ce1770c76c..00000000000
--- a/paddle/legacy/capi/Arguments.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "arguments.h"
-#include "capi_private.h"
-
-using paddle::capi::cast;
-
-#define castArg(v) cast<paddle::capi::CArguments>(v)
-#define castIVec(v) cast<paddle::capi::CIVector>(v)
-
-extern "C" {
-paddle_arguments paddle_arguments_create_none() {
-  return new paddle::capi::CArguments();
-}
-
-paddle_error paddle_arguments_destroy(paddle_arguments args) {
-  if (args == nullptr) return kPD_NULLPTR;
-  delete castArg(args);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_arguments_get_size(paddle_arguments args, uint64_t* size) {
-  if (args == nullptr || size == nullptr) return kPD_NULLPTR;
-  *size = castArg(args)->args.size();
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_arguments_resize(paddle_arguments args, uint64_t size) {
-  if (args == nullptr) return kPD_NULLPTR;
-  castArg(args)->args.resize(size);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_arguments_set_value(paddle_arguments args,
-                                        uint64_t ID,
-                                        paddle_matrix mat) {
-  if (args == nullptr || mat == nullptr) return kPD_NULLPTR;
-  auto m = paddle::capi::cast<paddle::capi::CMatrix>(mat);
-  if (m->mat == nullptr) return kPD_NULLPTR;
-  auto a = castArg(args);
-  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
-  a->args[ID].value = m->mat;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_arguments_get_value(paddle_arguments args,
-                                        uint64_t ID,
-                                        paddle_matrix mat) {
-  if (args == nullptr || mat == nullptr) return kPD_NULLPTR;
-  auto m = paddle::capi::cast<paddle::capi::CMatrix>(mat);
-  auto a = castArg(args);
-  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
-  m->mat = a->args[ID].value;
-  return kPD_NO_ERROR;
-}
-
-PD_API paddle_error paddle_arguments_get_prob(paddle_arguments args,
-                                              uint64_t ID,
-                                              paddle_matrix mat) {
-  if (args == nullptr || mat == nullptr) return kPD_NULLPTR;
-  auto m = paddle::capi::cast<paddle::capi::CMatrix>(mat);
-  auto a = castArg(args);
-  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
-  m->mat = a->args[ID].in;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_arguments_get_ids(paddle_arguments args,
-                                      uint64_t ID,
-                                      paddle_ivector ids) {
-  if (args == nullptr || ids == nullptr) return kPD_NULLPTR;
-  auto iv = castIVec(ids);
-  auto a = castArg(args);
-  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
-  iv->vec = a->args[ID].ids;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_arguments_set_ids(paddle_arguments args,
-                                      uint64_t ID,
-                                      paddle_ivector ids) {
-  //! TODO(lizhao): Complete this method.
-  if (args == nullptr || ids == nullptr) return kPD_NULLPTR;
-  auto iv = paddle::capi::cast<paddle::capi::CIVector>(ids);
-  if (iv->vec == nullptr) return kPD_NULLPTR;
-  auto a = castArg(args);
-  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
-  a->args[ID].ids = iv->vec;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_arguments_set_frame_shape(paddle_arguments args,
-                                              uint64_t ID,
-                                              uint64_t frameHeight,
-                                              uint64_t frameWidth) {
-  if (args == nullptr) return kPD_NULLPTR;
-  auto a = castArg(args);
-  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
-  a->args[ID].setFrameHeight(frameHeight);
-  a->args[ID].setFrameWidth(frameWidth);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_arguments_set_sequence_start_pos(paddle_arguments args,
-                                                     uint64_t ID,
-                                                     uint32_t nestedLevel,
-                                                     paddle_ivector seqPos) {
-  if (args == nullptr || seqPos == nullptr) return kPD_NULLPTR;
-  auto iv = paddle::capi::cast<paddle::capi::CIVector>(seqPos);
-  if (iv->vec == nullptr) return kPD_NULLPTR;
-  auto a = castArg(args);
-  return a->accessSeqPos(ID, nestedLevel, [&iv](paddle::ICpuGpuVectorPtr& ptr) {
-    ptr = std::make_shared<paddle::ICpuGpuVector>(iv->vec);
-  });
-}
-
-paddle_error paddle_arguments_get_sequence_start_pos(paddle_arguments args,
-                                                     uint64_t ID,
-                                                     uint32_t nestedLevel,
-                                                     paddle_ivector seqPos) {
-  if (args == nullptr || seqPos == nullptr) return kPD_NULLPTR;
-  auto iv = paddle::capi::cast<paddle::capi::CIVector>(seqPos);
-  auto a = castArg(args);
-  return a->accessSeqPos(ID, nestedLevel, [&iv](paddle::ICpuGpuVectorPtr& ptr) {
-    iv->vec = ptr->getMutableVector(false);
-  });
-}
-}
diff --git a/paddle/legacy/capi/CMakeLists.txt b/paddle/legacy/capi/CMakeLists.txt
deleted file mode 100644
index 957b1a3e6b0..00000000000
--- a/paddle/legacy/capi/CMakeLists.txt
+++ /dev/null
@@ -1,118 +0,0 @@
-if (WITH_DOUBLE)
-  set(PADDLE_FLOAT_TYPE double)
-else ()
-  set(PADDLE_FLOAT_TYPE float)
-endif()
-
-execute_process(
-  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
-  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-  OUTPUT_VARIABLE PADDLE_GIT_COMMIT
-  RESULT_VARIABLE PADDLE_GIT_COMMIT_RESULT
-  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT PADDLE_GIT_COMMIT)
-  set(PADDLE_GIT_COMMIT "no commit information")
-endif()
-
-# config.h used for C-API. It will store Paddle building configuration as a
-# header. Make user just include PaddleCAPI.h then can get building
-# configuration without explicitly set -DPADDLE_WITH_DOUBLE when building their
-# libraries.
-configure_file(config.h.in config.h @ONLY)
-
-# PaddleCAPI.h is the only header we exposed. It currently only used for model
-# inference.
-file(GLOB CAPI_HEADERS *.h)
-set(CAPI_PRIVATE_HEADER capi_private.h)
-list(REMOVE_ITEM CAPI_HEADERS ${CAPI_PRIVATE_HEADER})
-file(GLOB CAPI_SOURCES *.cpp)
-
-# building paddle_capi
-add_library(paddle_capi STATIC ${CAPI_HEADERS} ${CAPI_PRIVATE_HEADER}
-  ${CAPI_SOURCES})
-
-target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
-
-add_dependencies(paddle_capi paddle_proto paddle_gserver)
-
-# TODO: paddle_capi_whole will be removed.
-set(PADDLE_CAPI_LAYERS_LIBS
-    paddle_function
-    paddle_gserver)
-if(MOBILE_INFERENCE)
-  set(PADDLE_CAPI_ENGINE_LIBS
-      paddle_utils
-      paddle_parameter
-      paddle_math
-      paddle_cuda
-      paddle_proto)
-else()
-  set(PADDLE_CAPI_ENGINE_LIBS
-      paddle_utils
-      paddle_parameter
-      paddle_math
-      paddle_cuda
-      paddle_proto
-      paddle_pserver
-      paddle_network)
-endif()
-set(PADDLE_CAPI_INFER_LIBS ${PADDLE_CAPI_LAYERS_LIBS} ${PADDLE_CAPI_ENGINE_LIBS})
-cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
-
-# Link the static library for inference
-cc_library(paddle_capi_engine DEPS paddle_capi ${PADDLE_CAPI_ENGINE_LIBS})
-cc_library(paddle_capi_layers DEPS ${PADDLE_CAPI_LAYERS_LIBS})
-
-# Link the shared library for inference
-if(NOT IOS)
-  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_capi.map")
-  add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
-  set_target_properties(paddle_capi_shared	PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-  target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
-  link_paddle_exe(paddle_capi_shared)
-endif()
-
-# install library & headers.
-install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
-install(FILES paddle_capi.map DESTINATION include/paddle)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
-if(ANDROID)
-  install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers paddle_capi_shared
-          ARCHIVE DESTINATION lib/${ANDROID_ABI}
-          LIBRARY DESTINATION lib/${ANDROID_ABI})
-  execute_process(
-    COMMAND ${GIT_EXECUTABLE} log --pretty=oneline -1
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-    OUTPUT_VARIABLE GIT_COMMITS_LIST
-    RESULT_VARIABLE GIT_COMMITS_LIST_RESULT
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if(${GIT_COMMITS_LIST_RESULT})
-    set(GIT_COMMITS_LIST "No commits.")
-  endif()
-  install(CODE "FILE(WRITE ${CMAKE_INSTALL_PREFIX}/lib/${ANDROID_ABI}/BUILD.txt
-          \"Compiler:\n\"
-          \"\\t${CMAKE_C_COMPILER}\\n\"
-          \"\\t${CMAKE_CXX_COMPILER}\\n\"
-          \"Compiler Flags:\\n\"
-          \"\\t${CMAKE_F_FLAGS}\\n\"
-          \"\\t${CMAKE_CXX_FLAGS}\\n\"
-          \"Android API: ${CMAKE_SYSTEM_VERSION}\\n\"
-          \"Lastest commit:\\n\"
-          \"\\t${GIT_COMMITS_LIST}\\n\"
-      )"
-  )
-else(ANDROID)
-  install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers ARCHIVE DESTINATION lib)
-  if(NOT IOS)
-    install(TARGETS paddle_capi_shared DESTINATION lib)
-  endif()
-endif(ANDROID)
-
-# this variable used for unittest
-set(PADDLE_CAPI_INC_PATH
-  ${CMAKE_CURRENT_BINARY_DIR}
-  ${CMAKE_CURRENT_SOURCE_DIR})
-
-if (WITH_TESTING)
-  add_subdirectory(tests)
-endif()
diff --git a/paddle/legacy/capi/Main.cpp b/paddle/legacy/capi/Main.cpp
deleted file mode 100644
index 17d8f00a88a..00000000000
--- a/paddle/legacy/capi/Main.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fenv.h>
-#include <stdlib.h>
-#include <string.h>
-#include <vector>
-#include "capi_private.h"
-#include "main.h"
-#include "paddle/legacy/trainer/TrainerConfigHelper.h"
-#include "paddle/legacy/utils/Excepts.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-
-static void initPaddle(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  paddle::initPython(argc, argv);
-}
-
-extern "C" {
-paddle_error paddle_init(int argc, char** argv) {
-  static bool isInit = false;
-  if (isInit) return kPD_NO_ERROR;
-
-  std::vector<char*> realArgv;
-  realArgv.reserve(argc + 1);
-  realArgv.push_back(strdup(""));
-  for (int i = 0; i < argc; ++i) {
-    realArgv.push_back(argv[i]);
-  }
-  initPaddle(argc + 1, realArgv.data());
-  free(realArgv[0]);
-  isInit = true;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_init_thread() {
-  if (FLAGS_use_gpu) {
-    hl_init(FLAGS_gpu_id);
-  }
-  return kPD_NO_ERROR;
-}
-}
diff --git a/paddle/legacy/capi/Matrix.cpp b/paddle/legacy/capi/Matrix.cpp
deleted file mode 100644
index 733d49cacfd..00000000000
--- a/paddle/legacy/capi/Matrix.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "capi_private.h"
-#include "hl_cuda.h"
-#include "matrix.h"
-
-#define cast(v) paddle::capi::cast<paddle::capi::CMatrix>(v)
-extern "C" {
-paddle_matrix paddle_matrix_create(uint64_t height,
-                                   uint64_t width,
-                                   bool useGpu) {
-  auto ptr = new paddle::capi::CMatrix();
-  ptr->mat = paddle::Matrix::create(height, width, false, useGpu);
-  return ptr;
-}
-
-paddle_matrix paddle_matrix_create_none() {
-  return new paddle::capi::CMatrix();
-}
-
-paddle_error paddle_matrix_destroy(paddle_matrix mat) {
-  if (mat == nullptr) return kPD_NULLPTR;
-  auto ptr = cast(mat);
-  delete ptr;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_matrix_set_row(paddle_matrix mat,
-                                   uint64_t rowID,
-                                   paddle_real* rowArray) {
-  if (mat == nullptr || rowArray == nullptr) return kPD_NULLPTR;
-  auto ptr = cast(mat);
-  if (ptr->mat == nullptr) return kPD_NULLPTR;
-  if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
-  paddle::real* buf = ptr->mat->getRowBuf(rowID);
-  size_t width = ptr->mat->getWidth();
-#ifdef PADDLE_WITH_CUDA
-  hl_memcpy(buf, rowArray, sizeof(paddle::real) * width);
-#else
-  std::copy(rowArray, rowArray + width, buf);
-#endif
-  return kPD_NO_ERROR;
-}
-
-PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                            paddle_real* value) {
-  if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
-  auto ptr = cast(mat);
-  if (ptr->mat == nullptr) return kPD_NULLPTR;
-  paddle::real* buf = ptr->mat->getRowBuf(0);
-  size_t width = ptr->mat->getWidth();
-  size_t height = ptr->mat->getHeight();
-  if (ptr->mat->useGpu()) {
-#ifdef PADDLE_WITH_CUDA
-    hl_memcpy(buf, value, sizeof(paddle::real) * width * height);
-#else
-    return kPD_NOT_SUPPORTED;
-#endif
-  } else {
-    std::copy(value, value + width * height, buf);
-  }
-  return kPD_NO_ERROR;
-}
-
-PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                            paddle_real* result) {
-  if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
-  auto ptr = cast(mat);
-  if (ptr->mat == nullptr) return kPD_NULLPTR;
-  paddle::real* buf = ptr->mat->getRowBuf(0);
-  size_t width = ptr->mat->getWidth();
-  size_t height = ptr->mat->getHeight();
-  if (ptr->mat->useGpu()) {
-#ifdef PADDLE_WITH_CUDA
-    hl_memcpy(result, buf, width * height * sizeof(paddle::real));
-#else
-    return kPD_NOT_SUPPORTED;
-#endif
-  } else {
-    std::copy(buf, buf + width * height, result);
-  }
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_matrix_get_row(paddle_matrix mat,
-                                   uint64_t rowID,
-                                   paddle_real** rawRowBuffer) {
-  if (mat == nullptr) return kPD_NULLPTR;
-  auto ptr = cast(mat);
-  if (ptr->mat == nullptr) return kPD_NULLPTR;
-  if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
-  *rawRowBuffer = ptr->mat->getRowBuf(rowID);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_matrix_get_shape(paddle_matrix mat,
-                                     uint64_t* height,
-                                     uint64_t* width) {
-  if (mat == nullptr || cast(mat)->mat == nullptr) return kPD_NULLPTR;
-  if (height != nullptr) {
-    *height = cast(mat)->mat->getHeight();
-  }
-  if (width != nullptr) {
-    *width = cast(mat)->mat->getWidth();
-  }
-  return kPD_NO_ERROR;
-}
-}
-
-paddle_matrix paddle_matrix_create_sparse(
-    uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  auto ptr = new paddle::capi::CMatrix();
-  ptr->mat = paddle::Matrix::createSparseMatrix(
-      height,
-      width,
-      nnz,
-      isBinary ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
-      paddle::SPARSE_CSR,
-      false,
-      useGpu);
-  return ptr;
-#else
-  return nullptr;
-#endif
-}
-
-paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
-                                            int* rowArray,
-                                            uint64_t rowSize,
-                                            int* colArray,
-                                            uint64_t colSize,
-                                            float* valueArray,
-                                            uint64_t valueSize) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  if (mat == nullptr) return kPD_NULLPTR;
-  auto ptr = cast(mat);
-  if (rowArray == nullptr || colArray == nullptr ||
-      (valueSize != 0 && valueArray == nullptr) || ptr->mat == nullptr) {
-    return kPD_NULLPTR;
-  }
-  if (auto sparseMat = dynamic_cast<paddle::CpuSparseMatrix*>(ptr->mat.get())) {
-    std::vector<int> row(rowSize);
-    row.assign(rowArray, rowArray + rowSize);
-    std::vector<int> col(colSize);
-    col.assign(colArray, colArray + colSize);
-    std::vector<paddle_real> val(valueSize);
-    if (valueSize) {
-      val.assign(valueArray, valueArray + valueSize);
-    }
-    sparseMat->copyFrom(row, col, val);
-    return kPD_NO_ERROR;
-  } else {
-    return kPD_NOT_SUPPORTED;
-  }
-#else
-  return kPD_NOT_SUPPORTED;
-#endif
-}
diff --git a/paddle/legacy/capi/Vector.cpp b/paddle/legacy/capi/Vector.cpp
deleted file mode 100644
index afb5a9afefe..00000000000
--- a/paddle/legacy/capi/Vector.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "capi_private.h"
-#include "vector.h"
-
-using paddle::capi::cast;
-
-extern "C" {
-
-paddle_ivector paddle_ivector_create_none() {
-  return new paddle::capi::CIVector();
-}
-
-paddle_ivector paddle_ivector_create(int* array,
-                                     uint64_t size,
-                                     bool copy,
-                                     bool useGPU) {
-  auto ptr = new paddle::capi::CIVector();
-  if (copy) {
-    ptr->vec = paddle::IVector::create(size, useGPU);
-    ptr->vec->copyFrom(array, size);
-  } else {
-    ptr->vec = paddle::IVector::create(array, size, useGPU);
-  }
-  return ptr;
-}
-
-paddle_error paddle_ivector_destroy(paddle_ivector ivec) {
-  if (ivec == nullptr) return kPD_NULLPTR;
-  delete cast<paddle::capi::CIVector>(ivec);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_ivector_get(paddle_ivector ivec, int** buffer) {
-  if (ivec == nullptr || buffer == nullptr) return kPD_NULLPTR;
-  auto v = cast<paddle::capi::CIVector>(ivec);
-  if (v->vec == nullptr) return kPD_NULLPTR;
-  *buffer = v->vec->getData();
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_ivector_resize(paddle_ivector ivec, uint64_t size) {
-  if (ivec == nullptr) return kPD_NULLPTR;
-  auto v = cast<paddle::capi::CIVector>(ivec);
-  if (v->vec == nullptr) return kPD_NULLPTR;
-  v->vec->resize(size);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_ivector_get_size(paddle_ivector ivec, uint64_t* size) {
-  if (ivec == nullptr) return kPD_NULLPTR;
-  auto v = cast<paddle::capi::CIVector>(ivec);
-  if (v->vec == nullptr) return kPD_NULLPTR;
-  *size = v->vec->getSize();
-  return kPD_NO_ERROR;
-}
-}
diff --git a/paddle/legacy/capi/arguments.h b/paddle/legacy/capi/arguments.h
deleted file mode 100644
index ceb64ee6aa7..00000000000
--- a/paddle/legacy/capi/arguments.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef __PADDLE_CAPI_ARGUMENTS_H__
-#define __PADDLE_CAPI_ARGUMENTS_H__
-
-#include <stdint.h>
-#include "config.h"
-#include "error.h"
-#include "matrix.h"
-#include "vector.h"
-
-/**
- * Arguments functions. Each argument means layer output. Arguments means a
- * array of arguemnt.
- */
-typedef void* paddle_arguments;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * @brief paddle_arguments_create_none Create a array of arguments, which size
- * is zero.
- * @return Arguemnts
- */
-PD_API paddle_arguments paddle_arguments_create_none();
-
-/**
- * @brief paddle_arguments_destroy Destroy the arguments
- * @param args arguments to destroy
- * @return paddle_error
- */
-PD_API paddle_error paddle_arguments_destroy(paddle_arguments args);
-
-/**
- * @brief paddle_arguments_get_size Get size of arguments array
- * @param [in] args arguments array
- * @param [out] size array size
- * @return paddle_error
- */
-PD_API paddle_error paddle_arguments_get_size(paddle_arguments args,
-                                              uint64_t* size);
-
-/**
- * @brief PDArgsResize Resize a arguments array.
- * @param args arguments array.
- * @param size target size of array
- * @return paddle_error
- */
-PD_API paddle_error paddle_arguments_resize(paddle_arguments args,
-                                            uint64_t size);
-
-/**
- * @brief PDArgsSetValue Set value matrix of one argument in array, which index
- *        is `ID`.
- * @param args arguments array
- * @param ID array index
- * @param mat matrix pointer
- * @return paddle_error
- */
-PD_API paddle_error paddle_arguments_set_value(paddle_arguments args,
-                                               uint64_t ID,
-                                               paddle_matrix mat);
-
-/**
- * @brief PDArgsGetValue Get value matrix of one argument in array, which index
- *        is `ID`.
- * @param [in] args arguments array
- * @param [in] ID array index
- * @param [out] mat matrix pointer
- * @return paddle_error
- */
-PD_API paddle_error paddle_arguments_get_value(paddle_arguments args,
-                                               uint64_t ID,
-                                               paddle_matrix mat);
-
-/**
- * @brief paddle_arguments_get_prob Get the prob matrix of beam search, which
- *        slot ID is `ID`
- * @param [in] args arguments array
- * @param [in] ID array index
- * @param [out] mat matrix pointer
- * @return paddle_error
- */
-PD_API paddle_error paddle_arguments_get_prob(paddle_arguments args,
-                                              uint64_t ID,
-                                              paddle_matrix mat);
-
-/**
- * @brief PDArgsGetIds Get the integer vector of one argument in array, which
- *        index is `ID`.
- * @param args arguments array
- * @param ID array index
- * @param ids integer vector pointer
- * @return paddle_error
- */
-PD_API paddle_error paddle_arguments_get_ids(paddle_arguments args,
-                                             uint64_t ID,
-                                             paddle_ivector ids);
-
-/**
- * @brief PDArgsSetIds Set the integer vector of one argument in array, which
- *        index is `ID`.
- * @param [in] args arguments array
- * @param [in] ID array index
- * @param [out] ids integer vector pointer
- * @return paddle_error
- */
-PD_API paddle_error paddle_arguments_set_ids(paddle_arguments args,
-                                             uint64_t ID,
-                                             paddle_ivector ids);
-
-/**
- * @brief paddle_arguments_set_frame_shape Set the fram size of one argument
- *        in array, which index is `ID`.
- * @param [in] args arguments array
- * @param [in] ID array index
- * @param [in] frameHeight maximum height of input images
- * @param [in] frameWidth maximum width of input images
- * @return paddle_error
- */
-PD_API paddle_error paddle_arguments_set_frame_shape(paddle_arguments args,
-                                                     uint64_t ID,
-                                                     uint64_t frameHeight,
-                                                     uint64_t frameWidth);
-
-/**
- * @brief PDArgsSetSequenceStartPos Set sequence start position vector of one
- *        argument in array, which index is `ID`.
- * @param args arguments array
- * @param ID array index
- * @param seqPos sequence position array.
- * @return paddle_error
- */
-PD_API paddle_error
-paddle_arguments_set_sequence_start_pos(paddle_arguments args,
-                                        uint64_t ID,
-                                        uint32_t nestedLevel,
-                                        paddle_ivector seqPos);
-/**
- * @brief PDArgsGetSequenceStartPos Get sequence start position vector of one
- *        argument in array, which index is `ID`.
- * @param [in] args arguments array
- * @param [in] ID array index
- * @param [out] seqPos sequence position array
- * @return paddle_error
- */
-PD_API paddle_error
-paddle_arguments_get_sequence_start_pos(paddle_arguments args,
-                                        uint64_t ID,
-                                        uint32_t nestedLevel,
-                                        paddle_ivector seqPos);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/paddle/legacy/capi/capi.h b/paddle/legacy/capi/capi.h
deleted file mode 100644
index 749fcc4b799..00000000000
--- a/paddle/legacy/capi/capi.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef __PADDLE_CAPI_H__
-#define __PADDLE_CAPI_H__
-
-/**
- * Paddle C API. It will replace SWIG as Multiple Language API for model
- * training & inference. Currently it is only used in model infernece.
- *
- * NOTE: This is an experimental API, it could be changed.
- */
-#include "arguments.h"
-#include "config.h"
-#include "error.h"
-#include "gradient_machine.h"
-#include "main.h"
-#include "matrix.h"
-#include "vector.h"
-
-#endif  // PADDLECAPI_H_
diff --git a/paddle/legacy/capi/capi_private.h b/paddle/legacy/capi/capi_private.h
deleted file mode 100644
index e5f8c8c5c8b..00000000000
--- a/paddle/legacy/capi/capi_private.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "capi.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/parameter/Argument.h"
-#pragma once
-
-namespace paddle {
-namespace capi {
-
-enum CType { kIVECTOR = 0, kMATRIX, kARGUMENTS, kGRADIENT_MACHINE };
-
-#define STRUCT_HEADER CType type;
-
-struct CHeader {
-  STRUCT_HEADER
-};
-
-struct CIVector {
-  STRUCT_HEADER
-  IVectorPtr vec;
-
-  CIVector() : type(kIVECTOR) {}
-};
-
-struct CMatrix {
-  STRUCT_HEADER
-  MatrixPtr mat;
-
-  CMatrix() : type(kMATRIX) {}
-};
-
-struct CArguments {
-  STRUCT_HEADER
-  std::vector<paddle::Argument> args;
-
-  CArguments() : type(kARGUMENTS) {}
-
-  template <typename T>
-  paddle_error accessSeqPos(uint64_t ID, uint32_t nestedLevel, T callback) {
-    if (ID >= args.size()) return kPD_OUT_OF_RANGE;
-    switch (nestedLevel) {
-      case 0:
-        callback(args[ID].sequenceStartPositions);
-        break;
-      case 1:
-        callback(args[ID].subSequenceStartPositions);
-        break;
-      default:
-        return kPD_OUT_OF_RANGE;
-    }
-    return kPD_NO_ERROR;
-  }
-};
-
-struct CGradientMachine {
-  STRUCT_HEADER
-  paddle::GradientMachinePtr machine;
-
-  CGradientMachine() : type(kGRADIENT_MACHINE) {}
-};
-
-template <typename T>
-inline T* cast(void* ptr) {
-  return reinterpret_cast<T*>(ptr);
-}
-}  // namespace capi
-}  // namespace paddle
diff --git a/paddle/legacy/capi/config.h.in b/paddle/legacy/capi/config.h.in
deleted file mode 100644
index 0ddbd8c753c..00000000000
--- a/paddle/legacy/capi/config.h.in
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __PADDLE_PADDLE_CAPI_CONFIG_H_INCLUDED__
-#define __PADDLE_PADDLE_CAPI_CONFIG_H_INCLUDED__
-
-typedef @PADDLE_FLOAT_TYPE@ paddle_real;
-
-#define __PADDLE_VERSION__  "@PADDLE_VERSION@"
-#define __PADDLE_COMMIT__   "@PADDLE_GIT_COMMIT@"
-
-// Since we only support linux and macos in compile, always use clang or
-// gcc 4.8+. DLL_IMPORT/DLL_EXPORT is as simple as below.
-#define PD_API __attribute__((visibility("default")))
-
-#endif
diff --git a/paddle/legacy/capi/error.cpp b/paddle/legacy/capi/error.cpp
deleted file mode 100644
index 0c25de5ba98..00000000000
--- a/paddle/legacy/capi/error.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "error.h"
-
-extern "C" const char* paddle_error_string(paddle_error err) {
-  switch (err) {
-    case kPD_NULLPTR:
-      return "nullptr error";
-    case kPD_OUT_OF_RANGE:
-      return "out of range error";
-    case kPD_PROTOBUF_ERROR:
-      return "protobuf error";
-    case kPD_NOT_SUPPORTED:
-      return "not supported error";
-    case kPD_UNDEFINED_ERROR:
-      return "undefined error";
-    default:
-      return "";
-  }
-}
diff --git a/paddle/legacy/capi/error.h b/paddle/legacy/capi/error.h
deleted file mode 100644
index b0940725b50..00000000000
--- a/paddle/legacy/capi/error.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef __PADDLE_CAPI_ERROR_H__
-#define __PADDLE_CAPI_ERROR_H__
-
-#include "config.h"
-
-/**
- * Error Type for Paddle API.
- */
-typedef enum {
-  kPD_NO_ERROR = 0,
-  kPD_NULLPTR = 1,
-  kPD_OUT_OF_RANGE = 2,
-  kPD_PROTOBUF_ERROR = 3,
-  kPD_NOT_SUPPORTED = 4,
-  kPD_UNDEFINED_ERROR = -1,
-} paddle_error;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Error string for Paddle API.
- */
-PD_API const char* paddle_error_string(paddle_error err);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/paddle/legacy/capi/examples/.gitignore b/paddle/legacy/capi/examples/.gitignore
deleted file mode 100644
index 2caa0a5a298..00000000000
--- a/paddle/legacy/capi/examples/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*.bin
-build-*
diff --git a/paddle/legacy/capi/examples/README.md b/paddle/legacy/capi/examples/README.md
deleted file mode 100644
index 14013e281ff..00000000000
--- a/paddle/legacy/capi/examples/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# C-API Example Usage
-
-* [Model Inference](./model_inference/README.md)
diff --git a/paddle/legacy/capi/examples/model_inference/README.md b/paddle/legacy/capi/examples/model_inference/README.md
deleted file mode 100644
index 58e6c83140b..00000000000
--- a/paddle/legacy/capi/examples/model_inference/README.md
+++ /dev/null
@@ -1,42 +0,0 @@
-# Use C-API for Model Inference
-
-There are several examples in this directory about how to use Paddle C-API for model inference.
-
-## Convert configuration file to protobuf binary.
-
-Firstly, the user should convert Paddle's model configuration file into a protobuf binary file. In each example directory, there is a file named `convert_protobin.sh`. It will convert `trainer_config.conf` into `trainer_config.bin`.
-
-The `convert_protobin.sh` is very simple, just invoke `dump_config` Python module to dump the binary file. The command line usages are:
-
-```bash
-python -m paddle.utils.dump_config YOUR_CONFIG_FILE 'CONFIG_EXTRA_ARGS' --binary > YOUR_CONFIG_FILE.bin
-```
-
-## Initialize paddle
-
-```c++
-char* argv[] = {"--use_gpu=False"};
-paddle_init(1, (char**)argv);
-```
-
-We must initialize global context before we invoke other interfaces in Paddle. The initialize commands just like the `paddle_trainer` command line arguments.  `paddle train --help`,  will show the list of arguments. The most important argument is `use_gpu` or not.
-
-## Load network and parameters
-
-```c
-paddle_gradient_machine machine;
-paddle_gradient_machine_create_for_inference(&machine, config_file_content, content_size));
-paddle_gradient_machine_load_parameter_from_disk(machine, "./some_where_to_params"));
-```
-
-The gradient machine is a Paddle concept, which represents a neural network can be forwarded and backward. We can create a gradient machine fo model inference, and load the parameter files from disk.
-
-Moreover, if we want to inference in multi-thread, we could create a thread local gradient machine which shared the same parameter by using `paddle_gradient_machine_create_shared_param` API. Please reference `multi_thread` as an example.
-
-## Create input
-
-The input of a neural network is an `arguments`. The examples in this directory will show how to construct different types of inputs for prediction. Please look at `dense`, `sparse_binary`, `sequence` for details.
-
-## Get inference
-
-After invoking `paddle_gradient_machine_forward`, we could get the output of the neural network.  The `value` matrix of output arguments will store the neural network output values. If the output is a `SoftmaxActivation`, the `value` matrix are the probabilities of each input samples. The height of output matrix is number of sample. The width is the number of categories.
diff --git a/paddle/legacy/capi/examples/model_inference/common/common.h b/paddle/legacy/capi/examples/model_inference/common/common.h
deleted file mode 100644
index 23248b0caf9..00000000000
--- a/paddle/legacy/capi/examples/model_inference/common/common.h
+++ /dev/null
@@ -1,42 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef __CAPI_EXAMPLE_COMMON_H__
-#define __CAPI_EXAMPLE_COMMON_H__
-#include <stdio.h>
-#include <stdlib.h>
-
-#define CHECK(stmt)                                                      \
-  do {                                                                   \
-    paddle_error __err__ = stmt;                                         \
-    if (__err__ != kPD_NO_ERROR) {                                       \
-      fprintf(stderr, "Invoke paddle error %d in " #stmt "\n", __err__); \
-      exit(__err__);                                                     \
-    }                                                                    \
-  } while (0)
-
-void* read_config(const char* filename, long* size) {
-  FILE* file = fopen(filename, "r");
-  if (file == NULL) {
-    fprintf(stderr, "Open %s error\n", filename);
-    return NULL;
-  }
-  fseek(file, 0L, SEEK_END);
-  *size = ftell(file);
-  fseek(file, 0L, SEEK_SET);
-  void* buf = malloc(*size);
-  fread(buf, 1, *size, file);
-  fclose(file);
-  return buf;
-}
-#endif
diff --git a/paddle/legacy/capi/examples/model_inference/dense/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/dense/CMakeLists.txt
deleted file mode 100644
index 008a488fd9e..00000000000
--- a/paddle/legacy/capi/examples/model_inference/dense/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-project(dense)
-cmake_minimum_required(VERSION 2.8)
-aux_source_directory(. SRC_LIST)
-add_executable(${PROJECT_NAME} ${SRC_LIST})
-set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
-target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared)
diff --git a/paddle/legacy/capi/examples/model_inference/dense/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/dense/convert_protobin.sh
deleted file mode 100755
index 30ffc316ecb..00000000000
--- a/paddle/legacy/capi/examples/model_inference/dense/convert_protobin.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/bash
-python -m paddle.utils.dump_config trainer_config.py '' --binary > trainer_config.bin
diff --git a/paddle/legacy/capi/examples/model_inference/dense/main.c b/paddle/legacy/capi/examples/model_inference/dense/main.c
deleted file mode 100644
index 90444889a74..00000000000
--- a/paddle/legacy/capi/examples/model_inference/dense/main.c
+++ /dev/null
@@ -1,116 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <paddle/capi.h>
-#include <time.h>
-
-#include "../common/common.h"
-
-// Modify this path as needed.
-#define CONFIG_BIN "./trainer_config.bin"
-// Modify this path as needed.
-// This demo assumes that merged model is not used, then this path is the
-// directory storing all the trained parameters.
-// If the model is trained by PaddlePaddle V2 API, the model is saved as
-// a compressed file. You need to uncompress the compressed file first.
-#define MODEL_PATH "models/pass_4"
-
-int main() {
-  // Initalize the PaddlePaddle runtime environment.
-  char* argv[] = {"--use_gpu=False"};
-  CHECK(paddle_init(1, (char**)argv));
-
-  // Read the binary configuration file generated by `convert_protobin.sh`
-  long size;
-  void* buf = read_config(CONFIG_BIN, &size);
-
-  // Create the gradient machine for inference.
-  paddle_gradient_machine machine;
-  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
-
-  // Load the trained model. Modify the parameter MODEL_PATH to set the correct
-  // path of the trained model.
-  CHECK(paddle_gradient_machine_load_parameter_from_disk(machine, MODEL_PATH));
-
-  // Inputs and outputs of the network are organized as paddle_arguments object
-  // in C-API. In the comments below, "argument" specifically means one input of
-  // the neural network in PaddlePaddle C-API.
-  paddle_arguments in_args = paddle_arguments_create_none();
-
-  // There is only one data layer in this demo MNIST network, invoke this
-  // function to create one argument.
-  CHECK(paddle_arguments_resize(in_args, 1));
-
-  // Each argument needs one matrix or one ivector (integer vector, for sparse
-  // index input, usually used in NLP task) to holds the real input data.
-  // In the comments below, "matrix" specifically means the object needed by
-  // argument to hold the data. Here we create the matrix for the above created
-  // agument to store the testing samples.
-  paddle_matrix mat =
-      paddle_matrix_create(/* height = batch size */ 1,
-                           /* width = dimensionality of the data layer */ 784,
-                           /* whether to use GPU */ false);
-
-  paddle_real* array;
-  // Get the pointer pointing to the start address of the first row of the
-  // created matrix.
-  CHECK(paddle_matrix_get_row(mat, 0, &array));
-
-  // Fill the matrix with a randomly generated test sample.
-  srand(time(0));
-  for (int i = 0; i < 784; ++i) {
-    array[i] = rand() / ((float)RAND_MAX);
-  }
-
-  // Assign the matrix to the argument.
-  CHECK(paddle_arguments_set_value(in_args, 0, mat));
-
-  // Create the output argument.
-  paddle_arguments out_args = paddle_arguments_create_none();
-
-  // Invoke the forward computation.
-  CHECK(paddle_gradient_machine_forward(machine,
-                                        in_args,
-                                        out_args,
-                                        /* is train taks or not */ false));
-
-  // Create the matrix to hold the forward result of the neural network.
-  paddle_matrix prob = paddle_matrix_create_none();
-  // Access the matrix of the output argument, the predicted result is stored in
-  // which.
-  CHECK(paddle_arguments_get_value(out_args, 0, prob));
-
-  uint64_t height;
-  uint64_t width;
-  CHECK(paddle_matrix_get_shape(prob, &height, &width));
-  CHECK(paddle_matrix_get_row(prob, 0, &array));
-
-  printf("Prob: \n");
-  for (int i = 0; i < height * width; ++i) {
-    printf("%.4f ", array[i]);
-    if ((i + 1) % width == 0) {
-      printf("\n");
-    }
-  }
-  printf("\n");
-
-  // The cleaning up.
-  CHECK(paddle_matrix_destroy(prob));
-  CHECK(paddle_arguments_destroy(out_args));
-  CHECK(paddle_matrix_destroy(mat));
-  CHECK(paddle_arguments_destroy(in_args));
-  CHECK(paddle_gradient_machine_destroy(machine));
-
-  return 0;
-}
diff --git a/paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py b/paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py
deleted file mode 100644
index 673aba2036c..00000000000
--- a/paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.utils.merge_model import merge_v2_model
-
-from mnist_v2 import network
-
-net = network(is_infer=True)
-param_file = "models/params_pass_4.tar"
-output_file = "output.paddle.model"
-merge_v2_model(net, param_file, output_file)
diff --git a/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py b/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py
deleted file mode 100644
index 3fd15d658ad..00000000000
--- a/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py
+++ /dev/null
@@ -1,131 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import gzip
-import logging
-import argparse
-from PIL import Image
-import numpy as np
-
-import paddle.v2 as paddle
-from paddle.utils.dump_v2_config import dump_v2_config
-
-logger = logging.getLogger("paddle")
-logger.setLevel(logging.INFO)
-
-
-def multilayer_perceptron(img, layer_size, lbl_dim):
-    for idx, size in enumerate(layer_size):
-        hidden = paddle.layer.fc(input=(img if not idx else hidden),
-                                 size=size,
-                                 act=paddle.activation.Relu())
-    return paddle.layer.fc(input=hidden,
-                           size=lbl_dim,
-                           act=paddle.activation.Softmax())
-
-
-def network(input_dim=784, lbl_dim=10, is_infer=False):
-    images = paddle.layer.data(
-        name='pixel', type=paddle.data_type.dense_vector(input_dim))
-
-    predict = multilayer_perceptron(
-        images, layer_size=[128, 64], lbl_dim=lbl_dim)
-
-    if is_infer:
-        return predict
-    else:
-        label = paddle.layer.data(
-            name='label', type=paddle.data_type.integer_value(lbl_dim))
-        return paddle.layer.classification_cost(input=predict, label=label)
-
-
-def main(task="train", use_gpu=False, trainer_count=1, save_dir="models"):
-    if task == "train":
-        if not os.path.exists(save_dir):
-            os.mkdir(save_dir)
-
-        paddle.init(use_gpu=use_gpu, trainer_count=trainer_count)
-        cost = network()
-        parameters = paddle.parameters.create(cost)
-        optimizer = paddle.optimizer.Momentum(
-            learning_rate=0.1 / 128.0,
-            momentum=0.9,
-            regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
-
-        trainer = paddle.trainer.SGD(cost=cost,
-                                     parameters=parameters,
-                                     update_equation=optimizer)
-
-        def event_handler(event):
-            if isinstance(event, paddle.event.EndIteration):
-                if event.batch_id % 100 == 0:
-                    logger.info("Pass %d, Batch %d, Cost %f, %s" %
-                                (event.pass_id, event.batch_id, event.cost,
-                                 event.metrics))
-            if isinstance(event, paddle.event.EndPass):
-                with gzip.open(
-                        os.path.join(save_dir, "params_pass_%d.tar" %
-                                     event.pass_id), "w") as f:
-                    trainer.save_parameter_to_tar(f)
-
-        trainer.train(
-            reader=paddle.batch(
-                paddle.reader.shuffle(
-                    paddle.dataset.mnist.train(), buf_size=8192),
-                batch_size=128),
-            event_handler=event_handler,
-            num_passes=5)
-    elif task == "dump_config":
-        predict = network(is_infer=True)
-        dump_v2_config(predict, "trainer_config.bin", True)
-    else:
-        raise RuntimeError(("Error value for parameter task. "
-                            "Available options are: train and dump_config."))
-
-
-def parse_cmd():
-    parser = argparse.ArgumentParser(
-        description="PaddlePaddle MNIST demo for CAPI.")
-    parser.add_argument(
-        "--task",
-        type=str,
-        required=False,
-        help=("A string indicating the taks type. "
-              "Available options are: \"train\", \"dump_config\"."),
-        default="train")
-    parser.add_argument(
-        "--use_gpu",
-        type=bool,
-        help=("A bool flag indicating whether to use GPU device or not."),
-        default=False)
-    parser.add_argument(
-        "--trainer_count",
-        type=int,
-        help=("This parameter is only used in training task. It indicates "
-              "how many computing threads are created in training."),
-        default=1)
-    parser.add_argument(
-        "--save_dir",
-        type=str,
-        help=("This parameter is only used in training task. It indicates "
-              "path of the directory to save the trained models."),
-        default="models")
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_cmd()
-    main(args.task, args.use_gpu, args.trainer_count, args.save_dir)
diff --git a/paddle/legacy/capi/examples/model_inference/dense/trainer_config.py b/paddle/legacy/capi/examples/model_inference/dense/trainer_config.py
deleted file mode 100644
index eca2dce114b..00000000000
--- a/paddle/legacy/capi/examples/model_inference/dense/trainer_config.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/.gitignore b/paddle/legacy/capi/examples/model_inference/multi_thread/.gitignore
deleted file mode 100644
index fab7372d796..00000000000
--- a/paddle/legacy/capi/examples/model_inference/multi_thread/.gitignore
+++ /dev/null
@@ -1,73 +0,0 @@
-# This file is used to ignore files which are generated
-# ----------------------------------------------------------------------------
-
-*~
-*.autosave
-*.a
-*.core
-*.moc
-*.o
-*.obj
-*.orig
-*.rej
-*.so
-*.so.*
-*_pch.h.cpp
-*_resource.rc
-*.qm
-.#*
-*.*#
-core
-!core/
-tags
-.DS_Store
-.directory
-*.debug
-Makefile*
-*.prl
-*.app
-moc_*.cpp
-ui_*.h
-qrc_*.cpp
-Thumbs.db
-*.res
-*.rc
-/.qmake.cache
-/.qmake.stash
-
-# qtcreator generated files
-*.pro.user*
-
-# xemacs temporary files
-*.flc
-
-# Vim temporary files
-.*.swp
-
-# Visual Studio generated files
-*.ib_pdb_index
-*.idb
-*.ilk
-*.pdb
-*.sln
-*.suo
-*.vcproj
-*vcproj.*.*.user
-*.ncb
-*.sdf
-*.opensdf
-*.vcxproj
-*vcxproj.*
-
-# MinGW generated files
-*.Debug
-*.Release
-
-# Python byte code
-*.pyc
-
-# Binaries
-# --------
-*.dll
-*.exe
-
diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/multi_thread/CMakeLists.txt
deleted file mode 100644
index 2fc8debdded..00000000000
--- a/paddle/legacy/capi/examples/model_inference/multi_thread/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-project(multi_thread)
-cmake_minimum_required(VERSION 2.8)
-
-find_package (Threads)
-
-if(NOT PADDLE_ROOT)
-  set(PADDLE_ROOT $ENV{PADDLE_ROOT} CACHE PATH "Paddle Path")
-endif()
-if(PADDLE_ROOT)
-  include_directories(${PADDLE_ROOT}/include)
-  link_directories(${PADDLE_ROOT}/lib)
-endif()
-
-set(CPU_SRCS main.c)
-add_executable(${PROJECT_NAME} ${CPU_SRCS})
-set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
-target_link_libraries(${PROJECT_NAME}
-                      -lpaddle_capi_shared
-                      ${CMAKE_THREAD_LIBS_INIT})
-
-find_package(CUDA QUIET)
-if(CUDA_FOUND)
-  set(GPU_SRCS main_gpu.c)
-  cuda_add_executable(${PROJECT_NAME}_gpu ${GPU_SRCS})
-  set_property(TARGET ${PROJECT_NAME}_gpu PROPERTY C_STANDARD 99)
-  target_link_libraries(${PROJECT_NAME}_gpu
-                        -lpaddle_capi_shared
-                        ${CMAKE_THREAD_LIBS_INIT})
-endif(CUDA_FOUND)
diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
deleted file mode 100644
index b29f2cd2141..00000000000
--- a/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
+++ /dev/null
@@ -1 +0,0 @@
-../dense/convert_protobin.sh
diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/main.c b/paddle/legacy/capi/examples/model_inference/multi_thread/main.c
deleted file mode 100644
index 0a99e6b9c8d..00000000000
--- a/paddle/legacy/capi/examples/model_inference/multi_thread/main.c
+++ /dev/null
@@ -1,112 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <paddle/capi.h>
-#include <pthread.h>
-#include <time.h>
-#include "../common/common.h"
-
-#define CONFIG_BIN "./trainer_config.bin"
-#define NUM_THREAD 4
-#define NUM_ITER 1000
-
-pthread_mutex_t mutex;
-
-void* thread_main(void* gm_ptr) {
-  paddle_gradient_machine machine = (paddle_gradient_machine)(gm_ptr);
-  paddle_arguments in_args = paddle_arguments_create_none();
-  // Create input matrix.
-  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
-                                           /* size */ 784,
-                                           /* useGPU */ false);
-  paddle_arguments out_args = paddle_arguments_create_none();
-  paddle_matrix prob = paddle_matrix_create_none();
-  for (int iter = 0; iter < NUM_ITER; ++iter) {
-    // There is only one input of this network.
-    CHECK(paddle_arguments_resize(in_args, 1));
-
-    paddle_real* array;
-
-    // Get First row.
-    CHECK(paddle_matrix_get_row(mat, 0, &array));
-
-    for (int i = 0; i < 784; ++i) {
-      array[i] = rand() / ((float)RAND_MAX);
-    }
-
-    CHECK(paddle_arguments_set_value(in_args, 0, mat));
-
-    CHECK(paddle_gradient_machine_forward(machine,
-                                          in_args,
-                                          out_args,
-                                          /* isTrain */ false));
-
-    CHECK(paddle_arguments_get_value(out_args, 0, prob));
-
-    CHECK(paddle_matrix_get_row(prob, 0, &array));
-
-    pthread_mutex_lock(&mutex);
-    printf("Prob: ");
-    for (int i = 0; i < 10; ++i) {
-      printf("%.2f ", array[i]);
-    }
-    printf("\n");
-    pthread_mutex_unlock(&mutex);
-  }
-
-  CHECK(paddle_matrix_destroy(prob));
-  CHECK(paddle_arguments_destroy(out_args));
-  CHECK(paddle_matrix_destroy(mat));
-  CHECK(paddle_arguments_destroy(in_args));
-  CHECK(paddle_gradient_machine_destroy(machine));
-  return NULL;
-}
-
-int main() {
-  // Initalize Paddle
-  char* argv[] = {"--use_gpu=False"};
-  CHECK(paddle_init(1, (char**)argv));
-
-  // Reading config binary file. It is generated by `convert_protobin.sh`
-  long size;
-  void* buf = read_config(CONFIG_BIN, &size);
-
-  // Create a gradient machine for inference.
-  paddle_gradient_machine machine;
-  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
-  CHECK(paddle_gradient_machine_randomize_param(machine));
-
-  // Loading parameter. Uncomment the following line and change the directory.
-  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
-  //                                                "./some_where_to_params"));
-  srand(time(0));
-  pthread_mutex_init(&mutex, NULL);
-
-  pthread_t threads[NUM_THREAD];
-
-  for (int i = 0; i < NUM_THREAD; ++i) {
-    paddle_gradient_machine thread_local_machine;
-    CHECK(paddle_gradient_machine_create_shared_param(
-        machine, buf, size, &thread_local_machine));
-    pthread_create(&threads[i], NULL, thread_main, thread_local_machine);
-  }
-
-  for (int i = 0; i < NUM_THREAD; ++i) {
-    pthread_join(threads[i], NULL);
-  }
-
-  pthread_mutex_destroy(&mutex);
-
-  return 0;
-}
diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/main_gpu.c b/paddle/legacy/capi/examples/model_inference/multi_thread/main_gpu.c
deleted file mode 100644
index 60f0c59e771..00000000000
--- a/paddle/legacy/capi/examples/model_inference/multi_thread/main_gpu.c
+++ /dev/null
@@ -1,127 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <paddle/capi.h>
-#include <pthread.h>
-#include <time.h>
-#include "../common/common.h"
-
-#define CONFIG_BIN "./trainer_config.bin"
-#define NUM_THREAD 4
-#define NUM_ITER 1000
-
-pthread_mutex_t mutex;
-
-/*
- * @brief It is an simple inference example that runs multi-threads on a GPU.
- *        Each thread holds it own local gradient_machine but shares the same
- *        parameters.
- *        If you want to run on different GPUs, you need to launch
- *        multi-processes or set trainer_count > 1.
- */
-void* thread_main(void* gm_ptr) {
-  // Initialize the thread environment of Paddle.
-  CHECK(paddle_init_thread());
-
-  paddle_gradient_machine machine = (paddle_gradient_machine)(gm_ptr);
-  // Create input arguments.
-  paddle_arguments in_args = paddle_arguments_create_none();
-  // Create input matrix.
-  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
-                                           /* size */ 784,
-                                           /* useGPU */ true);
-  // Create output arguments.
-  paddle_arguments out_args = paddle_arguments_create_none();
-  // Create output matrix.
-  paddle_matrix prob = paddle_matrix_create_none();
-
-  // CPU buffer to cache the input and output.
-  paddle_real* cpu_input = (paddle_real*)malloc(784 * sizeof(paddle_real));
-  paddle_real* cpu_output = (paddle_real*)malloc(10 * sizeof(paddle_real));
-  for (int iter = 0; iter < NUM_ITER; ++iter) {
-    // There is only one input layer of this network.
-    CHECK(paddle_arguments_resize(in_args, 1));
-    CHECK(paddle_arguments_set_value(in_args, 0, mat));
-
-    for (int i = 0; i < 784; ++i) {
-      cpu_input[i] = rand() / ((float)RAND_MAX);
-    }
-    CHECK(paddle_matrix_set_value(mat, cpu_input));
-
-    CHECK(paddle_gradient_machine_forward(machine,
-                                          in_args,
-                                          out_args,
-                                          /* isTrain */ false));
-
-    CHECK(paddle_arguments_get_value(out_args, 0, prob));
-    CHECK(paddle_matrix_get_value(prob, cpu_output));
-
-    pthread_mutex_lock(&mutex);
-    printf("Prob: ");
-    for (int i = 0; i < 10; ++i) {
-      printf("%.2f ", cpu_output[i]);
-    }
-    printf("\n");
-    pthread_mutex_unlock(&mutex);
-  }
-
-  CHECK(paddle_matrix_destroy(prob));
-  CHECK(paddle_arguments_destroy(out_args));
-  CHECK(paddle_matrix_destroy(mat));
-  CHECK(paddle_arguments_destroy(in_args));
-  CHECK(paddle_gradient_machine_destroy(machine));
-
-  free(cpu_input);
-  free(cpu_output);
-
-  return NULL;
-}
-
-int main() {
-  // Initalize Paddle
-  char* argv[] = {"--use_gpu=True"};
-  CHECK(paddle_init(1, (char**)argv));
-
-  // Reading config binary file. It is generated by `convert_protobin.sh`
-  long size;
-  void* buf = read_config(CONFIG_BIN, &size);
-
-  // Create a gradient machine for inference.
-  paddle_gradient_machine machine;
-  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
-  CHECK(paddle_gradient_machine_randomize_param(machine));
-
-  // Loading parameter. Uncomment the following line and change the directory.
-  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
-  //                                                "./some_where_to_params"));
-  srand(time(0));
-  pthread_mutex_init(&mutex, NULL);
-
-  pthread_t threads[NUM_THREAD];
-
-  for (int i = 0; i < NUM_THREAD; ++i) {
-    paddle_gradient_machine thread_local_machine;
-    CHECK(paddle_gradient_machine_create_shared_param(
-        machine, buf, size, &thread_local_machine));
-    pthread_create(&threads[i], NULL, thread_main, thread_local_machine);
-  }
-
-  for (int i = 0; i < NUM_THREAD; ++i) {
-    pthread_join(threads[i], NULL);
-  }
-
-  pthread_mutex_destroy(&mutex);
-
-  return 0;
-}
diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/trainer_config.py b/paddle/legacy/capi/examples/model_inference/multi_thread/trainer_config.py
deleted file mode 100755
index fa6a12319a9..00000000000
--- a/paddle/legacy/capi/examples/model_inference/multi_thread/trainer_config.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reservedd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/paddle/legacy/capi/examples/model_inference/sequence/.gitignore b/paddle/legacy/capi/examples/model_inference/sequence/.gitignore
deleted file mode 100644
index fab7372d796..00000000000
--- a/paddle/legacy/capi/examples/model_inference/sequence/.gitignore
+++ /dev/null
@@ -1,73 +0,0 @@
-# This file is used to ignore files which are generated
-# ----------------------------------------------------------------------------
-
-*~
-*.autosave
-*.a
-*.core
-*.moc
-*.o
-*.obj
-*.orig
-*.rej
-*.so
-*.so.*
-*_pch.h.cpp
-*_resource.rc
-*.qm
-.#*
-*.*#
-core
-!core/
-tags
-.DS_Store
-.directory
-*.debug
-Makefile*
-*.prl
-*.app
-moc_*.cpp
-ui_*.h
-qrc_*.cpp
-Thumbs.db
-*.res
-*.rc
-/.qmake.cache
-/.qmake.stash
-
-# qtcreator generated files
-*.pro.user*
-
-# xemacs temporary files
-*.flc
-
-# Vim temporary files
-.*.swp
-
-# Visual Studio generated files
-*.ib_pdb_index
-*.idb
-*.ilk
-*.pdb
-*.sln
-*.suo
-*.vcproj
-*vcproj.*.*.user
-*.ncb
-*.sdf
-*.opensdf
-*.vcxproj
-*vcxproj.*
-
-# MinGW generated files
-*.Debug
-*.Release
-
-# Python byte code
-*.pyc
-
-# Binaries
-# --------
-*.dll
-*.exe
-
diff --git a/paddle/legacy/capi/examples/model_inference/sequence/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/sequence/CMakeLists.txt
deleted file mode 100644
index 71b73acba7c..00000000000
--- a/paddle/legacy/capi/examples/model_inference/sequence/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-project(sequence)
-cmake_minimum_required(VERSION 2.8)
-aux_source_directory(. SRC_LIST)
-add_executable(${PROJECT_NAME} ${SRC_LIST})
-set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
-target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared)
diff --git a/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
deleted file mode 100644
index b29f2cd2141..00000000000
--- a/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
+++ /dev/null
@@ -1 +0,0 @@
-../dense/convert_protobin.sh
diff --git a/paddle/legacy/capi/examples/model_inference/sequence/main.c b/paddle/legacy/capi/examples/model_inference/sequence/main.c
deleted file mode 100644
index 25a38d32f0b..00000000000
--- a/paddle/legacy/capi/examples/model_inference/sequence/main.c
+++ /dev/null
@@ -1,84 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <paddle/capi.h>
-#include <time.h>
-#include "../common/common.h"
-
-#define CONFIG_BIN "./trainer_config.bin"
-
-int main() {
-  // Initalize Paddle
-  char* argv[] = {"--use_gpu=False"};
-  CHECK(paddle_init(1, (char**)argv));
-
-  // Reading config binary file. It is generated by `convert_protobin.sh`
-  long size;
-  void* buf = read_config(CONFIG_BIN, &size);
-
-  // Create a gradient machine for inference.
-  paddle_gradient_machine machine;
-  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
-  CHECK(paddle_gradient_machine_randomize_param(machine));
-
-  // Loading parameter. Uncomment the following line and change the directory.
-  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
-  //                                                "./some_where_to_params"));
-  paddle_arguments in_args = paddle_arguments_create_none();
-
-  // There is only one input of this network.
-  CHECK(paddle_arguments_resize(in_args, 1));
-
-  // Create input ids.
-  int sentence_ids[] = {83, 48, 20, 84, 394, 853, 64, 53, 64};
-
-  paddle_ivector sentence = paddle_ivector_create(
-      sentence_ids, sizeof(sentence_ids) / sizeof(int), false, false);
-  CHECK(paddle_arguments_set_ids(in_args, 0, sentence));
-
-  int seq_pos_array[] = {0, sizeof(sentence_ids) / sizeof(int)};
-
-  paddle_ivector seq_pos = paddle_ivector_create(
-      seq_pos_array, sizeof(seq_pos_array) / sizeof(int), false, false);
-
-  CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos));
-
-  paddle_arguments out_args = paddle_arguments_create_none();
-  CHECK(paddle_gradient_machine_forward(machine,
-                                        in_args,
-                                        out_args,
-                                        /* isTrain */ false));
-  paddle_matrix prob = paddle_matrix_create_none();
-
-  CHECK(paddle_arguments_get_value(out_args, 0, prob));
-
-  paddle_real* array;
-
-  CHECK(paddle_matrix_get_row(prob, 0, &array));
-
-  printf("Prob: ");
-  for (int i = 0; i < 2; ++i) {
-    printf("%.2f ", array[i]);
-  }
-  printf("\n");
-
-  CHECK(paddle_matrix_destroy(prob));
-  CHECK(paddle_arguments_destroy(out_args));
-  CHECK(paddle_ivector_destroy(seq_pos));
-  CHECK(paddle_ivector_destroy(sentence));
-  CHECK(paddle_arguments_destroy(in_args));
-  CHECK(paddle_gradient_machine_destroy(machine));
-
-  return 0;
-}
diff --git a/paddle/legacy/capi/examples/model_inference/sequence/trainer_config.py b/paddle/legacy/capi/examples/model_inference/sequence/trainer_config.py
deleted file mode 100644
index 62ae97e2627..00000000000
--- a/paddle/legacy/capi/examples/model_inference/sequence/trainer_config.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-WORD_DIM = 3000
-
-sentence = data_layer(name='sentence', size=WORD_DIM)
-sentence_embedding = embedding_layer(
-    input=sentence,
-    size=64,
-    param_attr=ParameterAttribute(
-        initial_max=1.0, initial_min=0.5))
-lstm = simple_lstm(input=sentence_embedding, size=64)
-lstm_last = last_seq(input=lstm)
-outputs(fc_layer(input=lstm_last, size=2, act=SoftmaxActivation()))
diff --git a/paddle/legacy/capi/examples/model_inference/sparse_binary/.gitignore b/paddle/legacy/capi/examples/model_inference/sparse_binary/.gitignore
deleted file mode 100644
index fab7372d796..00000000000
--- a/paddle/legacy/capi/examples/model_inference/sparse_binary/.gitignore
+++ /dev/null
@@ -1,73 +0,0 @@
-# This file is used to ignore files which are generated
-# ----------------------------------------------------------------------------
-
-*~
-*.autosave
-*.a
-*.core
-*.moc
-*.o
-*.obj
-*.orig
-*.rej
-*.so
-*.so.*
-*_pch.h.cpp
-*_resource.rc
-*.qm
-.#*
-*.*#
-core
-!core/
-tags
-.DS_Store
-.directory
-*.debug
-Makefile*
-*.prl
-*.app
-moc_*.cpp
-ui_*.h
-qrc_*.cpp
-Thumbs.db
-*.res
-*.rc
-/.qmake.cache
-/.qmake.stash
-
-# qtcreator generated files
-*.pro.user*
-
-# xemacs temporary files
-*.flc
-
-# Vim temporary files
-.*.swp
-
-# Visual Studio generated files
-*.ib_pdb_index
-*.idb
-*.ilk
-*.pdb
-*.sln
-*.suo
-*.vcproj
-*vcproj.*.*.user
-*.ncb
-*.sdf
-*.opensdf
-*.vcxproj
-*vcxproj.*
-
-# MinGW generated files
-*.Debug
-*.Release
-
-# Python byte code
-*.pyc
-
-# Binaries
-# --------
-*.dll
-*.exe
-
diff --git a/paddle/legacy/capi/examples/model_inference/sparse_binary/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/sparse_binary/CMakeLists.txt
deleted file mode 100644
index c8219568890..00000000000
--- a/paddle/legacy/capi/examples/model_inference/sparse_binary/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-project(sparse_binary)
-cmake_minimum_required(VERSION 2.8)
-aux_source_directory(. SRC_LIST)
-add_executable(${PROJECT_NAME} ${SRC_LIST})
-find_package (Threads)
-set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
-target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared)
diff --git a/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
deleted file mode 100644
index b29f2cd2141..00000000000
--- a/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
+++ /dev/null
@@ -1 +0,0 @@
-../dense/convert_protobin.sh
diff --git a/paddle/legacy/capi/examples/model_inference/sparse_binary/main.c b/paddle/legacy/capi/examples/model_inference/sparse_binary/main.c
deleted file mode 100644
index 8df1b600885..00000000000
--- a/paddle/legacy/capi/examples/model_inference/sparse_binary/main.c
+++ /dev/null
@@ -1,87 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <paddle/capi.h>
-#include <time.h>
-
-#include "../common/common.h"
-
-#define CONFIG_BIN "./trainer_config.bin"
-
-int main() {
-  // Initalize Paddle
-  char* argv[] = {"--use_gpu=False"};
-  CHECK(paddle_init(1, (char**)argv));
-
-  // Read the binary configuration file which is generated by
-  // `convert_protobin.sh`
-  long size;
-  void* buf = read_config(CONFIG_BIN, &size);
-
-  // Create the gradient machine for inference.
-  paddle_gradient_machine machine;
-  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
-  CHECK(paddle_gradient_machine_randomize_param(machine));
-
-  // Load the trained parameters. Uncomment the following line and change the
-  // directory as needed.
-  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
-  //                                                "./some_where_to_params"));
-  paddle_arguments in_args = paddle_arguments_create_none();
-
-  // There is only one input of this network.
-  CHECK(paddle_arguments_resize(in_args, 1));
-
-  // Create the input matrix.
-  paddle_matrix mat = paddle_matrix_create_sparse(1, 784, 3, true, false);
-  srand(time(0));
-  paddle_real* array;
-  int colBuf[] = {9, 93, 109};
-  int rowBuf[] = {0, sizeof(colBuf) / sizeof(int)};
-
-  CHECK(paddle_matrix_sparse_copy_from(mat,
-                                       rowBuf,
-                                       sizeof(rowBuf) / sizeof(int),
-                                       colBuf,
-                                       sizeof(colBuf) / sizeof(int),
-                                       NULL,
-                                       0));
-
-  CHECK(paddle_arguments_set_value(in_args, 0, mat));
-
-  paddle_arguments out_args = paddle_arguments_create_none();
-  CHECK(paddle_gradient_machine_forward(machine,
-                                        in_args,
-                                        out_args,
-                                        /* isTrain */ false));
-  paddle_matrix prob = paddle_matrix_create_none();
-
-  CHECK(paddle_arguments_get_value(out_args, 0, prob));
-
-  CHECK(paddle_matrix_get_row(prob, 0, &array));
-
-  printf("Prob: ");
-  for (int i = 0; i < 10; ++i) {
-    printf("%.2f ", array[i]);
-  }
-  printf("\n");
-
-  CHECK(paddle_matrix_destroy(prob));
-  CHECK(paddle_arguments_destroy(out_args));
-  CHECK(paddle_matrix_destroy(mat));
-  CHECK(paddle_arguments_destroy(in_args));
-  CHECK(paddle_gradient_machine_destroy(machine));
-
-  return 0;
-}
diff --git a/paddle/legacy/capi/examples/model_inference/sparse_binary/trainer_config.py b/paddle/legacy/capi/examples/model_inference/sparse_binary/trainer_config.py
deleted file mode 100755
index fa6a12319a9..00000000000
--- a/paddle/legacy/capi/examples/model_inference/sparse_binary/trainer_config.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reservedd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/paddle/legacy/capi/gradient_machine.cpp b/paddle/legacy/capi/gradient_machine.cpp
deleted file mode 100644
index 0c5ddd856b5..00000000000
--- a/paddle/legacy/capi/gradient_machine.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gradient_machine.h"
-#include "capi_private.h"
-#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-
-#define cast(v) paddle::capi::cast<paddle::capi::CGradientMachine>(v)
-
-enum GradientMatchineCreateMode {
-  CREATE_MODE_NORMAL = 0,
-  CREATE_MODE_TESTING = 4
-};
-
-namespace paddle {
-
-class MyNeuralNetwork : public NeuralNetwork {
- public:
-  MyNeuralNetwork(const std::string& name, NeuralNetwork* network)
-      : NeuralNetwork(name, network) {}
-};
-
-NeuralNetwork* newCustomNerualNetwork(const std::string& name,
-                                      NeuralNetwork* network) {
-  return new MyNeuralNetwork(name, network);
-}
-}  // namespace paddle
-
-extern "C" {
-paddle_error paddle_gradient_machine_create_for_inference(
-    paddle_gradient_machine* machine, void* modelConfigProtobuf, int size) {
-  if (modelConfigProtobuf == nullptr) return kPD_NULLPTR;
-  paddle::ModelConfig config;
-  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
-      !config.IsInitialized()) {
-    return kPD_PROTOBUF_ERROR;
-  }
-
-  auto ptr = new paddle::capi::CGradientMachine();
-  ptr->machine.reset(paddle::GradientMachine::create(
-      config, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
-  *machine = ptr;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_create_for_inference_with_parameters(
-    paddle_gradient_machine* machine, void* mergedModel, uint64_t size) {
-  if (mergedModel == nullptr) return kPD_NULLPTR;
-  std::istringstream is(std::string(static_cast<char*>(mergedModel), size));
-  int64_t modelConfigSize = 0;
-  is.read((char*)(&modelConfigSize), sizeof(modelConfigSize));
-  std::string modelConfigProtobuf;
-  modelConfigProtobuf.resize(modelConfigSize);
-  is.read(&modelConfigProtobuf[0], modelConfigSize);
-  paddle::TrainerConfig config;
-  paddle::ModelConfig modelConfig;
-  if (!config.ParseFromString(modelConfigProtobuf) || !config.IsInitialized()) {
-    if (!modelConfig.ParseFromString(modelConfigProtobuf) ||
-        !modelConfig.IsInitialized()) {
-      return kPD_PROTOBUF_ERROR;
-    }
-  } else {
-    modelConfig = config.model_config();
-  }
-  auto ptr = new paddle::capi::CGradientMachine();
-  ptr->machine.reset(paddle::GradientMachine::create(
-      modelConfig, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
-  std::vector<paddle::ParameterPtr>& parameters = ptr->machine->getParameters();
-  for (auto& para : parameters) {
-    para->load(is);
-  }
-
-  *machine = ptr;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_destroy(paddle_gradient_machine machine) {
-  delete cast(machine);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_load_parameter_from_disk(
-    paddle_gradient_machine machine, const char* path) {
-  auto m = cast(machine);
-  if (m == nullptr || path == nullptr || m->machine == nullptr)
-    return kPD_NULLPTR;
-  m->machine->loadParameters(path);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_forward(paddle_gradient_machine machine,
-                                             paddle_arguments inArgs,
-                                             paddle_arguments outArgs,
-                                             bool isTrain) {
-  auto m = cast(machine);
-  auto in = paddle::capi::cast<paddle::capi::CArguments>(inArgs);
-  auto out = paddle::capi::cast<paddle::capi::CArguments>(outArgs);
-  if (m == nullptr || in == nullptr || out == nullptr || m->machine == nullptr)
-    return kPD_NULLPTR;
-  m->machine->forward(
-      in->args, &out->args, isTrain ? paddle::PASS_TRAIN : paddle::PASS_TEST);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_create_shared_param(
-    paddle_gradient_machine origin,
-    void* modelConfigProtobuf,
-    int size,
-    paddle_gradient_machine* slave) {
-  auto o = cast(origin);
-  if (origin == nullptr || slave == nullptr || o->machine == nullptr) {
-    return kPD_NULLPTR;
-  }
-  paddle::ModelConfig config;
-  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
-      !config.IsInitialized()) {
-    return kPD_PROTOBUF_ERROR;
-  }
-
-  std::unique_ptr<paddle::capi::CGradientMachine> ptr(
-      new paddle::capi::CGradientMachine());
-  auto nn = paddle::NeuralNetwork::create(config);
-  nn->init(config,
-           [&o](int paramId, paddle::Parameter* param) {
-             auto p = o->machine->getParameters()[paramId];
-             param->enableSharedType(paddle::PARAMETER_VALUE,
-                                     p->getBuf(paddle::PARAMETER_VALUE));
-           },
-           {paddle::PARAMETER_VALUE},
-           false);
-  ptr->machine.reset(nn);
-  *slave = ptr.release();
-  return kPD_NO_ERROR;
-}
-}
-
-paddle_error paddle_gradient_machine_randomize_param(
-    paddle_gradient_machine machine) {
-  auto m = cast(machine);
-  if (m == nullptr || m->machine == nullptr) return kPD_NULLPTR;
-  m->machine->randParameters();
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_get_layer_output(
-    paddle_gradient_machine machine,
-    const char* layerName,
-    paddle_arguments args) {
-  auto m = cast(machine);
-  auto out = paddle::capi::cast<paddle::capi::CArguments>(args);
-  if (m == nullptr || layerName == nullptr || out == nullptr ||
-      m->machine == nullptr) {
-    return kPD_NULLPTR;
-  }
-
-  auto layerOutput = m->machine->getLayerOutput(layerName);
-  out->args.push_back(layerOutput);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_release_layer_output(
-    paddle_gradient_machine machine) {
-  auto m = cast(machine);
-  if (m == nullptr || m->machine == nullptr) {
-    return kPD_NULLPTR;
-  }
-  m->machine->releaseOutput();
-  return kPD_NO_ERROR;
-}
diff --git a/paddle/legacy/capi/gradient_machine.h b/paddle/legacy/capi/gradient_machine.h
deleted file mode 100644
index f46498b3753..00000000000
--- a/paddle/legacy/capi/gradient_machine.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef __PADDLE_CAPI_GRADIENT_MACHINE_H__
-#define __PADDLE_CAPI_GRADIENT_MACHINE_H__
-#include "arguments.h"
-#include "config.h"
-#include "error.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-/**
- * @brief GradientMachine means a neural network.
- */
-typedef void* paddle_gradient_machine;
-
-/**
- * @brief Create a gradient machine used for model inference.
- * @param [out] machine that used for model inference.
- * @param [in] modelConfigProtobuf
- * @param [in] size
- * @return paddle_error
- */
-PD_API paddle_error paddle_gradient_machine_create_for_inference(
-    paddle_gradient_machine* machine, void* modelConfigProtobuf, int size);
-
-/**
- * @brief Create a gradient machine used for model inference, using config with
- *        parameters which is generated by `paddle merge_model`.
- *        Example:
- *          paddle merge_model \
- *                 --model_dir="pass-00000" \
- *                 --model_file="merged_model.paddle"
- * @param [out] machine that used for model inference
- * @param [in] mergedModel
- * @param [in] size
- * @return paddle_error
- */
-PD_API paddle_error
-paddle_gradient_machine_create_for_inference_with_parameters(
-    paddle_gradient_machine* machine, void* mergedModel, uint64_t size);
-
-/**
- * @brief Load parameter from disk.
- * @param machine Gradient Machine.
- * @param path local directory path.
- * @return paddle_error
- */
-PD_API paddle_error paddle_gradient_machine_load_parameter_from_disk(
-    paddle_gradient_machine machine, const char* path);
-
-/**
- * @brief Forward a gradient machine
- * @param machine Gradient machine
- * @param inArgs input arguments
- * @param outArgs output arguments
- * @param isTrain is train or not
- * @return paddle_error
- */
-PD_API paddle_error
-paddle_gradient_machine_forward(paddle_gradient_machine machine,
-                                paddle_arguments inArgs,
-                                paddle_arguments outArgs,
-                                bool isTrain);
-
-/**
- * @brief Create a gradient machine, which parameters are shared from another
- *        gradient machine.
- * @param [in] origin gradient machine
- * @param [in] modelConfigProtobuf model config protobuf
- * @param [in] size of model config buffer.
- * @param [out] slave gradient machine, the output value.
- * @return paddle_error
- */
-PD_API paddle_error
-paddle_gradient_machine_create_shared_param(paddle_gradient_machine origin,
-                                            void* modelConfigProtobuf,
-                                            int size,
-                                            paddle_gradient_machine* slave);
-
-PD_API paddle_error
-paddle_gradient_machine_randomize_param(paddle_gradient_machine machine);
-
-/**
- * @brief Destroy a gradient machine
- * @param machine that need to destroy
- * @return paddle_error
- */
-PD_API paddle_error
-paddle_gradient_machine_destroy(paddle_gradient_machine machine);
-
-/**
- * @brief Get the output of the layer named `layerName`.
- * @param [in] gradient machine that have run a inference
- * @param [in] layerName name of specified layer
- * @param [out] args output of the specified layer
- * @return paddle_error
- */
-PD_API paddle_error
-paddle_gradient_machine_get_layer_output(paddle_gradient_machine machine,
-                                         const char* layerName,
-                                         paddle_arguments args);
-
-/**
- * @brief Release the middle layer's output memory of the gradient machine.
- * @param [in] gradient machine that have run a inference
- * @return paddle_error
- */
-PD_API paddle_error
-paddle_gradient_machine_release_layer_output(paddle_gradient_machine machine);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/paddle/legacy/capi/main.h b/paddle/legacy/capi/main.h
deleted file mode 100644
index a0cb7bc2967..00000000000
--- a/paddle/legacy/capi/main.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef __PADDLE_CAPI_MAIN_H__
-#define __PADDLE_CAPI_MAIN_H__
-#include "config.h"
-#include "error.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Initialize Paddle.
- */
-PD_API paddle_error paddle_init(int argc, char** argv);
-
-/**
- * Initialize the thread environment of Paddle.
- * @note it is requisite for GPU runs but optional for CPU runs.
- *       For GPU runs, all threads will run on the same GPU devices.
- */
-PD_API paddle_error paddle_init_thread();
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/paddle/legacy/capi/matrix.h b/paddle/legacy/capi/matrix.h
deleted file mode 100644
index f6747f7b1a1..00000000000
--- a/paddle/legacy/capi/matrix.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef __PADDLE_CAPI_MATRIX_H__
-#define __PADDLE_CAPI_MATRIX_H__
-
-#include <stdbool.h>
-#include <stdint.h>
-#include "config.h"
-#include "error.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Matrix functions. Return will be a paddle_error type.
- */
-typedef void* paddle_matrix;
-
-/**
- * @brief paddle_matrix_create Create a dense matrix
- * @param height matrix height.
- * @param width matrix width
- * @param useGpu use GPU of not
- * @return Matrix handler
- */
-PD_API paddle_matrix paddle_matrix_create(uint64_t height,
-                                          uint64_t width,
-                                          bool useGpu);
-
-/**
- * @brief paddle_matrix_create_sparse Create a sparse matrix.
- * @param height the matrix height.
- * @param width the matrix width.
- * @param nnz the number of non-zero elements.
- * @param isBinary is binary (either 1 or 0 in matrix) or not.
- * @param useGpu is using GPU or not.
- * @return paddle_matrix.
- * @note Mobile inference does not support this interface.
- */
-PD_API paddle_matrix paddle_matrix_create_sparse(
-    uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
-
-/**
- * @brief paddle_matrix_destroy Destroy a matrix.
- * @param mat
- * @return paddle_error
- */
-PD_API paddle_error paddle_matrix_destroy(paddle_matrix mat);
-
-/**
- * @brief paddle_matrix_set_row Set a row to matrix.
- * @param mat Target Matrix
- * @param rowID Index of row
- * @param rowArray Row data.
- * @return paddle_error
- */
-PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
-                                          uint64_t rowID,
-                                          paddle_real* rowArray);
-
-/**
- * @brief paddle_matrix_set_value Set value to matrix.
- * @param mat Target Matrix
- * @param value Row data.
- * @return paddle_error
- * @note  value should contain enough element of data to init the mat
- */
-PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                            paddle_real* value);
-
-/**
- * @brief PDMatGetRow Get raw row buffer from matrix
- * @param [in] mat Target matrix
- * @param [in] rowID Index of row.
- * @param [out] rawRowBuffer Row Buffer
- * @return paddle_error
- */
-PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
-                                          uint64_t rowID,
-                                          paddle_real** rawRowBuffer);
-
-/**
- * @brief copy data from the matrix
- * @param [in] mat Target matrix
- * @param [out] result pointer to store the matrix data
- * @return paddle_error
- * @note the space of the result should allocated before invoke this API
- */
-PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                            paddle_real* result);
-/**
- * @brief PDMatCreateNone Create None Matrix
- * @return
- */
-PD_API paddle_matrix paddle_matrix_create_none();
-
-/**
- * @brief PDMatGetShape get the shape of matrix
- * @param mat target matrix
- * @param height The height of matrix
- * @param width The width of matrix
- * @return paddle_error
- */
-PD_API paddle_error paddle_matrix_get_shape(paddle_matrix mat,
-                                            uint64_t* height,
-                                            uint64_t* width);
-
-/**
- * @brief paddle_matrix_sparse_copy_from Copy from a CSR format matrix
- * @param [out] mat output matrix
- * @param [in] rowArray row array. The array slices in column array.
- * @param [in] rowSize length of row array.
- * @param [in] colArray the column array. It means the non-zero element indices
- * in each row.
- * @param [in] colSize length of column array.
- * @param [in] valueArray the value array. It means the non-zero elemnt values.
- * NULL if the matrix is binary.
- * @param [in] valueSize length of value array. Zero if the matrix is binary.
- * @return paddle_error
- * @note Mobile inference does not support this interface.
- */
-PD_API paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
-                                                   int* rowArray,
-                                                   uint64_t rowSize,
-                                                   int* colArray,
-                                                   uint64_t colSize,
-                                                   float* valueArray,
-                                                   uint64_t valueSize);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/paddle/legacy/capi/paddle_capi.map b/paddle/legacy/capi/paddle_capi.map
deleted file mode 100644
index 8d673f675dd..00000000000
--- a/paddle/legacy/capi/paddle_capi.map
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-	global:
-		paddle_*;
-	local:
-		*;
-};
diff --git a/paddle/legacy/capi/tests/.gitignore b/paddle/legacy/capi/tests/.gitignore
deleted file mode 100644
index 7ab6be95e39..00000000000
--- a/paddle/legacy/capi/tests/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-w
-b
diff --git a/paddle/legacy/capi/tests/CMakeLists.txt b/paddle/legacy/capi/tests/CMakeLists.txt
deleted file mode 100644
index bb38ace6280..00000000000
--- a/paddle/legacy/capi/tests/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-add_unittest(capi_test_mats test_Vector.cpp
-  test_Matrix.cpp test_Arguments.cpp)
-
-target_include_directories(capi_test_mats PUBLIC ${PADDLE_CAPI_INC_PATH})
-target_link_libraries(capi_test_mats paddle_capi)
-
-if(NOT MOBILE_INFERENCE)
-    add_unittest_without_exec(capi_test_gradientMachine test_GradientMachine.cpp)
-    target_include_directories(capi_test_gradientMachine PUBLIC
-      ${PADDLE_CAPI_INC_PATH})
-    target_link_libraries(capi_test_gradientMachine paddle_capi)
-    add_test(NAME capi_test_gradientMachine
-      COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/capi/tests)
-endif()
diff --git a/paddle/legacy/capi/tests/test_Arguments.cpp b/paddle/legacy/capi/tests/test_Arguments.cpp
deleted file mode 100644
index 6fb379719dc..00000000000
--- a/paddle/legacy/capi/tests/test_Arguments.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <functional>
-#include "capi.h"
-#include "gtest/gtest.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-static std::vector<paddle_real> randomBuffer(size_t bufSize) {
-  auto& eng = paddle::ThreadLocalRandomEngine::get();
-  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
-  std::vector<paddle_real> retv;
-  retv.reserve(bufSize);
-  for (size_t i = 0; i < bufSize; ++i) {
-    retv.push_back(dist(eng));
-  }
-  return retv;
-}
-
-TEST(CAPIArguments, create) {
-  //! TODO(yuyang18): Test GPU Code.
-  paddle_arguments args = paddle_arguments_create_none();
-  uint64_t size;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(args, &size));
-  ASSERT_EQ(0UL, size);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
-}
-
-TEST(CAPIArguments, value) {
-  paddle_arguments args = paddle_arguments_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
-
-  paddle_matrix mat = paddle_matrix_create(128, 64, false);
-  for (size_t i = 0; i < 128; ++i) {
-    std::vector<paddle_real> sampleBuf = randomBuffer(64);
-    paddle_matrix_set_row(mat, i, sampleBuf.data());
-  }
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(args, 0, mat));
-
-  paddle_matrix val = paddle_matrix_create_none();
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(args, 0, val));
-
-  for (size_t i = 0; i < 128; ++i) {
-    paddle_real* row1;
-    paddle_real* row2;
-
-    ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, i, &row1));
-    ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(val, i, &row2));
-    ASSERT_EQ(row1, row2);
-  }
-
-  paddle_ivector ivec = paddle_ivector_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(val));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
-}
-
-TEST(CAPIArguments, ids) {
-  paddle_arguments args = paddle_arguments_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
-
-  paddle_ivector ivec;
-  int array[3] = {1, 2, 3};
-  ivec = paddle_ivector_create(array, 3, true, false);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_ids(args, 0, ivec));
-
-  paddle_ivector val = paddle_ivector_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_ids(args, 0, val));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(val));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
-}
-
-template <typename T1, typename T2>
-void testSequenceHelper(T1 setter, T2 getter) {
-  paddle_arguments args = paddle_arguments_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
-
-  paddle_ivector ivec;
-  int array[3] = {1, 2, 3};
-  ivec = paddle_ivector_create(array, 3, true, false);
-  ASSERT_EQ(kPD_NO_ERROR, setter(args, 0, ivec));
-
-  paddle_ivector val = paddle_ivector_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, getter(args, 0, val));
-  uint64_t size;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get_size(val, &size));
-
-  int* rawBuf;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get(val, &rawBuf));
-  for (size_t i = 0; i < size; ++i) {
-    ASSERT_EQ(array[i], rawBuf[i]);
-  }
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(val));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
-}
-
-TEST(CAPIArguments, Sequence) {
-  auto testSequence = [](uint32_t nestedLevel) {
-    testSequenceHelper(std::bind(paddle_arguments_set_sequence_start_pos,
-                                 std::placeholders::_1,
-                                 std::placeholders::_2,
-                                 nestedLevel,
-                                 std::placeholders::_3),
-                       std::bind(paddle_arguments_get_sequence_start_pos,
-                                 std::placeholders::_1,
-                                 std::placeholders::_2,
-                                 nestedLevel,
-                                 std::placeholders::_3));
-  };
-  for (uint32_t i = 0; i < 2; ++i) {  // test seq and sub-seq.
-    testSequence(i);
-  }
-}
diff --git a/paddle/legacy/capi/tests/test_GradientMachine.cpp b/paddle/legacy/capi/tests/test_GradientMachine.cpp
deleted file mode 100644
index 5d1b7cb6ca4..00000000000
--- a/paddle/legacy/capi/tests/test_GradientMachine.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/gserver/gradientmachines/GradientMachine.h>
-#include <paddle/legacy/trainer/TrainerConfigHelper.h>
-#include <stdlib.h>
-#include <string.h>
-#include <type_traits>
-#include "capi.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-static std::vector<paddle_real> randomBuffer(size_t bufSize) {
-  auto& eng = paddle::ThreadLocalRandomEngine::get();
-  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
-  std::vector<paddle_real> retv;
-  retv.reserve(bufSize);
-  for (size_t i = 0; i < bufSize; ++i) {
-    retv.push_back(dist(eng));
-  }
-  return retv;
-}
-
-TEST(GradientMachine, testPredict) {
-  //! TODO(yuyang18): Test GPU Code.
-  paddle::TrainerConfigHelper config("./test_predict_network.py");
-  std::string buffer;
-  ASSERT_TRUE(config.getModelConfig().SerializeToString(&buffer));
-  paddle_gradient_machine machine;
-
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_create_for_inference(
-                &machine, &buffer[0], (int)buffer.size()));
-  std::unique_ptr<paddle::GradientMachine> gm(
-      paddle::GradientMachine::create(config.getModelConfig()));
-  ASSERT_NE(nullptr, gm);
-  gm->randParameters();
-  gm->saveParameters("./");
-
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_load_parameter_from_disk(machine, "./"));
-
-  paddle_gradient_machine machineSlave;
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_create_shared_param(
-                machine, &buffer[0], (int)buffer.size(), &machineSlave));
-  std::swap(machineSlave, machine);
-  paddle_arguments outArgs = paddle_arguments_create_none();
-
-  paddle_arguments inArgs = paddle_arguments_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(inArgs, 1));
-  paddle_matrix mat = paddle_matrix_create(1, 100, false);
-  static_assert(std::is_same<paddle_real, paddle::real>::value, "");
-
-  auto data = randomBuffer(100);
-  paddle_real* rowPtr;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
-  memcpy(rowPtr, data.data(), data.size() * sizeof(paddle_real));
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(inArgs, 0, mat));
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_forward(machine, inArgs, outArgs, false));
-
-  uint64_t sz;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(outArgs, &sz));
-  ASSERT_EQ(1UL, sz);
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(outArgs, 0, mat));
-  std::vector<paddle::Argument> paddleInArgs;
-  std::vector<paddle::Argument> paddleOutArgs;
-  paddleInArgs.resize(1);
-  paddleInArgs[0].value =
-      paddle::Matrix::create(data.data(), 1, 100, false, false);
-
-  gm->forward(paddleInArgs, &paddleOutArgs, paddle::PASS_TEST);
-
-  auto matPaddle = paddleOutArgs[0].value;
-
-  uint64_t height, width;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
-  ASSERT_EQ(matPaddle->getHeight(), height);
-  ASSERT_EQ(matPaddle->getWidth(), width);
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
-  for (size_t i = 0; i < width; ++i) {
-    ASSERT_NEAR(matPaddle->getData()[i], rowPtr[i], 1e-5);
-  }
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(inArgs));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(outArgs));
-  std::swap(machineSlave, machine);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machineSlave));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machine));
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  std::vector<char*> argvs;
-  argvs.push_back(strdup("--use_gpu=false"));
-  paddle_init((int)argvs.size(), argvs.data());
-  for (auto each : argvs) {
-    free(each);
-  }
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/capi/tests/test_Matrix.cpp b/paddle/legacy/capi/tests/test_Matrix.cpp
deleted file mode 100644
index 5ba051ae179..00000000000
--- a/paddle/legacy/capi/tests/test_Matrix.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "capi.h"
-#include "gtest/gtest.h"
-
-TEST(CAPIMatrix, create) {
-  //! TODO(yuyang18): Test GPU Code.
-  paddle_matrix mat = paddle_matrix_create(128, 32, false);
-  std::vector<paddle_real> sampleRow;
-  sampleRow.resize(32);
-  for (size_t i = 0; i < sampleRow.size(); ++i) {
-    sampleRow[i] = 1.0 / (i + 1.0);
-  }
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_row(mat, 0, sampleRow.data()));
-  ASSERT_EQ(kPD_OUT_OF_RANGE,
-            paddle_matrix_set_row(mat, 128, sampleRow.data()));
-
-  paddle_real* arrayPtr;
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &arrayPtr));
-  for (size_t i = 0; i < sampleRow.size(); ++i) {
-    ASSERT_NEAR(sampleRow[i], arrayPtr[i], 1e-5);
-  }
-
-  uint64_t height, width;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
-  ASSERT_EQ(128UL, height);
-  ASSERT_EQ(32UL, width);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
-}
-
-TEST(CAPIMatrix, createNone) {
-  paddle_matrix mat = paddle_matrix_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
-}
-
-TEST(CAPIMatrix, cpu_get_set_value) {
-  paddle_matrix mat = paddle_matrix_create(128, 32, false);
-  std::vector<paddle_real> sample;
-  std::vector<paddle_real> result;
-  sample.resize(128 * 32);
-  result.resize(128 * 32);
-  for (size_t i = 0; i < sample.size(); ++i) {
-    sample[i] = 1.0 / (i + 1.0);
-  }
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data()));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data()));
-  for (size_t i = 0; i < sample.size(); ++i) {
-    ASSERT_NEAR(sample[i], result[i], 1e-5);
-  }
-
-  uint64_t height, width;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
-  ASSERT_EQ(128UL, height);
-  ASSERT_EQ(32UL, width);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(CAPIMatrix, gpu_get_set_value) {
-  paddle_matrix mat = paddle_matrix_create(128, 32, true);
-  std::vector<paddle_real> sample;
-  std::vector<paddle_real> result;
-  sample.resize(128 * 32);
-  result.resize(128 * 32);
-  for (size_t i = 0; i < sample.size(); ++i) {
-    sample[i] = 1.0 / (i + 1.0);
-  }
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data()));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data()));
-  for (size_t i = 0; i < sample.size(); ++i) {
-    ASSERT_NEAR(sample[i], result[i], 1e-5);
-  }
-
-  uint64_t height, width;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
-  ASSERT_EQ(128UL, height);
-  ASSERT_EQ(32UL, width);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
-}
-#endif
diff --git a/paddle/legacy/capi/tests/test_Vector.cpp b/paddle/legacy/capi/tests/test_Vector.cpp
deleted file mode 100644
index fa7407e484c..00000000000
--- a/paddle/legacy/capi/tests/test_Vector.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "capi.h"
-#include "gtest/gtest.h"
-
-TEST(CAPIVector, create) {
-  //! TODO(yuyang18): Test GPU Code.
-  paddle_ivector vec;
-  int array[3] = {1, 2, 3};
-  vec = paddle_ivector_create(array, 3, true, false);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_resize(vec, 1000));
-  uint64_t size;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get_size(vec, &size));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(vec));
-}
-
-TEST(CAPIVector, createNone) {
-  paddle_ivector vec = paddle_ivector_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(vec));
-}
diff --git a/paddle/legacy/capi/tests/test_predict_network.py b/paddle/legacy/capi/tests/test_predict_network.py
deleted file mode 100644
index b8efb25704d..00000000000
--- a/paddle/legacy/capi/tests/test_predict_network.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=100)
-
-x = data_layer(name='x', size=100)
-
-y = fc_layer(
-    input=x,
-    size=100,
-    bias_attr=ParamAttr(name='b'),
-    param_attr=ParamAttr(name='w'))
-
-outputs(y)
diff --git a/paddle/legacy/capi/vector.h b/paddle/legacy/capi/vector.h
deleted file mode 100644
index a79f7fdf789..00000000000
--- a/paddle/legacy/capi/vector.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef __PADDLE_CAPI_VECTOR_H__
-#define __PADDLE_CAPI_VECTOR_H__
-
-#include <stdbool.h>
-#include <stdint.h>
-#include "config.h"
-#include "error.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Int Vector Functions. Return will be a paddle_error type.
- */
-typedef void* paddle_ivector;
-
-/**
- * @brief Create an none int vector. It just a handler and store nothing. Used
- *        to get output from other api.
- * @return None int vector.
- */
-PD_API paddle_ivector paddle_ivector_create_none();
-
-/**
- * @brief paddle_ivector_create create a paddle int vector
- * @param array: input array.
- * @param size: input array size.
- * @param copy: memory copy or just use same memory. True if copy.
- * @param useGPU: True if use GPU
- * @return paddle_error
- */
-PD_API paddle_ivector paddle_ivector_create(int* array,
-                                            uint64_t size,
-                                            bool copy,
-                                            bool useGPU);
-
-/**
- * @brief paddle_ivector_destroy destory an int vector.
- * @param ivec vector to be destoried.
- * @return paddle_error
- */
-PD_API paddle_error paddle_ivector_destroy(paddle_ivector ivec);
-
-/**
- * @brief paddle_ivector_get get raw buffer stored inside this int vector. It
- * could be GPU memory if this int vector is stored in GPU.
- * @param [in] ivec int vector
- * @param [out] buffer the return buffer pointer.
- * @return paddle_error
- */
-PD_API paddle_error paddle_ivector_get(paddle_ivector ivec, int** buffer);
-
-/**
- * @brief paddle_ivector_resize resize the int vector.
- * @param [in] ivec: int vector
- * @param [in] size: size to change
- * @return paddle_error
- */
-PD_API paddle_error paddle_ivector_resize(paddle_ivector ivec, uint64_t size);
-
-/**
- * @brief paddle_ivector_get_size get the size of int vector.
- * @param [in] ivec: int vector
- * @param [out] size: return size of this int vector.
- * @return paddle_error
- */
-PD_API paddle_error paddle_ivector_get_size(paddle_ivector ivec,
-                                            uint64_t* size);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/paddle/legacy/cuda/CMakeLists.txt b/paddle/legacy/cuda/CMakeLists.txt
deleted file mode 100755
index 9bbb8de78e0..00000000000
--- a/paddle/legacy/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,89 +0,0 @@
-set(AVX_SOURCES
-    src/hl_math.cc
-    src/hl_avx_functions.cc
-)
-
-if(WITH_AVX)
-    set(CUDA_SOURCES
-        src/hl_time.cc
-        src/hl_cpu_functions.cc
-        ${AVX_SOURCES})
-else()
-    set(CUDA_SOURCES
-        src/hl_time.cc
-        src/hl_cpu_functions.cc)
-endif()
-
-set(CUDA_CXX_WITH_GPU_SOURCES
-    src/hl_cuda_cublas.cc
-    src/hl_cuda_cudnn.cc
-    src/hl_cuda_device.cc)
-
-if(WITH_GPU)
-    set(CUDA_CXX_SOURCES
-        src/hl_warpctc_wrap.cc
-        ${CUDA_CXX_WITH_GPU_SOURCES})
-
-    set_source_files_properties(${CUDA_CXX_SOURCES}
-                                PROPERTIES COMPILE_FLAGS "-D__NVCC__")
-else()
-    if (NOT MOBILE_INFERENCE)
-    set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc)
-    endif()
-endif()
-
-set(CUDA_CU_SOURCES
-    src/hl_perturbation_util.cu
-    src/hl_cuda_aggregate.cu
-    src/hl_cuda_matrix.cu
-    src/hl_cuda_sparse.cu
-    src/hl_cuda_cnn.cu
-    src/hl_cuda_lstm.cu
-    src/hl_top_k.cu
-    src/hl_batch_transpose.cu
-    src/hl_batch_norm.cu
-    src/hl_cuda_sequence.cu
-    src/hl_table_apply.cu)
-
-set(CUDA_HEADERS
-    include/hl_time.h
-    include/hl_warpctc_wrap.h
-    include/hl_sequence.h
-    include/hl_cuda_cublas.h
-    include/hl_batch_transpose.h
-    include/hl_avx_functions.h
-    include/hl_sparse.h
-    include/hl_functions.h
-    include/hl_cuda_cudnn.h
-    include/hl_activation_functions.h
-    include/hl_base.h
-    include/stub/hl_cuda_cudnn_stub.h
-    include/stub/hl_cuda_stub.h
-    include/stub/hl_cuda_cublas_stub.h
-    include/stub/hl_cnn_stub.h
-    include/stub/hl_lstm_stub.h
-    include/stub/hl_sequence_stub.h
-    include/stub/hl_aggregate_stub.h
-    include/stub/hl_sparse_stub.h
-    include/stub/hl_matrix_stub.h
-    include/hl_aggregate.h
-    include/hl_cuda.h
-    include/hl_lstm.h
-    include/hl_table_apply.h
-    include/hl_gpu.h
-    include/hl_top_k.h
-    include/hl_matrix.h
-    include/hl_cnn.h)
-
-if(WITH_GPU)
-    cuda_add_library(paddle_cuda
-        ${CUDA_SOURCES}
-        ${CUDA_CU_SOURCES}
-        ${CUDA_CXX_SOURCES})
-else()
-    add_library(paddle_cuda
-                ${CUDA_SOURCES}
-                ${CUDA_CXX_SOURCES})
-endif()
-
-add_dependencies(paddle_cuda paddle_proto ${external_project_dependencies})
diff --git a/paddle/legacy/cuda/include/hl_activation_functions.h b/paddle/legacy/cuda/include/hl_activation_functions.h
deleted file mode 100644
index 66a69db545b..00000000000
--- a/paddle/legacy/cuda/include/hl_activation_functions.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_ACTIVATION_FUNCTIONS_H_
-#define HL_ACTIVATION_FUNCTIONS_H_
-
-#include "hl_functions.h"
-
-/**
- * Active functions: sigmoid, relu, tanh and linear.
- */
-#define HPPL_ACTIVE_FUNCTION \
-  { hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear }
-
-namespace hppl {
-
-/**
- * Hppl supports sigmoid, relu, tanh, linear active functions
- * for neural networks' forward and backward activation.
- */
-template <class T>
-class Active {
- public:
-  typedef T (*forward)(T);
-  typedef T (*backward)(T, T);
-};
-
-#ifdef __NVCC__
-namespace gpu {
-static __device__ Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
-static __device__ Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
-}  // namespace gpu
-#else
-namespace cpu {
-static Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
-static Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
-}  // namespace cpu
-
-#ifdef __AVX__
-namespace avx {
-static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION;
-static Active<__m256>::backward backward[] = HPPL_ACTIVE_FUNCTION;
-}  // namespace avx
-#endif
-#endif
-
-}  // namespace hppl
-
-#endif  // HL_ACTIVATION_FUNCTIONS_H_
diff --git a/paddle/legacy/cuda/include/hl_aggregate.h b/paddle/legacy/cuda/include/hl_aggregate.h
deleted file mode 100644
index 1ca26aa3bbb..00000000000
--- a/paddle/legacy/cuda/include/hl_aggregate.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_AGGREGATE_H_
-#define HL_AGGREGATE_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   Calculate the sum of each row of the matrix A_d.
- *
- * @param[in]    A_d     input matrix (M x N).
- * @param[out]   C_d     output matrix (M x 1).
- * @param[in]    dimM    matrix height.
- * @param[in]    dimN    matrix width.
- *
- */
-extern void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN);
-
-/**
- * @brief   Calculate the maximum value of each row of the matrix A_d.
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[out]  C_d     output matrix (M x 1).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN);
-
-/**
- * @brief   Calculate the minimum value of each row of the matrix A_d.
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[out]  C_d     output matrix (M x 1).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN);
-
-/**
- * @brief   Calculate the sum of each column of the matrix A_d.
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[out]  C_d     output Matrix (1 x N).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN);
-
-/**
- * @brief   Calculate the maximum value of each column of the matrix A_d.
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[out]  C_d     output matrix (1 x N).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN);
-
-/**
- * @brief   Calculate the minimum value of each column of the matrix A_d.
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[out]  C_d     output matrix (1 x N).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN);
-
-/**
- * @brief   C_h = sum(A_d[i]).
- *
- * @param[in]   A_d     input(m).
- * @param[out]  C_h     output(host memory).
- * @param[in]   dimM    size of vector.
- *
- */
-extern void hl_vector_sum(real *A_d, real *C_h, int dimM);
-
-/**
- * @brief   C_h = sum(abs(A_d[i])).
- *
- * @param[in]   A_d     input(m).
- * @param[out]  C_h     output(host memory).
- * @param[in]   dimM    size of vector.
- *
- */
-extern void hl_vector_abs_sum(real *A_d, real *C_h, int dimM);
-
-#endif /* HL_AGGREGATE_H_ */
diff --git a/paddle/legacy/cuda/include/hl_avx_functions.h b/paddle/legacy/cuda/include/hl_avx_functions.h
deleted file mode 100644
index 9fb99a36ea6..00000000000
--- a/paddle/legacy/cuda/include/hl_avx_functions.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_AVX_FUNCTIONS_H_
-#define HL_AVX_FUNCTIONS_H_
-
-#include <immintrin.h>
-
-namespace hppl {
-__m256 relu(const __m256 a);
-__m256 sigmoid(const __m256 a);
-__m256 tanh(const __m256 a);
-__m256 linear(const __m256 a);
-
-__m256 relu(const __m256 a, const __m256 b);
-__m256 sigmoid(const __m256 a, const __m256 b);
-__m256 tanh(const __m256 a, const __m256 b);
-__m256 linear(const __m256 a, const __m256 b);
-}  // namespace hppl
-
-#endif  // HL_AVX_FUNCTIONS_H_
diff --git a/paddle/legacy/cuda/include/hl_base.h b/paddle/legacy/cuda/include/hl_base.h
deleted file mode 100644
index bfe812a4387..00000000000
--- a/paddle/legacy/cuda/include/hl_base.h
+++ /dev/null
@@ -1,250 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstddef>
-
-#ifdef PADDLE_TYPE_DOUBLE
-#define HL_FLOAT_MAX 3.40282347e+38F
-#define HL_FLOAT_MIN 1.17549435e-38F
-using real = double;
-#else
-#define HL_FLOAT_MAX 1.7976931348623157e+308
-#define HL_FLOAT_MIN 2.2250738585072014e-308
-using real = float;
-#endif
-
-/**
- * The maximum input value for exp, used to avoid overflow problem.
- * currently only used for tanh function.
- */
-#define EXP_MAX_INPUT 40.0
-
-/**
- * @brief DIVUP(x, y) is similar to ceil(x / y).
- * @note  For CUDA, DIVUP will be used to specify
- *        the size of blockDim.
- */
-#ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y)-1) / (y))
-#endif
-
-/**
- * HPPL is an internal high performance parallel computing library
- * for high-level neural network routines, which can support many
- * heterogeneous compute architectures, such as GPU, FPGA, etc.
- */
-
-/**
- * @brief   HPPL CUDA Stream.
- *
- * @note    Each thread can use HPPL_STREAM_* after calling hl_init.
- *          HPPL_STREAM_DEFAULT is HPPL default stream.
- */
-typedef enum {
-  HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
-  HPPL_STREAM_1 = 1,
-  HPPL_STREAM_2 = 2,
-  HPPL_STREAM_3 = 3,
-  HPPL_STREAM_4 = 4,
-  HPPL_THREAD_STREAM_1 = 5,
-  HPPL_THREAD_STREAM_2 = 6,
-  HPPL_THREAD_STREAM_3 = 7,
-  HPPL_THREAD_STREAM_4 = 8,
-  HPPL_STREAM_END
-} hl_stream_t;
-
-/**
- * @brief HPPL activation mode.
- */
-typedef enum {
-  HL_ACTIVATION_SIGMOID = 0,
-  HL_ACTIVATION_RELU = 1,
-  HL_ACTIVATION_TANH = 2,
-  HL_ACTIVATION_LINEAR = 3,
-  HL_ACTIVATION_END
-} hl_activation_mode_t;
-
-/**
- * @brief Transpose type.
- */
-typedef enum {
-  HPPL_OP_N = 0, /* transpose */
-  HPPL_OP_T = 1, /* non transpose */
-  HPPL_OP_END
-} hl_trans_op_t;
-
-/**
- * @brief Lstm value.
- *
- * @param  gateValue         input value.
- * @param  prevStateValue    previous state value.
- * @param  stateValue        state value.
- * @param  stateActiveValue  state active value.
- * @param  outputValue       output value.
- */
-typedef struct {
-  real *gateValue;
-  real *prevStateValue;
-  real *stateValue;
-  real *stateActiveValue;
-  real *outputValue;
-  real *checkIg;
-  real *checkFg;
-  real *checkOg;
-} hl_lstm_value;
-
-/**
- * @brief Lstm gradient.
- *
- * @param  gateGrad          input gradient.
- * @param  prevStateGrad     previous state gradient.
- * @param  stateGrad         state gradient.
- * @param  stateActiveGrad   state active gradient.
- * @param  outputGrad        output gradient.
- */
-typedef struct {
-  real *gateGrad;
-  real *prevStateGrad;
-  real *stateGrad;
-  real *stateActiveGrad;
-  real *outputGrad;
-  real *checkIgGrad;
-  real *checkFgGrad;
-  real *checkOgGrad;
-} hl_lstm_grad;
-
-/**
- * @brief Gru value.
- *
- * @param  gateWeight           gate weight (updateGate + resetGate).
- * @param  stateWeight          frame state weight.
- * @param  gateValue            gate value results.
- * @param  resetOutputValue     resetOutput value.
- * @param  outputValue          output value.
- * @param  prevOutValue         previous output value.
- *
- */
-typedef struct {
-  real *gateWeight;
-  real *stateWeight;
-  real *gateValue;
-  real *resetOutputValue;
-  real *outputValue;
-  real *prevOutValue;
-} hl_gru_value;
-
-/**
- * @brief Gru gradient.
- *
- * @param  gateWeightGrad       gate weight gradient.
- * @param  stateWeightGrad      frame state weight gradient.
- * @param  gateGrad             gate gradient results.
- * @param  resetOutputGrad      resetOutput gradient.
- * @param  outputGrad           output gradient.
- * @param  prevOutGrad          previous output gradient.
- */
-typedef struct {
-  real *gateWeightGrad;
-  real *stateWeightGrad;
-  real *gateGrad;
-  real *resetOutputGrad;
-  real *outputGrad;
-  real *prevOutGrad;
-} hl_gru_grad;
-
-/**
- * @brief  Sparse matrix value type.
- */
-typedef enum {
-  HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
-  HL_FLOAT_VALUE = 1,
-  HL_VALUE_END
-} hl_matrix_value_t;
-
-/**
- * @brief  HPPL matrix format.
- */
-typedef enum {
-  HL_SPARSE_CSR = 0,
-  HL_SPARSE_CSC = 1,
-  HL_SPARSE_END
-} hl_matrix_format_t;
-
-typedef struct _hl_matrix_s *hl_matrix_s;
-
-/**
- * @brief   HPPL sparse matrix.
- *
- * @param  matrix     sparse matrix.
- * @param  format     matrix format.
- * @param  type       the type of matrix values.
- * @param  rows       matrix rows.
- * @param  cols       matrix columns.
- * @param  nnz        nonzero values of sparse matrix.
- */
-typedef struct {
-  hl_matrix_s matrix;
-  hl_matrix_format_t format;
-  hl_matrix_value_t type;
-  int rows;
-  int cols;
-  size_t nnz;
-} _hl_sparse_matrix_s, *hl_sparse_matrix_s;
-
-#ifdef __NVCC__
-
-#include <cuda_runtime.h>
-#include "paddle/legacy/cuda/include/hl_cuda.h"
-#include "paddle/legacy/utils/Logging.h"
-
-extern __thread bool g_sync_flag;
-extern __thread cudaStream_t default_stream;
-#define STREAM_DEFAULT default_stream
-
-/**
- * @brief   Check cuda kernel execution.
- * @param   msg   error string
- */
-#define CHECK_SYNC(msg)                                               \
-  if (true == g_sync_flag) {                                          \
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);                       \
-    cudaError_t err = (cudaError_t)hl_get_device_last_error();        \
-    CHECK_EQ(cudaSuccess, err)                                        \
-        << "[" << msg << "] "                                         \
-        << "CUDA error: " << hl_get_device_error_string((size_t)err); \
-  }
-
-// __shfl has been deprecated as of CUDA 9.0.
-#if CUDA_VERSION < 9000
-template <typename T>
-__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
-  return __shfl_down(val, delta);
-}
-
-template <typename T>
-__forceinline__ __device__ T
-__shfl_sync(unsigned, T val, int src_line, int width) {
-  return __shfl(val, src_line, width);
-}
-
-#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
-#else
-#define FULL_WARP_MASK 0xFFFFFFFF
-#define CREATE_SHFL_MASK(mask, predicate) \
-  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
-#endif
-
-#endif  // __NVCC__
diff --git a/paddle/legacy/cuda/include/hl_batch_norm.h b/paddle/legacy/cuda/include/hl_batch_norm.h
deleted file mode 100644
index 7814204d1b0..00000000000
--- a/paddle/legacy/cuda/include/hl_batch_norm.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_BATCH_NORM_H_
-#define HL_BATCH_NORM_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   batch norm inferece.
- *
- * @param[in]   input         input data.
- * @param[out]  output        output data.
- * @param[in]   scale         batch normalization scale parameter (in original
- *                            paper scale is referred to as gamma).
- * @param[in]   bias          batch normalization bias parameter (in original
- *                            paper scale is referred to as beta).
- * @param[in]   estimatedMean
- * @param[in]   estimatedVar  The moving mean and variance
- *                            accumulated during the training phase are passed
- *                            as inputs here.
- * @param[in]   epsilon       Epsilon value used in the batch
- *                            normalization formula.
- */
-extern void hl_batch_norm_cuda_inference(const real* input,
-                                         real* output,
-                                         const real* scale,
-                                         const real* bias,
-                                         const real* estimatedMean,
-                                         const real* estimatedVar,
-                                         const double epsilon,
-                                         size_t batchSize,
-                                         size_t channel,
-                                         size_t height,
-                                         size_t width);
-
-#endif  // HL_BATCH_NORM_H_
diff --git a/paddle/legacy/cuda/include/hl_batch_transpose.h b/paddle/legacy/cuda/include/hl_batch_transpose.h
deleted file mode 100644
index a16d3764fc7..00000000000
--- a/paddle/legacy/cuda/include/hl_batch_transpose.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_BATCH_TRANSPOSE_H_
-#define HL_BATCH_TRANSPOSE_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   Perform matrix transpose for each data in the batch.
- *
- * @param[in]   input     height * width elements in batch.
- * @param[out]  output    height * width elements in batch.
- * @param[in]   width     width of batch data.
- * @param[in]   height    height of batch data.
- * @param[in]   batchSize batch size
- *
- * @note    Both the inpt and output are arranged in batch-first
- *          order. Each batch has height * width data, which are
- *          arranged in height-first (or row-first) manner.
- */
-extern void batchTranspose(
-    const real* input, real* output, int width, int height, int batchSize);
-
-#endif  // HL_BATCH_TRANSPOSE_H_
diff --git a/paddle/legacy/cuda/include/hl_cnn.h b/paddle/legacy/cuda/include/hl_cnn.h
deleted file mode 100644
index b790fa39fe8..00000000000
--- a/paddle/legacy/cuda/include/hl_cnn.h
+++ /dev/null
@@ -1,417 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CNN_H_
-#define HL_CNN_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   Maximum pool forward with Mask output.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   inputData   input data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   pooledH     output image height.
- * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       width of pooling window.
- * @param[in]   sizeY       height of pooling window.
- * @param[in]   strideH     pooling stride height.
- * @param[in]   strideW     pooling stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[out]  tgtData     output data.
- * @param[in]   tgtStride   stride between output data samples.
- * @param[out]  maskData    the location indices of select max data.
- */
-extern void hl_maxpool_forward(const int frameCnt,
-                               const real* inputData,
-                               const int channels,
-                               const int height,
-                               const int width,
-                               const int pooledH,
-                               const int pooledW,
-                               const int sizeX,
-                               const int sizeY,
-                               const int strideH,
-                               const int strideW,
-                               const int paddingH,
-                               const int paddingW,
-                               real* tgtData,
-                               const int tgtStride,
-                               real* maskData = NULL);
-
-/**
- * @brief   Maximum pool backward.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   inputData   input data.
- * @param[out]  outData     output data.
- * @param[out]  outGrad     output grad data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   pooledH     output image height.
- * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       width of pooling window.
- * @param[in]   sizeY       height of pooling window.
- * @param[in]   strideH     pooling stride height.
- * @param[in]   strideW     pooling stride width.
- * @param[in]   scaleA      scale.
- * @param[in]   scaleB      scale.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[out]  targetGrad  output grad.
- * @param[in]   outStride   stride between output data samples.
- *
- */
-extern void hl_maxpool_backward(const int frameCnt,
-                                const real* inputData,
-                                const real* outData,
-                                const real* outGrad,
-                                const int channels,
-                                const int height,
-                                const int width,
-                                const int pooledH,
-                                const int pooledW,
-                                const int sizeX,
-                                const int sizeY,
-                                const int strideH,
-                                const int strideW,
-                                const int paddingH,
-                                const int paddingW,
-                                real scaleA,
-                                real scaleB,
-                                real* targetGrad,
-                                const int outStride);
-
-/**
- * @brief   Averge pool forward.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   inputData   input data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   pooledH     output image height.
- * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       width of pooling window.
- * @param[in]   sizeY       height of pooling window.
- * @param[in]   strideH     pooling stride height.
- * @param[in]   strideW     pooling stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[out]  tgtData     output data.
- * @param[in]   tgtStride   stride between output data samples.
- * @param[in]   excludeMode whether to consider paddings for size.
- *
- */
-extern void hl_avgpool_forward(const int frameCnt,
-                               const real* inputData,
-                               const int channels,
-                               const int height,
-                               const int width,
-                               const int pooledH,
-                               const int pooledW,
-                               const int sizeX,
-                               const int sizeY,
-                               const int strideH,
-                               const int strideW,
-                               const int paddingH,
-                               const int paddingW,
-                               real* tgtData,
-                               const int tgtStride,
-                               bool excludeMode);
-
-/**
- * @brief   Maximum pool backward.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   outGrad     output grad data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   pooledH     output image height.
- * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       width of pooling window.
- * @param[in]   sizeY       height of pooling window.
- * @param[in]   strideH     pooling stride height.
- * @param[in]   strideW     pooling stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[in]   scaleA      scale.
- * @param[in]   scaleB      scale.
- * @param[out]  backGrad    output grad.
- * @param[in]   outStride   stride between output data samples.
- * @param[in]   excludeMode whether to consider paddings for size.
- *
- */
-extern void hl_avgpool_backward(const int frameCnt,
-                                const real* outGrad,
-                                const int channels,
-                                const int height,
-                                const int width,
-                                const int pooledH,
-                                const int pooledW,
-                                const int sizeX,
-                                const int sizeY,
-                                const int strideH,
-                                const int strideW,
-                                int paddingH,
-                                int paddingW,
-                                real scaleA,
-                                real scaleB,
-                                real* backGrad,
-                                const int outStride,
-                                bool excludeMode);
-
-extern void hl_maxpool3D_forward(const int frameCnt,
-                                 const real* inputData,
-                                 const int channels,
-                                 const int depth,
-                                 const int height,
-                                 const int width,
-                                 const int pooledD,
-                                 const int pooledH,
-                                 const int pooledW,
-                                 const int sizeZ,
-                                 const int sizeY,
-                                 const int sizeX,
-                                 const int strideD,
-                                 const int strideH,
-                                 const int strideW,
-                                 const int paddingD,
-                                 const int paddingH,
-                                 const int paddingW,
-                                 real* tgtData,
-                                 real* maxPoolIdxData,
-                                 const int tgtStride);
-
-extern void hl_maxpool3D_backward(const int frameCnt,
-                                  const real* outGrad,
-                                  const int channels,
-                                  const int depth,
-                                  const int height,
-                                  const int width,
-                                  const int pooledD,
-                                  const int pooledH,
-                                  const int pooledW,
-                                  const int sizeZ,
-                                  const int sizeY,
-                                  const int sizeX,
-                                  const int strideD,
-                                  const int strideH,
-                                  const int strideW,
-                                  const int paddingD,
-                                  const int paddingH,
-                                  const int paddingW,
-                                  real scaleA,
-                                  real scaleB,
-                                  real* targetGrad,
-                                  real* maxPoolIdxData,
-                                  const int outStride);
-
-extern void hl_avgpool3D_forward(const int frameCnt,
-                                 const real* inputData,
-                                 const int channels,
-                                 const int depth,
-                                 const int height,
-                                 const int width,
-                                 const int pooledD,
-                                 const int pooledH,
-                                 const int pooledW,
-                                 const int sizeZ,
-                                 const int sizeY,
-                                 const int sizeX,
-                                 const int strideD,
-                                 const int strideH,
-                                 const int strideW,
-                                 const int paddingD,
-                                 const int paddingH,
-                                 const int paddingW,
-                                 real* tgtData,
-                                 const int tgtStride);
-
-extern void hl_avgpool3D_backward(const int frameCnt,
-                                  const real* outGrad,
-                                  const int channels,
-                                  const int depth,
-                                  const int height,
-                                  const int width,
-                                  const int pooledD,
-                                  const int pooledH,
-                                  const int pooledW,
-                                  const int sizeZ,
-                                  const int sizeY,
-                                  const int sizeX,
-                                  const int strideD,
-                                  const int strideH,
-                                  const int strideW,
-                                  int paddingD,
-                                  int paddingH,
-                                  int paddingW,
-                                  real scaleA,
-                                  real scaleB,
-                                  real* backGrad,
-                                  const int outStride);
-
-/**
- * @brief   Bilinear interpolation forward.
- *
- * @param[in]   inData      input value.
- * @param[in]   inImgH      input image height.
- * @param[in]   inImgW      input image width.
- * @param[in]   inputH      input batchSize.
- * @param[in]   inputW      input image data dim.
- * @param[out]  outData     output value.
- * @param[in]   outImgH     output image height.
- * @param[in]   outImgW     output image width.
- * @param[in]   outputH     output batchSize.
- * @param[in]   outputW     output image data dim.
- * @param[in]   numChannels number of channels.
- * @param[in]   ratioH      inImgH / outImgH.
- * @param[in]   ratioW      inImgW / outImgW.
- *
- */
-extern void hl_bilinear_forward(const real* inData,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t inputH,
-                                const size_t inputW,
-                                real* outData,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t outputH,
-                                const size_t outputW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW);
-
-/**
- * @brief   Bilinear interpolation backward.
- *
- * @param[out]  inGrad      input gradient.
- * @param[in]   inImgH      input image height.
- * @param[in]   inImgW      input image width.
- * @param[in]   inputH      input batchSize.
- * @param[in]   inputW      input image data dim.
- * @param[in]   outGrad     output gradient.
- * @param[in]   outImgH     output image height.
- * @param[in]   outImgW     output image width.
- * @param[in]   outputH     output batchSize.
- * @param[in]   outputW     output image data dim.
- * @param[in]   numChannels number of channels.
- * @param[in]   ratioH      inImgH / outImgH.
- * @param[in]   ratioW      inImgW / outImgW.
- *
- */
-extern void hl_bilinear_backward(real* inGrad,
-                                 const size_t inImgH,
-                                 const size_t inImgW,
-                                 const size_t inputH,
-                                 const size_t inputW,
-                                 const real* outGrad,
-                                 const size_t outImgH,
-                                 const size_t outImgW,
-                                 const size_t outputH,
-                                 const size_t outputW,
-                                 const size_t numChannels,
-                                 const real ratioH,
-                                 const real ratioW);
-
-/**
- * @brief   MaxOut forward.
- *
- * @param[in]   inData      input data.
- * @param[out]  outData     output data.
- * @param[out]  idData      output maxId.
- * @param[in]   batchSize   batchSize.
- * @param[in]   size        number of channels * image height * image width.
- * @param[in]   featLen     feature length = image height * image width.
- * @param[in]   groups      number of groups.
- */
-extern void hl_maxout_forward(const real* inData,
-                              real* outData,
-                              int* idData,
-                              size_t batchSize,
-                              size_t size,
-                              size_t featLen,
-                              size_t groups);
-
-/**
- * @brief   MaxOut backward.
- *
- * @param[out]  inGrad      input grad data.
- * @param[in]   outGrad     output grad data.
- * @param[in]   idData      output maxId.
- * @param[in]   batchSize   batchSize.
- * @param[in]   size        number of channels * image height * image width.
- * @param[in]   featLen     feature length = image height * image width.
- * @param[in]   groups      number of groups.
- */
-extern void hl_maxout_backward(real* inGrad,
-                               const real* outGrad,
-                               const int* idData,
-                               size_t batchSize,
-                               size_t size,
-                               size_t featLen,
-                               size_t groups);
-
-/**
- * @brief   Upsample forward.
- * @param[in]   inputData   input data.
- * @param[out]  maskData    the mask data from MaxPoolWithMaskLayer.
- * @param[out]  batchSize   the batch size of the input.
- * @param[in]   imgSizeH    image height.
- * @param[in]   imgSizeW    image width.
- * @param[in]   channels    the input channels.
- * @param[in]   outputH     the output height.
- * @param[in]   outputW     the output widht.
- * @param[out]  outputData  output data.
- */
-extern void hl_upsample_forward(real* inputData,
-                                real* maskData,
-                                size_t batchSize,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t channels,
-                                size_t outputH,
-                                size_t outputW,
-                                real* outputData);
-
-/**
- * @brief   Upsample backward.
- * @param[in]   outputGradData  the output grad data.
- * @param[out]  maskData    the mask data from MaxPoolWithMaskLayer.
- * @param[out]  batchSize       the batch size of the input.
- * @param[in]   imgSizeH        image height.
- * @param[in]   imgSizeW        image width.
- * @param[in]   channels        the input channels.
- * @param[in]   outputH         the output height.
- * @param[in]   outputW         the output widht.
- * @param[out]  inputGradData   the input grad data.
- */
-extern void hl_upsample_backward(real* outputGradData,
-                                 real* maskData,
-                                 size_t batchSize,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t channels,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 real* inputGradData);
-
-#endif  // HL_CNN_H_
diff --git a/paddle/legacy/cuda/include/hl_cpu_gru.cuh b/paddle/legacy/cuda/include/hl_cpu_gru.cuh
deleted file mode 100644
index ce1643932de..00000000000
--- a/paddle/legacy/cuda/include/hl_cpu_gru.cuh
+++ /dev/null
@@ -1,477 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_CPU_GRU_CUH_
-#define HL_CPU_GRU_CUH_
-
-#ifndef __NVCC__
-
-template<class OpResetOutput>
-void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
-                                       real *gateValue,
-                                       real *resetOutputValue,
-                                       real *prevOutputValue,
-                                       int frameSize,
-                                       hl_activation_mode_t active_gate) {
-  real rValueUpdateGate;
-  real rValueResetGate;
-  real rValueResetOutput;
-  real rPrevOut = 0;
-  real *updateGate = gateValue;
-  real *resetGate = gateValue + frameSize;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueResetGate = resetGate[i];
-    if (prevOutputValue) {
-      rPrevOut = prevOutputValue[i];
-    }
-
-    opResetOutput(rValueUpdateGate,
-                  rValueResetGate,
-                  rPrevOut,
-                  rValueResetOutput,
-                  hppl::cpu::forward[active_gate]);
-
-    updateGate[i] = rValueUpdateGate;
-    resetGate[i] = rValueResetGate;
-    resetOutputValue[i] = rValueResetOutput;
-  }
-}
-
-template<class OpFinalOutput>
-void hl_naive_gru_forward_final_output(OpFinalOutput opFinalOutput,
-                                       real *gateValue,
-                                       real *prevOutputValue,
-                                       real *outputValue,
-                                       int frameSize,
-                                       hl_activation_mode_t active_node) {
-  real rValueUpdateGate;
-  real rValueFrameState;
-  real rPrevOut = 0;
-  real rOutput;
-  real *updateGate = gateValue;
-  real *frameState = gateValue + frameSize * 2;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueFrameState = frameState[i];
-    if (prevOutputValue) {
-      rPrevOut = prevOutputValue[i];
-    }
-
-    opFinalOutput(rValueUpdateGate,
-                  rValueFrameState,
-                  rPrevOut,
-                  rOutput,
-                  hppl::cpu::forward[active_node]);
-
-    frameState[i] = rValueFrameState;
-    outputValue[i] = rOutput;
-  }
-}
-
-template<class OpResetOutput>
-void hl_avx_gru_forward_reset_output(OpResetOutput opResetOutput,
-                                     real *gateValue,
-                                     real *resetOutputValue,
-                                     real *prevOutputValue,
-                                     int frameSize,
-                                     hl_activation_mode_t active_gate) {
-#ifdef __AVX__
-  __m256 rValueUpdateGate;
-  __m256 rValueResetGate;
-  __m256 rValueResetOutput;
-  __m256 rPrevOut = _mm256_set1_ps(0.0f);
-  __m256 *updateGate = (__m256*)gateValue;
-  __m256 *resetGate = (__m256*)(gateValue + frameSize);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueResetGate = resetGate[i];
-    if (prevOutputValue) {
-      rPrevOut = ((__m256*)prevOutputValue)[i];
-    }
-
-    opResetOutput(rValueUpdateGate,
-                  rValueResetGate,
-                  rPrevOut,
-                  rValueResetOutput,
-                  hppl::avx::forward[active_gate]);
-
-    updateGate[i] = rValueUpdateGate;
-    resetGate[i] = rValueResetGate;
-    ((__m256*)resetOutputValue)[i] = rValueResetOutput;
-  }
-#endif
-}
-
-template<class OpFinalOutput>
-void hl_avx_gru_forward_final_output(OpFinalOutput opFinalOutput,
-                                     real *gateValue,
-                                     real *prevOutputValue,
-                                     real *outputValue,
-                                     int frameSize,
-                                     hl_activation_mode_t active_node) {
-#ifdef __AVX__
-  __m256 rValueUpdateGate;
-  __m256 rValueFrameState;
-  __m256 rPrevOut = _mm256_set1_ps(0.0f);
-  __m256 rOutput;
-  __m256 *updateGate = (__m256*)gateValue;
-  __m256 *frameState = (__m256*)(gateValue + frameSize * 2);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueFrameState = frameState[i];
-    if (prevOutputValue) {
-      rPrevOut = ((__m256*)prevOutputValue)[i];
-    }
-
-    opFinalOutput(rValueUpdateGate,
-                  rValueFrameState,
-                  rPrevOut,
-                  rOutput,
-                  hppl::avx::forward[active_node]);
-
-    frameState[i] = rValueFrameState;
-    ((__m256*)outputValue)[i] = rOutput;
-  }
-#endif
-}
-
-template<class OpResetOutput>
-inline void forward_reset_output(OpResetOutput opResetOutput,
-                                 hl_gru_value value,
-                                 int frameSize,
-                                 int batchSize,
-                                 hl_activation_mode_t active_gate) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpResetOutput::avx && !(frameSize & (8 - 1)) && (sizeof(real) == 4)) {
-      hl_avx_gru_forward_reset_output(opResetOutput,
-        value.gateValue, value.resetOutputValue, value.prevOutValue,
-        frameSize, active_gate);
-    } else {
-      hl_naive_gru_forward_reset_output(opResetOutput,
-        value.gateValue, value.resetOutputValue, value.prevOutValue,
-        frameSize, active_gate);
-    }
-
-    value.gateValue += frameSize * 3;
-    value.resetOutputValue += frameSize;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
-    }
-  }
-}
-
-template<class OpFinalOutput>
-inline void forward_final_output(OpFinalOutput opFinalOutput,
-                                 hl_gru_value value,
-                                 int frameSize,
-                                 int batchSize,
-                                 hl_activation_mode_t active_node) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpFinalOutput::avx && !(frameSize & (8 - 1)) && (sizeof(real) == 4)) {
-      hl_avx_gru_forward_final_output(opFinalOutput,
-        value.gateValue, value.prevOutValue, value.outputValue,
-        frameSize, active_node);
-    } else {
-      hl_naive_gru_forward_final_output(opFinalOutput,
-        value.gateValue, value.prevOutValue, value.outputValue,
-        frameSize, active_node);
-    }
-
-    value.gateValue += frameSize * 3;
-    value.outputValue += frameSize;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
-    }
-  }
-}
-
-template<class OpStateGrad>
-void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad,
-                                      real *gateValue,
-                                      real *gateGrad,
-                                      real *prevOutValue,
-                                      real *prevOutGrad,
-                                      real *outputGrad,
-                                      int frameSize,
-                                      hl_activation_mode_t active_node) {
-  real rUpdateGateValue;
-  real rUpdateGateGrad;
-  real rFrameStateValue;
-  real rFrameStateGrad;
-  real rOutGrad;
-  real rPrevOutValue = 0;
-  real rPrevOutGrad  = 0;
-  real *updateGateValue = gateValue;
-  real *updateGateGrad = gateGrad;
-  real *frameStateValue = gateValue + frameSize * 2;
-  real *frameStateGrad = gateGrad + frameSize * 2;
-
-  for (int i = 0; i < frameSize; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rFrameStateValue = frameStateValue[i];
-    rOutGrad  = outputGrad[i];
-    if (prevOutValue) {
-      rPrevOutValue = prevOutValue[i];
-    }
-    if (prevOutGrad) {
-      rPrevOutGrad  = prevOutGrad[i];
-    }
-
-    opStateGrad(rUpdateGateValue,
-                rUpdateGateGrad,
-                rFrameStateValue,
-                rFrameStateGrad,
-                rPrevOutValue,
-                rPrevOutGrad,
-                rOutGrad,
-                hppl::cpu::backward[active_node]);
-
-    updateGateGrad[i] = rUpdateGateGrad;
-    frameStateGrad[i] = rFrameStateGrad;
-    if (prevOutGrad) {
-      prevOutGrad[i] = rPrevOutGrad;
-    }
-  }
-}
-
-template<class OpResetGrad>
-void hl_naive_gru_backward_reset_grad(OpResetGrad opResetGrad,
-                                      real *gateValue,
-                                      real *gateGrad,
-                                      real *prevOutValue,
-                                      real *prevOutGrad,
-                                      real *resetOutputGrad,
-                                      int frameSize,
-                                      hl_activation_mode_t active_gate) {
-  real rUpdateGateValue;
-  real rUpdateGateGrad;
-  real rResetGateValue;
-  real rResetGateGrad;
-  real rResetOutputGrad = 0;
-  real rPrevOutValue = 0;
-  real rPrevOutGrad  = 0;
-  real *updateGateValue = gateValue;
-  real *updateGateGrad = gateGrad;
-  real *resetGateValue = gateValue + frameSize;
-  real *resetGateGrad = gateGrad + frameSize;
-
-  for (int i = 0; i < frameSize; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rUpdateGateGrad = updateGateGrad[i];
-    rResetGateValue = resetGateValue[i];
-
-    if (prevOutValue && prevOutGrad) {
-      rResetOutputGrad = resetOutputGrad[i];
-    }
-    if (prevOutValue) {
-      rPrevOutValue = prevOutValue[i];
-    }
-    if (prevOutGrad) {
-      rPrevOutGrad  = prevOutGrad[i];
-    }
-
-    opResetGrad(rUpdateGateValue,
-                rUpdateGateGrad,
-                rResetGateValue,
-                rResetGateGrad,
-                rPrevOutValue,
-                rPrevOutGrad,
-                rResetOutputGrad,
-                hppl::cpu::backward[active_gate]);
-
-    updateGateGrad[i] = rUpdateGateGrad;
-    resetGateGrad[i] = rResetGateGrad;
-    if (prevOutGrad) {
-      prevOutGrad[i] = rPrevOutGrad;
-    }
-  }
-}
-
-template<class OpStateGrad>
-void hl_avx_gru_backward_state_grad(OpStateGrad opStateGrad,
-                                    real *gateValue,
-                                    real *gateGrad,
-                                    real *prevOutValue,
-                                    real *prevOutGrad,
-                                    real *outputGrad,
-                                    int frameSize,
-                                    hl_activation_mode_t active_node) {
-#ifdef __AVX__
-  __m256 rUpdateGateValue;
-  __m256 rUpdateGateGrad;
-  __m256 rFrameStateValue;
-  __m256 rFrameStateGrad;
-  __m256 rOutGrad;
-  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutGrad  = _mm256_set1_ps(0.0f);
-  __m256 *updateGateValue = (__m256*)gateValue;
-  __m256 *updateGateGrad = (__m256*)gateGrad;
-  __m256 *frameStateValue = (__m256*)(gateValue + frameSize * 2);
-  __m256 *frameStateGrad = (__m256*)(gateGrad + frameSize * 2);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rFrameStateValue = frameStateValue[i];
-    rOutGrad  = ((__m256*)outputGrad)[i];
-    if (prevOutValue) {
-      rPrevOutValue = ((__m256*)prevOutValue)[i];
-    }
-    if (prevOutGrad) {
-      rPrevOutGrad  = ((__m256*)prevOutGrad)[i];
-    }
-
-    opStateGrad(rUpdateGateValue,
-                rUpdateGateGrad,
-                rFrameStateValue,
-                rFrameStateGrad,
-                rPrevOutValue,
-                rPrevOutGrad,
-                rOutGrad,
-                hppl::avx::backward[active_node]);
-
-    updateGateGrad[i] = rUpdateGateGrad;
-    frameStateGrad[i] = rFrameStateGrad;
-    if (prevOutGrad) {
-      ((__m256*)prevOutGrad)[i] = rPrevOutGrad;
-    }
-  }
-#endif
-}
-
-template<class OpResetGrad>
-void hl_avx_gru_backward_reset_grad(OpResetGrad opResetGrad,
-                                    real *gateValue,
-                                    real *gateGrad,
-                                    real *prevOutValue,
-                                    real *prevOutGrad,
-                                    real *resetOutputGrad,
-                                    int frameSize,
-                                    hl_activation_mode_t active_gate) {
-#ifdef __AVX__
-  __m256 rUpdateGateValue;
-  __m256 rUpdateGateGrad;
-  __m256 rResetGateValue;
-  __m256 rResetGateGrad;
-  __m256 rResetOutputGrad = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutGrad  = _mm256_set1_ps(0.0f);
-  __m256 *updateGateValue = (__m256*)gateValue;
-  __m256 *updateGateGrad = (__m256*)gateGrad;
-  __m256 *resetGateValue = (__m256*)(gateValue + frameSize);
-  __m256 *resetGateGrad = (__m256*)(gateGrad + frameSize);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rUpdateGateGrad = updateGateGrad[i];
-    rResetGateValue = resetGateValue[i];
-
-    if (prevOutValue && prevOutGrad) {
-      rResetOutputGrad = ((__m256*)resetOutputGrad)[i];
-    }
-    if (prevOutValue) {
-      rPrevOutValue = ((__m256*)prevOutValue)[i];
-    }
-    if (prevOutGrad) {
-      rPrevOutGrad  = ((__m256*)prevOutGrad)[i];
-    }
-
-    opResetGrad(rUpdateGateValue,
-                rUpdateGateGrad,
-                rResetGateValue,
-                rResetGateGrad,
-                rPrevOutValue,
-                rPrevOutGrad,
-                rResetOutputGrad,
-                hppl::avx::backward[active_gate]);
-
-    updateGateGrad[i] = rUpdateGateGrad;
-    resetGateGrad[i] = rResetGateGrad;
-    if (prevOutGrad) {
-      ((__m256*)prevOutGrad)[i] = rPrevOutGrad;
-    }
-  }
-#endif
-}
-
-template<class OpStateGrad>
-inline void backward_state_grad(OpStateGrad opStateGrad,
-                                hl_gru_value value,
-                                hl_gru_grad  grad,
-                                int frameSize,
-                                int batchSize,
-                                hl_activation_mode_t active_node) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpStateGrad::avx && !(frameSize & (8 - 1)) && (sizeof(real) == 4)) {
-      hl_avx_gru_backward_state_grad(opStateGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.outputGrad, frameSize, active_node);
-    } else {
-      hl_naive_gru_backward_state_grad(opStateGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.outputGrad, frameSize, active_node);
-    }
-
-    value.gateValue += frameSize * 3;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
-    }
-
-    grad.gateGrad += frameSize * 3;
-    grad.outputGrad += frameSize;
-    if (grad.prevOutGrad) {
-      grad.prevOutGrad += frameSize;
-    }
-  }
-}
-
-template<class OpResetGrad>
-inline void backward_reset_grad(OpResetGrad opResetGrad,
-                                hl_gru_value value,
-                                hl_gru_grad  grad,
-                                int frameSize,
-                                int batchSize,
-                                hl_activation_mode_t active_gate) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpResetGrad::avx && !(frameSize & (8 - 1)) && (sizeof(real) == 4)) {
-      hl_avx_gru_backward_reset_grad(opResetGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.resetOutputGrad, frameSize, active_gate);
-    } else {
-      hl_naive_gru_backward_reset_grad(opResetGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.resetOutputGrad, frameSize, active_gate);
-    }
-
-    value.gateValue += frameSize * 3;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
-    }
-
-    grad.gateGrad += frameSize * 3;
-    grad.resetOutputGrad += frameSize;
-    if (grad.prevOutGrad) {
-      grad.prevOutGrad += frameSize;
-    }
-  }
-}
-
-#endif
-
-#endif  // HL_CPU_GRU_CUH_
diff --git a/paddle/legacy/cuda/include/hl_cpu_lstm.cuh b/paddle/legacy/cuda/include/hl_cpu_lstm.cuh
deleted file mode 100644
index 58a97d1230d..00000000000
--- a/paddle/legacy/cuda/include/hl_cpu_lstm.cuh
+++ /dev/null
@@ -1,372 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_CPU_LSTM_CUH_
-#define HL_CPU_LSTM_CUH_
-
-#ifndef __NVCC__
-
-// using namespace hppl;
-
-template<class Op>
-void hl_naive_lstm_forward_one_sequence(Op op,
-                                        hl_lstm_value value,
-                                        int frameSize,
-                                        hl_activation_mode_t active_node,
-                                        hl_activation_mode_t active_gate,
-                                        hl_activation_mode_t active_state) {
-  real rValueIn;
-  real rValueIg;
-  real rValueFg;
-  real rValueOg;
-  real rCheckI;
-  real rCheckF;
-  real rCheckO;
-  real rState;
-  real rPrevState = 0;
-  real rStateAtv;
-  real rOut;
-
-  real *valueIn = value.gateValue;
-  real *valueIg = value.gateValue + frameSize;
-  real *valueFg = value.gateValue + frameSize * 2;
-  real *valueOg = value.gateValue + frameSize * 3;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueIn = valueIn[i];
-    rValueIg = valueIg[i];
-    rValueFg = valueFg[i];
-    rValueOg = valueOg[i];
-    rCheckI = value.checkIg[i];
-    rCheckF = value.checkFg[i];
-    rCheckO = value.checkOg[i];
-
-    if (value.prevStateValue) {
-      rPrevState = value.prevStateValue[i];
-    }
-
-    op(rValueIn,
-       rValueIg,
-       rValueFg,
-       rValueOg,
-       rPrevState,
-       rState,
-       rStateAtv,
-       rOut,
-       rCheckI,
-       rCheckF,
-       rCheckO,
-       hppl::cpu::forward[active_node],
-       hppl::cpu::forward[active_gate],
-       hppl::cpu::forward[active_state]);
-
-    valueIn[i] = rValueIn;
-    valueIg[i] = rValueIg;
-    valueFg[i] = rValueFg;
-    valueOg[i] = rValueOg;
-    value.stateValue[i] = rState;
-    value.stateActiveValue[i] = rStateAtv;
-    value.outputValue[i] = rOut;
-  }
-}
-
-template<class Op>
-void hl_naive_lstm_backward_one_sequence(Op op,
-                                         hl_lstm_value value,
-                                         hl_lstm_grad grad,
-                                         int frameSize,
-                                         hl_activation_mode_t active_node,
-                                         hl_activation_mode_t active_gate,
-                                         hl_activation_mode_t active_state) {
-  real rValueIn;
-  real rValueIg;
-  real rValueFg;
-  real rValueOg;
-  real rGradIn;
-  real rGradIg;
-  real rGradFg;
-  real rGradOg;
-  real rPrevState = 0;
-  real rPrevStateGrad;
-  real rState;
-  real rStateGrad;
-  real rStateAtv;
-  real rOutputGrad;
-  real rCheckI;
-  real rCheckF;
-  real rCheckO;
-  real rCheckIGrad;
-  real rCheckFGrad;
-  real rCheckOGrad;
-
-  real *valueIn = value.gateValue;
-  real *valueIg = value.gateValue + frameSize;
-  real *valueFg = value.gateValue + frameSize * 2;
-  real *valueOg = value.gateValue + frameSize * 3;
-  real *gradIn = grad.gateGrad;
-  real *gradIg = grad.gateGrad + frameSize;
-  real *gradFg = grad.gateGrad + frameSize * 2;
-  real *gradOg = grad.gateGrad + frameSize * 3;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueIn = valueIn[i];
-    rValueIg = valueIg[i];
-    rValueFg = valueFg[i];
-    rValueOg = valueOg[i];
-    rCheckI = value.checkIg[i];
-    rCheckF = value.checkFg[i];
-    rCheckO = value.checkOg[i];
-    rState = value.stateValue[i];
-    rStateAtv = value.stateActiveValue[i];
-    rOutputGrad = grad.outputGrad[i];
-    rStateGrad = grad.stateGrad[i];
-    if (value.prevStateValue) {
-      rPrevState = value.prevStateValue[i];
-    }
-
-    op(rValueIn,
-       rValueIg,
-       rValueFg,
-       rValueOg,
-       rGradIn,
-       rGradIg,
-       rGradFg,
-       rGradOg,
-       rPrevState,
-       rPrevStateGrad,
-       rState,
-       rStateGrad,
-       rStateAtv,
-       rOutputGrad,
-       rCheckI,
-       rCheckF,
-       rCheckO,
-       rCheckIGrad,
-       rCheckFGrad,
-       rCheckOGrad,
-       hppl::cpu::backward[active_node],
-       hppl::cpu::backward[active_gate],
-       hppl::cpu::backward[active_state]);
-
-    gradIn[i] = rGradIn;
-    gradIg[i] = rGradIg;
-    gradFg[i] = rGradFg;
-    gradOg[i] = rGradOg;
-    grad.stateGrad[i] = rStateGrad;
-
-    if (grad.prevStateGrad) grad.prevStateGrad[i] = rPrevStateGrad;
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad) grad.checkIgGrad[i] += rCheckIGrad;
-      if (grad.checkFgGrad) grad.checkFgGrad[i] += rCheckFGrad;
-    }
-    if (grad.checkOgGrad) grad.checkOgGrad[i] += rCheckOGrad;
-  }
-}
-
-template<class Op>
-void hl_avx_lstm_forward_one_sequence(Op op,
-                                      hl_lstm_value value,
-                                      int frameSize,
-                                      hl_activation_mode_t active_node,
-                                      hl_activation_mode_t active_gate,
-                                      hl_activation_mode_t active_state) {
-#ifdef __AVX__
-  __m256 rValueIn;
-  __m256 rValueIg;
-  __m256 rValueFg;
-  __m256 rValueOg;
-  __m256 rCheckI;
-  __m256 rCheckF;
-  __m256 rCheckO;
-  __m256 rState;
-  __m256 rPrevState = _mm256_set1_ps(0.0f);
-  __m256 rStateAtv;
-  __m256 rOut;
-
-  __m256 *valueIn = (__m256*)value.gateValue;
-  __m256 *valueIg = (__m256*)(value.gateValue + frameSize);
-  __m256 *valueFg = (__m256*)(value.gateValue + frameSize * 2);
-  __m256 *valueOg = (__m256*)(value.gateValue + frameSize * 3);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueIn = valueIn[i];
-    rValueIg = valueIg[i];
-    rValueFg = valueFg[i];
-    rValueOg = valueOg[i];
-    rCheckI = ((__m256*)value.checkIg)[i];
-    rCheckF = ((__m256*)value.checkFg)[i];
-    rCheckO = ((__m256*)value.checkOg)[i];
-
-    if (value.prevStateValue) {
-      rPrevState = ((__m256*)value.prevStateValue)[i];
-    }
-
-    op(rValueIn,
-       rValueIg,
-       rValueFg,
-       rValueOg,
-       rPrevState,
-       rState,
-       rStateAtv,
-       rOut,
-       rCheckI,
-       rCheckF,
-       rCheckO,
-       hppl::avx::forward[active_node],
-       hppl::avx::forward[active_gate],
-       hppl::avx::forward[active_state]);
-
-    valueIn[i] = rValueIn;
-    valueIg[i] = rValueIg;
-    valueFg[i] = rValueFg;
-    valueOg[i] = rValueOg;
-    ((__m256*)value.stateValue)[i] = rState;
-    ((__m256*)value.stateActiveValue)[i] = rStateAtv;
-    ((__m256*)value.outputValue)[i] = rOut;
-  }
-#endif
-}
-
-template<class Op>
-void hl_avx_lstm_backward_one_sequence(Op op,
-                                       hl_lstm_value value,
-                                       hl_lstm_grad grad,
-                                       int frameSize,
-                                       hl_activation_mode_t active_node,
-                                       hl_activation_mode_t active_gate,
-                                       hl_activation_mode_t active_state) {
-#ifdef __AVX__
-  __m256 rValueIn;
-  __m256 rValueIg;
-  __m256 rValueFg;
-  __m256 rValueOg;
-  __m256 rGradIn;
-  __m256 rGradIg;
-  __m256 rGradFg;
-  __m256 rGradOg;
-  __m256 rPrevState = _mm256_set1_ps(0.0f);
-  __m256 rPrevStateGrad;
-  __m256 rStateGrad;
-  __m256 rState;
-  __m256 rStateAtv;
-  __m256 rOutputGrad;
-  __m256 rCheckI;
-  __m256 rCheckF;
-  __m256 rCheckO;
-  __m256 rCheckIGrad;
-  __m256 rCheckFGrad;
-  __m256 rCheckOGrad;
-
-  __m256 *valueIn = (__m256*)value.gateValue;
-  __m256 *valueIg = (__m256*)(value.gateValue + frameSize);
-  __m256 *valueFg = (__m256*)(value.gateValue + frameSize * 2);
-  __m256 *valueOg = (__m256*)(value.gateValue + frameSize * 3);
-  __m256 *gradIn = (__m256*)grad.gateGrad;
-  __m256 *gradIg = (__m256*)(grad.gateGrad + frameSize);
-  __m256 *gradFg = (__m256*)(grad.gateGrad + frameSize * 2);
-  __m256 *gradOg = (__m256*)(grad.gateGrad + frameSize * 3);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueIn = valueIn[i];
-    rValueIg = valueIg[i];
-    rValueFg = valueFg[i];
-    rValueOg = valueOg[i];
-    rCheckI = ((__m256*)value.checkIg)[i];
-    rCheckF = ((__m256*)value.checkFg)[i];
-    rCheckO = ((__m256*)value.checkOg)[i];
-    rState = ((__m256*)value.stateValue)[i];
-    rStateAtv = ((__m256*)value.stateActiveValue)[i];
-    rOutputGrad = ((__m256*)grad.outputGrad)[i];
-    rStateGrad = ((__m256*)grad.stateGrad)[i];
-    if (value.prevStateValue) {
-      rPrevState = ((__m256*)value.prevStateValue)[i];
-    }
-
-    op(rValueIn,
-       rValueIg,
-       rValueFg,
-       rValueOg,
-       rGradIn,
-       rGradIg,
-       rGradFg,
-       rGradOg,
-       rPrevState,
-       rPrevStateGrad,
-       rState,
-       rStateGrad,
-       rStateAtv,
-       rOutputGrad,
-       rCheckI,
-       rCheckF,
-       rCheckO,
-       rCheckIGrad,
-       rCheckFGrad,
-       rCheckOGrad,
-       hppl::avx::backward[active_node],
-       hppl::avx::backward[active_gate],
-       hppl::avx::backward[active_state]);
-
-    gradIn[i] = rGradIn;
-    gradIg[i] = rGradIg;
-    gradFg[i] = rGradFg;
-    gradOg[i] = rGradOg;
-    ((__m256*)grad.stateGrad)[i] = rStateGrad;
-
-    if (grad.prevStateGrad) ((__m256*)grad.prevStateGrad)[i] = rPrevStateGrad;
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad) ((__m256*)grad.checkIgGrad)[i] += rCheckIGrad;
-      if (grad.checkFgGrad) ((__m256*)grad.checkFgGrad)[i] += rCheckFGrad;
-    }
-    if (grad.checkOgGrad) ((__m256*)grad.checkOgGrad)[i] += rCheckOGrad;
-  }
-#endif
-}
-
-template<class Op>
-void hl_cpu_lstm_forward(Op op,
-                         hl_lstm_value value,
-                         int frameSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate,
-                         hl_activation_mode_t active_state) {
-  if (Op::avx && !(frameSize & (8 - 1)) && (sizeof(real) == 4)) {
-    hl_avx_lstm_forward_one_sequence(op, value, frameSize,
-        active_node, active_gate, active_state);
-  } else {
-    hl_naive_lstm_forward_one_sequence(op, value, frameSize,
-        active_node, active_gate, active_state);
-  }
-}
-
-template<class Op>
-void hl_cpu_lstm_backward(Op op,
-                          hl_lstm_value value,
-                          hl_lstm_grad grad,
-                          int frameSize,
-                          hl_activation_mode_t active_node,
-                          hl_activation_mode_t active_gate,
-                          hl_activation_mode_t active_state) {
-  if (Op::avx && !(frameSize & (8 - 1)) && (sizeof(real) == 4)) {
-    hl_avx_lstm_backward_one_sequence(op, value, grad, frameSize,
-        active_node, active_gate, active_state);
-  } else {
-    hl_naive_lstm_backward_one_sequence(op, value, grad, frameSize,
-        active_node, active_gate, active_state);
-  }
-}
-
-#endif
-
-#endif /* HL_CPU_LSTM_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_cpu_matrix_kernel.cuh b/paddle/legacy/cuda/include/hl_cpu_matrix_kernel.cuh
deleted file mode 100644
index 4db9bb74e0a..00000000000
--- a/paddle/legacy/cuda/include/hl_cpu_matrix_kernel.cuh
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CPU_MATRIX_KERNEL_CUH_
-#define HL_CPU_MATRIX_KERNEL_CUH_
-
-#include <stdio.h>
-#include "hl_base.h"
-
-#ifndef __CUDA_ARCH__
-#include "hl_cpu_matrix_kernel_detail.cuh"
-#endif
-
-/**
- * @brief   cpu element wise unary operator.
- */
-template <class T, class Op>
-void hl_cpu_apply_unary_op(Op op, T* A_h, int dimM, int dimN, int lda) {
-  for (int i = 0; i < dimM; i ++) {
-    for (int j = 0; j < dimN; j++) {
-      op.cpuOperator(A_h[i*lda + j]);
-    }
-  }
-}
-
-/**
- * @brief   cpu element wise binary operator.
- */
-template <class T, class Op, bool BAsRowVector, bool BAsColVector>
-void hl_cpu_apply_binary_op(Op op,
-                            T* A_h,
-                            T* B_h,
-                            int dimM,
-                            int dimN,
-                            int lda,
-                            int ldb) {
-  for (int i = 0; i < dimM; i ++) {
-    for (int j = 0; j < dimN; j++) {
-      if (BAsRowVector == 0 && BAsColVector == 0) {
-        op.cpuOperator(A_h[i * lda + j], B_h[i * ldb + j]);
-      } else if (BAsRowVector == 1 && BAsColVector == 0) {
-        op.cpuOperator(A_h[i * lda + j], B_h[j]);
-      } else if (BAsRowVector == 0 && BAsColVector == 1) {
-        op.cpuOperator(A_h[i * lda + j], B_h[i * ldb]);
-      } else {
-        op.cpuOperator(A_h[i * lda + j], B_h[0]);
-      }
-    }
-  }
-}
-
-/**
- * @brief   cpu element wise ternary operator.
- */
-template <class T, class Op, bool CAsRowVector, bool CAsColVector>
-void hl_cpu_apply_ternary_op(Op op,
-                             T* A_h,
-                             T* B_h,
-                             T* C_h,
-                             int dimM,
-                             int dimN,
-                             int lda,
-                             int ldb,
-                             int ldc) {
-  for (int i = 0; i < dimM; i ++) {
-    for (int j = 0; j < dimN; j++) {
-      if (CAsRowVector == 0 && CAsColVector == 0) {
-        op.cpuOperator(A_h[i*lda + j], B_h[i*ldb + j], C_h[i*ldc + j]);
-      } else if (CAsRowVector == 1 && CAsColVector == 0) {
-        op.cpuOperator(A_h[i*lda + j], B_h[i*ldb + j], C_h[j]);
-      } else if (CAsRowVector == 0 && CAsColVector == 1) {
-        op.cpuOperator(A_h[i*lda + j], B_h[i*ldb + j], C_h[i*ldc]);
-      } else {
-        op.cpuOperator(A_h[i*lda + j], B_h[i*ldb + j], C_h[0]);
-      }
-    }
-  }
-}
-
-/**
- * @brief   cpu element wise quaternary operator.
- */
-template <class T, class Op>
-void hl_cpu_apply_quaternary_op(Op op,
-                                T* A_h,
-                                T* B_h,
-                                T* C_h,
-                                T* D_h,
-                                int dimM,
-                                int dimN,
-                                int lda,
-                                int ldb,
-                                int ldc,
-                                int ldd) {
-  for (int i = 0; i < dimM; i ++) {
-    for (int j = 0; j < dimN; j++) {
-      op.cpuOperator(A_h[i*lda + j],
-                     B_h[i*ldb + j],
-                     C_h[i*ldc + j],
-                     D_h[i*ldd + j]);
-    }
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_cpu_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda) {
-#ifndef __CUDA_ARCH__
-  if (!Agg::sse || !Op::sse || !Saver::sse) {
-    hl_matrix_row_op(agg, op, sv, dimM, dimN, dst, ld, A, lda);
-  } else {
-    if (hl_check_align(A) && hl_check_align(lda*sizeof(real))) {
-      hl_sse_matrix_row_op(agg, op, sv, dimM, dimN, dst, ld, A, lda);
-    } else {
-      hl_matrix_row_op(agg, op, sv, dimM, dimN, dst, ld, A, lda);
-    }
-  }
-#endif
-}
-
-template <class Agg, class Op, class Saver>
-void hl_cpu_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda,
-                          real *B, int ldb) {
-#ifndef __CUDA_ARCH__
-  if (!Agg::sse || !Op::sse || !Saver::sse) {
-    hl_matrix_row_op(agg, op, sv, dimM, dimN, dst, ld, A, lda, B, ldb);
-  } else {
-    if (hl_check_align(A) && hl_check_align(lda*sizeof(real))
-      && hl_check_align(B) && hl_check_align(ldb*sizeof(real))) {
-      hl_sse_matrix_row_op(
-        agg, op, sv, dimM, dimN, dst, ld, A, lda, B, ldb);
-    } else {
-      hl_matrix_row_op(agg, op, sv, dimM, dimN, dst, ld, A, lda, B, ldb);
-    }
-  }
-#endif
-}
-
-template <class Agg, class Op, class Saver>
-void hl_cpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda) {
-#ifndef __CUDA_ARCH__
-  if (!Agg::sse || !Op::sse || !Saver::sse) {
-    hl_matrix_column_op(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else {
-    if (hl_check_align(A) && hl_check_align(lda*sizeof(real))
-      && hl_check_align(dst)) {
-      hl_sse_matrix_column_op(agg, op, sv, dimM, dimN, dst, A, lda);
-    } else {
-      hl_matrix_column_op(agg, op, sv, dimM, dimN, dst, A, lda);
-    }
-  }
-#endif
-}
-
-template <class Agg, class Op, class Saver>
-void hl_cpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda,
-                             real *B, int ldb) {
-#ifndef __CUDA_ARCH__
-  if (!Agg::sse || !Op::sse || !Saver::sse) {
-    hl_matrix_column_op(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else {
-    if (hl_check_align(A) && hl_check_align(lda*sizeof(real))
-      && hl_check_align(B) && hl_check_align(ldb*sizeof(real))
-      && hl_check_align(dst)) {
-      hl_sse_matrix_column_op(
-        agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-    } else {
-      hl_matrix_column_op(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-    }
-  }
-#endif
-}
-
-#endif /* HL_CPU_MATRIX_KERNEL_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_cpu_matrix_kernel_detail.cuh b/paddle/legacy/cuda/include/hl_cpu_matrix_kernel_detail.cuh
deleted file mode 100644
index 54a749b9907..00000000000
--- a/paddle/legacy/cuda/include/hl_cpu_matrix_kernel_detail.cuh
+++ /dev/null
@@ -1,310 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_MATRIX_KERNEL_DETAIL_CUH_
-#define HL_MATRIX_KERNEL_DETAIL_CUH_
-
-#include "hl_matrix_type.cuh"
-
-inline bool hl_check_align(size_t size) {
-  return !(size & (VECTOR_SIZE - 1));
-}
-
-inline bool hl_check_align(void *ptr) {
-  return hl_check_align(reinterpret_cast<size_t>(ptr));
-}
-
-template <class Agg, class Op, class Saver>
-void hl_matrix_row_op(Agg agg, Op op, Saver sv,
-                      int dimM, int dimN,
-                      real *dst, int ld,
-                      real *A, int lda) {
-  for (int i = 0; i < dimM; i++) {
-    real tmp = agg.init();
-    for (int j = 0; j < dimN; j++) {
-        tmp = agg(tmp, op(A[i * lda + j]));
-    }
-    dst[i*ld] = sv(dst[i*ld], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_matrix_row_op(Agg agg, Op op, Saver sv,
-                      int dimM, int dimN,
-                      real *dst, int ld,
-                      real *A, int lda,
-                      real *B, int ldb) {
-  for (int i = 0; i < dimM; i++) {
-    real tmp = agg.init();
-    for (int j = 0; j < dimN; j++) {
-        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
-    }
-    dst[i*ld] = sv(dst[i*ld], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda,
-                         real *B, int ldb) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda) {
-  for (int i = 0; i < dimM; i++, A += lda) {
-    vecType mm = VECTOR_SET(agg.init());
-    vecType *a = (vecType*)(A);
-    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++) {
-        mm = agg.vecOp(mm, op.vecOp(*a));
-    }
-
-    int rem = dimN % VECTOR_LEN;
-    if (rem) {
-      real tmp = hl_agg_op(agg, mm);
-      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
-      for (int j = 0; j < rem; j++) {
-          tmp = agg(tmp, op(a[j]));
-      }
-      dst[i*ld] = sv(dst[i*ld], tmp);
-    } else {
-        dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
-    }
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda,
-                          real *B, int ldb) {
-  for (int i = 0; i < dimM; i++, A += lda, B += ldb) {
-    vecType mm = VECTOR_SET(agg.init());
-    vecType *a = (vecType*)(A);
-    vecType *b = (vecType*)(B);
-    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++, b++) {
-        mm = agg.vecOp(mm, op.vecOp(*a, *b));
-    }
-
-    int rem = dimN % VECTOR_LEN;
-    if (rem) {
-      real tmp = hl_agg_op(agg, mm);
-      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
-      real *b = B + (dimN / VECTOR_LEN) * VECTOR_LEN;
-      for (int j = 0; j < rem; j++) {
-          tmp = agg(tmp, op(a[j], b[j]));
-      }
-      dst[i*ld] = sv(dst[i*ld], tmp);
-    } else {
-        dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
-    }
-  }
-}
-
-/*
- * MaxRow greater than or equal dimN
- * dimN is multiples of VECTOR_LEN
- * so rem <= MaxRow / VECTOR_LEN
- */
-template <int MaxRow, class Agg, class Op, class Saver>
-void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
-                               int dimM, int dimN,
-                               real *dst,
-                               real *A, int lda) {
-  vecType mm[MaxRow / VECTOR_LEN];
-  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
-    mm[n] = VECTOR_SET(agg.init());
-  }
-
-  for (int i = 0; i < dimM; i++) {
-    vecType *a = (vecType*)(A + i * lda);
-    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
-    }
-  }
-
-  vecType *result = (vecType*)(dst);
-  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-    result[n] = sv.vecOp(result[n], mm[n]);
-  }
-
-  int rem = dimN % VECTOR_LEN;
-  if (rem) {
-    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda);
-  }
-}
-
-/*
- * dimN is multiples of VECTOR_LEN
- * dimN greater than Step
- */
-template <int Step, class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda) {
-  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step) {
-    vecType mm[Step / VECTOR_LEN];
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      mm[n] = VECTOR_SET(agg.init());
-    }
-
-    for (int i = 0; i < dimM; i++) {
-      vecType *a = (vecType*)(A + i * lda);
-      for (int n = 0; n < Step / VECTOR_LEN; n++) {
-        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
-      }
-    }
-
-    vecType *result = (vecType*)(dst);
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      result[n] = sv.vecOp(result[n], mm[n]);
-    }
-  }
-
-  int remRow = dimN % Step;
-  if (remRow) {
-    hl_sse_column_op_with_rem<Step>(agg, op, sv, dimM, remRow, dst, A, lda);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda) {
-  if (dimN <= 16) {
-    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else if (dimN <= 32) {
-    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else if (dimN <= 1024 || dimM <= 512) {
-    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else {
-    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda);
-  }
-}
-
-template <int MaxRow, class Agg, class Op, class Saver>
-void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
-                               int dimM, int dimN,
-                               real *dst,
-                               real *A, int lda,
-                               real *B, int ldb) {
-  vecType mm[MaxRow / VECTOR_LEN];
-  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
-    mm[n] = VECTOR_SET(agg.init());
-  }
-
-  for (int i = 0; i < dimM; i++) {
-    vecType *a = (vecType*)(A + i * lda);
-    vecType *b = (vecType*)(B + i * ldb);
-    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
-    }
-  }
-
-  vecType *result = (vecType*)(dst);
-  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-    result[n] = sv.vecOp(result[n], mm[n]);
-  }
-
-  int rem = dimN % VECTOR_LEN;
-  if (rem) {
-    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    B += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda, B, ldb);
-  }
-}
-
-template <int Step, class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda,
-                             real *B, int ldb) {
-  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step, B += Step) {
-    vecType mm[Step / VECTOR_LEN];
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      mm[n] = VECTOR_SET(agg.init());
-    }
-
-    for (int i = 0; i < dimM; i++) {
-      vecType *a = (vecType*)(A + i * lda);
-      vecType *b = (vecType*)(B + i * ldb);
-      for (int n = 0; n < Step / VECTOR_LEN; n++) {
-        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
-      }
-    }
-
-    vecType *result = (vecType*)(dst);
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      result[n] = sv.vecOp(result[n], mm[n]);
-    }
-  }
-
-  int remRow = dimN % Step;
-  if (remRow) {
-    hl_sse_column_op_with_rem<Step>(
-        agg, op, sv, dimM, remRow, dst, A, lda, B, ldb);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda,
-                             real *B, int ldb) {
-  if (dimN <= 16) {
-    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else if (dimN <= 32) {
-    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else if (dimN <= 1024 || dimM <= 512) {
-    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else {
-    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  }
-}
-
-#endif /* HL_MATRIX_KERNEL_DETAIL_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_cpu_scalar.cuh b/paddle/legacy/cuda/include/hl_cpu_scalar.cuh
deleted file mode 100644
index 939302e9715..00000000000
--- a/paddle/legacy/cuda/include/hl_cpu_scalar.cuh
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CPU_SCALAR_CUH_
-#define HL_CPU_SCALAR_CUH_
-
-#define VECTOR_SIMD false
-#define VECTOR_SET  hl_vec_set
-
-#ifndef PADDLE_TYPE_DOUBLE
-/* size of float */
-#define VECTOR_SIZE 4
-#else
-/* size of double */
-#define VECTOR_SIZE 8
-#endif
-
-typedef real vecType;
-
-/* Consider a real as a vector */
-#define VECTOR_LEN  1
-
-template <class Agg>
-inline real hl_agg_op(Agg agg, vecType mm) {
-  return mm;
-}
-
-INLINE real hl_vec_set(const real r) {
-  return r;
-}
-
-INLINE real hl_vec_classification_error(const real a,
-                                        const real b,
-                                        const real p,
-                                        const real r) {
-  return ((a > p) == (b > p)) ? 0.0f : 1.0f;
-}
-
-#endif  // HL_CPU_SCALAR_CUH_
diff --git a/paddle/legacy/cuda/include/hl_cpu_simd_neon.cuh b/paddle/legacy/cuda/include/hl_cpu_simd_neon.cuh
deleted file mode 100644
index e54e0f4646b..00000000000
--- a/paddle/legacy/cuda/include/hl_cpu_simd_neon.cuh
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CPU_SIMD_NEON_CUH_
-#define HL_CPU_SIMD_NEON_CUH_
-
-#include <arm_neon.h>
-
-#define VECTOR_SIMD true
-#define VECTOR_SIZE 16
-#define VECTOR_SET  hl_vec_set
-
-#ifndef PADDLE_TYPE_DOUBLE
-
-typedef float32x4_t vecType;
-
-/* number of float in vector */
-#define VECTOR_LEN  4
-
-template <class Agg>
-inline real hl_agg_op(Agg agg, vecType mm) {
-  float32x4_t rev = vrev64q_f32(mm);
-  float32x4_t tmp1 = agg.vecOp(rev, rev);
-  float32x2_t lo = vget_high_f32(rev);
-  float32x2_t hi = vget_low_f32(rev);
-  float32x4_t tmp2 = vcombine_f32(hi, lo);
-  float32x4_t ret = agg.vecOp(tmp1, tmp2);
-
-  return vgetq_lane_f32(ret, 0);
-}
-
-inline float32x4_t hl_vec_set(const real f) {
-  return vdupq_n_f32(f);
-}
-
-inline float32x4_t hl_vec_classification_error(const float32x4_t a,
-                                               const float32x4_t b,
-                                               const float32x4_t p,
-                                               const float32x4_t r) {
-  uint32x4_t tmp1 = vcgtq_f32(a, p);
-  uint32x4_t tmp2 = vcgtq_f32(b, p);
-  uint32x4_t tmp3 = veorq_u32(tmp1, tmp2);
-  return vcvtq_f32_u32(vandq_u32(tmp3, vcvtq_u32_f32(r)));
-}
-
-#else
-
-#ifdef __aarch64__
-typedef float64x2_t vecType;
-
-/* number of float in vector */
-#define VECTOR_LEN  2
-#define VECTOR_SET  vdupq_n_f64
-
-#error To be implemented
-#else
-#error NEON instructions does not support double precision
-#endif  // __aarch64__
-
-#endif
-
-#endif  // HL_CPU_SIMD_NEON_CUH_
diff --git a/paddle/legacy/cuda/include/hl_cpu_simd_sse.cuh b/paddle/legacy/cuda/include/hl_cpu_simd_sse.cuh
deleted file mode 100644
index 20c37d4dd31..00000000000
--- a/paddle/legacy/cuda/include/hl_cpu_simd_sse.cuh
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CPU_SIMD_SSE_CUH_
-#define HL_CPU_SIMD_SSE_CUH_
-
-#include <mmintrin.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-
-#define VECTOR_SIMD true
-#define VECTOR_SIZE 16
-#define VECTOR_SET  hl_vec_set
-
-#ifndef PADDLE_TYPE_DOUBLE
-
-typedef __m128  vecType;
-
-/* number of float in vector */
-#define VECTOR_LEN  4
-
-template <class Agg>
-inline real hl_agg_op(Agg agg, vecType mm) {
-  __m128 lo = _mm_unpacklo_ps(mm, mm);
-  __m128 hi = _mm_unpackhi_ps(mm, mm);
-  __m128 tmp1 = agg.vecOp(lo, hi);
-  __m128 tmp2 = _mm_movehl_ps(tmp1, tmp1);
-  __m128 ret = agg.vecOp(tmp1, tmp2);
-
-  return _mm_cvtss_f32(ret);
-}
-
-inline __m128 hl_vec_set(const real f) {
-  return _mm_set_ps1(f);
-}
-
-inline __m128 hl_vec_classification_error(const __m128 a,
-                                          const __m128 b,
-                                          const __m128 p,
-                                          const __m128 r) {
-  __m128 tmp1 = _mm_cmpgt_ps(a, p);
-  __m128 tmp2 = _mm_cmpgt_ps(b, p);
-  __m128 tmp3 = _mm_xor_ps(tmp1, tmp2);
-  return _mm_and_ps(tmp3, r);
-}
-
-#else
-
-typedef __m128d vecType;
-
-/* number of double in vector */
-#define VECTOR_LEN  2
-
-template <class Agg>
-inline real hl_agg_op(Agg agg, vecType mm) {
-  __m128d lo = _mm_unpacklo_pd(mm, mm);
-  __m128d hi = _mm_unpackhi_pd(mm, mm);
-  __m128d ret = agg.vecOp(lo, hi);
-
-  return _mm_cvtsd_f64(ret);
-}
-
-inline __m128d hl_vec_set(const real d) {
-#if defined(__APPLE__) || defined(__OSX__)
-  return _mm_set1_pd(d);
-#else
-  return _mm_set_pd1(d);
-#endif
-}
-
-inline __m128d hl_vec_classification_error(const __m128d a,
-                                           const __m128d b,
-                                           const __m128d p,
-                                           const __m128d r) {
-  __m128d tmp1 = _mm_cmpgt_pd(a, p);
-  __m128d tmp2 = _mm_cmpgt_pd(b, p);
-  __m128d tmp3 = _mm_xor_pd(tmp1, tmp2);
-  return _mm_and_pd(tmp3, r);
-}
-
-#endif
-
-#endif  // HL_CPU_SIMD_SSE_CUH_
diff --git a/paddle/legacy/cuda/include/hl_cuda.h b/paddle/legacy/cuda/include/hl_cuda.h
deleted file mode 100644
index 70efcccb818..00000000000
--- a/paddle/legacy/cuda/include/hl_cuda.h
+++ /dev/null
@@ -1,345 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CUDA_H_
-#define HL_CUDA_H_
-
-#include <string>
-#include "hl_base.h"
-
-/**
- * @brief   HPPL event.
- */
-typedef struct _hl_event_st *hl_event_t;
-
-/**
- * @brief return cuda runtime api version.
- */
-extern int hl_get_cuda_lib_version();
-
-/**
- * @brief   HPPL strat(Initialize all GPU).
- */
-extern void hl_start();
-
-/**
- * @brief   HPPL start(Initialize the specific GPU).
- *
- * @param[in]   device  device id(0, 1......).
- *                      if device is NULL, will start all GPU.
- * @param[in]   number  number of devices.
- */
-extern void hl_specify_devices_start(int *device, int number);
-
-/**
- * @brief   Queries if a device may directly access a peer device's memory.
- *
- * @param[in]   device      Device from which allocations on peerDevice are
- *                          to be directly accessed.
- * @param[in]   peerDevice  Device on which the allocations to be directly
- *                          accessed by device reside.
- *
- * @return  Returns true if device is capable of directly accessing memory
- *          from peerDevice and false otherwise.
- */
-bool hl_device_can_access_peer(int device, int peerDevice);
-
-/**
- * @brief   Enables direct access to memory allocations on a peer device.
- *
- * @param[in]   peerDevice  Peer device to enable direct access to from the
- *                          current device
- */
-void hl_device_enable_peer_access(int peerDevice);
-
-/**
- * @brief   Init a work thread.
- *
- * @param[in]   device  device id.
- */
-extern void hl_init(int device);
-
-/**
- * @brief   Finish a work thread.
- */
-extern void hl_fini();
-
-/**
- * @brief   Set synchronous/asynchronous flag.
- *
- * @param[in]   flag    true(default), set synchronous flag.
- *                      false, set asynchronous flag.
- *
- *
- * @note    This setting is only valid for the current worker thread.
- */
-extern void hl_set_sync_flag(bool flag);
-
-/**
- * @brief   Get synchronous/asynchronous flag.
- *
- * @return  Synchronous call true.
- *          Asynchronous call false.
- *
- */
-extern bool hl_get_sync_flag();
-
-/**
- * @brief   Returns the number of compute-capable devices.
- *
- */
-extern int hl_get_device_count();
-
-/**
- * @brief   Set device to be used.
- *
- * @param[in]   device  device id.
- *
- */
-extern void hl_set_device(int device);
-
-/**
- * @brief   Returns which device is currently being used.
- *
- * @return  device  device id.
- *
- */
-extern int hl_get_device();
-
-/**
- * @brief   Allocate device memory.
- *
- * @param[in]   size     size in bytes to copy.
- *
- * @return      dest_d   pointer to device memory.
- */
-extern void *hl_malloc_device(size_t size);
-
-/**
- * @brief   Free device memory.
- *
- * @param[in]   dest_d  pointer to device memory.
- *
- */
-extern void hl_free_mem_device(void *dest_d);
-
-/**
- * @brief   Allocate host page-lock memory.
- *
- * @param[in]   size     size in bytes to copy.
- *
- * @return      dest_h   pointer to host memory.
- */
-extern void *hl_malloc_host(size_t size);
-
-/**
- * @brief   Free host page-lock memory.
- *
- * @param[in]   dest_h  pointer to host memory.
- *
- */
-extern void hl_free_mem_host(void *dest_h);
-
-/**
- * @brief   Copy data.
- *
- * @param[in]   dst     dst memory address(host or device).
- * @param[in]   src     src memory address(host or device).
- * @param[in]   size    size in bytes to copy.
- *
- */
-extern void hl_memcpy(void *dst, void *src, size_t size);
-
-/**
- * @brief   Set device memory to a value.
- *
- * @param[in]   dest_d  pointer to device memory.
- * @param[in]   value   value to set for each byte of specified memory.
- * @param[in]   size    size in bytes to set.
- *
- */
-extern void hl_memset_device(void *dest_d, int value, size_t size);
-
-/**
- * @brief   Copy host memory to device memory.
- *
- * @param[in]   dest_d  dst memory address.
- * @param[in]   src_h   src memory address.
- * @param[in]   size    size in bytes to copy.
- *
- */
-extern void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size);
-
-/**
- * @brief   Copy device memory to host memory.
- *
- * @param[in]   dest_h  dst memory address.
- * @param[in]   src_d   src memory address.
- * @param[in]   size    size in bytes to copy.
- *
- */
-extern void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size);
-
-/**
- * @brief   Copy device memory to device memory.
- *
- * @param[in]   dest_d  dst memory address.
- * @param[in]   src_d   src memory address.
- * @param[in]   size    size in bytes to copy.
- *
- */
-extern void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size);
-
-/**
- * @brief   Generate uniformly distributed floats (0, 1.0].
- *
- * @param[in]   dest_d  pointer to device memory to store results.
- * @param[in]   num     number of floats to generate.
- *
- */
-extern void hl_rand(real *dest_d, size_t num);
-
-/**
- * @brief   Set the seed value of the random number generator.
- *
- * @param[in]   seed    seed value.
- */
-extern void hl_srand(unsigned int seed);
-
-/**
- * @brief   Copy data.
- *
- * @param[in]   dst     dst memory address(host or device).
- * @param[in]   src     src memory address(host or device).
- * @param[in]   size    size in bytes to copy.
- * @param[in]   stream  stream id.
- */
-extern void hl_memcpy_async(void *dst,
-                            void *src,
-                            size_t size,
-                            hl_stream_t stream);
-
-/**
- * @brief   Waits for stream tasks to complete.
- *
- * @param[in]   stream  stream id.
- */
-extern void hl_stream_synchronize(hl_stream_t stream);
-
-/**
- * @brief   Creates an event object.
- *
- * @param[out]   event  New event.
- */
-extern void hl_create_event(hl_event_t *event);
-
-/**
- * @brief   Destroys an event object.
- *
- * @param[in]   event   Event to destroy.
- */
-extern void hl_destroy_event(hl_event_t event);
-
-/**
- * @brief   Computes the elapsed time between events.
- *
- * @param[in]   start  Starting event.
- * @param[in]   end    Ending event.
- *
- * @return      time   Time between start and end in ms.
- */
-extern float hl_event_elapsed_time(hl_event_t start, hl_event_t end);
-
-/**
- * @brief   Records an event.
- *
- * @param[in]   stream   Stream in which to insert event.
- * @param[in]   event    Event waiting to be recorded as completed.
- *
- */
-extern void hl_stream_record_event(hl_stream_t stream, hl_event_t event);
-
-/**
- * @brief   Make a compute stream wait on an event.
- *
- * @param[in]   stream   Stream in which to insert event.
- * @param[in]   event    Event to wait on.
- *
- */
-extern void hl_stream_wait_event(hl_stream_t stream, hl_event_t event);
-
-/**
- * @brief   Wait for an event to complete.
- *
- * @param[in]   event       event to wait for.
- *
- */
-extern void hl_event_synchronize(hl_event_t event);
-
-/**
- * @brief   Sets block flags to be used for device executions.
- *
- * @note    This interface needs to be called before hl_start.
- */
-extern void hl_set_device_flags_block();
-
-/**
- * @brief   Returns the last error string from a cuda runtime call.
- */
-extern const char *hl_get_device_error_string();
-
-/**
- * @brief     Returns the last error string from a cuda runtime call.
- *
- * @param[in] err  error number.
- *
- * @see       hl_get_device_last_error()
- */
-extern const char *hl_get_device_error_string(size_t err);
-
-/**
- * @brief   Returns the last error number.
- *
- * @return  error number.
- *
- * @see     hl_get_device_error_string()
- */
-extern int hl_get_device_last_error();
-
-/**
- * @brief   check cuda event is ready
- *
- * @param[in]  event        cuda event to query.
- *
- * @return     true    cuda event is ready.
- *             false   cuda event is not ready.
- */
-extern bool hl_cuda_event_is_ready(hl_event_t event);
-
-/**
- * @brief   hppl device synchronization.
- */
-extern void hl_device_synchronize();
-
-/**
- * @brief   gpu profiler start
- */
-extern void hl_profiler_start();
-
-/**
- * @brief   gpu profiler stop
- */
-extern void hl_profiler_end();
-
-#endif  // HL_CUDA_H_
diff --git a/paddle/legacy/cuda/include/hl_cuda.ph b/paddle/legacy/cuda/include/hl_cuda.ph
deleted file mode 100644
index 7c4465e51ff..00000000000
--- a/paddle/legacy/cuda/include/hl_cuda.ph
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_CUDA_PH_
-#define HL_CUDA_PH_
-
-#include <stdio.h>
-#include <pthread.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <curand.h>
-#include <cudnn.h>
-#include "hl_base.h"
-
-/**
- * @brief   hppl event.
- * @param   cuda event.
- */
-struct _hl_event_st {
-    cudaEvent_t     cu_event;       /* cuda event */
-};
-
-/**
- * @brief   global device resources.
- *
- * @param   *stream         device global stream.
- * @param   handle          devcie cublas handle.
- * @param   gen             device curand generator.
- * @param   cudnn_handle    cudnn handle.
- * @param   *gen_mutex      gen lock.
- */
-typedef struct {
-    cudaStream_t        *stream;
-    cublasHandle_t      handle;
-    curandGenerator_t   gen;
-    cudnnHandle_t       cudnn_handle;
-    pthread_mutex_t     *gen_mutex;
-}_global_device_resources, *global_device_resources;
-
-/*
- * @brief   thread device resources.
- *
- * @param   *stream         device thread stream.
- * @param   *gpu_mem        device memory.
- * @param   *cpu_mem        cpu memory.
- * @param    mem_event      device memory lock.
- */
-typedef struct {
-    cudaStream_t   *stream;
-    real           *gpu_mem;
-    real           *cpu_mem;
-    cudaEvent_t    mem_event;
-}_thread_device_resources, *thread_device_resources;
-
-/*
- * @brief   hppl device properties.
- *
- * @param   device            device id.
- * @param   device_type       0.Nvidia, 1.AMD, 2.Intel.
- * @param   device_name[256]  device name.
- * @param   device_mem        total global memory.
- * @param   major             device compute capability.
- * @param   minor             device compute capability.
- * @param   is_local          local device or not.
- * @param   device_resources  device resources.
- */
-typedef struct {
-    int device;
-    int device_type;
-    char device_name[256];
-    size_t device_mem;
-    int major;
-    int minor;
-    bool is_local;
-    global_device_resources device_resources;
-} _hl_device_prop, *hl_device_prop;
-
-/**
- * @brief   thread device resource allocation.
- *
- * create cuda stream and cuda event, allocate gpu
- * memory and host page-lock memory for threads.
- *
- * @param[in]   device      device number.
- * @param[out]  device_res  device properties.
- */
-extern void hl_create_thread_resources(int device,
-                                       thread_device_resources device_res);
-
-/**
- * @brief   global device resource allocation.
- *
- * create cuda stream, initialize cublas, curand and cudnn.
- *
- * @param[out]   device_prop  device properties.
- */
-extern void hl_create_global_resources(hl_device_prop device_prop);
-
-#endif  /* HL_CUDA_PH_ */
diff --git a/paddle/legacy/cuda/include/hl_cuda_cublas.h b/paddle/legacy/cuda/include/hl_cuda_cublas.h
deleted file mode 100644
index 3959f81677b..00000000000
--- a/paddle/legacy/cuda/include/hl_cuda_cublas.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CUDA_CUBLAS_H_
-#define HL_CUDA_CUBLAS_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   Matrix transpose: C_d = T(A_d)
- *
- * @param[in]   A_d     input matrix (dimM x dimN).
- * @param[out]  C_d     output matrix (dimN x dimM).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- * @param[in]   lda     the first dimension of A_d.
- * @param[in]   ldc     the first dimension of C_d.
- *
- */
-extern void hl_matrix_transpose(
-    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc);
-
-/*
- * @brief Matrix transpose, while lda = dimN, ldc = dimM.
- *
- * @param[in]   A_d     input matrix (dimM x dimN).
- * @param[out]  C_d     output matrix (dimN x dimM).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN);
-
-/*
- * @brief Matrix inverse
- *
- * @param[in]   A_d    input matrix (dimN x dimN).
- * @param[out]  C_d    output matrix (dimN x dimN).
- * @param[in]   dimN   matrix height = matrix width
- * @param[in]   lda    the first dimension of A_d
- * @param[in]   ldc    the first dimension of C_d
- *
- */
-extern void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc);
-
-/**
- * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
- *
- * @param[in]   A_d     input.
- * @param[in]   transa  operation op(A) that is non-or transpose.
- * @param[in]   B_d     input.
- * @param[in]   transb  operation op(B) that is non-or transpose.
- * @param[out]  C_d     output.
- * @param[in]   dimM    matrix height of op(A) & C
- * @param[in]   dimN    matrix width of op(B) & C
- * @param[in]   dimK    width of op(A) & height of op(B)
- * @param[in]   alpha   scalar used for multiplication.
- * @param[in]   beta    scalar used for multiplication.
- * @param[in]   lda     the first dimension of A_d.
- * @param[in]   ldb     the first dimension of B_d.
- * @param[in]   ldc     the first dimension of C_d.
- *
- */
-extern void hl_matrix_mul(real *A_d,
-                          hl_trans_op_t transa,
-                          real *B_d,
-                          hl_trans_op_t transb,
-                          real *C_d,
-                          int dimM,
-                          int dimN,
-                          int dimK,
-                          real alpha,
-                          real beta,
-                          int lda,
-                          int ldb,
-                          int ldc);
-
-/**
- * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
- *
- * @param[in]   A_d     input.
- * @param[in]   transa  operation op(A) that is non-or transpose.
- * @param[in]   B_d     input.
- * @param[in]   transb  operation op(B) that is non-or transpose.
- * @param[out]  C_d     output.
- * @param[in]   dimM    matrix height of op(A) & C
- * @param[in]   dimN    matrix width of op(B) & C
- * @param[in]   dimK    width of op(A) & height of op(B)
- * @param[in]   alpha   scalar used for multiplication.
- * @param[in]   beta    scalar used for multiplication.
- *
- */
-extern void hl_matrix_mul(real *A_d,
-                          hl_trans_op_t transa,
-                          real *B_d,
-                          hl_trans_op_t transb,
-                          real *C_d,
-                          int dimM,
-                          int dimN,
-                          int dimK,
-                          real alpha,
-                          real beta);
-
-/**
- * @brief   This function performs the matrix-vector multiplication.
- *          C_d = alpha*op(A_d)*B_d + beta*C_d
- *
- * @param[in]     A_d    matrix.
- * @param[in]     trans  operation op(A) that is non-or transpose.
- * @param[in]     B_d    vector with dimN(dimM) elements
- *                       if trans==HPPL_OP_N(HPPL_OP_T).
- * @param[in,out] C_d    vector with dimM(dimN) elements
- *                       if trans==HPPL_OP_N(HPPL_OP_T).
- * @param[in]     dimM   number of rows of matrix A_d.
- * @param[in]     dimN   number of columns of matrix A_d.
- * @param[in]     alpha  scalar used for multiplication.
- * @param[in]     beta   scalar used for multiplication.
- * @param[in]     lda    the first dimension of A_d.
- * @param[in]     incb   increase B_d size for compaction.
- * @param[in]     incc   increase C_d size for compaction.
- *
- */
-
-extern void hl_matrix_mul_vector(real *A_d,
-                                 hl_trans_op_t trans,
-                                 real *B_d,
-                                 real *C_d,
-                                 int dimM,
-                                 int dimN,
-                                 real alpha,
-                                 real beta,
-                                 int lda,
-                                 int incb,
-                                 int incc);
-
-/**
- * @brief   This function performs the matrix-vector multiplication.
- *          C_d = alpha*op(A_d)*B_d + beta*C_d
- *
- * @param[in]     A_d    matrix.
- * @param[in]     trans  operation op(A) that is non-or transpose.
- * @param[in]     B_d    vector with dimN(dimM) elements
- *                       if trans==HPPL_OP_N(HPPL_OP_T).
- * @param[in,out] C_d    vector with dimM(dimN) elements
- *                       if trans==HPPL_OP_N(HPPL_OP_T).
- * @param[in]     dimM   number of rows of matrix A_d.
- * @param[in]     dimN   number of columns of matrix A_d.
- * @param[in]     alpha  scalar used for multiplication.
- * @param[in]     beta   scalar used for multiplication.
- *
- */
-extern void hl_matrix_mul_vector(real *A_d,
-                                 hl_trans_op_t trans,
-                                 real *B_d,
-                                 real *C_d,
-                                 int dimM,
-                                 int dimN,
-                                 real alpha,
-                                 real beta);
-
-#endif /* HL_CUDA_CUBLAS_H_ */
diff --git a/paddle/legacy/cuda/include/hl_cuda_cudnn.h b/paddle/legacy/cuda/include/hl_cuda_cudnn.h
deleted file mode 100644
index 4664e4144a8..00000000000
--- a/paddle/legacy/cuda/include/hl_cuda_cudnn.h
+++ /dev/null
@@ -1,516 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CUDA_CUDNN_H_
-#define HL_CUDA_CUDNN_H_
-
-#include "hl_base.h"
-
-/*
- *  hppl pooling mode
- */
-typedef enum {
-  HL_POOLING_MAX = 0,
-  // average does not include padded values
-  HL_POOLING_AVERAGE = 1,
-  // average includes padded values
-  HL_POOLING_AVERAGE_INCLUDE_PADDING = 2,
-  HL_POOLING_END
-} hl_pooling_mode_t;
-
-/**
- * @brief return cudnn lib version
- */
-
-extern int hl_get_cudnn_lib_version();
-
-/**
- * @brief   hppl image descriptor.
- */
-typedef struct _hl_tensor_descriptor* hl_tensor_descriptor;
-
-/**
- * @brief   hppl pooling descriptor.
- */
-typedef struct _hl_pooling_descriptor* hl_pooling_descriptor;
-
-/**
- * @brief   hppl filter descriptor.
- */
-typedef struct _hl_filter_descriptor* hl_filter_descriptor;
-
-/**
- * @brief   hppl filter descriptor.
- */
-typedef struct _hl_convolution_descriptor* hl_convolution_descriptor;
-
-/**
- * @brief   create image descriptor.
- *
- * @param[out]   image_desc     image descriptor.
- *
- */
-extern void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc);
-
-/**
- * @brief   reshape image descriptor.
- *
- * @param[in,out]   image_desc    image descriptor.
- * @param[in]       batch_size    input batch size.
- * @param[in]       feature_maps  image feature maps.
- * @param[in]       height        image height.
- * @param[in]       width         image width.
- */
-extern void hl_tensor_reshape(hl_tensor_descriptor image_desc,
-                              int batch_size,
-                              int feature_maps,
-                              int height,
-                              int width);
-
-/**
- * @brief   reshape image descriptor.
- *
- * @param[in,out]   image_desc    image descriptor.
- * @param[in]       batch_size    input batch size.
- * @param[in]       feature_maps  image feature maps.
- * @param[in]       height        image height.
- * @param[in]       width         image width.
- * @param[in]       nStride       stride between two consecutive images.
- * @param[in]       cStride       stride between two consecutive feature maps.
- * @param[in]       hStride       stride between two consecutive rows.
- * @param[in]       wStride       stride between two consecutive columns.
- *
- */
-extern void hl_tensor_reshape(hl_tensor_descriptor image_desc,
-                              int batch_size,
-                              int feature_maps,
-                              int height,
-                              int width,
-                              int nStride,
-                              int cStride,
-                              int hStride,
-                              int wStride);
-
-/**
- * @brief   destroy image descriptor.
- *
- * @param[in]   image_desc  hppl image descriptor.
- */
-extern void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc);
-
-/**
- * @brief   create pooling descriptor.
- *
- * @param[out]  pooling_desc    pooling descriptor.
- * @param[in]   mode            pooling mode.
- * @param[in]   height          height of the pooling window.
- * @param[in]   width           width of the pooling window.
- * @param[in]   height_padding  padding height.
- * @param[in]   width_padding   padding width.
- * @param[in]   stride_height   pooling vertical stride.
- * @param[in]   stride_width    pooling horizontal stride.
- */
-extern void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
-                                         hl_pooling_mode_t mode,
-                                         int height,
-                                         int width,
-                                         int height_padding,
-                                         int width_padding,
-                                         int stride_height,
-                                         int stride_width);
-
-/**
- * @brief   destroy pooling descriptor.
- *
- * @param[in]   pooling_desc  hppl pooling descriptor.
- *
- */
-extern void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc);
-
-/**
- * @brief   pooling forward(calculate output image).
- *
- * @param[in]   input           input image descriptor.
- * @param[in]   input_image     input image data.
- * @param[in]   output          output image descriptor.
- * @param[out]  output_image    output image data.
- * @param[in]   pooling         pooling descriptor.
- *
- */
-extern void hl_pooling_forward(hl_tensor_descriptor input,
-                               real* input_image,
-                               hl_tensor_descriptor output,
-                               real* output_image,
-                               hl_pooling_descriptor pooling);
-
-/**
- * @brief   pooling backward(calculate input image gradient).
- *
- * @param[in]   input               input image descriptor.
- * @param[in]   input_image         input image data.
- * @param[in]   input_image_grad    input image gradient data.
- * @param[in]   output              output image descriptor.
- * @param[in]   output_image        output image data.
- * @param[out]  output_image_grad   output image gradient data.
- * @param[in]   pooling             pooling descriptor.
- *
- */
-extern void hl_pooling_backward(hl_tensor_descriptor input,
-                                real* input_image,
-                                real* input_image_grad,
-                                hl_tensor_descriptor output,
-                                real* output_image,
-                                real* output_image_grad,
-                                hl_pooling_descriptor pooling);
-
-/**
- * @brief   create filter descriptor.
- *
- * @param[out]  filter                  filter descriptor.
- * @param[in]   input_feature_maps      input image feature maps.
- * @param[in]   output_feature_maps     output image feature maps.
- * @param[in]   height                  filter height.
- * @param[in]   width                   filter width.
- *
- */
-extern void hl_create_filter_descriptor(hl_filter_descriptor* filter,
-                                        int input_feature_maps,
-                                        int output_feature_maps,
-                                        int height,
-                                        int width);
-
-/**
- * @brief    convolution workspace configuration
- *
- * @param[in]    input                image descriptor
- * @param[in]    output               image descriptor
- * @param[in]    filter               filter descriptor
- * @param[in]    conv                 convolution descriptor
- * @param[out]   convFwdAlgo          forward algorithm
- * @param[out]   fwdLimitBytes        forward workspace size
- * @param[out]   convBwdDataAlgo      backward data algorithm
- * @param[out]   bwdDataLimitBytes    backward data workspace size
- * @param[out]   convBwdFilterAlgo    backward filter algorithm
- * @param[out]   bwdFilterLimitBytes  backward filter workspace size
- *
- */
-extern void hl_conv_workspace(hl_tensor_descriptor input,
-                              hl_tensor_descriptor output,
-                              hl_filter_descriptor filter,
-                              hl_convolution_descriptor conv,
-                              int* convFwdAlgo,
-                              size_t* fwdLimitBytes,
-                              int* convBwdDataAlgo,
-                              size_t* bwdDataLimitBytes,
-                              int* convBwdFilterAlgo,
-                              size_t* bwdFilterLimitBytes,
-                              bool useDilation);
-
-/**
- * @brief   destroy filter descriptor.
- *
- * @param[in]   filter  hppl filter descriptor.
- *
- */
-extern void hl_destroy_filter_descriptor(hl_filter_descriptor filter);
-
-/**
- * @brief   create convolution descriptor.
- *
- * @param[out]  conv                    conv descriptor.
- * @param[in]   image                   input image descriptor.
- * @param[in]   filter                  filter descriptor.
- * @param[in]   padding_height          padding height.
- * @param[in]   padding_width           padding width.
- * @param[in]   stride_height           stride height.
- * @param[in]   stride_width            stride width.
- *
- */
-extern void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
-                                             hl_tensor_descriptor image,
-                                             hl_filter_descriptor filter,
-                                             int padding_height,
-                                             int padding_width,
-                                             int stride_height,
-                                             int stride_width,
-                                             int dilation_h = 1,
-                                             int dilation_w = 1);
-
-/**
- * @brief   reset convolution descriptor.
- *
- * @param[in,out]   conv                conv descriptor.
- * @param[in]       image               input image descriptor.
- * @param[in]       filter              filter descriptor.
- * @param[in]       padding_height      padding height.
- * @param[in]       padding_width       padding width.
- * @param[in]       stride_height       stride height.
- * @param[in]       stride_width        stride width.
- *
- */
-extern void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
-                                            hl_tensor_descriptor image,
-                                            hl_filter_descriptor filter,
-                                            int padding_height,
-                                            int padding_width,
-                                            int stride_height,
-                                            int stride_width,
-                                            int dilation_h = 1,
-                                            int dilation_w = 1);
-
-/**
- * @brief   destroy convolution descriptor.
- *
- * @param[in]   conv  hppl convolution descriptor.
- */
-extern void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv);
-
-/**
- * @brief   convolution forward(calculate output image).
- *
- * @param[in]   input           input image descriptor.
- * @param[in]   input_data      input image data.
- * @param[in]   output          output image descriptor.
- * @param[out]  output_data     output image data.
- * @param[in]   filter          filter descriptor.
- * @param[in]   filter_data     filter data.
- * @param[in]   conv            convolution descriptor.
- * @param[in]   gpuWorkSpace    limited gpu workspace.
- * @param[in]   sizeInBytes     gpu workspace size (bytes).
- * @param[in]   convFwdAlgo     forward algorithm.
- */
-extern void hl_convolution_forward(hl_tensor_descriptor input,
-                                   real* input_data,
-                                   hl_tensor_descriptor output,
-                                   real* output_data,
-                                   hl_filter_descriptor filter,
-                                   real* filter_data,
-                                   hl_convolution_descriptor conv,
-                                   void* gpuWorkSpace,
-                                   size_t sizeInBytes,
-                                   int convFwdAlgo);
-
-/**
- * @brief   convolution forward add bias(calculate output add bias).
- *
- * @param[in]   bias                bias descriptor.
- * @param[in]   bias_data           bias data.
- * @param[in]   output              output image descriptor.
- * @param[out]  output_data         output image data.
- */
-extern void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
-                                            real* bias_data,
-                                            hl_tensor_descriptor output,
-                                            real* output_data);
-
-/**
- * @brief   convolution backward filter(calculate filter grad data).
- *
- * @param[in]   input               input image descriptor.
- * @param[in]   input_data          input image data.
- * @param[in]   output              output image descriptor.
- * @param[in]   output_grad_data    output image grad data.
- * @param[in]   filter              filter descriptor.
- * @param[out]  filter_grad_data    filter grad data.
- * @param[in]   conv                convolution descriptor.
- * @param[in]   gpuWorkSpace        limited gpu workspace.
- * @param[in]   sizeInBytes         gpu workspace size (bytes).
- * @param[in]   convBwdFilterAlgo   backward filter algorithm.
- */
-extern void hl_convolution_backward_filter(hl_tensor_descriptor input,
-                                           real* input_data,
-                                           hl_tensor_descriptor output,
-                                           real* output_grad_data,
-                                           hl_filter_descriptor filter,
-                                           real* filter_grad_data,
-                                           hl_convolution_descriptor conv,
-                                           void* gpuWorkSpace,
-                                           size_t sizeInBytes,
-                                           int convBwdFilterAlgo);
-
-/**
- * @brief   convolution backward data(calculate input image grad data).
- *
- * @param[in]   input               input image descriptor.
- * @param[out]  input_data_grad     input image grad data.
- * @param[in]   output              output image descriptor.
- * @param[in]   output_grad_data    output image grad data.
- * @param[in]   filter              filter descriptor.
- * @param[in]   filter_data         filter data.
- * @param[in]   conv                convolution descriptor.
- * @param[in]   gpuWorkSpace        limited gpu workspace.
- * @param[in]   sizeInBytes         gpu workspace size (bytes).
- * @param[in]   convBwdDataAlgo     backward data algorithm.
- */
-extern void hl_convolution_backward_data(hl_tensor_descriptor input,
-                                         real* input_data_grad,
-                                         hl_tensor_descriptor output,
-                                         real* output_grad_data,
-                                         hl_filter_descriptor filter,
-                                         real* filter_data,
-                                         hl_convolution_descriptor conv,
-                                         void* gpuWorkSpace,
-                                         size_t sizeInBytes,
-                                         int convBwdDataAlgo);
-
-/**
- * @brief   convolution backward bias(calculate bias grad data).
- *
- * @param[in]   bias                bias descriptor.
- * @param[out]  bias_grad_data      bias grad data.
- * @param[in]   output              output image descriptor.
- * @param[in]   output_grad_data    output image grad data.
- */
-extern void hl_convolution_backward_bias(hl_tensor_descriptor bias,
-                                         real* bias_grad_data,
-                                         hl_tensor_descriptor output,
-                                         real* output_grad_data);
-
-/**
- * @brief   softmax forward.
- *
- * @param[in]   input               input value.
- * @param[out]  output              output value.
- * @param[in]   height              matrix height.
- * @param[in]   width               matrix width.
- */
-extern void hl_softmax_forward(real* input,
-                               real* output,
-                               int height,
-                               int width);
-
-/**
- * @brief   softmax backward.
- *
- * @param[in]   output_value        output value data.
- * @param[out]  output_grad         output grad data.
- * @param[in]   height              matrix height.
- * @param[in]   width               matrix width.
- */
-extern void hl_softmax_backward(real* output_value,
-                                real* output_grad,
-                                int height,
-                                int width);
-
-/**
- * @brief   cudnn batch norm forward.
- *
- * @param[in]   inputDesc     input tensor descriptor desc.
- * @param[in]   input         input data.
- * @param[in]   outputDesc    output tensor descriptor desc.
- * @param[out]  output        output data.
- * @param[in]   bnParamDesc   tensor descriptor desc.
- *                            bnScale, bnBias, running mean/var, save_mean/var.
- * @param[in]   scale         batch normalization scale parameter (in original
- *                            paper scale is referred to as gamma).
- * @param[in]   bias          batch normalization bias parameter (in original
- *                            paper scale is referred to as beta).
- * @param[in]   factor        Factor used in the moving average computation.
- *                            runningMean = newMean * factor
- *                                         + runningMean * (1 - factor)
- * @param[in]   runningMean   running mean.
- * @param[in]   runningInvVar running variance.
- * @param[in]   epsilon       Epsilon value used in the batch normalization
- *                            formula.
- * @param[out]  savedMean     optional cache to save intermediate results.
- * @param[out]  savedVar      optional cache to save intermediate results.
- *
- */
-extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                           real* input,
-                                           hl_tensor_descriptor outputDesc,
-                                           real* output,
-                                           hl_tensor_descriptor bnParamDesc,
-                                           real* scale,
-                                           real* bias,
-                                           double factor,
-                                           real* runningMean,
-                                           real* runningInvVar,
-                                           double epsilon,
-                                           real* savedMean,
-                                           real* savedVar);
-
-/**
- * @brief   cudnn batch norm forward.
- *
- * @param[in]   inputDesc    input tensor descriptor desc.
- * @param[in]   input        input data.
- * @param[in]   outputDesc   output tensor descriptor desc.
- * @param[out]  output       output data.
- * @param[in]   bnParamDesc  tensor descriptor desc.
- *                           bnScale, bnBias, running mean/var, save_mean/var.
- * @param[in]   scale        batch normalization scale parameter (in original
- *                           paper scale is referred to as gamma).
- * @param[in]   bias         batch normalization bias parameter (in original
- *                           paper scale is referred to as beta).
- * @param[in]   estimatedMean
- * @param[in]   estimatedVar It is suggested that resultRunningMean,
- *                           resultRunningVariance from the
- *                           cudnnBatchNormalizationForwardTraining call
- *                           accumulated during the training phase are passed
- *                           as inputs here.
- * @param[in]   epsilon      Epsilon value used in the batch
- *                           normalization formula.
- *
- */
-extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                            real* input,
-                                            hl_tensor_descriptor outputDesc,
-                                            real* output,
-                                            hl_tensor_descriptor bnParamDesc,
-                                            real* scale,
-                                            real* bias,
-                                            real* estimatedMean,
-                                            real* estimatedVar,
-                                            double epsilon);
-
-/**
- * @brief   cudnn batch norm forward.
- *
- * @param[in]   inputDesc       input tensor descriptor desc.
- * @param[in]   input           input data.
- * @param[in]   outGradDesc     output tensor descriptor desc.
- * @param[out]  outGrad         output data.
- * @param[in]   inGradDesc      input tensor descriptor desc.
- * @param[in]   inGrad          input data.
- * @param[in]   dBnParamDesc    tensor descriptor desc.
- *                              bnScale, bnBias, running mean/var,
- * save_mean/var.
- * @param[in]   scale           batch normalization scale parameter (in original
- *                              paper scale is referred to as gamma).
- * @param[in]   scaleGrad       batch normalization scale parameter (in original
- *                              paper scale is referred to as gamma) gradient.
- * @param[in]   biasGrad        batch normalization bias parameter (in original
- *                              paper scale is referred to as beta) gradient.
- * @param[in]   epsilon         Epsilon value used in the batch
- *                              normalization formula.
- * @param[out]  savedMean       optional cache to save intermediate results.
- * @param[out]  savedInvVar     optional cache to save intermediate results.
- *
- */
-extern void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                                   real* input,
-                                   hl_tensor_descriptor outGradDesc,
-                                   real* outGrad,
-                                   hl_tensor_descriptor inGradDesc,
-                                   real* inGrad,
-                                   hl_tensor_descriptor dBnParamDesc,
-                                   real* scale,
-                                   real* scaleGrad,
-                                   real* biasGrad,
-                                   double epsilon,
-                                   real* savedMean,
-                                   real* savedInvVar);
-
-#endif  // HL_CUDA_CUDNN_H_
diff --git a/paddle/legacy/cuda/include/hl_cuda_cudnn.ph b/paddle/legacy/cuda/include/hl_cuda_cudnn.ph
deleted file mode 100644
index bb3b89f6faa..00000000000
--- a/paddle/legacy/cuda/include/hl_cuda_cudnn.ph
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CUDA_CUDNN_PH_
-#define HL_CUDA_CUDNN_PH_
-
-#include "hl_base.h"
-
-/*
- * @brief   hppl for cudnn tensor4d descriptor.
- */
-typedef struct {
-    cudnnTensorDescriptor_t     desc;
-    cudnnTensorFormat_t         format;
-    cudnnDataType_t             data_type;  // image data type
-    int batch_size;                         // number of input batch size
-    int feature_maps;                       // number of input feature maps
-    int height;                             // height of input image
-    int width;                              // width of input image
-} _cudnn_tensor_descriptor, *cudnn_tensor_descriptor;
-
-#define GET_TENSOR_DESCRIPTOR(image) (((cudnn_tensor_descriptor)image)->desc)
-
-/*
- * @brief   hppl for cudnn pooling descriptor.
- */
-typedef struct {
-    cudnnPoolingDescriptor_t   desc;
-    cudnnPoolingMode_t         mode;
-    int window_height;
-    int window_width;
-    int stride_height;
-    int stride_width;
-} _cudnn_pooling_descriptor, *cudnn_pooling_descriptor;
-
-/*
- * @brief   hppl for cudnn filter descriptor.
- */
-typedef struct {
-    cudnnFilterDescriptor_t   desc;
-    cudnnDataType_t           data_type;    /* data type */
-    int output_feature_maps;        /* number of output feature maps */
-    int input_feature_maps;         /* number of input feature maps */
-    int filter_height;              /* height of each input filter */
-    int filter_width;               /* width of  each input fitler */
-} _cudnn_filter_descriptor, *cudnn_filter_descriptor;
-
-#define GET_FILTER_DESCRIPTOR(filter) (((cudnn_filter_descriptor)filter)->desc)
-
-/*
- * @brief   hppl for cudnn convolution descriptor.
- */
-typedef struct {
-    cudnnConvolutionDescriptor_t    desc;
-    hl_tensor_descriptor             input_image;
-    hl_filter_descriptor            filter;
-    int padding_height;                     // zero-padding height
-    int padding_width;                      // zero-padding width
-    int stride_height;                      // vertical filter stride
-    int stride_width;                       // horizontal filter stride
-    int upscalex;                           // upscale the input in x-direction
-    int upscaley;                           // upscale the input in y-direction
-    cudnnConvolutionMode_t          mode;
-} _cudnn_convolution_descriptor, *cudnn_convolution_descriptor;
-
-#define GET_CONVOLUTION_DESCRIPTOR(conv)    \
-    (((cudnn_convolution_descriptor)conv)->desc)
-
-#endif /* HL_CUDA_CUDNN_PH_ */
diff --git a/paddle/legacy/cuda/include/hl_device_functions.cuh b/paddle/legacy/cuda/include/hl_device_functions.cuh
deleted file mode 100755
index ef068e10622..00000000000
--- a/paddle/legacy/cuda/include/hl_device_functions.cuh
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_DEVICE_FUNCTIONS_CUH_
-#define HL_DEVICE_FUNCTIONS_CUH_
-
-namespace paddle {
-
-template <class T>
-inline __device__ T paddleAtomicAdd(T* address, T val);
-
-template <>
-inline __device__ float paddleAtomicAdd(float* address, float val) {
-  return atomicAdd(address, val);
-}
-
-template <>
-inline __device__ double paddleAtomicAdd(double* address, double val) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
-  return atomicAdd(address, val);
-#else
-  // NOLINTNEXTLINE
-  unsigned long long int* address_as_ull = (unsigned long long int*)address;
-  unsigned long long int old = *address_as_ull, assumed; // NOLINT
-
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull,
-                    assumed,
-                    __double_as_longlong(val +
-                    __longlong_as_double(assumed)));
-  } while (assumed != old);
-
-  return __longlong_as_double(old);
-#endif
-}
-}  // namespace paddle
-
-/**
- * @brief  sum reduction
- *
- * @param[in,out]  smem       input data, better to use __shared__ memory.
- * @param[in]      tid        thread index.
- * @param[in]      threads    the total thread number used to reduce,
- *                            such as, blockDim.x.
- *
- * @return smem[0]: the sum of each elements in smem.
- */
-__device__ __forceinline__
-void simpleReduce(real* smem, int tid, int threads) {
-  for (unsigned int s = threads / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      smem[tid] += smem[tid + s];
-    }
-    __syncthreads();
-  }
-}
-
-#endif /* HL_DEVICE_FUNCTIONS_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_functions.h b/paddle/legacy/cuda/include/hl_functions.h
deleted file mode 100644
index 9912b4c1799..00000000000
--- a/paddle/legacy/cuda/include/hl_functions.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_FUNCTIONS_H_
-#define HL_FUNCTIONS_H_
-
-#include "hl_base.h"
-
-/**
- * sigmoid threshold maximum
- */
-#define SIGMOID_THRESHOLD_MIN -40.0
-
-/**
- * sigmoid threshold minimum
- */
-#define SIGMOID_THRESHOLD_MAX 13.0
-
-#ifndef __NVCC__
-namespace hppl {
-/*
- * forward activation
- */
-real relu(const real a);
-real sigmoid(const real a);
-real tanh(const real a);
-real linear(const real a);
-
-/*
- * backward activation
- */
-real relu(const real a, const real b);
-real sigmoid(const real a, const real b);
-real tanh(const real a, const real b);
-real linear(const real a, const real b);
-}  // namespace hppl
-
-#ifdef __AVX__
-#include "hl_avx_functions.h"
-#endif
-
-#else
-#include "hl_gpu_functions.cuh"
-#endif
-
-#endif  // HL_FUNCTIONS_H_
diff --git a/paddle/legacy/cuda/include/hl_gpu.h b/paddle/legacy/cuda/include/hl_gpu.h
deleted file mode 100644
index 50a2e9cdd29..00000000000
--- a/paddle/legacy/cuda/include/hl_gpu.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_GPU_H_
-#define HL_GPU_H_
-
-#include "hl_aggregate.h"
-#include "hl_base.h"
-#include "hl_cnn.h"
-#include "hl_cuda.h"
-#include "hl_cuda_cublas.h"
-#include "hl_cuda_cudnn.h"
-#include "hl_lstm.h"
-#include "hl_matrix.h"
-#include "hl_sequence.h"
-#include "hl_sparse.h"
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "hl_warpctc_wrap.h"
-#endif
-
-#ifdef HPPL_STUB_FUNC
-#include "stub/hl_aggregate_stub.h"
-#include "stub/hl_cnn_stub.h"
-#include "stub/hl_cuda_cublas_stub.h"
-#include "stub/hl_cuda_cudnn_stub.h"
-#include "stub/hl_cuda_stub.h"
-#include "stub/hl_lstm_stub.h"
-#include "stub/hl_matrix_stub.h"
-#include "stub/hl_sequence_stub.h"
-#include "stub/hl_sparse_stub.h"
-#endif
-
-#endif /* HL_GPU_H_ */
diff --git a/paddle/legacy/cuda/include/hl_gpu_functions.cuh b/paddle/legacy/cuda/include/hl_gpu_functions.cuh
deleted file mode 100644
index 705aa71f4ba..00000000000
--- a/paddle/legacy/cuda/include/hl_gpu_functions.cuh
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_GPU_FUNCTIONS_CUH_
-#define HL_GPU_FUNCTIONS_CUH_
-
-#include "hl_base.h"
-
-namespace hppl {
-
-  __device__ static real relu(const real a) {
-    return a > 0.0f ? a : 0.0f;
-  }
-
-  __device__ static real sigmoid(const real a) {
-    const real min = SIGMOID_THRESHOLD_MIN;
-    const real max = SIGMOID_THRESHOLD_MAX;
-    real tmp = (a < min) ? min : ((a > max) ? max : a);
-#ifndef PADDLE_TYPE_DOUBLE
-    return __fdividef(1.0f, 1.0f + __expf(-tmp));
-#else
-    return 1.0 / (1.0 + exp(-tmp));
-#endif
-  }
-
-  __device__ static real tanh(const real a) {
-#ifndef PADDLE_TYPE_DOUBLE
-    return __fdividef(2.0f, (1.0f + __expf(-2.0f*a))) - 1.0f;
-#else
-    return (2.0 / (1.0 + exp(-2.0*a))) - 1.0;
-#endif
-  }
-
-  __device__ static real linear(const real a) {
-    return a;
-  }
-
-  __device__ static real relu(const real a, const real b) {
-    return a * (b > 0.0f ? 1.0f : 0.0f);
-  }
-
-  __device__ static real sigmoid(const real a, const real b) {
-    return a * b * (1 - b);
-  }
-
-  __device__ static real tanh(const real a, const real b) {
-    return a * (1.0f - b * b);
-  }
-
-  __device__ static real linear(const real a, const real b) {
-    return a;
-  }
-
-}  // namespace hppl
-
-#endif  // HL_GPU_FUNCTIONS_CUH_
diff --git a/paddle/legacy/cuda/include/hl_gpu_gru.cuh b/paddle/legacy/cuda/include/hl_gpu_gru.cuh
deleted file mode 100644
index 8d299572c73..00000000000
--- a/paddle/legacy/cuda/include/hl_gpu_gru.cuh
+++ /dev/null
@@ -1,393 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_GPU_GRU_CUH_
-#define HL_GPU_GRU_CUH_
-
-#ifdef __NVCC__
-
-#include "paddle/legacy/utils/Logging.h"
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class OpResetOutput, bool isBatch>
-__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
-                                        real *gateValue,
-                                        real *resetOutputValue,
-                                        real *prevOutputValue,
-                                        int frameSize,
-                                        int batchSize,
-                                        hl_activation_mode_t active_gate) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    resetOutputValue += batchIdx * frameSize;
-  }
-
-  real rPrevOut = 0;
-  real rValueResetOutput;
-  real rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
-  real rValueResetGate  = gateValue[frameIdx + frameSize * 1];
-
-  if (prevOutputValue) {
-    if (isBatch) prevOutputValue += batchIdx * frameSize;
-    rPrevOut = prevOutputValue[frameIdx];
-  }
-
-  opResetOutput(rValueUpdateGate,
-                rValueResetGate,
-                rPrevOut,
-                rValueResetOutput,
-                hppl::gpu::forward[active_gate]);
-
-  gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
-  gateValue[frameIdx + frameSize * 1] = rValueResetGate;
-  resetOutputValue[frameIdx] = rValueResetOutput;
-}
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class OpFinalOutput, bool isBatch>
-__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
-                                        real *gateValue,
-                                        real *prevOutputValue,
-                                        real *outputValue,
-                                        int frameSize,
-                                        int batchSize,
-                                        hl_activation_mode_t active_node) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    outputValue += batchIdx * frameSize;
-  }
-
-  real rOutput;
-  real rPrevOut = 0;
-  real rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
-  real rValueFrameState = gateValue[frameIdx + frameSize * 2];
-
-  if (prevOutputValue) {
-    if (isBatch) prevOutputValue += batchIdx * frameSize;
-    rPrevOut = prevOutputValue[frameIdx];
-  }
-
-  opFinalOutput(rValueUpdateGate,
-                rValueFrameState,
-                rPrevOut,
-                rOutput,
-                hppl::gpu::forward[active_node]);
-
-  gateValue[frameIdx + frameSize * 2] = rValueFrameState;
-  outputValue[frameIdx] = rOutput;
-}
-
-template<class OpResetOutput, class OpFinalOutput>
-void hl_gpu_gru_forward(OpResetOutput opResetOutput,
-                        OpFinalOutput opFinalOutput,
-                        hl_gru_value value,
-                        int frameSize,
-                        int batchSize,
-                        hl_activation_mode_t active_node,
-                        hl_activation_mode_t active_gate) {
-  dim3 threads;
-  dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
-  } else {
-    threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
-  }
-
-  if (value.prevOutValue) {
-    hl_matrix_mul(value.prevOutValue, HPPL_OP_N,
-                  value.gateWeight, HPPL_OP_N,
-                  value.gateValue,
-                  batchSize, 2*frameSize, frameSize,
-                  /*alpha = */ 1, /*beta = */ 1,
-                  frameSize, 2* frameSize, 3*frameSize);
-  }
-
-  if (batchSize == 1) {
-    KeGruForwardResetOutput<OpResetOutput, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetOutput,
-        value.gateValue, value.resetOutputValue, value.prevOutValue,
-        frameSize, batchSize, active_gate);
-  } else {
-    KeGruForwardResetOutput<OpResetOutput, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetOutput,
-        value.gateValue, value.resetOutputValue, value.prevOutValue,
-        frameSize, batchSize, active_gate);
-  }
-
-  if (value.prevOutValue) {
-    hl_matrix_mul(value.resetOutputValue, HPPL_OP_N,
-                  value.stateWeight, HPPL_OP_N,
-                  value.gateValue + 2*frameSize,
-                  batchSize, frameSize, frameSize,
-                  /*alpha = */ 1, /*beta = */ 1,
-                  frameSize, frameSize, 3*frameSize);
-  }
-
-  if (batchSize == 1) {
-    KeGruForwardFinalOutput<OpFinalOutput, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opFinalOutput,
-        value.gateValue, value.prevOutValue, value.outputValue,
-        frameSize, batchSize, active_node);
-  } else {
-    KeGruForwardFinalOutput<OpFinalOutput, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opFinalOutput,
-        value.gateValue, value.prevOutValue, value.outputValue,
-        frameSize, batchSize, active_node);
-  }
-
-  CHECK_SYNC("hl_gpu_gru_forward failed");
-}
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class OpStateGrad, bool isBatch>
-__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad,
-                                       real *gateValue,
-                                       real *gateGrad,
-                                       real *prevOutValue,
-                                       real *prevOutGrad,
-                                       real *outputGrad,
-                                       int frameSize,
-                                       int batchSize,
-                                       hl_activation_mode_t active_node) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    gateGrad  += batchIdx * 3 * frameSize;
-    outputGrad += batchIdx * frameSize;
-  }
-
-  real rUpdateGateGrad;
-  real rFrameStateGrad;
-  real rPrevOutValue = 0;
-  real rPrevOutGrad  = 0;
-  real rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
-  real rFrameStateValue = gateValue[frameIdx + frameSize * 2];
-  real rOutGrad  = outputGrad[frameIdx];
-
-  if (prevOutValue && prevOutGrad) {
-    if (isBatch) prevOutValue += batchIdx * frameSize;
-    rPrevOutValue = prevOutValue[frameIdx];
-
-    if (isBatch) prevOutGrad  += batchIdx * frameSize;
-    rPrevOutGrad  = prevOutGrad[frameIdx];
-  }
-
-  opStateGrad(rUpdateGateValue,
-              rUpdateGateGrad,
-              rFrameStateValue,
-              rFrameStateGrad,
-              rPrevOutValue,
-              rPrevOutGrad,
-              rOutGrad,
-              hppl::gpu::backward[active_node]);
-
-  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
-  gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
-  if (prevOutGrad) {
-    prevOutGrad[frameIdx] = rPrevOutGrad;
-  }
-}
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class OpResetGrad, bool isBatch>
-__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad,
-                                       real *gateValue,
-                                       real *gateGrad,
-                                       real *prevOutValue,
-                                       real *prevOutGrad,
-                                       real *resetOutputGrad,
-                                       int frameSize,
-                                       int batchSize,
-                                       hl_activation_mode_t active_gate) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    gateGrad  += batchIdx * 3 * frameSize;
-    resetOutputGrad += batchIdx * frameSize;
-  }
-
-  real rResetGateGrad;
-  real rPrevOutValue = 0;
-  real rPrevOutGrad  = 0;
-  real rResetOutputGrad = 0;
-  real rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
-  real rUpdateGateGrad  = gateGrad[frameIdx + frameSize * 0];
-  real rResetGateValue  = gateValue[frameIdx + frameSize * 1];
-
-  if (prevOutValue && prevOutGrad) {
-    if (isBatch) prevOutValue += batchIdx * frameSize;
-    if (isBatch) prevOutGrad  += batchIdx * frameSize;
-    rPrevOutValue = prevOutValue[frameIdx];
-    rPrevOutGrad  = prevOutGrad[frameIdx];
-    rResetOutputGrad = resetOutputGrad[frameIdx];
-  }
-
-  opResetGrad(rUpdateGateValue,
-              rUpdateGateGrad,
-              rResetGateValue,
-              rResetGateGrad,
-              rPrevOutValue,
-              rPrevOutGrad,
-              rResetOutputGrad,
-              hppl::gpu::backward[active_gate]);
-
-  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
-  gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
-  if (prevOutGrad) {
-    prevOutGrad[frameIdx] = rPrevOutGrad;
-  }
-}
-
-template<class OpStateGrad, class OpResetGrad>
-void hl_gpu_gru_backward(OpStateGrad opStateGrad,
-                         OpResetGrad opResetGrad,
-                         hl_gru_value value,
-                         hl_gru_grad  grad,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate) {
-  dim3 threads;
-  dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
-  } else {
-    threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
-  }
-
-  if (batchSize == 1) {
-    KeGruBackwardStateGrad<OpStateGrad, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opStateGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.outputGrad, frameSize, batchSize, active_node);
-  } else {
-    KeGruBackwardStateGrad<OpStateGrad, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opStateGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.outputGrad, frameSize, batchSize, active_node);
-  }
-
-  if (value.prevOutValue && grad.prevOutGrad) {
-    hl_matrix_mul(grad.gateGrad + 2*frameSize, HPPL_OP_N,
-                  value.stateWeight, HPPL_OP_T,
-                  grad.resetOutputGrad,
-                  batchSize, frameSize, frameSize,
-                  /*alpha = */ 1, /*beta = */ 0,
-                  3*frameSize, frameSize, frameSize);
-    if (grad.stateWeightGrad) {
-      hl_matrix_mul(value.resetOutputValue, HPPL_OP_T,
-                    grad.gateGrad + 2*frameSize, HPPL_OP_N,
-                    grad.stateWeightGrad,
-                    frameSize, frameSize, batchSize,
-                    /*alpha = */ 1, /*beta = */ 1,
-                    frameSize, 3*frameSize, frameSize);
-    }
-  }
-
-  if (batchSize == 1) {
-    KeGruBackwardResetGrad<OpResetGrad, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.resetOutputGrad, frameSize, batchSize, active_gate);
-  } else {
-    KeGruBackwardResetGrad<OpResetGrad, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.resetOutputGrad, frameSize, batchSize, active_gate);
-  }
-
-  if (grad.prevOutGrad && value.prevOutValue) {
-    hl_matrix_mul(grad.gateGrad, HPPL_OP_N,
-                  value.gateWeight, HPPL_OP_T,
-                  grad.prevOutGrad,
-                  batchSize, frameSize, 2*frameSize,
-                  /*alpha = */ 1, /*beta = */ 1,
-                  3*frameSize, 2*frameSize, frameSize);
-    if (grad.gateWeightGrad) {
-      hl_matrix_mul(value.prevOutValue, HPPL_OP_T,
-                    grad.gateGrad, HPPL_OP_N,
-                    grad.gateWeightGrad,
-                    frameSize, 2*frameSize, batchSize,
-                    /*alpha = */ 1, /*beta = */ 1,
-                    frameSize, 3*frameSize, 2*frameSize);
-    }
-  }
-
-  CHECK_SYNC("hl_gpu_gru_backward failed");
-}
-
-#else
-
-template<class OpResetOutput, class OpFinalOutput>
-void hl_gpu_gru_forward(OpResetOutput opResetOutput,
-                        OpFinalOutput opFinalOutput,
-                        hl_gru_value value,
-                        int frameSize,
-                        int batchSize,
-                        hl_activation_mode_t active_node,
-                        hl_activation_mode_t active_gate) {}
-
-template<class OpStateGrad, class OpResetGrad>
-void hl_gpu_gru_backward(OpStateGrad opStateGrad,
-                         OpResetGrad opResetGrad,
-                         hl_gru_value value,
-                         hl_gru_grad  grad,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate) {}
-
-#endif
-
-#endif /* HL_GPU_GRU_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_gpu_lstm.cuh b/paddle/legacy/cuda/include/hl_gpu_lstm.cuh
deleted file mode 100644
index aae011b838c..00000000000
--- a/paddle/legacy/cuda/include/hl_gpu_lstm.cuh
+++ /dev/null
@@ -1,300 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_GPU_LSTM_CUH_
-#define HL_GPU_LSTM_CUH_
-
-#ifdef __NVCC__
-
-#include "paddle/legacy/utils/Logging.h"
-#include "hl_device_functions.cuh"
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class Op, bool isBatch>
-__global__ void KeLstmForward(Op op,
-                              hl_lstm_value value,
-                              int frameSize,
-                              int batchSize,
-                              hl_activation_mode_t active_node,
-                              hl_activation_mode_t active_gate,
-                              hl_activation_mode_t active_state) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    value.gateValue += batchIdx * frameSize * 4;
-    value.outputValue += batchIdx * frameSize;
-    value.stateValue  += batchIdx * frameSize;
-    value.stateActiveValue += batchIdx * frameSize;
-  }
-
-  real rState;
-  real rPrevState = 0;
-  real rStateAtv;
-  real rOut;
-  real rValueIn;
-  real rValueIg;
-  real rValueFg;
-  real rValueOg;
-  real rCheckI = value.checkIg[frameIdx];
-  real rCheckF = value.checkFg[frameIdx];
-  real rCheckO = value.checkOg[frameIdx];
-
-  rValueIn = value.gateValue[frameIdx];
-  rValueIg = value.gateValue[frameIdx + frameSize];
-  rValueFg = value.gateValue[frameIdx + frameSize * 2];
-  rValueOg = value.gateValue[frameIdx + frameSize * 3];
-
-  if (value.prevStateValue) {
-    if (isBatch) value.prevStateValue += batchIdx * frameSize;
-    rPrevState = value.prevStateValue[frameIdx];
-  }
-
-  op(rValueIn,
-     rValueIg,
-     rValueFg,
-     rValueOg,
-     rPrevState,
-     rState,
-     rStateAtv,
-     rOut,
-     rCheckI,
-     rCheckF,
-     rCheckO,
-     hppl::gpu::forward[active_node],
-     hppl::gpu::forward[active_gate],
-     hppl::gpu::forward[active_state]);
-
-  value.gateValue[frameIdx] = rValueIn;
-  value.gateValue[frameIdx + frameSize] = rValueIg;
-  value.gateValue[frameIdx + frameSize * 2] = rValueFg;
-  value.gateValue[frameIdx + frameSize * 3] = rValueOg;
-
-  value.stateValue[frameIdx] = rState;
-  value.stateActiveValue[frameIdx] = rStateAtv;
-  value.outputValue[frameIdx] = rOut;
-}
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class Op, bool isBatch>
-__global__ void KeLstmBackward(Op op,
-                               hl_lstm_value value,
-                               hl_lstm_grad grad,
-                               int frameSize,
-                               int batchSize,
-                               hl_activation_mode_t active_node,
-                               hl_activation_mode_t active_gate,
-                               hl_activation_mode_t active_state) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    value.gateValue += batchIdx * frameSize * 4;
-    value.stateValue += batchIdx * frameSize;
-    value.stateActiveValue += batchIdx * frameSize;
-    grad.gateGrad += batchIdx * frameSize * 4;
-    grad.stateGrad += batchIdx * frameSize;
-    grad.outputGrad += batchIdx * frameSize;
-  }
-
-  real rValueIn;
-  real rValueIg;
-  real rValueFg;
-  real rValueOg;
-  real rGradIn;
-  real rGradIg;
-  real rGradFg;
-  real rGradOg;
-  real rPrevState = 0;
-  real rPrevStateGrad;
-  real rState;
-  real rStateGrad;
-  real rStateAtv;
-  real rOutputGrad;
-  real rCheckI = value.checkIg[frameIdx];
-  real rCheckF = value.checkFg[frameIdx];
-  real rCheckO = value.checkOg[frameIdx];
-  real rCheckIGrad;
-  real rCheckFGrad;
-  real rCheckOGrad;
-
-  rValueIn = value.gateValue[frameIdx];
-  rValueIg = value.gateValue[frameIdx + frameSize];
-  rValueFg = value.gateValue[frameIdx + frameSize * 2];
-  rValueOg = value.gateValue[frameIdx + frameSize * 3];
-  rState = value.stateValue[frameIdx];
-  rStateAtv = value.stateActiveValue[frameIdx];
-  rOutputGrad = grad.outputGrad[frameIdx];
-  rStateGrad = grad.stateGrad[frameIdx];
-
-  if (value.prevStateValue) {
-    if (isBatch) value.prevStateValue += batchIdx * frameSize;
-    rPrevState = value.prevStateValue[frameIdx];
-  }
-
-  op(rValueIn,
-     rValueIg,
-     rValueFg,
-     rValueOg,
-     rGradIn,
-     rGradIg,
-     rGradFg,
-     rGradOg,
-     rPrevState,
-     rPrevStateGrad,
-     rState,
-     rStateGrad,
-     rStateAtv,
-     rOutputGrad,
-     rCheckI,
-     rCheckF,
-     rCheckO,
-     rCheckIGrad,
-     rCheckFGrad,
-     rCheckOGrad,
-     hppl::gpu::backward[active_node],
-     hppl::gpu::backward[active_gate],
-     hppl::gpu::backward[active_state]);
-
-  grad.gateGrad[frameIdx] = rGradIn;
-  grad.gateGrad[frameIdx + frameSize    ] = rGradIg;
-  grad.gateGrad[frameIdx + frameSize * 2] = rGradFg;
-  grad.gateGrad[frameIdx + frameSize * 3] = rGradOg;
-  grad.stateGrad[frameIdx] = rStateGrad;
-  if (grad.prevStateGrad) {
-    if (isBatch) grad.prevStateGrad += batchIdx * frameSize;
-    grad.prevStateGrad[frameIdx] = rPrevStateGrad;
-  }
-
-  if (isBatch) {
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad) paddle::paddleAtomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
-      if (grad.checkFgGrad) paddle::paddleAtomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
-    }
-    if (grad.checkOgGrad) paddle::paddleAtomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
-  } else {
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
-      if (grad.checkFgGrad) grad.checkFgGrad[frameIdx] += rCheckFGrad;
-    }
-    if (grad.checkOgGrad) grad.checkOgGrad[frameIdx] += rCheckOGrad;
-  }
-}
-
-template<class Op>
-void hl_gpu_lstm_forward(Op op,
-                         hl_lstm_value value,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate,
-                         hl_activation_mode_t active_state) {
-  dim3 threads;
-  dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
-  } else {
-    /* framePerBlock = 32 batchPerBlock = 32 */
-    threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
-  }
-
-  if (batchSize == 1) {
-    KeLstmForward<Op, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value,
-      frameSize, batchSize, active_node, active_gate, active_state);
-  } else {
-    KeLstmForward<Op, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value,
-      frameSize, batchSize, active_node, active_gate, active_state);
-  }
-
-  CHECK_SYNC("hl_gpu_lstm_forward failed");
-}
-
-template<class Op>
-void hl_gpu_lstm_backward(Op op,
-                          hl_lstm_value value,
-                          hl_lstm_grad grad,
-                          int frameSize,
-                          int batchSize,
-                          hl_activation_mode_t active_node,
-                          hl_activation_mode_t active_gate,
-                          hl_activation_mode_t active_state) {
-  dim3 threads;
-  dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
-  } else {
-    /* framePerBlock = 32 batchPerBlock = 32 */
-    threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
-  }
-
-  if (batchSize == 1) {
-    KeLstmBackward<Op, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value, grad,
-      frameSize, batchSize, active_node, active_gate, active_state);
-  } else {
-    KeLstmBackward<Op, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value, grad,
-      frameSize, batchSize, active_node, active_gate, active_state);
-  }
-
-  CHECK_SYNC("hl_gpu_lstm_backward failed");
-}
-
-#else
-
-template<class Op>
-void hl_gpu_lstm_forward(Op op,
-                         hl_lstm_value value,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate,
-                         hl_activation_mode_t active_state) {}
-
-template<class Op>
-void hl_gpu_lstm_backward(Op op,
-                          hl_lstm_value value,
-                          hl_lstm_grad grad,
-                          int frameSize,
-                          int batchSize,
-                          hl_activation_mode_t active_node,
-                          hl_activation_mode_t active_gate,
-                          hl_activation_mode_t active_state) {}
-
-#endif
-
-#endif /* HL_GPU_LSTM_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh b/paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh
deleted file mode 100644
index 6177d23657f..00000000000
--- a/paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh
+++ /dev/null
@@ -1,629 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-
-#ifndef HL_GPU_MATRIX_KERNEL_CUH_
-#define HL_GPU_MATRIX_KERNEL_CUH_
-
-#include <algorithm>
-#include "paddle/legacy/utils/Logging.h"
-#include "hl_base.h"
-
-#ifdef __NVCC__
-/* gpu apply interface */
-
-template<class T, class Op>
-__global__ void KeEltWiseUnaryOp(T* A_d, const int border, Op op) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    op.gpuOperator(A_d[idx]);
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseUnaryOp(T* A_d,
-                                 int dimM,
-                                 int dimN,
-                                 int lda,
-                                 Op op) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
-      op.gpuOperator(A_d[i * lda + j]);
-    }
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseBinaryOp(T* A_d, T *B_d, const int border, Op op) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    op.gpuOperator(A_d[idx], B_d[idx]);
-  }
-}
-
-template<class T, class Op, bool BAsRowVector, bool BAsColVector>
-__global__ void KeEltWiseBinaryOp(T *A_d,
-                                  T *B_d,
-                                  int dimM,
-                                  int dimN,
-                                  int lda,
-                                  int ldb,
-                                  Op op) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
-      if (BAsRowVector == 0 && BAsColVector == 0) {
-        op.gpuOperator(A_d[i * lda + j], B_d[i * ldb + j]);
-      } else if (BAsRowVector == 1 && BAsColVector == 0) {
-        op.gpuOperator(A_d[i * lda + j], B_d[j]);
-      } else if (BAsRowVector == 0 && BAsColVector == 1) {
-        op.gpuOperator(A_d[i * lda + j], B_d[i * ldb]);
-      } else {
-        op.gpuOperator(A_d[i * lda + j], B_d[0]);
-      }
-    }
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseTernaryOp(T* A_d,
-                                   T *B_d,
-                                   T *C_d,
-                                   const int border,
-                                   Op op) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    op.gpuOperator(A_d[idx], B_d[idx], C_d[idx]);
-  }
-}
-
-template<class T, class Op, bool CAsRowVector, bool CAsColVector>
-__global__ void KeEltWiseTernaryOp(T* A_d,
-                                   T* B_d,
-                                   T* C_d,
-                                   int dimM,
-                                   int dimN,
-                                   int lda,
-                                   int ldb,
-                                   int ldc,
-                                   Op op) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
-      if (CAsRowVector == 0 && CAsColVector == 0) {
-        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[i*ldc + j]);
-      } else if (CAsRowVector == 1 && CAsColVector == 0) {
-        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[j]);
-      } else if (CAsRowVector == 0 && CAsColVector == 1) {
-        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[i*ldc]);
-      } else {
-        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[0]);
-      }
-    }
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseQuaternaryOp(T* A_d,
-                                      T* B_d,
-                                      T* C_d,
-                                      T* D_d,
-                                      const int border,
-                                      Op op) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    op.gpuOperator(A_d[idx], B_d[idx], C_d[idx], D_d[idx]);
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseQuaternaryOp(T* A_d,
-                                      T* B_d,
-                                      T* C_d,
-                                      T* D_d,
-                                      int dimM,
-                                      int dimN,
-                                      int lda,
-                                      int ldb,
-                                      int ldc,
-                                      int ldd,
-                                      Op op) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
-      op.gpuOperator(A_d[i*lda + j],
-        B_d[i*ldb + j], C_d[i*ldc + j], D_d[i*ldd + j]);
-    }
-  }
-}
-
-/**
- * @brief   gpu element wise unary operator.
- */
-template <class T, class Op>
-void hl_gpu_apply_unary_op(Op op, T* A_d, int dimM, int dimN, int lda) {
-  CHECK_NOTNULL(A_d);
-
-  if (dimM == 1 || dimN == lda) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    KeEltWiseUnaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-      (A_d, size, op);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    KeEltWiseUnaryOp<T, Op><<<grid, threads, 0, STREAM_DEFAULT>>>
-      (A_d, dimM, dimN, lda, op);
-  }
-
-  CHECK_SYNC("hl_gpu_apply_unary_op failed");
-}
-
-/**
- * @brief   gpu element wise binary operator.
- */
-template <class T, class Op, bool BAsRowVector, bool BAsColVector>
-void hl_gpu_apply_binary_op(Op op,
-                            T* A_d,
-                            T* B_d,
-                            int dimM,
-                            int dimN,
-                            int lda,
-                            int ldb) {
-  CHECK_NOTNULL(A_d);
-
-  if ((BAsRowVector == 0 && BAsColVector == 0) &&
-      ((dimM == 1) || (dimN == lda && dimN == ldb))) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    KeEltWiseBinaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, size, op);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    KeEltWiseBinaryOp<T, Op, BAsRowVector, BAsColVector>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, dimM, dimN, lda, ldb, op);
-  }
-
-  CHECK_SYNC("hl_gpu_apply_binary_op failed");
-}
-
-/**
- * @brief   gpu element wise ternary operator.
- */
-template <class T, class Op, bool CAsRowVector, bool CAsColVector>
-void hl_gpu_apply_ternary_op(Op op,
-                             T* A_d,
-                             T* B_d,
-                             T* C_d,
-                             int dimM,
-                             int dimN,
-                             int lda,
-                             int ldb,
-                             int ldc) {
-  CHECK_NOTNULL(A_d);
-
-  if ((CAsRowVector == 0 && CAsColVector == 0) &&
-      ((dimM == 1) || (dimN == lda && dimN == ldb && dimN == ldc))) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    KeEltWiseTernaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, C_d, size, op);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    KeEltWiseTernaryOp<T, Op, CAsRowVector, CAsColVector>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, C_d, dimM, dimN, lda, ldb, ldc, op);
-  }
-
-  CHECK_SYNC("hl_gpu_apply_ternary_op failed");
-}
-
-
-/**
- * @brief   gpu element wise quaternary operator.
- */
-template <class T, class Op>
-void hl_gpu_apply_quaternary_op(Op op,
-                                T* A_d,
-                                T* B_d,
-                                T* C_d,
-                                T* D_d,
-                                int dimM,
-                                int dimN,
-                                int lda,
-                                int ldb,
-                                int ldc,
-                                int ldd) {
-  CHECK_NOTNULL(A_d);
-
-  if ((dimM == 1) ||
-      (dimN == lda && dimN == ldb && dimN == ldc && dimN == ldd)) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    KeEltWiseQuaternaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, C_d, D_d, size, op);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    KeEltWiseQuaternaryOp<T, Op><<<grid, threads, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, C_d, D_d, dimM, dimN, lda, ldb, ldc, ldd, op);
-  }
-
-  CHECK_SYNC("hl_gpu_apply_quaternary_op failed");
-}
-
-#else
-
-template <class T, class Op>
-void hl_gpu_apply_unary_op(Op op, T* A_d, int dimM, int dimN, int lda) {}
-
-template <class T, class Op, bool BAsRowVector, bool BAsColVector>
-void hl_gpu_apply_binary_op(Op op,
-                            T* A_d,
-                            T* B_d,
-                            int dimM,
-                            int dimN,
-                            int lda,
-                            int ldb) {}
-
-template <class T, class Op, bool CAsRowVector, bool CAsColVector>
-void hl_gpu_apply_ternary_op(Op op,
-                             T* A_d,
-                             T* B_d,
-                             T* C_d,
-                             int dimM,
-                             int dimN,
-                             int lda,
-                             int ldb,
-                             int ldc) {}
-
-template <class T, class Op>
-void hl_gpu_apply_quaternary_op(Op op,
-                                T* A_d,
-                                T* B_d,
-                                T* C_d,
-                                T* D_d,
-                                int dimM,
-                                int dimN,
-                                int lda,
-                                int ldb,
-                                int ldc,
-                                int ldd) {}
-#endif
-
-#ifdef __NVCC__
-/**
- * @brief   matrix row operator.
- */
-
-template<class Agg, class Op>
-__device__ __inline__ real sumRow(Agg agg, Op op,
-                                  int idx, int blockSize,
-                                  int dimN, real *A) {
-  real tmp = agg.init();
-  int cnt = (dimN + blockSize -1) / blockSize;
-  for (int i = 0; i < cnt && idx < dimN; i++) {
-      tmp = agg(tmp, op(A[idx]));
-      idx += blockSize;
-  }
-  return tmp;
-}
-
-template<class Agg, class Op>
-__device__ __inline__ real sumRow(Agg agg, Op op,
-                                  int idx, int blockSize,
-                                  int dimN, real *A, real *B) {
-  real tmp = agg.init();
-  int cnt = (dimN + blockSize -1) / blockSize;
-  for (int i = 0; i < cnt && idx < dimN; i++) {
-    tmp = agg(tmp, op(A[idx], B[idx]));
-    idx += blockSize;
-  }
-  return tmp;
-}
-
-template<class Agg>
-__device__ __inline__ void aggRow(Agg agg, real *row, int size, int tid) {
-  for (int stride = size/2; stride > 0; stride = stride/2) {
-    if (tid < stride) {
-      row[tid] = agg(row[tid], row[tid + stride]);
-    }
-    __syncthreads();
-  }
-}
-
-template<class Agg, class Op, class Saver, int blockSize>
-__global__ void KeMatrixRowOp(Agg agg, Op op, Saver sv,
-                              int dimN,
-                              real *dst, int ld,
-                              real *A, int lda) {
-  __shared__ real row_s[blockSize];
-  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
-  int tid = threadIdx.x;
-
-  A += rowId*lda;
-  row_s[tid] = sumRow(agg, op, tid, blockSize, dimN, A);
-  __syncthreads();
-
-  aggRow(agg, row_s, blockSize, tid);
-  __syncthreads();
-
-  if (tid == 0) {
-    dst[rowId*ld] = sv(dst[rowId*ld], row_s[0]);
-  }
-}
-
-template<class Agg, class Op, class Saver, int blockSize>
-__global__ void KeMatrixRowOp(Agg agg, Op op, Saver sv,
-                              int dimN,
-                              real *dst, int ld,
-                              real *A, int lda,
-                              real *B, int ldb) {
-  __shared__ real row_s[blockSize];
-  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
-  int tid = threadIdx.x;
-
-  A += rowId*lda;
-  B += rowId*ldb;
-  row_s[tid] = sumRow(agg, op, tid, blockSize, dimN, A, B);
-  __syncthreads();
-
-  aggRow(agg, row_s, blockSize, tid);
-  __syncthreads();
-
-  if (tid == 0) {
-    dst[rowId*ld] = sv(dst[rowId*ld], row_s[0]);
-  }
-}
-
-/**
- * @brief   matrix column operator.
- */
-template <class Agg, class Op>
-__device__ __inline__ real sumCol(Agg agg, Op op,
-                                  int index, int stride,
-                                  int dimM, real *A, int lda) {
-  real tmp = agg.init();
-  for (; index < dimM;) {
-    tmp = agg(tmp, op(A[index*lda]));
-    index += stride;
-  }
-  return tmp;
-}
-
-template <class Agg, class Op>
-__device__ __inline__ real sumCol(Agg agg, Op op,
-                                  int index, int stride, int dimM,
-                                  real *A, int lda, real *B, int ldb) {
-  real tmp = agg.init();
-  for (; index < dimM;) {
-    tmp = agg(tmp, op(A[index*lda], B[index*ldb]));
-    index += stride;
-  }
-  return tmp;
-}
-
-template <class Agg, class Op, class Saver>
-__global__ void KeMatrixColumnOp(Agg agg, Op op, Saver sv,
-                                 int dimM, int dimN,
-                                 real *dst,
-                                 real *A, int lda) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (rowIdx < dimN) {
-    A += rowIdx;
-    real tmp = sumCol(agg, op, 0, 1, dimM, A, lda);
-    dst[rowIdx] = sv(dst[rowIdx], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver, int blockDimX, int blockDimY>
-__global__ void KeMatrixColumnOp_S(Agg agg, Op op, Saver sv,
-                                   int dimM, int dimN,
-                                   real *dst,
-                                   real *A, int lda) {
-  __shared__ real col_s[blockDimX*blockDimY];
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (rowIdx < dimN) {
-    A += rowIdx;
-    real tmp = sumCol(agg, op, threadIdx.y, blockDimY, dimM, A, lda);
-    col_s[threadIdx.x + threadIdx.y*blockDimX] = tmp;
-  }
-  __syncthreads();
-
-  if (rowIdx < dimN) {
-    if (threadIdx.y ==0) {
-      real tmp = agg.init();
-      for (int i=0; i < blockDimY; i++) {
-        tmp = agg(tmp, col_s[threadIdx.x + i*blockDimX]);
-      }
-      dst[rowIdx] = sv(dst[rowIdx], tmp);
-    }
-  }
-}
-
-template <class Agg, class Op, class Saver>
-__global__ void KeMatrixColumnOp(Agg agg, Op op, Saver sv,
-                                 int dimM, int dimN,
-                                 real *dst,
-                                 real *A, int lda,
-                                 real *B, int ldb) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (rowIdx < dimN) {
-    A += rowIdx;
-    B += rowIdx;
-    real tmp = sumCol(agg, op, 0, 1, dimM, A, lda, B, ldb);
-    dst[rowIdx] = sv(dst[rowIdx], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver, int blockDimX, int blockDimY>
-__global__ void KeMatrixColumnOp_S(Agg agg, Op op, Saver sv,
-                                   int dimM, int dimN,
-                                   real *dst,
-                                   real *A, int lda,
-                                   real *B, int ldb) {
-  __shared__ real col_s[blockDimX*blockDimY];
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (rowIdx < dimN) {
-    A += rowIdx;
-    B += rowIdx;
-    real tmp = sumCol(agg, op,
-        threadIdx.y, blockDimY, dimM, A, lda, B, ldb);
-    col_s[threadIdx.x + threadIdx.y*blockDimX] = tmp;
-  }
-  __syncthreads();
-
-  if (rowIdx < dimN) {
-    if (threadIdx.y ==0) {
-      real tmp = agg.init();
-      for (int i=0; i < blockDimY; i++) {
-        tmp = agg(tmp, col_s[threadIdx.x + i*blockDimX]);
-      }
-      dst[rowIdx] = sv(dst[rowIdx], tmp);
-    }
-  }
-}
-
-#endif
-
-template <class Agg, class Op, class Saver>
-void hl_gpu_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda) {
-#ifdef __NVCC__
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(A);
-
-  int blocksX = dimM;
-  int blocksY = 1;
-  dim3 threads(128, 1);
-  dim3 grid(blocksX, blocksY);
-  KeMatrixRowOp<Agg, Op, Saver, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (agg, op, sv, dimN, dst, ld, A, lda);
-
-  CHECK_SYNC("hl_matrix_row_op failed");
-#endif
-}
-
-template <class Agg, class Op, class Saver>
-void hl_gpu_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda,
-                          real *B, int ldb) {
-#ifdef __NVCC__
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(A);
-
-  int blocksX = dimM;
-  int blocksY = 1;
-  dim3 threads(128, 1);
-  dim3 grid(blocksX, blocksY);
-  KeMatrixRowOp<Agg, Op, Saver, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (agg, op, sv, dimN, dst, ld, A, lda, B, ldb);
-
-  CHECK_SYNC("hl_matrix_row_op failed");
-#endif
-}
-
-template <class Agg, class Op, class Saver>
-void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda) {
-#ifdef __NVCC__
-  if (dimN >= 8192) {
-    int blocksX = (dimN + 128 -1) / 128;
-    int blocksY = 1;
-    dim3 threads(128, 1);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp<Agg, Op, Saver>
-        <<< grid, threads, 0, STREAM_DEFAULT >>>
-        (agg, op, sv, dimM, dimN, dst, A, lda);
-  } else {
-    int blocksX = (dimN + 32 -1) / 32;
-    int blocksY = 1;
-    dim3 threads(32, 32);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp_S<Agg, Op, Saver, 32, 32>
-        <<< grid, threads, 0, STREAM_DEFAULT>>>
-        (agg, op, sv, dimM, dimN, dst, A, lda);
-  }
-
-  CHECK_SYNC("hl_matrix_column_op failed");
-#endif
-}
-
-template <class Agg, class Op, class Saver>
-void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda,
-                             real *B, int ldb) {
-#ifdef __NVCC__
-  if (dimN >= 8192) {
-    int blocksX = (dimN + 128 -1) / 128;
-    int blocksY = 1;
-    dim3 threads(128, 1);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp<Agg, Op, Saver>
-        <<< grid, threads, 0, STREAM_DEFAULT >>>
-        (agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else {
-    int blocksX = (dimN + 32 -1) / 32;
-    int blocksY = 1;
-    dim3 threads(32, 32);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp_S<Agg, Op, Saver, 32, 32>
-        <<< grid, threads, 0, STREAM_DEFAULT>>>
-        (agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  }
-
-  CHECK_SYNC("hl_matrix_column_op failed");
-#endif
-}
-
-#endif /* HL_GPU_MATRIX_KERNEL_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_gru_ops.cuh b/paddle/legacy/cuda/include/hl_gru_ops.cuh
deleted file mode 100644
index 6c647c514db..00000000000
--- a/paddle/legacy/cuda/include/hl_gru_ops.cuh
+++ /dev/null
@@ -1,205 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_GRU_OPS_CUH_
-#define HL_GRU_OPS_CUH_
-
-#ifdef __CUDA_ARCH__
-#define INLINE   __device__ inline
-#else
-#define INLINE   inline
-#endif
-
-namespace hppl {
-
-namespace forward {
-class gru_resetOutput {
-public:
-  /**
-   * @param[in,out]   valueUpdateGate  update gate
-   * @param[in,out]   valueResetGate   reset gate
-   * @param[in]       prevOut          previous output
-   * @param[out]      valueResetOutput intermediate value for frame state
-   * @param[in]       actGate          forward function of gate
-   */
-  INLINE void operator()(real &valueUpdateGate,
-                         real &valueResetGate,
-                         real &prevOut,
-                         real &valueResetOutput,
-                         Active<real>::forward actGate) {
-    valueUpdateGate  = actGate(valueUpdateGate);
-    valueResetGate   = actGate(valueResetGate);
-    valueResetOutput = prevOut * valueResetGate;
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  INLINE void operator()(__m256 &valueUpdateGate,
-                         __m256 &valueResetGate,
-                         __m256 &prevOut,
-                         __m256 &valueResetOutput,
-                         Active<__m256>::forward actGate) {
-    valueUpdateGate  = actGate(valueUpdateGate);
-    valueResetGate   = actGate(valueResetGate);
-    valueResetOutput = _mm256_mul_ps(prevOut, valueResetGate);
-  }
-#endif
-#endif
-};
-
-class gru_finalOutput {
-public:
-  /**
-   * @param[in]     valueUpdateGate   update gate
-   * @param[in,out] valueFrameState   frame state ({\tilde{h}_t})
-   * @param[in]     prevOut           previous output
-   * @param[out]    valueOutput       output
-   * @param[in]     actInput          forward function of node
-   */
-  INLINE void operator()(real &valueUpdateGate,
-                         real &valueFrameState,
-                         real &prevOut,
-                         real &valueOutput,
-                         Active<real>::forward actInput ) {
-    valueFrameState = actInput(valueFrameState);
-    valueOutput = prevOut - (valueUpdateGate * prevOut) +
-      (valueUpdateGate * valueFrameState);
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  INLINE void operator()(__m256 &valueUpdateGate,
-                         __m256 &valueFrameState,
-                         __m256 &prevOut,
-                         __m256 &valueOutput,
-                         Active<__m256>::forward actInput) {
-    valueFrameState = actInput(valueFrameState);
-    valueOutput = _mm256_add_ps(
-      _mm256_sub_ps(prevOut, _mm256_mul_ps(valueUpdateGate, prevOut)),
-      _mm256_mul_ps(valueUpdateGate, valueFrameState));
-  }
-#endif
-#endif
-};
-}  // namespace forward
-
-namespace backward {
-class gru_stateGrad {
-public:
-  /**
-   * @param[in]     valueUpdateGate   update gate value
-   * @param[out]    gradUpdateGate    update gate grad
-   * @param[in]     valueFrameState   frame state value
-   * @param[out]    gradFrameState    frame state grad
-   * @param[in]     valuePrevOut      previous output value
-   * @param[in,out] gradPrevOut       previous output grad
-   * @param[in]     gradOutput        output grad
-   * @param[in]     actInput          backward function of frame state
-   */
-  INLINE void operator()(real &valueUpdateGate,
-                         real &gradUpdateGate,
-                         real &valueFrameState,
-                         real &gradFrameState,
-                         real &valuePrevOut,
-                         real &gradPrevOut,
-                         real &gradOutput,
-                         Active<real>::backward actInput) {
-    gradUpdateGate = (gradOutput * valueFrameState);
-    gradUpdateGate -= (gradOutput * valuePrevOut);
-    gradPrevOut -= (gradOutput * valueUpdateGate);
-    gradPrevOut += gradOutput;
-    gradFrameState = actInput(gradOutput * valueUpdateGate, valueFrameState);
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  INLINE void operator()(__m256 &valueUpdateGate,
-                         __m256 &gradUpdateGate,
-                         __m256 &valueFrameState,
-                         __m256 &gradFrameState,
-                         __m256 &valuePrevOut,
-                         __m256 &gradPrevOut,
-                         __m256 &gradOutput,
-                         Active<__m256>::backward actInput) {
-    gradUpdateGate = _mm256_mul_ps(gradOutput, valueFrameState);
-    gradUpdateGate = _mm256_sub_ps(
-      gradUpdateGate, _mm256_mul_ps(gradOutput, valuePrevOut));
-    gradPrevOut = _mm256_add_ps(
-      _mm256_sub_ps(gradPrevOut, _mm256_mul_ps(gradOutput, valueUpdateGate)),
-      gradOutput);
-    gradFrameState = actInput(
-      _mm256_mul_ps(gradOutput, valueUpdateGate), valueFrameState);
-  }
-#endif
-#endif
-};
-
-class gru_resetGrad {
-public:
-  /**
-   * @param[in]     valueUpdateGate   update gate value
-   * @param[in,out] gradUpdateGate    update gate grad
-   * @param[in]     valueResetGate    reset gate value
-   * @param[out]    gradResetGate     reset gate grad
-   * @param[in]     valuePrevOut      previous output value
-   * @param[in,out] gradPrevOut       previous output grad
-   * @param[in]     gradResetOutput   reset output grad (temp val)
-   * @param[in]     actGate           backward function of gate
-   */
-  INLINE void operator()(real &valueUpdateGate,
-                         real &gradUpdateGate,
-                         real &valueResetGate,
-                         real &gradResetGate,
-                         real &valuePrevOut,
-                         real &gradPrevOut,
-                         real &gradResetOutput,
-                         Active<real>::backward actGate) {
-    gradResetGate = (gradResetOutput * valuePrevOut);
-    gradPrevOut += (gradResetOutput * valueResetGate);
-    gradUpdateGate = actGate(gradUpdateGate, valueUpdateGate);
-    gradResetGate  = actGate(gradResetGate , valueResetGate);
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  INLINE void operator()(__m256 &valueUpdateGate,
-                         __m256 &gradUpdateGate,
-                         __m256 &valueResetGate,
-                         __m256 &gradResetGate,
-                         __m256 &valuePrevOut,
-                         __m256 &gradPrevOut,
-                         __m256 &gradResetOutput,
-                         Active<__m256>::backward actGate) {
-    gradResetGate = _mm256_mul_ps(gradResetOutput, valuePrevOut);
-    gradPrevOut = _mm256_add_ps(
-      gradPrevOut, _mm256_mul_ps(gradResetOutput, valueResetGate));
-    gradUpdateGate = actGate(gradUpdateGate, valueUpdateGate);
-    gradResetGate  = actGate(gradResetGate , valueResetGate);
-  }
-#endif
-#endif
-};
-}  // namespace backward
-}  // namespace hppl
-
-#endif /* HL_GRU_OPS_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_lstm.h b/paddle/legacy/cuda/include/hl_lstm.h
deleted file mode 100644
index 5db4783bf4d..00000000000
--- a/paddle/legacy/cuda/include/hl_lstm.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_LSTM_H_
-#define HL_LSTM_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   Lstm sequence parallel forward.
- *
- * @param[in]   gateValue           input value.
- * @param[out]  stateValue          state value.
- * @param[out]  preOutputValue     prev output value.
- * @param[out]  outputValue         output value.
- * @param[in]   checkIg             bias.
- * @param[in]   checkFg             bias.
- * @param[in]   checkOg             bias.
- * @param[in]   weight              weight.
- * @param[in]   sequence            sequence index.
- * @param[in]   frameSize           frame size.
- * @param[in]   numSequences        number of sequences.
- * @param[in]   reversed            reverse.
- * @param[in]   active_node         active input type.
- * @param[in]   active_gate         active state type.
- * @param[in]   active_state        actvie gate type.
- *
- *
- * @note    Only support frameSize = 32 or 64.
- */
-extern void hl_lstm_parallel_forward(real *gateValue,
-                                     real *stateValue,
-                                     real *preOutputValue,
-                                     real *outputValue,
-                                     real *checkIg,
-                                     real *checkFg,
-                                     real *checkOg,
-                                     real *weight,
-                                     const int *sequence,
-                                     int frameSize,
-                                     int numSequences,
-                                     bool reversed,
-                                     hl_activation_mode_t active_node,
-                                     hl_activation_mode_t active_gate,
-                                     hl_activation_mode_t active_state);
-
-/**
- * @brief   Lstm sequence parallel backward data.
- *
- * @param[in]   gateValue           input value.
- * @param[out]  gateGrad            input gradient.
- * @param[in]   stateValue          state value.
- * @param[out]  stateGrad           state gradient.
- * @param[out]  preOutputValue     prev output value.
- * @param[out]  preOutputGrad      prev output gradient.
- * @param[in]   outputGrad          output gradient.
- * @param[in]   checkIg             bias.
- * @param[out]  checkIgGrad         bias gradient.
- * @param[in]   checkFg             bias.
- * @param[out]  checkFgGrad         bias gradient.
- * @param[in]   checkOg             bias.
- * @param[out]  checkOgGrad         bias gradient.
- * @param[in]   weight              weight.
- * @param[in]   sequence            sequence index.
- * @param[in]   frameSize           frame size.
- * @param[in]   numSequences        number of sequences.
- * @param[in]   reversed            reverse.
- * @param[in]   active_node         active input type.
- * @param[in]   active_gate         active state type.
- * @param[in]   active_state        actvie gate type.
- *
- *
- * @note    Only support frameSize = 32 or 64.
- */
-extern void hl_lstm_parallel_backward_data(real *gateValue,
-                                           real *gateGrad,
-                                           real *stateValue,
-                                           real *stateGrad,
-                                           real *preOutputValue,
-                                           real *preOutputGrad,
-                                           real *outputGrad,
-                                           real *checkIg,
-                                           real *checkIgGrad,
-                                           real *checkFg,
-                                           real *checkFgGrad,
-                                           real *checkOg,
-                                           real *checkOgGrad,
-                                           real *weight,
-                                           const int *sequence,
-                                           int frameSize,
-                                           int numSequences,
-                                           bool reversed,
-                                           hl_activation_mode_t active_node,
-                                           hl_activation_mode_t active_gate,
-                                           hl_activation_mode_t active_state);
-
-/**
- * @brief   Lstm sequence parallel backward weight.
- *
- * @param[out]  weightGrad          weight gradient.
- * @param[in]   outputValue         output value.
- * @param[in]   gateGrad            gate gradient.
- * @param[in]   sequence            sequence index.
- * @param[in]   frameSize           frame size.
- * @param[in]   batchSize           batch size.
- * @param[in]   numSequences        number of sequences.
- * @param[in]   reversed            reverse.
- *
- */
-extern void hl_lstm_parallel_backward_weight(real *weightGrad,
-                                             real *outputValue,
-                                             real *gateGrad,
-                                             const int *sequence,
-                                             int frameSize,
-                                             int batchSize,
-                                             int numSequences,
-                                             bool reversed);
-
-#endif /* HL_LSTM_H_ */
diff --git a/paddle/legacy/cuda/include/hl_lstm_ops.cuh b/paddle/legacy/cuda/include/hl_lstm_ops.cuh
deleted file mode 100644
index 394fdf5ac07..00000000000
--- a/paddle/legacy/cuda/include/hl_lstm_ops.cuh
+++ /dev/null
@@ -1,213 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_LSTM_OPS_CUH_
-#define HL_LSTM_OPS_CUH_
-
-#ifdef __CUDA_ARCH__
-#define INLINE   __device__ inline
-#else
-#define INLINE   inline
-#endif
-
-namespace hppl {
-
-namespace forward {
-class lstm {
-public:
-  /**
-   * @param   valueIn     input
-   * @param   valueIg     input gate
-   * @param   valueFg     forget gate
-   * @param   valueOg     output gate
-   * @param   prevState   previous state
-   * @param   state       current state
-   * @param   stateAtv    state active
-   * @param   output      output
-   * @param   checkI      check input gate
-   * @param   checkF      check forget gate
-   * @param   checkO      check output gate
-   * @param   actInput    forward function of input
-   * @param   actGate     forward function of gate
-   * @param   actState    forward function of state
-   */
-  INLINE void operator()(real &valueIn,
-                         real &valueIg,
-                         real &valueFg,
-                         real &valueOg,
-                         real &prevState,
-                         real &state,
-                         real &stateAtv,
-                         real &output,
-                         real &checkI,
-                         real &checkF,
-                         real &checkO,
-                         Active<real>::forward actInput,
-                         Active<real>::forward actGate,
-                         Active<real>::forward actState) {
-    valueIn = actInput(valueIn);
-    valueIg = actGate(valueIg + prevState * checkI);
-    valueFg = actGate(valueFg + prevState * checkF);
-    state = valueIn * valueIg + prevState * valueFg;
-    valueOg = actGate(valueOg + state * checkO);
-    stateAtv = actState(state);
-    output = valueOg * stateAtv;
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  INLINE void operator()(__m256 &valueIn,
-                         __m256 &valueIg,
-                         __m256 &valueFg,
-                         __m256 &valueOg,
-                         __m256 &prevState,
-                         __m256 &state,
-                         __m256 &stateAtv,
-                         __m256 &output,
-                         __m256 &checkI,
-                         __m256 &checkF,
-                         __m256 &checkO,
-                         Active<__m256>::forward actInput,
-                         Active<__m256>::forward actGate,
-                         Active<__m256>::forward actState) {
-    valueIn = actInput(valueIn);
-    valueIg = actGate(
-      _mm256_add_ps(valueIg, _mm256_mul_ps(prevState, checkI)));
-    valueFg = actGate(
-      _mm256_add_ps(valueFg, _mm256_mul_ps(prevState, checkF)));
-    state = _mm256_add_ps(_mm256_mul_ps(valueIn, valueIg)
-        , _mm256_mul_ps(prevState, valueFg));
-    valueOg = actGate(_mm256_add_ps(valueOg, _mm256_mul_ps(state, checkO)));
-    stateAtv = actState(state);
-    output = _mm256_mul_ps(valueOg, stateAtv);
-  }
-#endif
-#endif
-};
-}  // namespace forward
-
-namespace backward {
-class lstm {
-public:
-  /**
-   * @param   valueIn         input
-   * @param   valueIg         input gate
-   * @param   valueFg         forget gate
-   * @param   valueOg         output gate
-   * @param   gradIn          input grad
-   * @param   gradIg          input gate grad
-   * @param   gradFg          forget gate grad
-   * @param   gradOg          output gate grad
-   * @param   prevState       previous state value
-   * @param   prevStateGrad   previous state grad
-   * @param   state           current state value
-   * @param   stateGrad       current state grad
-   * @param   stateAtv        state active
-   * @param   outputGrad      output grad
-   * @param   checkI          check input gate
-   * @param   checkF          check forget gate
-   * @param   checkO          check output gate
-   * @param   checkIGrad      check input gate grad
-   * @param   checkFGrad      check forget gate grad
-   * @param   checkOGrad      check output gate grad
-   * @param   actInput        backward function of input
-   * @param   actGate         backward function of gate
-   * @param   actState        backward function of state
-   */
-  INLINE void operator()(real &valueIn,
-                         real &valueIg,
-                         real &valueFg,
-                         real &valueOg,
-                         real &gradIn,
-                         real &gradIg,
-                         real &gradFg,
-                         real &gradOg,
-                         real &prevState,
-                         real &prevStateGrad,
-                         real &state,
-                         real &stateGrad,
-                         real &stateAtv,
-                         real &outputGrad,
-                         real &checkI,
-                         real &checkF,
-                         real &checkO,
-                         real &checkIGrad,
-                         real &checkFGrad,
-                         real &checkOGrad,
-                         Active<real>::backward actInput,
-                         Active<real>::backward actGate,
-                         Active<real>::backward actState) {
-    gradOg = actGate(outputGrad * stateAtv, valueOg);
-    stateGrad += actState(outputGrad * valueOg, stateAtv) + gradOg * checkO;
-    gradIn = actInput(stateGrad * valueIg, valueIn);
-    gradIg = actGate(stateGrad * valueIn, valueIg);
-    gradFg = actGate(stateGrad * prevState, valueFg);
-    prevStateGrad = gradIg * checkI + gradFg * checkF + stateGrad * valueFg;
-    checkIGrad = gradIg * prevState;
-    checkFGrad = gradFg * prevState;
-    checkOGrad = gradOg * state;
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  INLINE void operator()(__m256 &valueIn,
-                         __m256 &valueIg,
-                         __m256 &valueFg,
-                         __m256 &valueOg,
-                         __m256 &gradIn,
-                         __m256 &gradIg,
-                         __m256 &gradFg,
-                         __m256 &gradOg,
-                         __m256 &prevState,
-                         __m256 &prevStateGrad,
-                         __m256 &state,
-                         __m256 &stateGrad,
-                         __m256 &stateAtv,
-                         __m256 &outputGrad,
-                         __m256 &checkI,
-                         __m256 &checkF,
-                         __m256 &checkO,
-                         __m256 &checkIGrad,
-                         __m256 &checkFGrad,
-                         __m256 &checkOGrad,
-                         Active<__m256>::backward actInput,
-                         Active<__m256>::backward actGate,
-                         Active<__m256>::backward actState) {
-    gradOg = actGate(_mm256_mul_ps(outputGrad, stateAtv), valueOg);
-    stateGrad = _mm256_add_ps(
-      actState(_mm256_mul_ps(outputGrad, valueOg), stateAtv), stateGrad);
-    stateGrad = _mm256_add_ps(_mm256_mul_ps(gradOg, checkO), stateGrad);
-    gradIn = actInput(_mm256_mul_ps(stateGrad, valueIg), valueIn);
-    gradIg = actGate(_mm256_mul_ps(stateGrad, valueIn), valueIg);
-    gradFg = actGate(_mm256_mul_ps(stateGrad, prevState), valueFg);
-    prevStateGrad = _mm256_add_ps(
-      _mm256_mul_ps(gradIg, checkI), _mm256_mul_ps(gradFg, checkF));
-    prevStateGrad = _mm256_add_ps(
-      _mm256_mul_ps(stateGrad, valueFg), prevStateGrad);
-    checkIGrad = _mm256_mul_ps(gradIg, prevState);
-    checkFGrad = _mm256_mul_ps(gradFg, prevState);
-    checkOGrad = _mm256_mul_ps(gradOg, state);
-  }
-#endif
-#endif
-};
-}  // namespace backward
-}  // namespace hppl
-
-#endif /* HL_LSTM_OPS_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_matrix.h b/paddle/legacy/cuda/include/hl_matrix.h
deleted file mode 100644
index 88d538343f9..00000000000
--- a/paddle/legacy/cuda/include/hl_matrix.h
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_MATRIX_H_
-#define HL_MATRIX_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   Matrix addition: C_d[i] = alpha * A_d[i] + beta * B_d[i].
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[in]   B_d     input matrix (M x N).
- * @param[out]  C_d     output matrix (M x N).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- * @param[in]   alpha   scalar used for addition.
- * @param[in]   beta    scalar used for addition.
- *
- */
-extern void hl_matrix_add(
-    real* A_d, real* B_d, real* C_d, int dimM, int dimN, real alpha, real beta);
-/**
- * @brief   Matrix Softmax.
- *
- * @param[in]   A_d     input maxtrix (M x N).
- * @param[out]  C_d     output matrix (M x N).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN);
-
-/**
- * @brief   Matrix softmax derivative.
- *
- * @param[out]  grad_d       intput matrix (M x N).
- * @param[in]   output_d     output matrix (M x N).
- * @param[in]   sftmaxSum_d  softmax sum (M * 1).
- * @param[in]   dimM         matrix height.
- * @param[in]   dimN         matrix width.
- *
- */
-extern void hl_matrix_softmax_derivative(
-    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN);
-
-/**
- * @brief   Sequence softmax.
- *
- * @param[in]   A_d         input vector.
- * @param[out]  C_d         output vector.
- * @param[in]   index       start positions of sequence.
- * @param[in]   numSequence sequence number.
- *
- */
-extern void hl_sequence_softmax_forward(real* A_d,
-                                        real* C_d,
-                                        const int* index,
-                                        int numSequence);
-
-/**
- * @brief   Matrix cross entropy.
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[out]  C_d     output matrix (M X 1).
- * @param[in]   label_d input matrix (M x 1).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_cross_entropy(
-    real* A_d, real* C_d, int* label_d, int dimM, int dimN);
-
-/**
- * @brief   Matrix cross entropy back propagation.
- *
- * @param[out]  grad_d      output matrix (M x N).
- * @param[in]   output_d    input matrix (M x N).
- * @param[in]   label_d     input vector (M x 1).
- * @param[in]   dimM        matrix height.
- * @param[in]   dimN        matrix width.
- *
- */
-extern void hl_matrix_cross_entropy_bp(
-    real* grad_d, real* output_d, int* label_d, int dimM, int dimN);
-
-/**
- * @brief  Matrix multi-binary label cross entropy
- *
- * @param[in]   output    input matrix (M x N).
- * @param[out]  entropy   output matrix (M x 1).
- * @param[in]   mat       input sparse matrix.
- * @param[in]   dimM      matrix height.
- * @param[in]   dimN      matrix width.
- */
-extern void hl_matrix_multi_binary_cross_entropy(
-    real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN);
-
-/**
- * @brief  Matrix multi-binary label cross entropy backprop
- *
- * @param[in]   output    input matrix (M x N).
- * @param[out]  grad      output matrix (M x N).
- * @param[in]   mat       input sparse matrix.
- * @param[in]   dimM      matrix height.
- * @param[in]   dimN      matrix width.
- */
-extern void hl_matrix_multi_binary_cross_entropy_bp(
-    real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN);
-
-/**
- * @brief  Matrix zero memory.
- *
- * @param[in,out]  data   input data.
- * @param[in]      num    length of data.
- *
- */
-extern void hl_matrix_zero_mem(real* data, int num);
-
-/**
- * @brief parameter relu forward
- *
- * @param[out] output     output data
- * @param[in]  input      input data
- * @param[in]  w          parameter data
- * @param[in]  width      matrix width
- * @param[in]  height     matrix height
- * @param[in]  partial_sum
- */
-
-extern void hl_param_relu_forward(
-    real* output, real* input, real* w, int width, int height, int partial_sum);
-/**
- * @brief parameter relu backward w
- *
- * @param[out] grad_w      w grad
- * @param[in]  grad_o      output grad
- * @param[in]  input       input data
- * @param[in]  width       matrix width
- * @param[in]  height      matrix height
- * @param[in]  partial_sum
- */
-extern void hl_param_relu_backward_w(real* grad_w,
-                                     real* grad_o,
-                                     real* input,
-                                     int width,
-                                     int height,
-                                     int partial_sum);
-/**
- * @brief parameter relu backward diff
- *
- * @param[in]       grad_o      output grad
- * @param[in]       input       input data
- * @param[in]       w           parameter
- * @param[out]      diff        diff
- * @param[in]       width       matrix width
- * @param[in]       height      matrix height
- * @param[in]       partial_sum
- */
-extern void hl_param_relu_backward_diff(real* grad_o,
-                                        real* input,
-                                        real* w,
-                                        real* diff,
-                                        int width,
-                                        int height,
-                                        int partial_sum);
-
-/**
- * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[in]   B_d     input matrix (1 x channel).
- * @param[in]   channel width of B.
- * @param[in]   dimM    height of A.
- * @param[in]   dimN    width of A.
- * @param[in]   scale   scalar used for addition.
- *
- */
-extern void hl_matrix_add_shared_bias(real* A_d,
-                                      real* B_d,
-                                      const int channel,
-                                      const int dimM,
-                                      const int dimN,
-                                      real scale);
-
-/**
- * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
- *
- * @param[in]   B_d     input matrix (1 x channel).
- * @param[in]   A_d     input matrix (M x N).
- * @param[in]   channel width of B.
- * @param[in]   dimM    height of A.
- * @param[in]   dimN    width of A.
- * @param[in]   scale   scalar used for addition.
- *
- */
-extern void hl_matrix_collect_shared_bias(real* B_d,
-                                          real* A_d,
-                                          const int channel,
-                                          const int dimM,
-                                          const int dimN,
-                                          real scale);
-
-/**
- * @brief  Matrix rotation in 90 degrees
- *
- * @param[in]   mat       input matrix (M x N).
- * @param[out]  matRot    output matrix (N x M).
- * @param[in]   dimM      input matrix height.
- * @param[in]   dimN      input matrix width.
- * @param[in]   clockWise rotation direction
- */
-extern void hl_matrix_rotate(
-    real* mat, real* matRot, int dimM, int dimN, bool clockWise);
-
-/**
- * @brief  Matrix vol2Col: Convert 3D volume into col matrix
- *
- * @param[in]   matSrc     input matrix.
- * @param[in]   channel    channel of matSrc.
- * @param[in]   depth      depth of matSrc.
- * @param[in]   height     height of matSrc.
- * @param[in]   width      width of matSrc.
- * @param[in]   filterD    depth of filter.
- * @param[in]   filterH    height of filter.
- * @param[in]   filterW    width of filter.
- * @param[in]   strideD    stride in the depth.
- * @param[in]   strideH    stride in the height.
- * @param[in]   strideW    stride in the width.
- * @param[in]   paddingD   padding in the depth.
- * @param[in]   paddingH   padding in the height.
- * @param[in]   paddingW   padding in the width.
- * @param[out]   dataDst     output matrix.
- *
- */
-extern void hl_matrix_vol2Col(const real* dataSrc,
-                              int channels,
-                              int depth,
-                              int height,
-                              int width,
-                              int filterD,
-                              int filterH,
-                              int filterW,
-                              int strideD,
-                              int strideH,
-                              int strideW,
-                              int paddingD,
-                              int paddingH,
-                              int paddingW,
-                              real* dataDst);
-
-/**
- * @brief  Matrix col2Vol: Convert col matrix into 3D volume
- *
- * @param[out]  matDst     output matrix.
- * @param[in]   channel    channel of matDst.
- * @param[in]   depth      depth of matDst.
- * @param[in]   height     height of matDst.
- * @param[in]   width      width of matDst.
- * @param[in]   filterD    depth of filter.
- * @param[in]   filterH    height of filter.
- * @param[in]   filterW    width of filter.
- * @param[in]   strideD    stride in the depth.
- * @param[in]   strideH    stride in the height.
- * @param[in]   strideW    stride in the width.
- * @param[in]   paddingD   padding in the depth.
- * @param[in]   paddingH   padding in the height.
- * @param[in]   paddingW   padding in the width.
- * @param[in]   matSrc     input matrix.
- * @param[in]   beta       input
- * @param[in]   alpha      input
- *
- */
-extern void hl_matrix_col2Vol(real* dataDst,
-                              int channels,
-                              int depth,
-                              int height,
-                              int width,
-                              int filterD,
-                              int filterH,
-                              int filterW,
-                              int strideD,
-                              int strideH,
-                              int strideW,
-                              int paddingD,
-                              int paddingH,
-                              int paddingW,
-                              const real* dataSrc,
-                              real alpha,
-                              real beta);
-
-/**
- * @brief  Matrix col2Vol: Convert col matrix into 3D volume
- * @param[out]  out     output int vector.
- * @param[in]   vec     input float vector.
- * @param[in]   size    size of the vector.
- */
-extern void hl_vector_cast2int(int* out, real* vec, int size);
-
-#endif /* HL_MATRIX_H_ */
diff --git a/paddle/legacy/cuda/include/hl_matrix_apply.cuh b/paddle/legacy/cuda/include/hl_matrix_apply.cuh
deleted file mode 100644
index a067c8233b9..00000000000
--- a/paddle/legacy/cuda/include/hl_matrix_apply.cuh
+++ /dev/null
@@ -1,423 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_MATRIX_APPLY_H_
-#define HL_MATRIX_APPLY_H_
-
-#include "hl_base.h"
-#include "hl_cpu_matrix_kernel.cuh"
-#include "hl_gpu_matrix_kernel.cuh"
-
-/**
- * @brief   CPU element wise unary operator.
- *
- *  element wise op(a) for 0 <= i < dimM & for 0 <= j < dimN.
- *
- * @param[in]       op          unary op. see namespace unary
- * @param[in,out]   A_h         matrix.
- * @param[in]       dimM        matrix height.
- * @param[in]       dimN        matrix width.
- * @param[in]       lda         leading dimension of A.
- *
- */
-template <class T, class Op>
-extern void hl_cpu_apply_unary_op(Op op,
-                                  T* A_h,
-                                  int dimM,
-                                  int dimN,
-                                  int lda);
-
-/**
- * @brief   CPU element wise binary operator.
- *
- * element wise op(a, b) for 0 <= i < dimM & for 0 <= j < dimN.
- *
- * if (BAsRowVector == 0 && BAsColVector == 0)
- *   op(A[i * lda + j], B[i * ldb + j])
- *
- * if (BAsRowVector == 1 && BAsColVector == 0)
- *   op(A[i * lda + j], B[j])
- *
- * if (BAsRowVector == 0 && BAsColVector == 1)
- *   op(A[i * lda + j], B[i * ldb])
- *
- * if (BAsRowVector == 1 && BAsColVector == 1)
- *   op(A[i * lda + j], B[0])
- *
- * @param[in]       op          binary op. see namespace binary.
- * @param[in,out]   A_h         matrix.
- * @param[in,out]   B_h         matrix.
- * @param[in]       dimM        matrix height.
- * @param[in]       dimN        matrix width.
- * @param[in]       lda         leading dimension of A.
- * @param[in]       ldb         leading dimension of B.
- *
- */
-template <class T, class Op, bool BAsRowVector, bool BAsColVector>
-extern void hl_cpu_apply_binary_op(Op op,
-                                   T* A_h,
-                                   T* B_h,
-                                   int dimM,
-                                   int dimN,
-                                   int lda,
-                                   int ldb);
-
-/**
- * @brief   CPU element wise ternary operator.
- *
- * element wise op(a, b, c) for 0 <= i < dimM & for 0 <= j < dimN.
- *
- * if (CAsRowVector == 0 && CAsColVector == 0)
- *   op(A[i*lda + j], B[i*ldb + j], C[i*ldc + j])
- *
- * if (CAsRowVector == 1 && CAsColVector == 0)
- *   op(A[i*lda + j], B[i*ldb + j], C[j])
- *
- * if (CAsRowVector == 0 && CAsColVector == 1)
- *   op(A[i*lda + j], B[i*ldb + j], C[i*ldc])
- *
- * if (CAsRowVector == 1 && CAsColVector == 1)
- *   op(A[i*lda + j], B[i*ldb + j], C[0])
- *
- * @param[in]       op          ternary op. see namespace ternary.
- * @param[in,out]   A_h         matrix.
- * @param[in,out]   B_h         matrix.
- * @param[in,out]   C_h         matrix.
- * @param[in]       dimM        matrix height.
- * @param[in]       dimN        matrix width.
- * @param[in]       lda         leading dimension of A.
- * @param[in]       ldb         leading dimension of B.
- * @param[in]       ldc         leading dimension of C.
- *
- */
-template <class T, class Op, bool CAsRowVector, bool CAsColVector>
-extern void hl_cpu_apply_ternary_op(Op op,
-                                    T* A_h,
-                                    T* B_h,
-                                    T* C_h,
-                                    int dimM,
-                                    int dimN,
-                                    int lda,
-                                    int ldb,
-                                    int ldc);
-
-/**
- * @brief   CPU element wise quaternary operator.
- *          element wise op(a, b, c, d) for 0 <= i < dimM & for 0 <= j < dimN.
- *
- * @param[in]       op          quaternary op. see namespace ternary.
- * @param[in,out]   A_h         matrix.
- * @param[in,out]   B_h         matrix.
- * @param[in,out]   C_h         matrix.
- * @param[in,out]   D_h         matrix.
- * @param[in]       dimM        matrix height.
- * @param[in]       dimN        matrix width.
- * @param[in]       lda         leading dimension of A.
- * @param[in]       ldb         leading dimension of B.
- * @param[in]       ldc         leading dimension of C.
- * @param[in]       ldd         leading dimension of D.
- *
- */
-template <class T, class Op>
-extern void hl_cpu_apply_quaternary_op(Op op,
-                                       T* A_h,
-                                       T* B_h,
-                                       T* C_h,
-                                       T* D_h,
-                                       int dimM,
-                                       int dimN,
-                                       int lda,
-                                       int ldb,
-                                       int ldc,
-                                       int ldd);
-
-/**
- * @brief   GPU element wise unary operator.
- *          element wise op(a) for 0 <= i < dimM & for 0 <= j < dimN.
- *
- * @param[in]       op          unary op. see namespace unary.
- * @param[in,out]   A_d         matrix.
- * @param[in]       dimM        matrix height.
- * @param[in]       dimN        matrix width.
- * @param[in]       lda         leading dimension of A.
- *
- */
-template <class T, class Op>
-extern void hl_gpu_apply_unary_op(Op op,
-                                  T* A_d,
-                                  int dimM,
-                                  int dimN,
-                                  int lda);
-
-/**
- * @brief   GPU element wise binary operator.
- *
- * element wise op(a, b) for 0 <= i < dimM & for 0 <= j < dimN
- *
- * if (BAsRowVector == 0 && BAsColVector == 0)
- *   op(A[i * lda + j], B[i * ldb + j])
- *
- * if (BAsRowVector == 1 && BAsColVector == 0)
- *   op(A[i * lda + j], B[j])
- *
- * if (BAsRowVector == 0 && BAsColVector == 1)
- *   op(A[i * lda + j], B[i * ldb])
- *
- * if (BAsRowVector == 1 && BAsColVector == 1)
- *   op(A[i * lda + j], B[0])
- *
- * @param[in]       op          binary op. see namespace binary.
- * @param[in,out]   A_d         matrix.
- * @param[in,out]   B_d         matrix.
- * @param[in]       dimM        matrix height.
- * @param[in]       dimN        matrix width.
- * @param[in]       lda         leading dimension of A.
- * @param[in]       ldb         leading dimension of B.
- *
- */
-template <class T, class Op, bool BAsRowVector, bool BAsColVector>
-extern void hl_gpu_apply_binary_op(Op op,
-                                   T* A_d,
-                                   T* B_d,
-                                   int dimM,
-                                   int dimN,
-                                   int lda,
-                                   int ldb);
-/**
- * @brief   GPU element wise ternary operator.
- *
- * element wise op(a, b, c) for 0 <= i < dimM & for 0 <= j < dimN.
- *
- * if (CAsRowVector == 0 && CAsColVector == 0)
- *   op(A[i*lda + j], B[i*ldb + j], C[i*ldc + j])
- *
- * if (CAsRowVector == 1 && CAsColVector == 0)
- *   op(A[i*lda + j], B[i*ldb + j], C[j])
- *
- * if (CAsRowVector == 0 && CAsColVector == 1)
- *   op(A[i*lda + j], B[i*ldb + j], C[i*ldc])
- *
- * if (CAsRowVector == 1 && CAsColVector == 1)
- *   op(A[i*lda + j], B[i*ldb + j], C[0])
- *
- * @param[in]       op          ternary op. see namespace ternary.
- * @param[in,out]   A_d         matrix.
- * @param[in,out]   B_d         matrix.
- * @param[in,out]   C_d         matrix.
- * @param[in]       dimM        matrix height.
- * @param[in]       dimN        matrix width.
- * @param[in]       lda         leading dimension of A.
- * @param[in]       ldb         leading dimension of B.
- * @param[in]       ldc         leading dimension of C.
- *
- */
-template <class T, class Op, bool CAsRowVector, bool CAsColVector>
-extern void hl_gpu_apply_ternary_op(Op op,
-                                    T* A_d,
-                                    T* B_d,
-                                    T* C_d,
-                                    int dimM,
-                                    int dimN,
-                                    int lda,
-                                    int ldb,
-                                    int ldc);
-
-
-/**
- * @brief   GPU element wise quaternary operator.
- *          element wise op(a, b, c, d) for 0 <= i < dimM & for 0 <= j < dimN.
- *
- * @param[in]       op          quaternary op. see namespace ternary.
- * @param[in,out]   A_d         matrix.
- * @param[in,out]   B_d         matrix.
- * @param[in,out]   C_d         matrix.
- * @param[in,out]   D_d         matrix.
- * @param[in]       dimM        matrix height.
- * @param[in]       dimN        matrix width.
- * @param[in]       lda         leading dimension of A.
- * @param[in]       ldb         leading dimension of B.
- * @param[in]       ldc         leading dimension of C.
- * @param[in]       ldd         leading dimension of D.
- *
- */
-template <class T, class Op>
-extern void hl_gpu_apply_quaternary_op(Op op,
-                                       T* A_d,
-                                       T* B_d,
-                                       T* C_d,
-                                       T* D_d,
-                                       int dimM,
-                                       int dimN,
-                                       int lda,
-                                       int ldb,
-                                       int ldc,
-                                       int ldd);
-
-/**
- * @brief  CPU matrix row operator.
- */
-template <class Agg, class Op, class Saver>
-extern void hl_cpu_matrix_row_op(Agg agg, Op op, Saver sv,
-                                 int dimM, int dimN,
-                                 real *dst, int ld,
-                                 real *A, int lda);
-
-/**
- * @brief  CPU matrix row operator.
- *
- * @param[in]  agg    aggregate operator expression.
- * @param[in]  op     operator expression.
- * @param[in]  dimM   matrix height.
- * @param[in]  dimN   matrix width.
- * @param[out] dst    destination matrix.
- * @param[in]  ld     leading dimension of dst matrix.
- * @param[in]  *A     matrix A.
- * @param[in]  lda    leading dimension of matrix A.
- * @param[in]  *B     matrix B.
- * @param[in]  ldb    leading dimension of matrix B.
- *
- */
-template <class Saver, class Agg, class Op>
-extern void hl_cpu_matrix_row_op(Agg agg, Op op,
-                                 int dimM, int dimN,
-                                 real *dst, int ld,
-                                 real *A, int lda,
-                                 real *B, int ldb);
-
-/**
- * @brief  CPU matrix column operator.
- *
- * @param[in]  agg    aggregate operator expression.
- * @param[in]  op     operator expression.
- * @param[in]  sv     assignment operator expression.
- * @param[in]  dimM   matrix height.
- * @param[in]  dimN   matrix width.
- * @param[out] dst    destination matrix.
- * @param[in]  *A     matrix A.
- * @param[in]  lda    leading dimension of matrix A.
- *
- */
-template <class Agg, class Op, class Saver>
-extern void hl_cpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                                    int dimM, int dimN,
-                                    real *dst,
-                                    real *A, int lda);
-
-/**
- * @brief  CPU matrix column operator.
- *
- * @param[in]  agg    aggregate operator expression.
- * @param[in]  op     operator expression.
- * @param[in]  sv     assignment operator expression.
- * @param[in]  dimM   matrix height.
- * @param[in]  dimN   matrix width.
- * @param[out] dst    destination matrix.
- * @param[in]  *A     matrix A.
- * @param[in]  lda    leading dimension of matrix A.
- * @param[in]  *B     matrix B.
- * @param[in]  ldb    leading dimension of matrix B.
- *
- */
-template <class Agg, class Op, class Saver>
-extern void hl_cpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                                    int dimM, int dimN,
-                                    real *dst,
-                                    real *A, int lda,
-                                    real *B, int ldb);
-
-/**
- * @brief  GPU matrix row operator.
- *
- * @param[in]  agg    aggregate operator expression.
- * @param[in]  op     operator expression.
- * @param[in]  sv     assignment operator expression.
- * @param[in]  dimM   matrix height.
- * @param[in]  dimN   matrix width.
- * @param[out] dst    destination matrix.
- * @param[in]  ld     leading dimension of dst.
- * @param[in]  *A     matrix A.
- * @param[in]  lda    leading dimension of matrix A.
- *
- */
-template <class Agg, class Op, class Saver>
-extern void hl_gpu_matrix_row_op(Agg agg, Op op, Saver sv,
-                                 int dimM, int dimN,
-                                 real *dst, int ld,
-                                 real *A, int lda);
-
-/**
- * @brief  GPU matrix row operator.
- *
- * @param[in]  agg    aggregate operator expression.
- * @param[in]  op     operator expression.
- * @param[in]  dimM   matrix height.
- * @param[in]  dimN   matrix width.
- * @param[out] dst    destination matrix.
- * @param[in]  ld     leading dimension of dst matrix.
- * @param[in]  *A     matrix A.
- * @param[in]  lda    leading dimension of matrix A.
- * @param[in]  *B     matrix B.
- * @param[in]  ldb    leading dimension of matrix B.
- *
- */
-template <class Saver, class Agg, class Op>
-extern void hl_gpu_matrix_row_op(Agg agg, Op op,
-                                 int dimM, int dimN,
-                                 real *dst, int ld,
-                                 real *A, int lda,
-                                 real *B, int ldb);
-
-/**
- * @brief  GPU matrix column operator.
- *
- * @param[in]  agg    aggregate operator expression.
- * @param[in]  op     operator expression.
- * @param[in]  sv     assignment operator expression.
- * @param[in]  dimM   matrix height.
- * @param[in]  dimN   matrix width.
- * @param[out] dst    destination matrix.
- * @param[in]  *A     matrix A.
- * @param[in]  lda    leading dimension of matrix A.
- *
- */
-template <class Agg, class Op, class Saver>
-extern void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                                    int dimM, int dimN,
-                                    real *dst,
-                                    real *A, int lda);
-
-/**
- * @brief  GPU matrix column operator.
- *
- * @param[in]  agg    aggregate operator expression.
- * @param[in]  op     operator expression.
- * @param[in]  sv     assignment operator expression.
- * @param[in]  dimM   matrix height.
- * @param[in]  dimN   matrix width.
- * @param[out] dst    destination matrix.
- * @param[in]  *A     matrix A.
- * @param[in]  lda    leading dimension of matrix A.
- * @param[in]  *B     matrix B.
- * @param[in]  ldb    leading dimension of matrix B.
- *
- */
-template <class Agg, class Op, class Saver>
-extern void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                                    int dimM, int dimN,
-                                    real *dst,
-                                    real *A, int lda,
-                                    real *B, int ldb);
-
-#endif /* HL_MATRIX_APPLY_H_ */
diff --git a/paddle/legacy/cuda/include/hl_matrix_base.cuh b/paddle/legacy/cuda/include/hl_matrix_base.cuh
deleted file mode 100644
index a309bb0011c..00000000000
--- a/paddle/legacy/cuda/include/hl_matrix_base.cuh
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_MATRIX_BASE_CUH_
-#define HL_MATRIX_BASE_CUH_
-
-#include "hl_matrix_type.cuh"
-
-class BaseOp {
-public:
-  static const bool sse = false;
-  BaseOp() {}
-  explicit BaseOp(const real s1) {}
-  explicit BaseOp(const real s1, const real s2) {}
-  INLINE vecType vecOp(const vecType a) const {
-    return a;
-  }
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return a;
-  }
-};
-
-#ifdef __CUDA_ARCH__
-typedef BaseOp SSESum;
-typedef BaseOp SSEMax;
-typedef BaseOp SSEMin;
-typedef BaseOp SSEIdentity;
-typedef BaseOp SSEAdd;
-typedef BaseOp SSEAdd2;
-typedef BaseOp SSESub;
-typedef BaseOp SSEMul;
-typedef BaseOp SSEDiv;
-typedef BaseOp SSESquaredDiff;
-typedef BaseOp SSEFirst;
-typedef BaseOp SSESecond;
-typedef BaseOp SSEClassificationError;
-#else
-#include "hl_matrix_base_detail.cuh"
-#endif
-
-namespace aggregate {
-class sum : public SSESum {
-public:
-  INLINE real init() { return 0.0f; }
-  INLINE real operator()(const real a, const real b) const {
-    return a + b;
-  }
-};
-
-class max : public SSEMax {
-public:
-  INLINE real init() { return -HL_FLOAT_MAX; }
-  INLINE real operator()(const real a, const real b) const {
-    return a > b ? a : b;
-  }
-};
-
-class min : public SSEMin {
-public:
-  INLINE real init() {return HL_FLOAT_MAX;}
-  INLINE real operator()(const real a, const real b) const {
-    return a > b ? b : a;
-  }
-};
-}  // namespace aggregate
-
-namespace base {
-namespace unary {
-class identity : public SSEIdentity {
-public:
-  INLINE real operator()(const real a) const {
-    return a;
-  }
-};
-}  // namespace unary
-
-namespace binary {
-class add : public SSEAdd {
-public:
-  INLINE real operator()(const real a, const real b) const {
-    return a + b;
-  }
-};
-
-class add2 : public SSEAdd2 {
-private:
-  const real p1;
-  const real p2;
-public:
-  add2(const real s1, const real s2)
-    : SSEAdd2(s1, s2), p1(s1), p2(s2) {}
-  INLINE real operator()(const real a, const real b) const {
-    return p1 * a + p2 * b;
-  }
-};
-
-class sub : public SSESub {
-public:
-  INLINE real operator()(const real a, const real b) const {
-    return a - b;
-  }
-};
-
-class mul : public SSEMul {
-public:
-  INLINE real operator()(const real a, const real b) const {
-    return a * b;
-  }
-};
-
-class div : public SSEDiv {
-public:
-  INLINE real operator()(const real a, const real b) const  {
-    return a / b;
-  }
-};
-
-class squaredDiff : public SSESquaredDiff {
-public:
-  INLINE real operator()(const real a, const real b) const {
-    return (a - b) * (a - b);
-  }
-};
-
-class first : public SSEFirst {
-public:
-  INLINE real operator()(const real a, const real b) const {
-    return a;
-  }
-};
-
-class second : public SSESecond {
-public:
-  INLINE real operator()(const real a, const real b) const {
-    return b;
-  }
-};
-
-class classificationError : public SSEClassificationError {
-private:
-  const real p;
-public:
-  explicit classificationError(const real s)
-    : SSEClassificationError(s), p(s) {}
-  INLINE real operator()(const real a, const real b) const {
-    return ((a > p) == (b > p)) ? 0.0f : 1.0f;
-  }
-};
-}  // namespace binary
-}  // namespace base
-
-#endif /* HL_MATRIX_BASE_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_matrix_base_detail.cuh b/paddle/legacy/cuda/include/hl_matrix_base_detail.cuh
deleted file mode 100644
index 74211bcb929..00000000000
--- a/paddle/legacy/cuda/include/hl_matrix_base_detail.cuh
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_MATRIX_BASE_DETAIL_CUH_
-#define HL_MATRIX_BASE_DETAIL_CUH_
-
-#include "hl_matrix_type.cuh"
-#include "hl_tensor_ops.h"
-
-namespace aggregate {
-class SSESum {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hppl::binary::add<vecType>()(a, b);
-  }
-};
-
-class SSEMax {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hppl::binary::max<vecType>()(a, b);
-  }
-};
-
-class SSEMin {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hppl::binary::min<vecType>()(a, b);
-  }
-};
-}  // namespace aggregate
-
-namespace base {
-namespace unary {
-class SSEIdentity {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a) const {
-    return a;
-  }
-};
-}  // namespace unary
-
-namespace binary {
-class SSEAdd {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hppl::binary::add<vecType>()(a, b);
-  }
-};
-
-class SSEAdd2 {
-public:
-  static const bool sse = VECTOR_SIMD;
-  const real p1;
-  const real p2;
-  vecType mp1;
-  vecType mp2;
-
-public:
-  SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
-    mp1 = hl_vec_set(p1);
-    mp2 = hl_vec_set(p2);
-  }
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hppl::binary::add_scale<vecType>(mp1, mp2)(a, b);
-  }
-};
-
-class SSESub {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hppl::binary::sub<vecType>()(a, b);
-  }
-};
-
-class SSEMul {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hppl::binary::mul<vecType>()(a, b);
-  }
-};
-
-class SSEDiv {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hppl::binary::div<vecType>()(a, b);
-  }
-};
-
-class SSESquaredDiff {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    vecType tmp = hppl::binary::sub<vecType>()(a, b);
-    return hppl::binary::mul<vecType>()(tmp, tmp);
-  }
-};
-
-class SSEFirst {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return a;
-  }
-};
-
-class SSESecond {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return b;
-  }
-};
-
-class SSEClassificationError {
-public:
-  static const bool sse = VECTOR_SIMD;
-  const real p;
-  vecType mp;
-  vecType result;
-
-public:
-  explicit SSEClassificationError(const real s) : p(s) {
-    mp = hl_vec_set(p);
-    result = hl_vec_set(1.0f);
-  }
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hl_vec_classification_error(a, b, mp, result);
-  }
-};
-}  // namespace binary
-}  // namespace base
-
-#endif /* HL_MATRIX_BASE_DETAIL_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_matrix_ops.cuh b/paddle/legacy/cuda/include/hl_matrix_ops.cuh
deleted file mode 100644
index 4e8bd912349..00000000000
--- a/paddle/legacy/cuda/include/hl_matrix_ops.cuh
+++ /dev/null
@@ -1,253 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_MATRIX_OPS_CUH_
-#define HL_MATRIX_OPS_CUH_
-
-#include "hl_base.h"
-
-#ifdef __NVCC__
-#define HL_DEVICE   __device__
-#else
-#define HL_DEVICE
-#endif
-
-/**
- * @brief   parameter macro.
- */
-#define ONE_PARAMETER(name)     \
-        private: \
-          const T p;\
-        public: \
-          name(const T s) : p(s) {}
-
-#define TWO_PARAMETER(name)     \
-        private: \
-          const T p1;\
-          const T p2;\
-        public: \
-          name(const T s1, T s2) : p1(s1), p2(s2) {}
-
-#define THREE_PARAMETER(name)     \
-        private: \
-          const T p1;\
-          const T p2;\
-          const T p3;\
-        public: \
-          name(const T s1, T s2, T s3) : p1(s1), p2(s2), p3(s3) {}
-
-#define FOUR_PARAMETER(name)     \
-        private: \
-          const T p1;\
-          const T p2;\
-          const T p3;\
-          const T p4;\
-        public: \
-          name(const T s1, T s2, T s3, T s4) : p1(s1), p2(s2), p3(s3), p4(s4) {}
-
-/**
- * @brief   unary operator macro.
- *
- * @param   name    operator name.
- * @param   op      operator expression.
- *
- * @note   op format: op supports multiple expressions that are separated
- *         by a comma. e.g. a, b
- *
- * @see    hl_gpu_apply_unary_op
- * @see    hl_cpu_apply_unary_op
- */
-#define DEFINE_MATRIX_UNARY_OP(name, op) \
-    namespace unary {\
-    template<class T>\
-    class name {\
-    public:\
-        HL_DEVICE inline void gpuOperator(T &a) {op;}\
-        inline void cpuOperator(T &a) {op;}\
-    };\
-    }
-
-
-/**
- * @brief   unary operator macro.
- *
- * @param   name        operator name.
- * @param   PARA_MACRO  parameter macro.
- * @param   op          operator expression.
- *
- * @note   op format: op supports multiple expressions that are separated
- *         by a comma. e.g. a, b
- *
- * @see    hl_gpu_apply_unary_op
- * @see    hl_cpu_apply_unary_op
- */
-#define DEFINE_MATRIX_UNARY_PARAMETER_OP(name, PARA_MACRO, op) \
-    namespace unary {\
-    template<class T>\
-    class name {\
-    PARA_MACRO(name)\
-    public:\
-        HL_DEVICE inline void gpuOperator(T &a) {op;}\
-        inline void cpuOperator(T &a) {op;}\
-    };\
-    }
-
-
-/**
- * @brief   binary operator macro.
- *
- * @param   name    operator name.
- * @param   op      operator expression.
- *
- * @note   op format: op supports multiple expressions that are separated
- *         by a comma. e.g. a, b
- *
- * @see    hl_gpu_apply_unary_op
- * @see    hl_cpu_apply_unary_op
- */
-#define DEFINE_MATRIX_BINARY_OP(name, op) \
-    namespace binary {\
-    template<class T>\
-    class name {\
-    public:\
-        HL_DEVICE inline void gpuOperator(T &a, T &b) {op;}\
-        inline void cpuOperator(T &a, T &b) {op;}\
-    };\
-    }
-
-
-/**
- * @brief   binary operator macro.
- *
- * @param   name        operator name.
- * @param   PARA_MACRO  parameter macro.
- * @param   op          operator expression.
- *
- * @note   op format: op supports multiple expressions that are separated
- *         by a comma. e.g. a, b
- *
- * @see    hl_gpu_apply_binary_op
- * @see    hl_cpu_apply_binary_op
- */
-#define DEFINE_MATRIX_BINARY_PARAMETER_OP(name, PARA_MACRO, op) \
-    namespace binary {\
-    template<class T>\
-    class name {\
-    PARA_MACRO(name)\
-    public:\
-        HL_DEVICE inline void gpuOperator(T &a, T &b) {op;}\
-        inline void cpuOperator(T &a, T &b) {op;}\
-    };\
-    }
-
-
-/**
- * @brief   ternary operator macro.
- *
- * @param   name    operator name.
- * @param   op      operator expression.
- *
- * @note   op format: op supports multiple expressions that are separated
- *         by a comma. e.g. a, b, c
- *
- * @see    hl_gpu_apply_ternary_op
- * @see    hl_cpu_apply_ternary_op
- */
-#define DEFINE_MATRIX_TERNARY_OP(name, op) \
-    namespace ternary {\
-    template<class T>\
-    class name {\
-    public:\
-        HL_DEVICE inline void gpuOperator(T &a, T &b, T &c) {op;}\
-        inline void cpuOperator(T &a, T &b, T &c) {op;}\
-    };\
-    }
-
-
-/**
- * @brief   ternary operator macro.
- *
- * @param   name        operator name.
- * @param   PARA_MACRO  parameter macro.
- * @param   op          operator expression.
- *
- * @note   op format: op supports multiple expressions that are separated
- *         by a comma. e.g. a, b, c
- *
- * @see    hl_gpu_apply_ternary_op
- * @see    hl_cpu_apply_ternary_op
- */
-#define DEFINE_MATRIX_TERNARY_PARAMETER_OP(name, PARA_MACRO, op) \
-    namespace ternary {\
-    template<class T>\
-    class name {\
-    private:\
-    PARA_MACRO(name)\
-    public:\
-        HL_DEVICE inline void gpuOperator(T &a, T &b, T &c) {op;}\
-        inline void cpuOperator(T &a, T &b, T &c) {op;}\
-    };\
-    }
-
-
-/**
- * @brief   quaternary operator macro.
- *
- * @param   name        operator name.
- * @param   op          operator expression.
- *
- * @note   op format: op supports multiple expressions that are separated
- *         by a comma. e.g. a, b, c, d
- *
- * @see    hl_gpu_apply_quaternary_op
- * @see    hl_cpu_apply_quaternary_op
- */
-#define DEFINE_MATRIX_QUATERNARY_OP(name, op)     \
-  namespace quaternary {\
-  template<class T>\
-  class name {\
-   public:\
-   HL_DEVICE inline void gpuOperator(T &a, T &b, T &c, T &d) {op;}\
-   inline void cpuOperator(T&a, T &b, T &c, T &d) {op;}\
-  };\
-  }
-
-
-/**
- * @brief   quaternary operator macro.
- *
- * @param   name        operator name.
- * @param   PARA_MACRO  parameter macro.
- * @param   op          operator expression.
- *
- * @note   op format: op supports multiple expressions that are separated
- *         by a comma. e.g. a, b, c, d
- *
- * @see    hl_gpu_apply_quaternary_op
- * @see    hl_cpu_apply_quaternary_op
- */
-#define DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(name, PARA_MACRO, op)     \
-  namespace quaternary {\
-  template<class T>\
-  class name {\
-   private:\
-   PARA_MACRO(name)\
-   public:\
-   HL_DEVICE inline void gpuOperator(T &a, T &b, T &c, T &d) {op;}\
-   inline void cpuOperator(T &a, T &b, T &c, T &d) {op;}\
-  };\
-  }
-
-#endif /* HL_MATRIX_OPS_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_matrix_type.cuh b/paddle/legacy/cuda/include/hl_matrix_type.cuh
deleted file mode 100644
index e61c0d0a479..00000000000
--- a/paddle/legacy/cuda/include/hl_matrix_type.cuh
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_MATRIX_TYPE_CUH_
-#define HL_MATRIX_TYPE_CUH_
-
-#include "hl_base.h"
-
-#ifdef __CUDA_ARCH__
-/**
- * CUDA kernel inline function
- */
-#define INLINE   __device__ inline
-#else
-/**
- * CPP inline function
- */
-#define INLINE   inline
-#endif
-
-#ifdef __CUDA_ARCH__
-#include <vector_types.h>
-#ifndef PADDLE_TYPE_DOUBLE
-typedef float4 vecType;
-#else
-typedef double2 vecType;
-#endif
-#elif defined(__SSE3__)
-#include "hl_cpu_simd_sse.cuh"
-#define PADDLE_USE_SSE3
-#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && !defined(__NVCC__)
-// Currently nvcc does not support neon intrinsic.
-// TODO: Extract simd intrinsic implementation from .cu files.
-#include "hl_cpu_simd_neon.cuh"
-#define PADDLE_USE_NEON
-#else
-#include "hl_cpu_scalar.cuh"
-#endif
-
-#endif  // HL_MATRIX_TYPE_CUH_
diff --git a/paddle/legacy/cuda/include/hl_perturbation_util.cuh b/paddle/legacy/cuda/include/hl_perturbation_util.cuh
deleted file mode 100644
index e0a27778cae..00000000000
--- a/paddle/legacy/cuda/include/hl_perturbation_util.cuh
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef DISTRUB_UTIL_CUH_
-#define DISTRUB_UTIL_CUH_
-
-#include "hl_base.h"
-
-/*
- * Functionality: randomly rotate, scale and sample a minibatch of images
-                  and their label maps
- * images:            (numImages, imgPixels, 3)
- * targets:           (numImages, imgPixels, 3)
- *
- * created by Wei Xu. Converted to paddle by Jiang Wang.
- */
-void hl_conv_random_disturb(const real* images, int imgSize, int tgtSize,
-                            int channels, int numImages, real scaleRatio,
-                            real rotateAngle, int samplingRate,
-                            real* gpu_r_angle, real* gpu_s_ratio,
-                            int* gpu_center_r, int* gpu_center_c,
-                            int paddingValue, bool isTrain, real* targets);
-
-void hl_conv_random_disturb_with_params(const real* images, int imgSize,
-                                        int tgtSize, int channels,
-                                        int numImages, int samplingRate,
-                                        const real* gpuRotationAngle,
-                                        const real* gpuScaleRatio,
-                                        const int* gpuCenterR,
-                                        const int* gpuCenterC,
-                                        int paddingValue, real* targets);
-
-void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
-                                int*& gpuCenterR, int*& gpuCenterC,
-                                int numImages, int imgSize,
-                                real rotateAngle, real scaleRatio,
-                                int samplingRate, bool isTrain);
-
-#endif /* DISTURB_UTIL_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_recurrent_apply.cuh b/paddle/legacy/cuda/include/hl_recurrent_apply.cuh
deleted file mode 100644
index b2cc231f58d..00000000000
--- a/paddle/legacy/cuda/include/hl_recurrent_apply.cuh
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_RECURRENT_APPLY_CUH_
-#define HL_RECURRENT_APPLY_CUH_
-
-#include "hl_base.h"
-#include "hl_activation_functions.h"
-#include "hl_lstm_ops.cuh"
-#include "hl_gpu_lstm.cuh"
-#include "hl_cpu_lstm.cuh"
-#include "hl_gru_ops.cuh"
-#include "hl_gpu_gru.cuh"
-#include "hl_cpu_gru.cuh"
-
-/**
- * @brief   Cpu lstm forward one sequence.
- *
- * @param[in]   op                  hl_lstm_ops.cuh
- * @param[out]  value               hl_lstm_value type.
- * @param[in]   frameSize           frame size.
- * @param[in]   active_node         active input type.
- * @param[in]   active_gate         active state type.
- * @param[in]   active_state        actvie gate type.
- */
-template<class Op>
-extern void hl_cpu_lstm_forward(Op op,
-                                hl_lstm_value value,
-                                int frameSize,
-                                hl_activation_mode_t active_node,
-                                hl_activation_mode_t active_gate,
-                                hl_activation_mode_t active_state);
-
-/**
- * @brief   Cpu lstm backward one sequence.
- *
- * @param[in]   op                  hl_lstm_ops.cuh
- * @param[in]   value               lstm value.
- * @param[out]  grad                output gradient.
- * @param[in]   frameSize           frame size.
- * @param[in]   active_node         active input type.
- * @param[in]   active_gate         active state type.
- * @param[in]   active_state        actvie gate type.
- */
-template<class Op>
-extern void hl_cpu_lstm_backward(Op op,
-                                 hl_lstm_value value,
-                                 hl_lstm_grad grad,
-                                 int frameSize,
-                                 hl_activation_mode_t active_node,
-                                 hl_activation_mode_t active_gate,
-                                 hl_activation_mode_t active_state);
-
-/**
- * @brief   Gpu lstm batch forward.
- *
- * @param[in]   op                  hl_lstm_ops.cuh
- * @param[out]  value               lstm value.
- * @param[in]   frameSize           frame size.
- * @param[in]   batchSize           size of current batch.
- * @param[in]   active_node         active input type.
- * @param[in]   active_gate         active state type.
- * @param[in]   active_state        actvie gate type.
- */
-template<class Op>
-extern void hl_gpu_lstm_forward(Op op,
-                                hl_lstm_value value,
-                                int frameSize,
-                                int batchSize,
-                                hl_activation_mode_t active_node,
-                                hl_activation_mode_t active_gate,
-                                hl_activation_mode_t active_state);
-
-/**
- * @brief   Gpu lstm batch backward.
- *
- * @param[in]   op                  hl_lstm_ops.cuh
- * @param[out]  value               lstm value.
- * @param[out]  grad                lstm gradient.
- * @param[in]   frameSize           frame size.
- * @param[in]   batchSize           size of current batch.
- * @param[in]   active_node         active input type.
- * @param[in]   active_gate         active state type.
- * @param[in]   active_state        actvie gate type.
- */
-template<class Op>
-extern void hl_gpu_lstm_backward(Op op,
-                                 hl_lstm_value value,
-                                 hl_lstm_grad grad,
-                                 int frameSize,
-                                 int batchSize,
-                                 hl_activation_mode_t active_node,
-                                 hl_activation_mode_t active_gate,
-                                 hl_activation_mode_t active_state);
-
-/**
- * @brief   Cpu gru forward.
- *
- * @param[in]     opResetOutput   hl_gru_ops.cuh
- * @param[in]     opFinalOutput   hl_gru_ops.cuh
- * @param[in,out] value           gru value.
- * @param[in]     frameSize       frame length/size.
- * @param[in]     batchSize       size of current batch.
- * @param[in]     active_node     active input type.
- * @param[in]     active_gate     active state type.
- */
-template<class OpResetOutput, class OpFinalOutput>
-extern void hl_cpu_gru_forward(OpResetOutput opResetOutput,
-                               OpFinalOutput opFinalOutput,
-                               hl_gru_value value,
-                               int frameSize,
-                               int batchSize,
-                               hl_activation_mode_t active_node,
-                               hl_activation_mode_t active_gate);
-
-/**
- * @brief   Cpu gru forward.
- *
- * @param[in]     opStateGrad     hl_gru_ops.cuh
- * @param[in]     opResetGrad     hl_gru_ops.cuh
- * @param[in]     value           gru value.
- * @param[in,out] grad            gru gradient.
- * @param[in]     frameSize       frame length/size.
- * @param[in]     batchSize       size of current batch.
- * @param[in]     active_node     active input type.
- * @param[in]     active_gate     active state type.
- */
-template<class OpStateGrad, class OpResetGrad>
-extern void hl_cpu_gru_backward(OpStateGrad opStateGrad,
-                                OpResetGrad opResetGrad,
-                                hl_gru_value value,
-                                hl_gru_grad  grad,
-                                int frameSize,
-                                int batchSize,
-                                hl_activation_mode_t active_node,
-                                hl_activation_mode_t active_gate);
-
-/**
- * @brief   Gpu gru forward.
- *
- * @param[in]     opResetOutput   hl_gru_ops.cuh
- * @param[in]     opFinalOutput   hl_gru_ops.cuh
- * @param[in,out] value           gru value.
- * @param[in]     frameSize       frame length/size.
- * @param[in]     batchSize       size of current batch.
- * @param[in]     active_node     active input type.
- * @param[in]     active_gate     active state type.
- */
-template<class OpResetOutput, class OpFinalOutput>
-extern void hl_gpu_gru_forward(OpResetOutput opResetOutput,
-                               OpFinalOutput opFinalOutput,
-                               hl_gru_value value,
-                               int frameSize,
-                               int batchSize,
-                               hl_activation_mode_t active_node,
-                               hl_activation_mode_t active_gate);
-
-/**
- * @brief   Gpu gru forward.
- *
- * @param[in]     opStateGrad     hl_gru_ops.cuh
- * @param[in]     opResetGrad     hl_gru_ops.cuh
- * @param[in]     value           gru value.
- * @param[in,out] grad            gru gradient.
- * @param[in]     frameSize       frame length/size.
- * @param[in]     batchSize       size of current batch.
- * @param[in]     active_node     active input type.
- * @param[in]     active_gate     active state type.
- */
-template<class OpStateGrad, class OpResetGrad>
-extern void hl_gpu_gru_backward(OpStateGrad opStateGrad,
-                                OpResetGrad opResetGrad,
-                                hl_gru_value value,
-                                hl_gru_grad  grad,
-                                int frameSize,
-                                int batchSize,
-                                hl_activation_mode_t active_node,
-                                hl_activation_mode_t active_gate);
-
-#endif /* HL_RECURRENT_APPLY_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_sequence.h b/paddle/legacy/cuda/include/hl_sequence.h
deleted file mode 100644
index 3923bdd921b..00000000000
--- a/paddle/legacy/cuda/include/hl_sequence.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_SEQUENCE_H_
-#define HL_SEQUENCE_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   Maximum sequence forward.
- *
- * @param[in]   input           each sequence contains some instances.
- * @param[in]   sequence        sequence index..
- * @param[out]  output          max instance in this sequence.
- * @param[out]  index           index of max instance.
- * @param[in]   numSequences    size of sequence[in].
- * @param[in]   dim             input dimension.
- *
- */
-extern void hl_max_sequence_forward(real* input,
-                                    const int* sequence,
-                                    real* output,
-                                    int* index,
-                                    int numSequences,
-                                    int dim);
-
-/**
- * @brief   Maximum sequence backward.
- *
- * @param[in]   outputGrad      output gradient.
- * @param[in]   index           index of max instance.
- * @param[out]  inputGrad       input gradient.
- * @param[in]   numSequences    size of sequence[in].
- * @param[in]   dim             input dimension.
- *
- */
-extern void hl_max_sequence_backward(
-    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim);
-
-/**
- * @brief   Memory copy from sequence to batch.
- *
- * if seq2batch == true
- *
- *    copy from sequence to batch: batch[i] = sequence[batchIndex[i]].
- *
- * if seq2batch == false
- *
- *    copy from batch to sequence: sequence[batchIndex[i]] = batch[i].
- *
- * @param[in,out]   batch       batch matrix.
- * @param[in,out]   sequence    equence matrix.
- * @param[in]       batchIndex  index vector.
- * @param[in]       seqWidth    width of sequence.
- * @param[in]       batchCount  number of batchIndex.
- * @param[in]       seq2batch   copy direction.
- *
- */
-extern void hl_sequence2batch_copy(real* batch,
-                                   real* sequence,
-                                   const int* batchIndex,
-                                   int seqWidth,
-                                   int batchCount,
-                                   bool seq2batch);
-
-/**
- * @brief   Add sequence to batch.
- *
- * if seq2batch == true
- *
- *    add sequence to batch: batch[i] = sequence[batchIndex[i]].
- *
- * if seq2batch == false
- *
- *    add batch to sequence: sequence[batchIndex[i]] = batch[i].
- *
- * @param[in,out]   batch       batch matrix.
- * @param[in,out]   sequence    equence matrix.
- * @param[in]       batchIndex  index vector.
- * @param[in]       seqWidth    width of sequence.
- * @param[in]       batchCount  number of batchIndex.
- * @param[in]       seq2batch   copy direction.
- *
- */
-extern void hl_sequence2batch_add(real* batch,
-                                  real* sequence,
-                                  int* batchIndex,
-                                  int seqWidth,
-                                  int batchCount,
-                                  bool seq2batch);
-
-/**
- * @brief   Memory copy from sequence to batch,
- *          while padding all sequences to the same length.
- *
- * if seq2batch == true
- *
- *    copy from sequence to batch:
- *        batch[i] = sequence[sequenceStartPositions[i]]
- *
- * if seq2batch == false
- *
- *    copy from batch to sequence:
- *        sequence[sequenceStartPositions[i]] = batch[i]
- *
- * @param[in,out]   batch                   batch matrix.
- * @param[in,out]   sequence                sequence matrix.
- * @param[in]       sequenceStartPositions  index vector.
- * @param[in]       sequenceWidth           width of sequence.
- * @param[in]       maxSequenceLength       maximum length of sequences.
- * @param[in]       numSequences            number of sequences.
- * @param[in]       normByTimes             whether dividing sequence's length.
- * @param[in]       seq2batch               copy direction.
- *
- */
-extern void hl_sequence2batch_copy_padding(real* batch,
-                                           real* sequence,
-                                           const int* sequenceStartPositions,
-                                           const size_t sequenceWidth,
-                                           const size_t maxSequenceLength,
-                                           const size_t numSequences,
-                                           bool normByTimes,
-                                           bool seq2batch);
-
-/**
- * @brief  dst = Op(src), src is sequence.
- *
- * mode = 0, Op is average.
- *
- * mode = 1, Op is sum.
- *
- * mode = 2, Op is sum(src)/sqrt(N), N is sequence length.
- *
- * @param[in,out]   dst       destination data.
- * @param[in]       src       source data.
- * @param[in]       starts    sequence start positions.
- * @param[in]       height    height of dst data.
- * @param[in]       width     width of dst data.
- * @param[in]       mode      0: avreage,
- *                            1: sum,
- *                            2: divide by square root
- *                            of sequenceLength
- */
-extern void hl_sequence_avg_forward(real* dst,
-                                    real* src,
-                                    const int* starts,
-                                    int height,
-                                    int width,
-                                    const int mode);
-
-extern void hl_sequence_avg_backward(real* dst,
-                                     real* src,
-                                     const int* starts,
-                                     int height,
-                                     int width,
-                                     const int mode);
-#endif /* HL_SEQUENCE_H_ */
diff --git a/paddle/legacy/cuda/include/hl_sparse.h b/paddle/legacy/cuda/include/hl_sparse.h
deleted file mode 100644
index 9aab52e045c..00000000000
--- a/paddle/legacy/cuda/include/hl_sparse.h
+++ /dev/null
@@ -1,523 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_SPARSE_H_
-#define HL_SPARSE_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   Malloc a sparse matrix.
- *
- * @param[out]  A_d        sparse matrix.
- * @param[in]   format     format.
- * @param[in]   value_type valueType.
- * @param[in]   dimM       height.
- * @param[in]   dimN       width.
- * @param[in]   nnz        number of none zero element.
- *
- */
-extern void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                    hl_matrix_format_t format,
-                                    hl_matrix_value_t value_type,
-                                    int dimM,
-                                    int dimN,
-                                    int nnz);
-
-/**
- * @brief   Free a sparse matrix.
- *
- * @param[in]  A_d  GPU sparse matrix.
- *
- */
-extern void hl_free_sparse_matrix(hl_sparse_matrix_s A_d);
-
-/**
- * @brief   Construct a sparse matrix use input gpu memory.
- *
- * @param[out]  A_d         sparse matrix.
- * @param[in]   dest_d      gpu memory.
- * @param[in]   size        size of dest_d.
- * @param[in]   format      format.
- * @param[in]   value_type  valueType.
- * @param[in]   dimM        height.
- * @param[in]   dimN        width.
- * @param[in]   nnz         number of none zero element.
- *
- * @note    Destruct api is hl_destruct_sparse_matrix.
- *
- */
-extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       void *dest_d,
-                                       size_t size,
-                                       hl_matrix_format_t format,
-                                       hl_matrix_value_t value_type,
-                                       int dimM,
-                                       int dimN,
-                                       int nnz);
-
-/**
- * @brief   Use three arrays to construct sparse matrix.
- *
- * if format is HL_SPARSE_CSR, size of rows_d is dimM + 1,
- * and size of cols_d is nnz;
- *
- * if format is HL_SPARSE_CSC, size of rows_d is nnz, and size of
- * cols_d is dimN + 1.
- *
- * if valueType is HL_NO_VALUE, size of value_d is zero,
- * else size of value_d is nnz.
- *
- * @param[out]  A_d        sparse matrix.
- * @param[in]   value_d    value.
- * @param[in]   rows_d     row.
- * @param[in]   cols_d     col.
- * @param[in]   format     format.
- * @param[in]   value_type valueType.
- * @param[in]   dimM       height.
- * @param[in]   dimN       width.
- * @param[in]   nnz        number of none zero element.
- *
- * @note    The corresponding destructor interface is hl_destruct_sparse_matrix.
- *
- */
-extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       real *value_d,
-                                       int *rows_d,
-                                       int *cols_d,
-                                       hl_matrix_format_t format,
-                                       hl_matrix_value_t value_type,
-                                       int dimM,
-                                       int dimN,
-                                       int nnz);
-
-/**
- * @brief   Destruct sparse matrix.
- *
- * @param[in] A_d  sparse matrix.
- *
- */
-extern void hl_destruct_sparse_matrix(hl_sparse_matrix_s A_d);
-
-/**
- * @brief   Copy value & index to sparse matrix.
- *
- * if csr_matrix is HL_FLOAT_VALUE.
- *
- *  1. csr_val, csr_row, csr_col three pointers are not null.
- *
- *  2. csr_val is not null, csr_row adn csr_col are null.
- *
- * if csr_matrix is HL_NO_VALUE.
- *
- *  1. csr_val will be ignore, csr_row and csr_col are not null.
- *
- *
- * @param[in,out]   csr_matrix sparse matrix.
- * @param[in]       csr_val    point to csr value array(nnz).
- * @param[in]       csr_row    point to csr row indices array(dimM+1).
- * @param[in]       csr_col    point to csr col indices array(nnz).
- * @param[in]       stream     hl_stream_t type.
- *
- */
-extern void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
-                                 real *csr_val,
-                                 int *csr_row,
-                                 int *csr_col,
-                                 hl_stream_t stream);
-
-/**
- * @brief   Copy value & index to sparse matrix.
- *
- * if csr_matrix is HL_FLOAT_VALUE.
- *
- *   1. csc_val, csc_row, csc_col three pointers are not null.
- *
- *   2. csc_val is not null, csc_row and csc_col are null.
- *
- * if csr_matrix is HL_NO_VALUE.
- *
- *   1. csc_val will be ignore, csc_row and csc_col are not null.
- *
- * @param[in,out]   csc_matrix sparse matrix.
- * @param[in]       csc_val    point to csc value array(nnz).
- * @param[in]       csc_row    point to csc row indices array(nnz).
- * @param[in]       csc_col    point to csc col indices array(dimN+1).
- * @param[in]       stream     hl_stream_t type.
- *
- *
- */
-extern void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
-                                 real *csc_val,
-                                 int *csc_row,
-                                 int *csc_col,
-                                 hl_stream_t stream);
-
-/**
- * @brief   Copy sparse matrix to sparse matrix.
- *
- * @param[out]  dst     sparse matrix.
- * @param[in]   src     sparse matrix.
- * @param[in]   stream  hl_stream_t type.
- *
- *
- * @note    1. Format of the src matrix and dst matrix needs to be consistent.
- *          2. Source matrix has value, the destination matrix has value or
- *             no value can be; the source matrix is no value, then the
- *             destination matrix must also be no value;
- */
-extern void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
-                                    hl_sparse_matrix_s src,
-                                    hl_stream_t stream);
-
-/**
- * @brief   csr matrix to dense matrix.
- *
- * @param[in]   A_d     csr matrix.
- * @param[out]  C_d     dense matrix.
- * @param[in]   dimM    height.
- * @param[in]   dimN    width.
- *
- */
-extern void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN);
-
-/**
- * @brief   csc matrix to dense matrix.
- *
- * @param[in]   A_d     csc matrix.
- * @param[out]  C_d     dense matrix.
- * @param[in]   dimM    height.
- * @param[in]   dimN    width.
- *
- */
-extern void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN);
-
-/**
- * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
- *
- * @param[in]   A_d     csr sparse matrix.
- * @param[in]   transa  operation op(A) that is non-or transpose.
- * @param[in]   B_d     dense matrix.
- * @param[in]   transb  operation op(B) that is non-or transpose.
- * @param[out]  C_d     dense matrix.
- * @param[in]   dimM    matrix height of op(A) & C
- * @param[in]   dimN    matrix width of op(B) & C
- * @param[in]   dimK    width of op(A) & height of op(B)
- * @param[in]   alpha   scalar used for multiplication.
- * @param[in]   beta    scalar used for multiplication.
- *                      If beta is zero, C does not have to be a valid input.
- *
- * @note    transb is not support HPPL_OP_T.
- *
- */
-extern void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
-                                    hl_trans_op_t transa,
-                                    real *B_d,
-                                    hl_trans_op_t transb,
-                                    real *C_d,
-                                    int dimM,
-                                    int dimN,
-                                    int dimK,
-                                    real alpha,
-                                    real beta);
-
-/**
- * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
- *
- * @param[in]   A_d     sparse matrix.
- * @param[in]   transa  operation op(A) that is non-or transpose.
- * @param[in]   B_d     dense matrix.
- * @param[in]   transb  operation op(B) that is non-or transpose.
- * @param[out]  C_d     dense matrix.
- * @param[in]   dimM    matrix height of op(A) & C
- * @param[in]   dimN    matrix width of op(B) & C
- * @param[in]   dimK    width of op(A) & height of op(B)
- * @param[in]   alpha   scalar used for multiplication.
- * @param[in]   beta    scalar used for multiplication.
- *                      If beta is zero, C does not have to be a valid input.
- *
- * @note    transb is not support HPPL_OP_T.
- *
- */
-extern void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
-                                    hl_trans_op_t transa,
-                                    real *B_d,
-                                    hl_trans_op_t transb,
-                                    real *C_d,
-                                    int dimM,
-                                    int dimN,
-                                    int dimK,
-                                    real alpha,
-                                    real beta);
-
-/**
- * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
- *
- * @param[in]   A_d     dense matrix.
- * @param[in]   transa  operation op(A) that is non-or transpose.
- * @param[in]   B_d     csc sparse matrix.
- * @param[in]   transb  operation op(B) that is non-or transpose.
- * @param[out]  C_d     dense matrix.
- * @param[in]   dimM    matrix height of op(A) & C
- * @param[in]   dimN    matrix width of op(B) & C
- * @param[in]   dimK    width of op(A) & height of op(B)
- * @param[in]   alpha   scalar used for multiplication.
- * @param[in]   beta    scalar used for multiplication.
- *                      If beta is zero, C does not have to be a valid input.
- *
- * @note    transa is not support HPPL_OP_T.
- *
- */
-extern void hl_matrix_dense_mul_csc(real *A_d,
-                                    hl_trans_op_t transa,
-                                    hl_sparse_matrix_s B_d,
-                                    hl_trans_op_t transb,
-                                    real *C_d,
-                                    int dimM,
-                                    int dimN,
-                                    int dimK,
-                                    real alpha,
-                                    real beta);
-
-/**
- * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
- *          Calculated based on the non-zero elements of the matrix C.
- *
- * @param[in]     A_d     dense matrix.
- * @param[in]     transa  operation op(A) that is non-or transpose.
- * @param[in]     B_d     dense matrix.
- * @param[in]     transb  operation op(B) that is non-or transpose.
- * @param[in,out] C_d     sparse matrix.
- * @param[in]     dimM    matrix height of op(A) & C
- * @param[in]     dimN    matrix width of op(B) & C
- * @param[in]     dimK    width of op(A) & height of op(B)
- * @param[in]     alpha   scalar used for multiplication.
- * @param[in]     beta    scalar used for multiplication.
- *
- * @note    transb is not support HPPL_OP_T.
- *
- */
-extern void hl_sparse_matrix_mul(real *A_d,
-                                 hl_trans_op_t transa,
-                                 real *B_d,
-                                 hl_trans_op_t transb,
-                                 hl_sparse_matrix_s C_d,
-                                 int dimM,
-                                 int dimN,
-                                 int dimK,
-                                 real alpha,
-                                 real beta);
-
-/**
- * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
- *
- * @param[in]   A_d     dense matrix.
- * @param[in]   transa  operation op(A) that is non-or transpose.
- * @param[in]   B_d     sparse matrix.
- * @param[in]   transb  operation op(B) that is non-or transpose.
- * @param[out]  C_d     dense matrix.
- * @param[in]   dimM    matrix height of op(A) & C
- * @param[in]   dimN    matrix width of op(B) & C
- * @param[in]   dimK    width of op(A) & height of op(B)
- * @param[in]   alpha   scalar used for multiplication.
- * @param[in]   beta    scalar used for multiplication.
- *                      If beta is zero, C does not have to be a valid input.
- *
- *
- * @note    transa is not support HPPL_OP_T.
- *
- */
-extern void hl_matrix_dense_mul_csr(real *A_d,
-                                    hl_trans_op_t transa,
-                                    hl_sparse_matrix_s B_d,
-                                    hl_trans_op_t transb,
-                                    real *C_d,
-                                    int dimM,
-                                    int dimN,
-                                    int dimK,
-                                    real alpha,
-                                    real beta);
-
-/**
- * @brief   Memcpy csc_matrix to host.
- *
- * a. according to csc_matrix, update three arrays
- *
- *  1. csc_val, csc_row, csc_col are dest Address.
- *
- *  2. if type of csc_matrix is HL_NO_VALUE, update csc_row and csc_col
- *
- *  3. if type of csc_matrix is HL_FLOAT_VALUE, update csc_row,
- *     csc_col and csc_value.
- *
- * b. The interface is asynchronous copy. To ensure that the data is copied
- *     please call the synchronous interface;
- *
- *
- * @param[out]  csc_val     point to csc value array(nnz).
- * @param[in]   val_size    csc value size.
- * @param[out]  csc_row     point to csc row indices array(nnz).
- * @param[in]   row_size    csc row size.
- * @param[out]  csc_col     point to csc col indices array(dimN + 1).
- * @param[in]   col_size    csc column size.
- * @param[in]   csc_matrix  sparse matrix.
- * @param[in]   stream      hl_stream_t type.
- *
- */
-extern void hl_memcpy_from_csc_matrix(real *csc_val,
-                                      size_t val_size,
-                                      int *csc_row,
-                                      size_t row_size,
-                                      int *csc_col,
-                                      size_t col_size,
-                                      hl_sparse_matrix_s csc_matrix,
-                                      hl_stream_t stream);
-
-/**
- * @brief   Memcpy sparse matrix to host.
- *
- * a. according to csr_matrix, update three arrays
- *
- *  1. csr_val, csr_row, csr_col are dest Address.
- *
- *  2. if type of csr_matrix is HL_NO_VALUE, update csr_row and csr_col
- *
- *  3. if type of csr_matrix is HL_FLOAT_VALUE, update csr_row,
- *     csr_col and csr_value
- *
- * b. The interface is asynchronous copy. To ensure that the data is copied
- *     please call the synchronous interface;
- *
- * @param[out]  csr_val     point to csr value array(nnz).
- * @param[in]   val_size    csr value size.
- * @param[out]  csr_row     point to csr row indices array(nnz).
- * @param[in]   row_size    csr row size.
- * @param[out]  csr_col     point to csr col indices array(dimN + 1).
- * @param[in]   col_size    csr column size.
- * @param[in]   csr_matrix  sparse matrix.
- * @param[in]   stream      hl_stream_t type.
- *
- */
-extern void hl_memcpy_from_csr_matrix(real *csr_val,
-                                      size_t val_size,
-                                      int *csr_row,
-                                      size_t row_size,
-                                      int *csr_col,
-                                      size_t col_size,
-                                      hl_sparse_matrix_s csr_matrix,
-                                      hl_stream_t stream);
-
-/**
- * @brief   A_d[j] += B_d[i,j] for i in range(height)
- *
- * @param[in,out]   A_d    vector, size = width.
- * @param[in]       B_d    sparse matrix.
- * @param[in]       dimM   height.
- * @param[in]       dimN   width.
- * @param[in]       scale  scale of B_d
- *
- */
-extern void hl_sparse_matrix_column_sum(
-    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale);
-/**
- * @brief implementation of csr sparse matrix in hl_sparse_matirx_column_sum
- */
-extern void hl_matrix_csr_column_sum(
-    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale);
-
-/**
- * @brief   A_d[i,j] += B_d[j]
- *
- * @param[in,out]   A_d    sprare matrix.
- * @param[in]       B_d    vector, size = A_d.width.
- * @param[in]       scale  scale of B_d.
- *
- */
-extern void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
-                                      real *B_d,
-                                      real scale);
-/**
- * @brief implementation of csr sparse matrix in hl_sparse_matrix_add_bias
- */
-extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
-                                   real *B_d,
-                                   real scale);
-
-/**
- * @brief   sparseMatrix = alpha * denseMatrix + beta *sparseMatrix
- *          A_d[i,j] = alpha * B_d[i,j] + beta * A_d[i,j]
- *          Only add value of same (row, col) index in dense matrix and
- *          do not use others values whoes postions are not in sparse matirx.
- *
- * @param[in,out]   A_d    sprare matrix.
- * @param[in]       B_d    dense matrix.
- * @param[in]       dimM   height of B_d.
- * @param[in]       dimN   width of B_d.
- * @param[in]       alpha  scale of B_d.
- * @param[in]       beta   scale of A_d.
- *
- */
-extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
-                                       real *B_d,
-                                       int dimM,
-                                       int dimN,
-                                       real alpha,
-                                       real beta);
-/**
- * @brief implementation of csr sparse matrix in hl_sparse_matrix_add_dense
- */
-extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
-                                    real *B_d,
-                                    int dimM,
-                                    int dimN,
-                                    real alpha,
-                                    real beta);
-
-/**
- * @brief get rows pionter of GpuSparseMatrix
- *
- * @param[in]    sMat  sparse matrix
- *
- * @return   return rows pointer, which is gpu address
- *
- */
-extern int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
-
-/**
- * @brief get cols pionter of GpuSparseMatrix
- *
- * @param[in]    sMat  sparse matrix
- *
- * @return   return cols pointer, which is gpu address
- *
- */
-extern int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
-
-/**
- * @brief get value pionter of GpuSparseMatrix
- *
- * @param[in]    sMat  sparse matrix
- *
- * @return   return value pointer, which is gpu address
- *
- */
-extern real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat);
-
-#endif /* HL_SPARSE_H_ */
diff --git a/paddle/legacy/cuda/include/hl_sparse.ph b/paddle/legacy/cuda/include/hl_sparse.ph
deleted file mode 100644
index c0fdccb942c..00000000000
--- a/paddle/legacy/cuda/include/hl_sparse.ph
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_SPARSE_PH_
-#define HL_SPARSE_PH_
-
-#include "hl_base.h"
-
-/**
- * @brief   sparse matrix csr format.
- *
- * @param   *csr_val     nonzero values of matrix.
- * @param   *csr_row     row indices.
- * @param   *csr_col     column indices.
- * @param   nnz_s        sizeof of csr_val & csr_col.
- * @param   row_s        sizeof of csr_row.
- * @param   sparsity     sparsity pattern.
- *
- */
-typedef struct {
-    real                *csr_val;
-    int                 *csr_row;
-    int                 *csr_col;
-    size_t              nnz_s;
-    int                 row_s;
-    float               sparsity;
-}_hl_csr_matrix, *hl_csr_matrix;
-
-/**
- * @brief   sparse matrix csc format.
- *
- * @param   *csc_val      nonzero values of matrix.
- * @param   *csc_row      row indices.
- * @param   *csc_col      column indices.
- * @param   nnz_s         sizeof of csc_val & csc_row.
- * @param   col_s         sizeof of csc_col.
- * @param   sparsity      sparsity pattern.
- *
- */
-typedef struct {
-    real                *csc_val;
-    int                 *csc_row;
-    int                 *csc_col;
-    size_t              nnz_s;
-    int                 col_s;
-    float               sparsity;
-}_hl_csc_matrix, *hl_csc_matrix;
-
-#define __sparse_get_type_return__(mat, type, field)\
-  do {\
-    hl_##type##_matrix type##_d = (hl_##type##_matrix)((mat)->matrix);\
-    if (type##_d) {\
-      return type##_d -> type##_##field;\
-    } else {\
-      LOG(WARNING) << "parameter " <<  #field << "NULL error!";\
-      return NULL;\
-    }\
-  } while(0)
-
-#define __sparse_get_return__(mat, field)\
-  do {\
-    if ((mat) == NULL) {\
-      LOG(WARNING) << "parameter NULL error!";\
-      return NULL;\
-    }\
-    if ((mat)->format == HL_SPARSE_CSR) {\
-      __sparse_get_type_return__(mat, csr, field);\
-    } else {\
-      __sparse_get_type_return__(mat, csc, field);\
-    }\
-  } while(0)
-
-#endif  /* HL_SPARSE_PH_ */
diff --git a/paddle/legacy/cuda/include/hl_table_apply.h b/paddle/legacy/cuda/include/hl_table_apply.h
deleted file mode 100644
index dff60aa0a22..00000000000
--- a/paddle/legacy/cuda/include/hl_table_apply.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_TABLE_APPLY_H_
-#define HL_TABLE_APPLY_H_
-
-/**
- * @brief   Get row from table.
- *          output[i] += table[ids[i]]
- *          if ids[i] == -1, it will be ignored
- *
- * @param[out]  output          output matrix.
- * @param[in]   ldo             leading dimension of output.
- * @param[in]   table           table matrix.
- * @param[in]   ldt             leading dimension of table.
- * @param[in]   ids             ids vector.
- * @param[in]   numSamples      height of output.
- * @param[in]   tableSize       height of table.
- * @param[in]   dim             width of table.
- *
- */
-extern void hl_matrix_select_rows(real* output,
-                                  int ldo,
-                                  real* table,
-                                  int ldt,
-                                  int* ids,
-                                  int numSamples,
-                                  int tableSize,
-                                  int dim);
-
-/**
- * @brief   Add row to table.
- *          table[ids[i]] += output[i]
- *          if ids[i] == -1, it will be ignored
- *
- * @param[out]  table           table matrix.
- * @param[in]   ldt             leading dimension of table.
- * @param[in]   input           input matrix.
- * @param[in]   ldi             leading dimension of input.
- * @param[in]   ids             ids vector.
- * @param[in]   numSamples      height of input.
- * @param[in]   tableSize       height of table.
- * @param[in]   dim             width of table.
- *
- */
-extern void hl_matrix_add_to_rows(real* table,
-                                  int ldt,
-                                  real* input,
-                                  int ldi,
-                                  int* ids,
-                                  int numSamples,
-                                  int tableSize,
-                                  int dim);
-
-/**
- * @brief   Select element from vector.
- *
- * @param[out]  dst         output vector.
- * @param[in]   sized       size of dst.
- * @param[in]   src         input vector.
- * @param[in]   sizes       size of src.
- * @param[in]   ids         index vector.
- * @param[in]   sizei       size of ids.
- *
- */
-template <class T>
-extern void hl_vector_select_from(
-    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei);
-
-#endif /* HL_TABLE_APPLY_H_ */
diff --git a/paddle/legacy/cuda/include/hl_tensor_ops.h b/paddle/legacy/cuda/include/hl_tensor_ops.h
deleted file mode 100644
index bc5e5da53d5..00000000000
--- a/paddle/legacy/cuda/include/hl_tensor_ops.h
+++ /dev/null
@@ -1,536 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_TENSOR_OPS_H_
-#define HL_TENSOR_OPS_H_
-
-#include <cmath>
-#include "hl_matrix_type.cuh"
-
-namespace hppl {
-namespace unary {
-
-template <class T>
-class add_scale {
- private:
-  const T p;
-
- public:
-  INLINE add_scale(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return a + p; }
-};
-
-template <class T>
-class sub_scale {
- private:
-  const T p;
-
- public:
-  INLINE sub_scale(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return a - p; }
-};
-
-template <class T>
-class mul_scale {
- private:
-  const T p;
-
- public:
-  INLINE mul_scale(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return a * p; }
-};
-
-template <class T>
-class div_scale {
- private:
-  const T p;
-
- public:
-  INLINE div_scale(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return a / p; }
-};
-
-template <class T>
-class neg {
- public:
-  INLINE T operator()(const T a) const { return -a; }
-};
-
-template <class T>
-class exp_op {
- public:
-  INLINE T operator()(const T a) const { return std::exp(a); }
-};
-
-template <class T>
-class log_op {
- public:
-  INLINE T operator()(const T a) const { return std::log(a); }
-};
-
-template <class T>
-class sqrt_op {
- public:
-  INLINE T operator()(const T a) const { return std::sqrt(a); }
-};
-
-template <class T>
-class square {
- public:
-  INLINE T operator()(const T a) const { return a * a; }
-};
-
-template <class T>
-class reciprocal {
- public:
-  INLINE T operator()(const T a) const { return T(1) / a; }
-};
-
-template <class T>
-class abs {
- public:
-  INLINE T operator()(const T a) const { return a > 0 ? a : -a; }
-};
-
-template <class T>
-class sign {
- public:
-  INLINE T operator()(const T a) const { return (a > 0) - (a < 0); }
-};
-
-template <class T>
-class min {
- private:
-  const T p;
-
- public:
-  INLINE min(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return a > p ? p : a; }
-};
-
-template <class T>
-class max {
- private:
-  const T p;
-
- public:
-  INLINE max(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return a < p ? p : a; }
-};
-
-template <class T>
-class pow_op {
- private:
-  const T p;
-
- public:
-  INLINE pow_op(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return std::pow(a, p); }
-};
-
-template <class T>
-class constant {
- private:
-  const T p;
-
- public:
-  INLINE constant(const T s) : p(s) {}
-  INLINE T operator()(int i) const { return p; }
-  INLINE T operator()(int i, int j) const { return p; }
-};
-
-template <class T>
-class cmp_eq {
- private:
-  const T p;
-
- public:
-  INLINE cmp_eq(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a == p; }
-};
-
-template <class T>
-class cmp_ne {
- private:
-  const T p;
-
- public:
-  INLINE cmp_ne(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a != p; }
-};
-
-template <class T>
-class cmp_le {
- private:
-  const T p;
-
- public:
-  INLINE cmp_le(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a <= p; }
-};
-
-template <class T>
-class cmp_lt {
- private:
-  const T p;
-
- public:
-  INLINE cmp_lt(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a < p; }
-};
-
-template <class T>
-class cmp_ge {
- private:
-  const T p;
-
- public:
-  INLINE cmp_ge(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a >= p; }
-};
-
-template <class T>
-class cmp_gt {
- private:
-  const T p;
-
- public:
-  INLINE cmp_gt(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a > p; }
-};
-
-template <class T>
-class and_op {
- private:
-  const T p;
-
- public:
-  INLINE and_op(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a && p; }
-};
-
-template <class T>
-class or_op {
- private:
-  const T p;
-
- public:
-  INLINE or_op(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a || p; }
-};
-
-}  // namespace unary
-
-namespace binary {
-template <class T>
-class add {
- public:
-  INLINE T operator()(const T a, const T b) const { return a + b; }
-};
-
-template <class T>
-class add_scale {
- private:
-  const T p1;
-  const T p2;
-
- public:
-  INLINE add_scale(const T s1, const T s2) : p1(s1), p2(s2) {}
-  INLINE T operator()(const T a, const T b) const { return p1 * a + p2 * b; }
-};
-
-template <class T>
-class sub {
- public:
-  INLINE T operator()(const T a, const T b) const { return a - b; }
-};
-
-template <class T>
-class mul {
- public:
-  INLINE T operator()(const T a, const T b) const { return a * b; }
-};
-
-template <class T>
-class div {
- public:
-  INLINE T operator()(const T a, const T b) const { return a / b; }
-};
-
-template <class T>
-class cmp_eq {
- public:
-  INLINE bool operator()(const T a, const T b) const { return a == b; }
-};
-
-template <class T>
-class cmp_ne {
- public:
-  INLINE bool operator()(const T a, const T b) const { return a != b; }
-};
-
-template <class T>
-class cmp_le {
- public:
-  INLINE bool operator()(const T a, const T b) const { return a <= b; }
-};
-
-template <class T>
-class cmp_lt {
- public:
-  INLINE bool operator()(const T a, const T b) const { return a < b; }
-};
-
-template <class T>
-class cmp_ge {
- public:
-  INLINE bool operator()(const T a, const T b) const { return a >= b; }
-};
-
-template <class T>
-class cmp_gt {
- public:
-  INLINE bool operator()(const T a, const T b) const { return a > b; }
-};
-
-template <class T>
-class and_op {
- public:
-  INLINE bool operator()(const T a, const T b) const { return a && b; }
-};
-
-template <class T>
-class or_op {
- public:
-  INLINE bool operator()(const T a, const T b) const { return a || b; }
-};
-
-template <class T>
-class min {
- public:
-  INLINE T operator()(const T a, const T b) const { return a > b ? b : a; }
-};
-
-template <class T>
-class max {
- public:
-  INLINE T operator()(const T a, const T b) const { return a < b ? b : a; }
-};
-
-#ifdef PADDLE_USE_SSE3
-#ifndef PADDLE_TYPE_DOUBLE
-template <>
-class add<__m128> {
- public:
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_add_ps(a, b);
-  }
-};
-
-template <>
-class add_scale<__m128> {
- private:
-  const __m128 p1;
-  const __m128 p2;
-
- public:
-  INLINE add_scale(const __m128 s1, const __m128 s2) : p1(s1), p2(s2) {}
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_add_ps(_mm_mul_ps(p1, a), _mm_mul_ps(p2, b));
-  }
-};
-
-template <>
-class sub<__m128> {
- public:
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_sub_ps(a, b);
-  }
-};
-
-template <>
-class mul<__m128> {
- public:
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_mul_ps(a, b);
-  }
-};
-
-template <>
-class div<__m128> {
- public:
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_div_ps(a, b);
-  }
-};
-
-template <>
-class min<__m128> {
- public:
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_min_ps(a, b);
-  }
-};
-
-template <>
-class max<__m128> {
- public:
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_max_ps(a, b);
-  }
-};
-#else
-template <>
-class add<__m128d> {
- public:
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_add_pd(a, b);
-  }
-};
-
-template <>
-class add_scale<__m128d> {
- private:
-  const __m128d p1;
-  const __m128d p2;
-
- public:
-  INLINE add_scale(const __m128d s1, const __m128d s2) : p1(s1), p2(s2) {}
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_add_pd(_mm_mul_pd(p1, a), _mm_mul_pd(p2, b));
-  }
-};
-
-template <>
-class sub<__m128d> {
- public:
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_sub_pd(a, b);
-  }
-};
-
-template <>
-class mul<__m128d> {
- public:
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_mul_pd(a, b);
-  }
-};
-
-template <>
-class div<__m128d> {
- public:
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_div_pd(a, b);
-  }
-};
-
-template <>
-class min<__m128d> {
- public:
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_min_pd(a, b);
-  }
-};
-
-template <>
-class max<__m128d> {
- public:
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_max_pd(a, b);
-  }
-};
-#endif  // PADDLE_TYPE_DOUBLE
-#endif  // PADDLE_USE_SSE3
-
-#ifdef PADDLE_USE_NEON
-#ifndef PADDLE_TYPE_DOUBLE
-template <>
-class add<float32x4_t> {
- public:
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    return vaddq_f32(a, b);
-  }
-};
-
-template <>
-class add_scale<float32x4_t> {
- private:
-  const float32x4_t p1;
-  const float32x4_t p2;
-
- public:
-  INLINE add_scale(const float32x4_t s1, const float32x4_t s2)
-      : p1(s1), p2(s2) {}
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    return vaddq_f32(vmulq_f32(p1, a), vmulq_f32(p2, b));
-  }
-};
-
-template <>
-class sub<float32x4_t> {
- public:
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    return vsubq_f32(a, b);
-  }
-};
-
-template <>
-class mul<float32x4_t> {
- public:
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    return vmulq_f32(a, b);
-  }
-};
-
-template <>
-class div<float32x4_t> {
- public:
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    float32x4_t tmp = vrecpeq_f32(b);
-    return vmulq_f32(a, tmp);
-  }
-};
-
-template <>
-class min<float32x4_t> {
- public:
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    return vminq_f32(a, b);
-  }
-};
-
-template <>
-class max<float32x4_t> {
- public:
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    return vmaxq_f32(a, b);
-  }
-};
-#else
-#error To be implemented
-#endif  // PADDLE_TYPE_DOUBLE
-#endif  // PADDLE_USE_NEON
-
-}  // namespace binary
-}  // namespace hppl
-
-#endif  // HL_TENSOR_OPS_H_
diff --git a/paddle/legacy/cuda/include/hl_thread.ph b/paddle/legacy/cuda/include/hl_thread.ph
deleted file mode 100644
index 4abede1517a..00000000000
--- a/paddle/legacy/cuda/include/hl_thread.ph
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_THREAD_PH_
-#define HL_THREAD_PH_
-
-#include <stdio.h>
-#include <pthread.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <curand.h>
-#include <cudnn.h>
-#include "hl_base.h"
-
-/**
- * @brief   Thread resource structure.
- *
- * @param   stream[HPPL_STREAM_END] Stream for thread.
- * @param   handle                  Cublas Handle.
- * @param   gen                     Curand Generator.
- * @param   cudnn_handle            Cudnn handle.
- * @param   cudnn_desc              Cudnn image descriptor.
- * @param   *gen_mutex              Gen lock.
- * @param   *gpu_mem                HPPL GPU Memory.
- * @param   *cpu_mem                HPPL CPU Memory.
- * @param   event                   gpu_mem event.
- * @param   device                  Thread device context.
- * @param   major                   Compute capability.
- * @param   is_init                 Thread init or not.
- */
-typedef struct {
-    cudaStream_t             stream[HPPL_STREAM_END];
-    cublasHandle_t           handle;
-    curandGenerator_t        gen;
-    cudnnHandle_t            cudnn_handle;
-    cudnnTensorDescriptor_t  cudnn_desc;
-    pthread_mutex_t          *gen_mutex;
-    real                     *gpu_mem;
-    real                     *cpu_mem;
-    cudaEvent_t              event;
-    int                      device;
-    int                      major;
-    bool                     is_init;
-} _hl_thread_resource, *hl_thread_resource;
-
-extern __thread _hl_thread_resource t_resource;
-
-/**
- * @brief   Initialize cudnn.
- *
- * @param   cudnn_handle  Cudnn handle.
- * @param   stream        Cudnn stream.
- */
-extern void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream);
-
-/**
- * @brief   Initialize cublas.
- *
- * @param   cublas_handle  Cublas handle.
- * @param   stream         Cuda stream.
- */
-extern void hl_cublas_init(cublasHandle_t *cublas_handle, cudaStream_t stream);
-
-/**
- * @brief   Initialize cudnn tensor descriptor.
- *
- * @param   cudnn_desc    Cudnn tensor descriptor.
- */
-
-extern void hl_cudnn_desc_init(cudnnTensorDescriptor_t*  cudnn_desc);
-
-#endif  /* HL_THREAD_PH_ */
diff --git a/paddle/legacy/cuda/include/hl_time.h b/paddle/legacy/cuda/include/hl_time.h
deleted file mode 100644
index 61d80c065c8..00000000000
--- a/paddle/legacy/cuda/include/hl_time.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_TIME_H_
-#define HL_TIME_H_
-#include <cstdint>
-/**
- * @brief   High resolution timer.
- *
- * @return  int64_t the representation value of the object as a
- *                  count of periods, which are not necessarily
- *                  seconds.
- *
- * @note    It is used to generate random perturbation parameters.
- */
-int64_t getCurrentTimeStick(void);
-
-#endif /* HL_TIME_H_ */
diff --git a/paddle/legacy/cuda/include/hl_top_k.h b/paddle/legacy/cuda/include/hl_top_k.h
deleted file mode 100644
index a3c7872f525..00000000000
--- a/paddle/legacy/cuda/include/hl_top_k.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_TOP_K_H_
-#define HL_TOP_K_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   find top k element.
- *
- * @param[out]  topVal         top k element.
- * @param[in]   ldv            leading dimension of topVal.
- * @param[out]  topIds         top k index.
- * @param[in]   src            input value.
- * @param[in]   lds            leading dimension of src.
- * @param[in]   dim            width of input value.
- * @param[in]   beamSize       beam size.
- * @param[in]   numSamples     height of input value.
- *
- */
-extern void hl_matrix_top_k(real* topVal,
-                            int ldv,
-                            int* topIds,
-                            real* src,
-                            int lds,
-                            int dim,
-                            int beamSize,
-                            int numSamples);
-
-/**
- * @brief   find top k element for each row in sparse matrix.
- *
- * @param[out]  topVal         top k element.
- * @param[in]   ldv            leading dimension of topVal.
- * @param[out]  topIds         top k index.
- * @param[in]   src            sparse matrix.
- * @param[in]   beamSize       beam size.
- * @param[in]   numSamples     height of input value.
- *
- * @note    Only support HL_SPARSE_CSR format.
- */
-extern void hl_sparse_matrix_top_k(real* topVal,
-                                   int ldv,
-                                   int* topIds,
-                                   hl_sparse_matrix_s src,
-                                   int beamSize,
-                                   int numSamples);
-
-/**
- * @brief   Matrix classification error.
- *
- * @param[out]  topVal         top k element.
- * @param[in]   ldv            leading dimension of topVal.
- * @param[out]  topIds         top k index.
- * @param[in]   src            input value.
- * @param[in]   lds            leading dimension of src.
- * @param[in]   dim            width of input value.
- * @param[in]   topkSize       size of top k element.
- * @param[in]   numSamples     height of input value.
- * @param[in]   label          ground truth label.
- * @param[out]  recResult      top-k classification error.
- *
- */
-extern void hl_matrix_classification_error(real* topVal,
-                                           int ldv,
-                                           int* topIds,
-                                           real* src,
-                                           int lds,
-                                           int dim,
-                                           int topkSize,
-                                           int numSamples,
-                                           int* label,
-                                           real* recResult);
-
-#endif  // HL_TOP_K_H_
diff --git a/paddle/legacy/cuda/include/hl_warpctc_wrap.h b/paddle/legacy/cuda/include/hl_warpctc_wrap.h
deleted file mode 100644
index 09cbd6d450f..00000000000
--- a/paddle/legacy/cuda/include/hl_warpctc_wrap.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#ifndef HL_WARPCTC_WRAP_H_
-#define HL_WARPCTC_WRAP_H_
-#include "ctc.h"
-#include "hl_base.h"
-
-typedef ctcStatus_t hl_warpctc_status_t;
-typedef ctcOptions hl_warpctc_options_t;
-
-/**
- * @brief Init ctc options.
- *
- * @param[in]   blank     blank label used in ctc loss function.
- * @param[in]   useGpu    whether use gpu.
- * @param[out]  options   handle to store cpu or gpu informations.
- *
- */
-extern void hl_warpctc_init(const size_t blank,
-                            bool useGpu,
-                            hl_warpctc_options_t* options);
-
-/**
- * @brief Compute the connectionist temporal classification loss,
- *        and optionally compute the gradient with respect to the inputs.
- *
- * if batchGrad == nullptr
- *
- *    only compute the ctc loss.
- *
- * if batchGrad != nullptr
- *
- *    compute both ctc loss and gradient.
- *
- * @param[in]   batchInput      batch matrix of input probabilities,
- *                              in maxSequenceLength x numSequence x numClasses
- *                              (row-major) format.
- * @param[out]  batchGrad       batch matrix of gradient.
- * @param[in]   cpuLabels       labels always in CPU memory.
- * @param[in]   cpuLabelLengths length of all labels in CPU memory.
- * @param[in]   cpuInputLengths length of all sequences in CPU memory.
- * @param[in]   numClasses      number of possible output symbols.
- * @param[in]   numSequences    number of sequence.
- * @param[out]  cpuCosts        cost of each sequence in CPU memory.
- * @param[out]  workspace       workspace to store some temporary results.
- * @param[in]   options         handle to store cpu or gpu informations.
- *
- */
-extern void hl_warpctc_compute_loss(const real* batchInput,
-                                    real* batchGrad,
-                                    const int* cpuLabels,
-                                    const int* cpuLabelLengths,
-                                    const int* cpuInputLengths,
-                                    const size_t numClasses,
-                                    const size_t numSequences,
-                                    real* cpuCosts,
-                                    void* workspace,
-                                    hl_warpctc_options_t* options);
-
-/**
- * @brief Compute the required workspace size.
- *        There is no memory allocated operations within warp-ctc.
- *
- * @param[in]   cpuLabelLengths length of all labels in CPU memory.
- * @param[in]   cpuInputLengths length of all sequences in CPU memory.
- * @param[in]   numClasses      number of possible output symbols.
- * @param[in]   numSequences    number of sequence.
- * @param[in]   options         handle to store cpu or gpu informations.
- * @param[out]  bytes           pointer to a scalar where the memory
- *                              requirement in bytes will be placed.
- *
- */
-extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
-                                          const int* cpuInputLengths,
-                                          const size_t numClasses,
-                                          const size_t numSequences,
-                                          hl_warpctc_options_t* options,
-                                          size_t* bytes);
-
-#endif  // HL_WARPCTC_WRAP_H_
-#endif
diff --git a/paddle/legacy/cuda/include/stub/hl_aggregate_stub.h b/paddle/legacy/cuda/include/stub/hl_aggregate_stub.h
deleted file mode 100644
index 2ac841facc6..00000000000
--- a/paddle/legacy/cuda/include/stub/hl_aggregate_stub.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_AGGREGATE_STUB_H_
-#define HL_AGGREGATE_STUB_H_
-
-#include "hl_aggregate.h"
-
-inline void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {}
-
-inline void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {}
-
-inline void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {}
-
-inline void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {}
-
-inline void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {}
-
-inline void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {}
-
-inline void hl_vector_sum(real *A_d, real *C_h, int dimM) {}
-
-inline void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {}
-
-#endif  // HL_AGGREGATE_STUB_H_
diff --git a/paddle/legacy/cuda/include/stub/hl_cnn_stub.h b/paddle/legacy/cuda/include/stub/hl_cnn_stub.h
deleted file mode 100644
index 997eed62e07..00000000000
--- a/paddle/legacy/cuda/include/stub/hl_cnn_stub.h
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CNN_STUB_H_
-#define HL_CNN_STUB_H_
-
-#include "hl_cnn.h"
-
-inline void hl_maxpool_forward(const int frameCnt,
-                               const real* inputData,
-                               const int channels,
-                               const int height,
-                               const int width,
-                               const int pooledH,
-                               const int pooledW,
-                               const int sizeX,
-                               const int sizeY,
-                               const int strideH,
-                               const int strideW,
-                               const int paddingH,
-                               const int paddingW,
-                               real* tgtData,
-                               const int tgtStride,
-                               real* MaskData) {}
-
-inline void hl_maxpool_backward(const int frameCnt,
-                                const real* inputData,
-                                const real* outData,
-                                const real* outGrad,
-                                const int channels,
-                                const int height,
-                                const int width,
-                                const int pooledH,
-                                const int pooledW,
-                                const int sizeX,
-                                const int sizeY,
-                                const int strideH,
-                                const int strideW,
-                                const int paddingH,
-                                const int paddingW,
-                                real scaleA,
-                                real scaleB,
-                                real* targetGrad,
-                                const int outStride) {}
-
-inline void hl_avgpool_forward(const int frameCnt,
-                               const real* inputData,
-                               const int channels,
-                               const int height,
-                               const int width,
-                               const int pooledH,
-                               const int pooledW,
-                               const int sizeX,
-                               const int sizeY,
-                               const int strideH,
-                               const int strideW,
-                               const int paddingH,
-                               const int paddingW,
-                               real* tgtData,
-                               const int tgtStride,
-                               const bool excludeMode) {}
-
-inline void hl_avgpool_backward(const int frameCnt,
-                                const real* outGrad,
-                                const int channels,
-                                const int height,
-                                const int width,
-                                const int pooledH,
-                                const int pooledW,
-                                const int sizeX,
-                                const int sizeY,
-                                const int strideH,
-                                const int strideW,
-                                int paddingH,
-                                int paddingW,
-                                real scaleA,
-                                real scaleB,
-                                real* backGrad,
-                                const int outStride,
-                                const bool excludeMode) {}
-
-inline void hl_maxpool3D_forward(const int frameCnt,
-                                 const real* inputData,
-                                 const int channels,
-                                 const int depth,
-                                 const int height,
-                                 const int width,
-                                 const int pooledD,
-                                 const int pooledH,
-                                 const int pooledW,
-                                 const int sizeZ,
-                                 const int sizeY,
-                                 const int sizeX,
-                                 const int strideD,
-                                 const int strideH,
-                                 const int strideW,
-                                 const int paddingD,
-                                 const int paddingH,
-                                 const int paddingW,
-                                 real* tgtData,
-                                 real* maxPoolIdxData,
-                                 const int tgtStride) {}
-
-inline void hl_maxpool3D_backward(const int frameCnt,
-                                  const real* outGrad,
-                                  const int channels,
-                                  const int depth,
-                                  const int height,
-                                  const int width,
-                                  const int pooledD,
-                                  const int pooledH,
-                                  const int pooledW,
-                                  const int sizeZ,
-                                  const int sizeY,
-                                  const int sizeX,
-                                  const int strideD,
-                                  const int strideH,
-                                  const int strideW,
-                                  const int paddingD,
-                                  const int paddingH,
-                                  const int paddingW,
-                                  real scaleA,
-                                  real scaleB,
-                                  real* targetGrad,
-                                  real* maxPoolIdxData,
-                                  const int outStride) {}
-
-inline void hl_avgpool3D_forward(const int frameCnt,
-                                 const real* inputData,
-                                 const int channels,
-                                 const int depth,
-                                 const int height,
-                                 const int width,
-                                 const int pooledD,
-                                 const int pooledH,
-                                 const int pooledW,
-                                 const int sizeZ,
-                                 const int sizeY,
-                                 const int sizeX,
-                                 const int strideD,
-                                 const int strideH,
-                                 const int strideW,
-                                 const int paddingD,
-                                 const int paddingH,
-                                 const int paddingW,
-                                 real* tgtData,
-                                 const int tgtStride) {}
-
-inline void hl_avgpool3D_backward(const int frameCnt,
-                                  const real* outGrad,
-                                  const int channels,
-                                  const int depth,
-                                  const int height,
-                                  const int width,
-                                  const int pooledD,
-                                  const int pooledH,
-                                  const int pooledW,
-                                  const int sizeZ,
-                                  const int sizeY,
-                                  const int sizeX,
-                                  const int strideD,
-                                  const int strideH,
-                                  const int strideW,
-                                  const int paddingD,
-                                  const int paddingH,
-                                  const int paddingW,
-                                  real scaleA,
-                                  real scaleB,
-                                  real* backGrad,
-                                  const int outStride) {}
-
-inline void hl_bilinear_forward(const real* inData,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t inputH,
-                                const size_t inputW,
-                                real* outData,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t outputH,
-                                const size_t outputW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {}
-
-inline void hl_bilinear_backward(real* inGrad,
-                                 const size_t inImgH,
-                                 const size_t inImgW,
-                                 const size_t inputH,
-                                 const size_t inputW,
-                                 const real* outGrad,
-                                 const size_t outImgH,
-                                 const size_t outImgW,
-                                 const size_t outputH,
-                                 const size_t outputW,
-                                 const size_t numChannels,
-                                 const real ratioH,
-                                 const real ratioW) {}
-
-inline void hl_maxout_forward(const real* inData,
-                              real* outData,
-                              int* idData,
-                              size_t batchSize,
-                              size_t size,
-                              size_t featLen,
-                              size_t group) {}
-
-inline void hl_maxout_backward(real* inGrad,
-                               const real* outGrad,
-                               const int* idData,
-                               size_t batchSize,
-                               size_t size,
-                               size_t featLen,
-                               size_t group) {}
-
-inline void hl_upsample_forward(real* inputData,
-                                real* maskData,
-                                size_t batchSize,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t channels,
-                                size_t outputH,
-                                size_t outputW,
-                                real* outputData) {}
-
-inline void hl_upsample_backward(real* outputGradData,
-                                 real* maskData,
-                                 size_t batchSize,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t channels,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 real* inputGradData) {}
-
-#endif  // HL_CNN_STUB_H_
diff --git a/paddle/legacy/cuda/include/stub/hl_cuda_cublas_stub.h b/paddle/legacy/cuda/include/stub/hl_cuda_cublas_stub.h
deleted file mode 100644
index 0b2300cda95..00000000000
--- a/paddle/legacy/cuda/include/stub/hl_cuda_cublas_stub.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CUDA_CUBLAS_STUB_H_
-#define HL_CUDA_CUBLAS_STUB_H_
-
-#include "hl_cuda_cublas.h"
-
-inline void hl_matrix_transpose(
-    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {}
-
-inline void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {}
-
-inline void hl_matrix_inverse(
-    real *A_d, real *C_d, int dimN, int lda, int ldc) {}
-
-inline void hl_matrix_mul(real *A_d,
-                          hl_trans_op_t transa,
-                          real *B_d,
-                          hl_trans_op_t transb,
-                          real *C_d,
-                          int dimM,
-                          int dimN,
-                          int dimK,
-                          real alpha,
-                          real beta,
-                          int lda,
-                          int ldb,
-                          int ldc) {}
-
-inline void hl_matrix_mul(real *A_d,
-                          hl_trans_op_t transa,
-                          real *B_d,
-                          hl_trans_op_t transb,
-                          real *C_d,
-                          int dimM,
-                          int dimN,
-                          int dimK,
-                          real alpha,
-                          real beta) {}
-
-#endif  // HL_CUDA_CUBLAS_STUB_H_
diff --git a/paddle/legacy/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/legacy/cuda/include/stub/hl_cuda_cudnn_stub.h
deleted file mode 100644
index 4b8bdf7507b..00000000000
--- a/paddle/legacy/cuda/include/stub/hl_cuda_cudnn_stub.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CUDA_CUDNN_STUB_H_
-#define HL_CUDA_CUDNN_STUB_H_
-
-#include "hl_cuda_cudnn.h"
-
-inline int hl_get_cudnn_lib_version() { return 0; }
-
-inline void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {}
-
-inline void hl_tensor_reshape(hl_tensor_descriptor image_desc,
-                              int batch_size,
-                              int feature_maps,
-                              int height,
-                              int width) {}
-
-inline void hl_tensor_reshape(hl_tensor_descriptor image_desc,
-                              int batch_size,
-                              int feature_maps,
-                              int height,
-                              int width,
-                              int nStride,
-                              int cStride,
-                              int hStride,
-                              int wStride) {}
-
-inline void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {}
-
-inline void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
-                                         hl_pooling_mode_t mode,
-                                         int height,
-                                         int width,
-                                         int height_padding,
-                                         int width_padding,
-                                         int stride_height,
-                                         int stride_width) {}
-
-inline void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {}
-
-inline void hl_pooling_forward(hl_tensor_descriptor input,
-                               real* input_image,
-                               hl_tensor_descriptor output,
-                               real* output_image,
-                               hl_pooling_descriptor pooling) {}
-
-inline void hl_pooling_backward(hl_tensor_descriptor input,
-                                real* input_image,
-                                real* input_image_grad,
-                                hl_tensor_descriptor output,
-                                real* output_image,
-                                real* output_image_grad,
-                                hl_pooling_descriptor pooling) {}
-
-inline void hl_create_filter_descriptor(hl_filter_descriptor* filter,
-                                        int input_feature_maps,
-                                        int output_feature_maps,
-                                        int height,
-                                        int width) {}
-
-inline void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {}
-
-inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
-                                             hl_tensor_descriptor image,
-                                             hl_filter_descriptor filter,
-                                             int padding_height,
-                                             int padding_width,
-                                             int stride_height,
-                                             int stride_width,
-                                             int dilation_h,
-                                             int dilation_w) {}
-
-inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
-                                            hl_tensor_descriptor image,
-                                            hl_filter_descriptor filter,
-                                            int padding_height,
-                                            int padding_width,
-                                            int stride_height,
-                                            int stride_width,
-                                            int dilation_h,
-                                            int dilation_w) {}
-
-inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {}
-
-inline void hl_conv_workspace(hl_tensor_descriptor input,
-                              hl_tensor_descriptor output,
-                              hl_filter_descriptor filter,
-                              hl_convolution_descriptor conv,
-                              int* convFwdAlgo,
-                              size_t* fwdLimitBytes,
-                              int* convBwdDataAlgo,
-                              size_t* bwdDataLimitBytes,
-                              int* convBwdFilterAlgo,
-                              size_t* bwdFilterLimitBytes,
-                              bool useDilation) {}
-
-inline void hl_convolution_forward(hl_tensor_descriptor input,
-                                   real* input_data,
-                                   hl_tensor_descriptor output,
-                                   real* output_data,
-                                   hl_filter_descriptor filter,
-                                   real* filter_data,
-                                   hl_convolution_descriptor conv,
-                                   void* gpuWorkSpace,
-                                   size_t sizeInBytes,
-                                   int convFwdAlgo) {}
-
-inline void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
-                                            real* bias_data,
-                                            hl_tensor_descriptor output,
-                                            real* output_data) {}
-
-inline void hl_convolution_backward_filter(hl_tensor_descriptor input,
-                                           real* input_data,
-                                           hl_tensor_descriptor output,
-                                           real* output_grad_data,
-                                           hl_filter_descriptor filter,
-                                           real* filter_grad_data,
-                                           hl_convolution_descriptor conv,
-                                           void* gpuWorkSpace,
-                                           size_t sizeInBytes,
-                                           int convBwdFilterAlgo) {}
-
-inline void hl_convolution_backward_data(hl_tensor_descriptor input,
-                                         real* input_data_grad,
-                                         hl_tensor_descriptor output,
-                                         real* output_grad_data,
-                                         hl_filter_descriptor filter,
-                                         real* filter_data,
-                                         hl_convolution_descriptor conv,
-                                         void* gpuWorkSpace,
-                                         size_t sizeInBytes,
-                                         int convBwdDataAlgo) {}
-
-inline void hl_convolution_backward_bias(hl_tensor_descriptor bias,
-                                         real* bias_grad_data,
-                                         hl_tensor_descriptor output,
-                                         real* output_grad_data) {}
-
-inline void hl_softmax_forward(real* input,
-                               real* output,
-                               int height,
-                               int width) {}
-
-inline void hl_softmax_backward(real* output_value,
-                                real* output_grad,
-                                int height,
-                                int width) {}
-
-inline void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                           real* input,
-                                           hl_tensor_descriptor outputDesc,
-                                           real* output,
-                                           hl_tensor_descriptor bnParamDesc,
-                                           real* scale,
-                                           real* bias,
-                                           double factor,
-                                           real* runningMean,
-                                           real* runningInvVar,
-                                           double epsilon,
-                                           real* savedMean,
-                                           real* savedVar) {}
-
-inline void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                            real* input,
-                                            hl_tensor_descriptor outputDesc,
-                                            real* output,
-                                            hl_tensor_descriptor bnParamDesc,
-                                            real* scale,
-                                            real* bias,
-                                            real* estimatedMean,
-                                            real* estimatedVar,
-                                            double epsilon) {}
-
-inline void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                                   real* input,
-                                   hl_tensor_descriptor outGradDesc,
-                                   real* outGrad,
-                                   hl_tensor_descriptor inGradDesc,
-                                   real* inGrad,
-                                   hl_tensor_descriptor dBnParamDesc,
-                                   real* scale,
-                                   real* scaleGrad,
-                                   real* biasGrad,
-                                   double epsilon,
-                                   real* savedMean,
-                                   real* savedInvVar) {}
-
-#endif  // HL_CUDA_CUDNN_STUB_H_
diff --git a/paddle/legacy/cuda/include/stub/hl_cuda_stub.h b/paddle/legacy/cuda/include/stub/hl_cuda_stub.h
deleted file mode 100644
index ac8b22ef31a..00000000000
--- a/paddle/legacy/cuda/include/stub/hl_cuda_stub.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CUDA_STUB_H_
-#define HL_CUDA_STUB_H_
-
-#include "hl_cuda.h"
-
-inline void hl_start() {}
-
-inline void hl_specify_devices_start(int *device, int number) {}
-
-inline void hl_init(int device) {}
-
-inline int hl_get_cuda_lib_version(int device) { return 0; }
-
-inline void hl_fini() {}
-
-inline void hl_set_sync_flag(bool flag) {}
-
-inline bool hl_get_sync_flag() { return false; }
-
-inline int hl_get_device_count() { return 0; }
-
-inline void hl_set_device(int device) {}
-
-inline int hl_get_device() { return 0; }
-
-inline void *hl_malloc_device(size_t size) { return NULL; }
-
-inline void hl_free_mem_device(void *dest_d) {}
-
-inline void *hl_malloc_host(size_t size) { return NULL; }
-
-inline void hl_free_mem_host(void *dest_h) {}
-
-inline void hl_memcpy(void *dst, void *src, size_t size) {}
-
-inline void hl_memset_device(void *dest_d, int value, size_t size) {}
-
-inline void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {}
-
-inline void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {}
-
-inline void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {}
-
-inline void hl_rand(real *dest_d, size_t num) {}
-
-inline void hl_srand(unsigned int seed) {}
-
-inline void hl_memcpy_async(void *dst,
-                            void *src,
-                            size_t size,
-                            hl_stream_t stream) {}
-
-inline void hl_stream_synchronize(hl_stream_t stream) {}
-
-inline void hl_create_event(hl_event_t *event) {}
-
-inline void hl_destroy_event(hl_event_t event) {}
-
-inline float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
-  return 0;
-}
-
-inline void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {}
-
-inline void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {}
-
-inline void hl_event_synchronize(hl_event_t event) {}
-
-inline int hl_get_device_last_error() { return 0; }
-
-inline const char *hl_get_device_error_string() { return NULL; }
-
-inline const char *hl_get_device_error_string(size_t err) { return NULL; }
-
-inline bool hl_cuda_event_is_ready(hl_event_t event) { return true; }
-
-inline void hl_device_synchronize() {}
-
-inline void hl_profiler_start() {}
-
-inline void hl_profiler_end() {}
-
-#endif  // HL_CUDA_STUB_H_
diff --git a/paddle/legacy/cuda/include/stub/hl_lstm_stub.h b/paddle/legacy/cuda/include/stub/hl_lstm_stub.h
deleted file mode 100644
index be2b71787e5..00000000000
--- a/paddle/legacy/cuda/include/stub/hl_lstm_stub.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_LSTM_STUB_H_
-#define HL_LSTM_STUB_H_
-
-#include "hl_lstm.h"
-
-inline void hl_lstm_parallel_forward(real *gateValue,
-                                     real *stateValue,
-                                     real *preOutputValue,
-                                     real *outputValue,
-                                     real *checkIg,
-                                     real *checkFg,
-                                     real *checkOg,
-                                     real *weight,
-                                     const int *sequence,
-                                     int frameSize,
-                                     int numSequences,
-                                     bool reversed,
-                                     hl_activation_mode_t active_node,
-                                     hl_activation_mode_t active_gate,
-                                     hl_activation_mode_t active_state) {}
-
-inline void hl_lstm_parallel_backward_data(real *gateValue,
-                                           real *gateGrad,
-                                           real *stateValue,
-                                           real *stateGrad,
-                                           real *preOutputValue,
-                                           real *preOutputGrad,
-                                           real *outputGrad,
-                                           real *checkIg,
-                                           real *checkIgGrad,
-                                           real *checkFg,
-                                           real *checkFgGrad,
-                                           real *checkOg,
-                                           real *checkOgGrad,
-                                           real *weight,
-                                           const int *sequence,
-                                           int frameSize,
-                                           int numSequences,
-                                           bool reversed,
-                                           hl_activation_mode_t active_node,
-                                           hl_activation_mode_t active_gate,
-                                           hl_activation_mode_t active_state) {}
-
-inline void hl_lstm_parallel_backward_weight(real *weightGrad,
-                                             real *outputValue,
-                                             real *gateGrad,
-                                             const int *sequence,
-                                             int frameSize,
-                                             int batchSize,
-                                             int numSequences,
-                                             bool reversed) {}
-
-#endif  // HL_LSTM_STUB_H_
diff --git a/paddle/legacy/cuda/include/stub/hl_matrix_stub.h b/paddle/legacy/cuda/include/stub/hl_matrix_stub.h
deleted file mode 100644
index 914a2edaf21..00000000000
--- a/paddle/legacy/cuda/include/stub/hl_matrix_stub.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_MATRIX_STUB_H_
-#define HL_MATRIX_STUB_H_
-
-#include "hl_matrix.h"
-
-inline void hl_matrix_add(real* A_d,
-                          real* B_d,
-                          real* C_d,
-                          int dimM,
-                          int dimN,
-                          real alpha,
-                          real beta) {}
-
-inline void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {}
-
-inline void hl_sequence_softmax_forward(real* A_d,
-                                        real* C_d,
-                                        const int* index,
-                                        int numSequence) {}
-
-inline void hl_matrix_softmax_derivative(
-    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {}
-
-inline void hl_matrix_classification_error(real* topVal,
-                                           int ldv,
-                                           int* topIds,
-                                           real* src,
-                                           int lds,
-                                           int dim,
-                                           int topkSize,
-                                           int numSamples,
-                                           int* label,
-                                           real* recResult) {}
-
-inline void hl_matrix_cross_entropy(
-    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {}
-
-inline void hl_matrix_cross_entropy_bp(
-    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {}
-
-inline void hl_matrix_multi_binary_cross_entropy(
-    real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN) {}
-
-inline void hl_matrix_multi_binary_cross_entropy_bp(
-    real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN) {}
-
-inline void hl_matrix_zero_mem(real* data, int num) {}
-
-inline void hl_param_relu_forward(real* output,
-                                  real* input,
-                                  real* w,
-                                  int width,
-                                  int height,
-                                  int partial_sum) {}
-
-inline void hl_param_relu_backward_w(real* grad_w,
-                                     real* grad_o,
-                                     real* input,
-                                     int width,
-                                     int height,
-                                     int partial_sum) {}
-
-inline void hl_param_relu_backward_diff(real* grad_o,
-                                        real* input,
-                                        real* w,
-                                        real* diff,
-                                        int width,
-                                        int height,
-                                        int partial_sum) {}
-
-inline void hl_matrix_add_shared_bias(real* A_d,
-                                      real* B_d,
-                                      const int channel,
-                                      const int dimM,
-                                      const int dimN,
-                                      real scale) {}
-
-inline void hl_matrix_collect_shared_bias(real* B_d,
-                                          real* A_d,
-                                          const int channel,
-                                          const int dimM,
-                                          const int dimN,
-                                          real scale) {}
-
-inline void hl_matrix_rotate(
-    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {}
-
-inline void hl_matrix_vol2Col(const real* dataSrc,
-                              int channels,
-                              int depth,
-                              int height,
-                              int width,
-                              int filterD,
-                              int filterH,
-                              int filterW,
-                              int strideD,
-                              int strideH,
-                              int strideW,
-                              int paddingD,
-                              int paddingH,
-                              int paddingW,
-                              real* dataDst) {}
-
-inline void hl_matrix_col2Vol(real* dataDst,
-                              int channels,
-                              int depth,
-                              int height,
-                              int width,
-                              int filterD,
-                              int filterH,
-                              int filterW,
-                              int strideD,
-                              int strideH,
-                              int strideW,
-                              int paddingD,
-                              int paddingH,
-                              int paddingW,
-                              const real* dataSrc,
-                              real alpha,
-                              real beta) {}
-
-inline void hl_vector_cast2int(int* out, real* vec, int size) {}
-
-#endif  // HL_MATRIX_STUB_H_
diff --git a/paddle/legacy/cuda/include/stub/hl_sequence_stub.h b/paddle/legacy/cuda/include/stub/hl_sequence_stub.h
deleted file mode 100644
index 44bc3dbaff3..00000000000
--- a/paddle/legacy/cuda/include/stub/hl_sequence_stub.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_SEQUENCE_STUB_H_
-#define HL_SEQUENCE_STUB_H_
-
-#include "hl_sequence.h"
-
-inline void hl_max_sequence_forward(real* input,
-                                    const int* sequence,
-                                    real* output,
-                                    int* index,
-                                    int numSequences,
-                                    int dim) {}
-
-inline void hl_max_sequence_backward(
-    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {}
-
-inline void hl_sequence2batch_copy(real* batch,
-                                   real* sequence,
-                                   const int* batchIndex,
-                                   int seqWidth,
-                                   int batchCount,
-                                   bool seq2batch) {}
-
-inline void hl_sequence2batch_add(real* batch,
-                                  real* sequence,
-                                  int* batchIndex,
-                                  int seqWidth,
-                                  int batchCount,
-                                  bool seq2batch) {}
-
-inline void hl_sequence2batch_copy_padding(real* batch,
-                                           real* sequence,
-                                           const int* sequenceStartPositions,
-                                           const size_t sequenceWidth,
-                                           const size_t maxSequenceLength,
-                                           const size_t numSequences,
-                                           bool normByTimes,
-                                           bool seq2batch) {}
-
-inline void hl_sequence_avg_forward(real* dst,
-                                    real* src,
-                                    const int* starts,
-                                    int height,
-                                    int width,
-                                    const int mode) {}
-
-inline void hl_sequence_avg_backward(real* dst,
-                                     real* src,
-                                     const int* starts,
-                                     int height,
-                                     int width,
-                                     const int mode) {}
-#endif  // HL_SEQUENCE_STUB_H_
diff --git a/paddle/legacy/cuda/include/stub/hl_sparse_stub.h b/paddle/legacy/cuda/include/stub/hl_sparse_stub.h
deleted file mode 100644
index 4001d4fb741..00000000000
--- a/paddle/legacy/cuda/include/stub/hl_sparse_stub.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_SPARSE_STUB_H_
-#define HL_SPARSE_STUB_H_
-
-#include "hl_sparse.h"
-
-inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                    hl_matrix_format_t format,
-                                    hl_matrix_value_t value_type,
-                                    int dimM,
-                                    int dimN,
-                                    int nnz) {}
-
-inline void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {}
-
-inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       void *dest_d,
-                                       size_t size,
-                                       hl_matrix_format_t format,
-                                       hl_matrix_value_t value_type,
-                                       int dimM,
-                                       int dimN,
-                                       int nnz) {}
-
-inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       real *value_d,
-                                       int *rows_d,
-                                       int *cols_d,
-                                       hl_matrix_format_t format,
-                                       hl_matrix_value_t value_type,
-                                       int dimM,
-                                       int dimN,
-                                       int nnz) {}
-
-inline void hl_destruct_sparse_matrix(hl_sparse_matrix_s A_d) {}
-
-inline void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
-                                 real *csr_val,
-                                 int *csr_row,
-                                 int *csr_col,
-                                 hl_stream_t stream) {}
-
-inline void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
-                                 real *csc_val,
-                                 int *csc_row,
-                                 int *csc_col,
-                                 hl_stream_t stream) {}
-
-inline void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
-                                    hl_sparse_matrix_s src,
-                                    hl_stream_t stream) {}
-
-inline void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN) {}
-
-inline void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN) {}
-
-inline void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
-                                    hl_trans_op_t transa,
-                                    real *B_d,
-                                    hl_trans_op_t transb,
-                                    real *C_d,
-                                    int dimM,
-                                    int dimN,
-                                    int dimK,
-                                    real alpha,
-                                    real beta) {}
-
-inline void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
-                                    hl_trans_op_t transa,
-                                    real *B_d,
-                                    hl_trans_op_t transb,
-                                    real *C_d,
-                                    int dimM,
-                                    int dimN,
-                                    int dimK,
-                                    real alpha,
-                                    real beta) {}
-
-inline void hl_matrix_dense_mul_csc(real *A_d,
-                                    hl_trans_op_t transa,
-                                    hl_sparse_matrix_s B_d,
-                                    hl_trans_op_t transb,
-                                    real *C_d,
-                                    int dimM,
-                                    int dimN,
-                                    int dimK,
-                                    real alpha,
-                                    real beta) {}
-
-inline void hl_sparse_matrix_mul(real *A_d,
-                                 hl_trans_op_t transa,
-                                 real *B_d,
-                                 hl_trans_op_t transb,
-                                 hl_sparse_matrix_s C_d,
-                                 int dimM,
-                                 int dimN,
-                                 int dimK,
-                                 real alpha,
-                                 real beta) {}
-
-inline void hl_matrix_dense_mul_csr(real *A_d,
-                                    hl_trans_op_t transa,
-                                    hl_sparse_matrix_s B_d,
-                                    hl_trans_op_t transb,
-                                    real *C_d,
-                                    int dimM,
-                                    int dimN,
-                                    int dimK,
-                                    real alpha,
-                                    real beta) {}
-
-inline void hl_memcpy_from_csc_matrix(real *csc_val,
-                                      size_t val_size,
-                                      int *csc_row,
-                                      size_t row_size,
-                                      int *csc_col,
-                                      size_t col_size,
-                                      hl_sparse_matrix_s csc_matrix,
-                                      hl_stream_t stream) {}
-
-inline void hl_memcpy_from_csr_matrix(real *csr_val,
-                                      size_t val_size,
-                                      int *csr_row,
-                                      size_t row_size,
-                                      int *csr_col,
-                                      size_t col_size,
-                                      hl_sparse_matrix_s csr_matrix,
-                                      hl_stream_t stream) {}
-
-inline void hl_sparse_matrix_column_sum(
-    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {}
-
-inline void hl_matrix_csr_column_sum(
-    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {}
-
-inline void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
-                                      real *B_d,
-                                      real scale) {}
-
-inline void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
-                                   real *B_d,
-                                   real scale) {}
-
-inline void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
-                                       real *B_d,
-                                       int dimM,
-                                       int dimN,
-                                       real alpha,
-                                       real beta) {}
-
-inline void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
-                                    real *B_d,
-                                    int dimM,
-                                    int dimN,
-                                    real alpha,
-                                    real beta) {}
-
-inline int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) { return NULL; }
-
-inline int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) { return NULL; }
-
-inline real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
-  return NULL;
-}
-
-#endif  // HL_SPARSE_STUB_H_
diff --git a/paddle/legacy/cuda/src/avx_mathfun.h b/paddle/legacy/cuda/src/avx_mathfun.h
deleted file mode 100644
index 8e698e746a1..00000000000
--- a/paddle/legacy/cuda/src/avx_mathfun.h
+++ /dev/null
@@ -1,735 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-/*
-   AVX implementation of sin, cos, sincos, exp and log
-
-   Based on "sse_mathfun.h", by Julien Pommier
-   http://gruntthepeon.free.fr/ssemath/
-
-   Copyright (C) 2012 Giovanni Garberoglio
-   Interdisciplinary Laboratory for Computational Science (LISC)
-   Fondazione Bruno Kessler and University of Trento
-   via Sommarive, 18
-   I-38123 Trento (Italy)
-
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-
-  (this is the zlib license)
-*/
-
-#include <immintrin.h>
-
-/* yes I know, the top of this file is quite ugly */
-#define ALIGN32_BEG
-#define ALIGN32_END __attribute__((aligned(32)))
-
-/* __m128 is ugly to write */
-typedef __m256 v8sf;   // vector of 8 float (avx)
-typedef __m256i v8si;  // vector of 8 int   (avx)
-typedef __m128i v4si;  // vector of 8 int   (avx)
-
-#define _PI32AVX_CONST(Name, Val)                                 \
-  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { \
-      Val, Val, Val, Val}
-
-_PI32AVX_CONST(1, 1);
-_PI32AVX_CONST(inv1, ~1);
-_PI32AVX_CONST(2, 2);
-_PI32AVX_CONST(4, 4);
-
-/* declare some AVX constants -- why can't I figure a better way to do that? */
-#define _PS256_CONST(Name, Val)                                   \
-  static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \
-      Val, Val, Val, Val, Val, Val, Val, Val}
-#define _PI32_CONST256(Name, Val)                                  \
-  static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \
-      Val, Val, Val, Val, Val, Val, Val, Val}
-#define _PS256_CONST_TYPE(Name, Type, Val)                       \
-  static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \
-      Val, Val, Val, Val, Val, Val, Val, Val}
-
-_PS256_CONST(1, 1.0f);
-_PS256_CONST(0p5, 0.5f);
-/* the smallest non denormalized float number */
-_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
-_PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
-_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
-
-_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
-_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
-
-_PI32_CONST256(0, 0);
-_PI32_CONST256(1, 1);
-_PI32_CONST256(inv1, ~1);
-_PI32_CONST256(2, 2);
-_PI32_CONST256(4, 4);
-_PI32_CONST256(0x7f, 0x7f);
-
-_PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
-_PS256_CONST(cephes_log_p0, 7.0376836292E-2);
-_PS256_CONST(cephes_log_p1, -1.1514610310E-1);
-_PS256_CONST(cephes_log_p2, 1.1676998740E-1);
-_PS256_CONST(cephes_log_p3, -1.2420140846E-1);
-_PS256_CONST(cephes_log_p4, +1.4249322787E-1);
-_PS256_CONST(cephes_log_p5, -1.6668057665E-1);
-_PS256_CONST(cephes_log_p6, +2.0000714765E-1);
-_PS256_CONST(cephes_log_p7, -2.4999993993E-1);
-_PS256_CONST(cephes_log_p8, +3.3333331174E-1);
-_PS256_CONST(cephes_log_q1, -2.12194440e-4);
-_PS256_CONST(cephes_log_q2, 0.693359375);
-
-#ifndef __AVX2__
-
-typedef union imm_xmm_union {
-  v8si imm;
-  v4si xmm[2];
-} imm_xmm_union;
-
-#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_)       \
-  {                                               \
-    imm_xmm_union u __attribute__((aligned(32))); \
-    u.imm = imm_;                                 \
-    xmm0_ = u.xmm[0];                             \
-    xmm1_ = u.xmm[1];                             \
-  }
-
-#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_)       \
-  {                                               \
-    imm_xmm_union u __attribute__((aligned(32))); \
-    u.xmm[0] = xmm0_;                             \
-    u.xmm[1] = xmm1_;                             \
-    imm_ = u.imm;                                 \
-  }
-
-#define AVX2_BITOP_USING_SSE2(fn)                        \
-  static inline v8si avx2_mm256_##fn(v8si x, int a) {    \
-    /* use SSE2 instruction to perform the bitop AVX2 */ \
-    v4si x1, x2;                                         \
-    v8si ret;                                            \
-    COPY_IMM_TO_XMM(x, x1, x2);                          \
-    x1 = _mm_##fn(x1, a);                                \
-    x2 = _mm_##fn(x2, a);                                \
-    COPY_XMM_TO_IMM(x1, x2, ret);                        \
-    return (ret);                                        \
-  }
-
-//#warning "Using SSE2 to perform AVX2 bitshift ops"
-AVX2_BITOP_USING_SSE2(slli_epi32)
-AVX2_BITOP_USING_SSE2(srli_epi32)
-
-#define AVX2_INTOP_USING_SSE2(fn)                                     \
-  static inline v8si avx2_mm256_##fn(v8si x, v8si y) {                \
-    /* use SSE2 instructions to perform the AVX2 integer operation */ \
-    v4si x1, x2;                                                      \
-    v4si y1, y2;                                                      \
-    v8si ret;                                                         \
-    COPY_IMM_TO_XMM(x, x1, x2);                                       \
-    COPY_IMM_TO_XMM(y, y1, y2);                                       \
-    x1 = _mm_##fn(x1, y1);                                            \
-    x2 = _mm_##fn(x2, y2);                                            \
-    COPY_XMM_TO_IMM(x1, x2, ret);                                     \
-    return (ret);                                                     \
-  }
-
-//#warning "Using SSE2 to perform AVX2 integer ops"
-AVX2_INTOP_USING_SSE2(and_si128)
-AVX2_INTOP_USING_SSE2(andnot_si128)
-AVX2_INTOP_USING_SSE2(cmpeq_epi32)
-AVX2_INTOP_USING_SSE2(sub_epi32)
-AVX2_INTOP_USING_SSE2(add_epi32)
-#define avx2_mm256_and_si256 avx2_mm256_and_si128
-#define avx2_mm256_andnot_si256 avx2_mm256_andnot_si128
-#else
-#define avx2_mm256_slli_epi32 _mm256_slli_epi32
-#define avx2_mm256_srli_epi32 _mm256_srli_epi32
-#define avx2_mm256_and_si256 _mm256_and_si256
-#define avx2_mm256_andnot_si256 _mm256_andnot_si256
-#define avx2_mm256_cmpeq_epi32 _mm256_cmpeq_epi32
-#define avx2_mm256_sub_epi32 _mm256_sub_epi32
-#define avx2_mm256_add_epi32 _mm256_add_epi32
-#endif /* __AVX2__ */
-
-/* natural logarithm computed for 8 simultaneous float
-   return NaN for x <= 0
-*/
-v8sf log256_ps(v8sf x) {
-  v8si imm0;
-  v8sf one = *(v8sf *)_ps256_1;
-
-  // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
-  v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
-
-  x = _mm256_max_ps(
-      x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
-
-  // can be done with AVX2
-  imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
-
-  /* keep only the fractional part */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
-  x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
-
-  // this is again another AVX2 instruction
-  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
-  v8sf e = _mm256_cvtepi32_ps(imm0);
-
-  e = _mm256_add_ps(e, one);
-
-  /* part2:
-     if( x < SQRTHF ) {
-       e -= 1;
-       x = x + x - 1.0;
-     } else { x = x - 1.0; }
-  */
-  // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
-  v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
-  v8sf tmp = _mm256_and_ps(x, mask);
-  x = _mm256_sub_ps(x, one);
-  e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
-  x = _mm256_add_ps(x, tmp);
-
-  v8sf z = _mm256_mul_ps(x, x);
-
-  v8sf y = *(v8sf *)_ps256_cephes_log_p0;
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
-  y = _mm256_mul_ps(y, x);
-
-  y = _mm256_mul_ps(y, z);
-
-  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
-  y = _mm256_add_ps(y, tmp);
-
-  tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
-  y = _mm256_sub_ps(y, tmp);
-
-  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
-  x = _mm256_add_ps(x, y);
-  x = _mm256_add_ps(x, tmp);
-  x = _mm256_or_ps(x, invalid_mask);  // negative arg will be NAN
-  return x;
-}
-
-_PS256_CONST(exp_hi, 88.3762626647949f);
-_PS256_CONST(exp_lo, -88.3762626647949f);
-
-_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
-_PS256_CONST(cephes_exp_C1, 0.693359375);
-_PS256_CONST(cephes_exp_C2, -2.12194440e-4);
-
-_PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
-_PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
-_PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
-_PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
-_PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
-_PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
-
-v8sf exp256_ps(v8sf x) {
-  v8sf tmp = _mm256_setzero_ps(), fx;
-  v8si imm0;
-  v8sf one = *(v8sf *)_ps256_1;
-
-  x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
-  x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
-
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
-  fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
-
-  /* how to perform a floorf with SSE: just below */
-  // imm0 = _mm256_cvttps_epi32(fx);
-  // tmp  = _mm256_cvtepi32_ps(imm0);
-
-  tmp = _mm256_floor_ps(fx);
-
-  /* if greater, substract 1 */
-  // v8sf mask = _mm256_cmpgt_ps(tmp, fx);
-  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
-  mask = _mm256_and_ps(mask, one);
-  fx = _mm256_sub_ps(tmp, mask);
-
-  tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
-  v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
-  x = _mm256_sub_ps(x, tmp);
-  x = _mm256_sub_ps(x, z);
-
-  z = _mm256_mul_ps(x, x);
-
-  v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, x);
-  y = _mm256_add_ps(y, one);
-
-  /* build 2^n */
-  imm0 = _mm256_cvttps_epi32(fx);
-  // another two AVX2 instructions
-  imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
-  imm0 = avx2_mm256_slli_epi32(imm0, 23);
-  v8sf pow2n = _mm256_castsi256_ps(imm0);
-  y = _mm256_mul_ps(y, pow2n);
-  return y;
-}
-
-_PS256_CONST(minus_cephes_DP1, -0.78515625);
-_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
-_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
-_PS256_CONST(sincof_p0, -1.9515295891E-4);
-_PS256_CONST(sincof_p1, 8.3321608736E-3);
-_PS256_CONST(sincof_p2, -1.6666654611E-1);
-_PS256_CONST(coscof_p0, 2.443315711809948E-005);
-_PS256_CONST(coscof_p1, -1.388731625493765E-003);
-_PS256_CONST(coscof_p2, 4.166664568298827E-002);
-_PS256_CONST(cephes_FOPI, 1.27323954473516);  // 4 / M_PI
-
-/* evaluation of 8 sines at onces using AVX intrisics
-
-   The code is the exact rewriting of the cephes sinf function.
-   Precision is excellent as long as x < 8192 (I did not bother to
-   take into account the special handling they have for greater values
-   -- it does not return garbage for arguments over 8192, though, but
-   the extra precision is missing).
-
-   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
-   surprising but correct result.
-
-*/
-v8sf sin256_ps(v8sf x) {  // any x
-  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
-  v8si imm0, imm2;
-
-#ifndef __AVX2__
-  v4si imm0_1, imm0_2;
-  v4si imm2_1, imm2_2;
-#endif
-
-  sign_bit = x;
-  /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
-  /* extract the sign bit (upper one) */
-  sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
-
-  /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
-
-/*
-  Here we start a series of integer operations, which are in the
-  realm of AVX2.
-  If we don't have AVX, let's perform them using SSE2 directives
-*/
-
-#ifdef __AVX2__
-  /* store the integer part of y in mm0 */
-  imm2 = _mm256_cvttps_epi32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  // another two AVX2 instruction
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
-  y = _mm256_cvtepi32_ps(imm2);
-
-  /* get the swap sign flag */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
-  imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  /* get the polynom selection mask
-     there is one polynom for 0 <= x <= Pi/4
-     and another one for Pi/4<x<=Pi/2
-
-     Both branches will be computed.
-  */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
-#else
-  /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
-
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-  y = _mm256_cvtepi32_ps(imm2);
-
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
-
-  imm0_1 = _mm_slli_epi32(imm0_1, 29);
-  imm0_2 = _mm_slli_epi32(imm0_2, 29);
-
-  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
-
-  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
-  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-#endif
-
-  v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
-  v8sf poly_mask = _mm256_castsi256_ps(imm2);
-  sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
-  xmm1 = _mm256_mul_ps(y, xmm1);
-  xmm2 = _mm256_mul_ps(y, xmm2);
-  xmm3 = _mm256_mul_ps(y, xmm3);
-  x = _mm256_add_ps(x, xmm1);
-  x = _mm256_add_ps(x, xmm2);
-  x = _mm256_add_ps(x, xmm3);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf *)_ps256_coscof_p0;
-  v8sf z = _mm256_mul_ps(x, x);
-
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
-  y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_mul_ps(y2, x);
-  y2 = _mm256_add_ps(y2, x);
-
-  /* select the correct result from the two polynoms */
-  xmm3 = poly_mask;
-  y2 = _mm256_and_ps(xmm3, y2);  //, xmm3);
-  y = _mm256_andnot_ps(xmm3, y);
-  y = _mm256_add_ps(y, y2);
-  /* update the sign */
-  y = _mm256_xor_ps(y, sign_bit);
-
-  return y;
-}
-
-/* almost the same as sin_ps */
-v8sf cos256_ps(v8sf x) {  // any x
-  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
-  v8si imm0, imm2;
-
-#ifndef __AVX2__
-  v4si imm0_1, imm0_2;
-  v4si imm2_1, imm2_2;
-#endif
-
-  /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
-
-  /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
-
-#ifdef __AVX2__
-  /* store the integer part of y in mm0 */
-  imm2 = _mm256_cvttps_epi32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
-  y = _mm256_cvtepi32_ps(imm2);
-  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2);
-
-  /* get the swap sign flag */
-  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4);
-  imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  /* get the polynom selection mask */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
-#else
-
-  /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
-
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-  y = _mm256_cvtepi32_ps(imm2);
-
-  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2);
-
-  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4);
-
-  imm0_1 = _mm_slli_epi32(imm0_1, 29);
-  imm0_2 = _mm_slli_epi32(imm0_2, 29);
-
-  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
-
-  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
-  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-#endif
-
-  v8sf sign_bit = _mm256_castsi256_ps(imm0);
-  v8sf poly_mask = _mm256_castsi256_ps(imm2);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
-  xmm1 = _mm256_mul_ps(y, xmm1);
-  xmm2 = _mm256_mul_ps(y, xmm2);
-  xmm3 = _mm256_mul_ps(y, xmm3);
-  x = _mm256_add_ps(x, xmm1);
-  x = _mm256_add_ps(x, xmm2);
-  x = _mm256_add_ps(x, xmm3);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf *)_ps256_coscof_p0;
-  v8sf z = _mm256_mul_ps(x, x);
-
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
-  y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_mul_ps(y2, x);
-  y2 = _mm256_add_ps(y2, x);
-
-  /* select the correct result from the two polynoms */
-  xmm3 = poly_mask;
-  y2 = _mm256_and_ps(xmm3, y2);  //, xmm3);
-  y = _mm256_andnot_ps(xmm3, y);
-  y = _mm256_add_ps(y, y2);
-  /* update the sign */
-  y = _mm256_xor_ps(y, sign_bit);
-
-  return y;
-}
-
-/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could
-   replace both of them..
-   it is almost as fast, and gives you a free cosine with your sine */
-void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
-  v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
-  v8si imm0, imm2, imm4;
-
-#ifndef __AVX2__
-  v4si imm0_1, imm0_2;
-  v4si imm2_1, imm2_2;
-  v4si imm4_1, imm4_2;
-#endif
-
-  sign_bit_sin = x;
-  /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
-  /* extract the sign bit (upper one) */
-  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask);
-
-  /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
-
-#ifdef __AVX2__
-  /* store the integer part of y in imm2 */
-  imm2 = _mm256_cvttps_epi32(y);
-
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
-
-  y = _mm256_cvtepi32_ps(imm2);
-  imm4 = imm2;
-
-  /* get the swap sign flag for the sine */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
-  imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  // v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
-
-  /* get the polynom selection mask for the sine*/
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
-// v8sf poly_mask = _mm256_castsi256_ps(imm2);
-#else
-  /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
-
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-  y = _mm256_cvtepi32_ps(imm2);
-
-  imm4_1 = imm2_1;
-  imm4_2 = imm2_2;
-
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
-
-  imm0_1 = _mm_slli_epi32(imm0_1, 29);
-  imm0_2 = _mm_slli_epi32(imm0_2, 29);
-
-  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
-
-  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
-  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-#endif
-  v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
-  v8sf poly_mask = _mm256_castsi256_ps(imm2);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
-  xmm1 = _mm256_mul_ps(y, xmm1);
-  xmm2 = _mm256_mul_ps(y, xmm2);
-  xmm3 = _mm256_mul_ps(y, xmm3);
-  x = _mm256_add_ps(x, xmm1);
-  x = _mm256_add_ps(x, xmm2);
-  x = _mm256_add_ps(x, xmm3);
-
-#ifdef __AVX2__
-  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2);
-  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4);
-  imm4 = avx2_mm256_slli_epi32(imm4, 29);
-#else
-  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2);
-  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2);
-
-  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4);
-  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4);
-
-  imm4_1 = _mm_slli_epi32(imm4_1, 29);
-  imm4_2 = _mm_slli_epi32(imm4_2, 29);
-
-  COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
-#endif
-
-  v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
-
-  sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  v8sf z = _mm256_mul_ps(x, x);
-  y = *(v8sf *)_ps256_coscof_p0;
-
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
-  y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_mul_ps(y2, x);
-  y2 = _mm256_add_ps(y2, x);
-
-  /* select the correct result from the two polynoms */
-  xmm3 = poly_mask;
-  v8sf ysin2 = _mm256_and_ps(xmm3, y2);
-  v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
-  y2 = _mm256_sub_ps(y2, ysin2);
-  y = _mm256_sub_ps(y, ysin1);
-
-  xmm1 = _mm256_add_ps(ysin1, ysin2);
-  xmm2 = _mm256_add_ps(y, y2);
-
-  /* update the sign */
-  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
-  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
-}
diff --git a/paddle/legacy/cuda/src/hl_avx_functions.cc b/paddle/legacy/cuda/src/hl_avx_functions.cc
deleted file mode 100644
index 6fb7c9dd06a..00000000000
--- a/paddle/legacy/cuda/src/hl_avx_functions.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <immintrin.h>
-#include "hl_functions.h"
-
-namespace hppl {
-
-extern __m256 exp(__m256 a);
-
-__m256 relu(const __m256 a) {
-  __m256 tmp = _mm256_set1_ps(0.0f);
-  return _mm256_max_ps(a, tmp);
-}
-
-__m256 sigmoid(const __m256 a) {
-  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
-  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
-  __m256 tmp = _mm256_max_ps(a, min);
-  tmp = _mm256_min_ps(tmp, max);
-  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
-  tmp = exp(tmp);
-  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
-  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
-  return tmp;
-}
-
-__m256 tanh(const __m256 a) {
-  __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
-  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
-  tmp = _mm256_min_ps(tmp, max);
-  tmp = exp(tmp);
-  return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
-                                     _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
-                       _mm256_set1_ps(1.0f));
-}
-
-__m256 linear(const __m256 a) { return a; }
-
-__m256 relu(const __m256 a, const __m256 b) {
-  return _mm256_mul_ps(
-      a,
-      _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
-                    _mm256_set1_ps(1.0f)));
-}
-
-__m256 sigmoid(const __m256 a, const __m256 b) {
-  return _mm256_mul_ps(_mm256_mul_ps(a, b),
-                       _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
-}
-
-__m256 tanh(const __m256 a, const __m256 b) {
-  return _mm256_mul_ps(
-      a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
-}
-
-__m256 linear(const __m256 a, const __m256 b) { return a; }
-}  // namespace hppl
diff --git a/paddle/legacy/cuda/src/hl_batch_norm.cu b/paddle/legacy/cuda/src/hl_batch_norm.cu
deleted file mode 100644
index f9ffde0d53e..00000000000
--- a/paddle/legacy/cuda/src/hl_batch_norm.cu
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_batch_norm.h"
-
-__global__ void batchNormInference(real* output,
-                                   const real* input,
-                                   const real* scale,
-                                   const real* bias,
-                                   const real* estimatedMean,
-                                   const real* estimatedVar,
-                                   const double epsilon,
-                                   size_t batchSize,
-                                   size_t channel,
-                                   size_t height,
-                                   size_t width) {
-  const int tid = threadIdx.x;
-  const int num = channel * height * width;
-  const int batch = blockIdx.x;
-  for (int i = tid; i < num; i += blockDim.x) {
-    const int c = i / (height * width);
-    const int id = batch * num + i;
-    real val = input[id] - estimatedMean[c];
-    val /= sqrt(estimatedVar[c] + epsilon);
-    val *= scale[c];
-    val += bias[c];
-    output[id] = val;
-  }
-}
-
-void hl_batch_norm_cuda_inference(const real* input,
-                                  real* output,
-                                  const real* scale,
-                                  const real* bias,
-                                  const real* estimatedMean,
-                                  const real* estimatedVar,
-                                  const double epsilon,
-                                  size_t batchSize,
-                                  size_t channel,
-                                  size_t height,
-                                  size_t width) {
-  batchNormInference<<<batchSize, 256, 0, STREAM_DEFAULT>>>(output,
-                                                            input,
-                                                            scale,
-                                                            bias,
-                                                            estimatedMean,
-                                                            estimatedVar,
-                                                            epsilon,
-                                                            batchSize,
-                                                            channel,
-                                                            height,
-                                                            width);
-
-  CHECK_SYNC("hl_batch_norm_cuda_inference failed!");
-}
diff --git a/paddle/legacy/cuda/src/hl_batch_transpose.cu b/paddle/legacy/cuda/src/hl_batch_transpose.cu
deleted file mode 100644
index 221839905d7..00000000000
--- a/paddle/legacy/cuda/src/hl_batch_transpose.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_base.h"
-#include "hl_batch_transpose.h"
-
-const int TILE_DIM = 64;
-const int BLOCK_ROWS = 16;
-
-// No bank-conflict transpose for a batch of data.
-__global__ void batchTransposeNoBankConflicts(
-    real* odata, const real* idata, int numSamples, int width, int height) {
-  __shared__ float tile[TILE_DIM][TILE_DIM + 1];
-
-  const int x = blockIdx.x * TILE_DIM + threadIdx.x;
-  const int y = blockIdx.y * TILE_DIM + threadIdx.y;
-  const int sampleId = blockIdx.z;
-  if (sampleId > numSamples) return;
-  if (x < width) {
-    for (int j = threadIdx.y; j < TILE_DIM && j < height - y + threadIdx.y;
-         j += BLOCK_ROWS)
-      tile[j][threadIdx.x] =
-          idata[sampleId * width * height + (y + j - threadIdx.y) * width + x];
-  }
-
-  __syncthreads();
-
-  // The matrix is tranposed. Thus height is new width, and width is new height.
-  const int newX = blockIdx.y * TILE_DIM + threadIdx.x;
-  const int newY = blockIdx.x * TILE_DIM + threadIdx.y;
-  if (newX >= height) {
-    return;
-  }
-  for (int j = threadIdx.y; j < TILE_DIM && j < width - newY + threadIdx.y;
-       j += BLOCK_ROWS)
-    odata[sampleId * width * height + (newY + j - threadIdx.y) * height +
-          newX] = tile[threadIdx.x][j];
-}
-
-void batchTranspose(
-    const real* input, real* output, int width, int height, int batchSize) {
-  dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
-  dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
-  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-      output, input, batchSize, width, height);
-
-  CHECK_SYNC("batchTranspose failed!");
-}
diff --git a/paddle/legacy/cuda/src/hl_cpu_functions.cc b/paddle/legacy/cuda/src/hl_cpu_functions.cc
deleted file mode 100644
index 1306576bcb9..00000000000
--- a/paddle/legacy/cuda/src/hl_cpu_functions.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <math.h>
-#include "hl_functions.h"
-
-namespace hppl {
-
-real relu(const real a) { return a > 0.0f ? a : 0.0f; }
-
-real sigmoid(const real a) {
-  const real min = SIGMOID_THRESHOLD_MIN;
-  const real max = SIGMOID_THRESHOLD_MAX;
-  real tmp = (a < min) ? min : ((a > max) ? max : a);
-  return 1.0 / (1.0 + exp(-tmp));
-}
-
-real tanh(const real a) {
-  real tmp = -2.0 * a;
-  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-  return (2.0 / (1.0 + exp(tmp))) - 1.0;
-}
-
-real linear(const real a) { return a; }
-
-real relu(const real a, const real b) { return a * (b > 0.0f ? 1.0f : 0.0f); }
-
-real sigmoid(const real a, const real b) { return a * b * (1 - b); }
-
-real tanh(const real a, const real b) { return a * (1.0f - b * b); }
-
-real linear(const real a, const real b) { return a; }
-}  // namespace hppl
diff --git a/paddle/legacy/cuda/src/hl_cuda_aggregate.cu b/paddle/legacy/cuda/src/hl_cuda_aggregate.cu
deleted file mode 100644
index 9831c5ecc34..00000000000
--- a/paddle/legacy/cuda/src/hl_cuda_aggregate.cu
+++ /dev/null
@@ -1,293 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_aggregate.h"
-#include "hl_base.h"
-#include "hl_cuda.h"
-#include "hl_cuda.ph"
-#include "hl_matrix_base.cuh"
-#include "hl_thread.ph"
-#include "paddle/legacy/utils/Logging.h"
-
-/**
- * @brief   matrix row operator.
- */
-template <class Agg, int blockSize>
-__global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
-  __shared__ real sum_s[blockSize];
-  int cnt = (dimN + blockSize - 1) / blockSize;
-  int rowId = blockIdx.x + blockIdx.y * gridDim.x;
-  int index = rowId * dimN;
-  int tid = threadIdx.x;
-  int lmt = tid;
-
-  real tmp = agg.init();
-  for (int ii = 0; ii < cnt && lmt < dimN; ii++) {
-    tmp = agg(tmp, E[index + lmt]);
-    lmt += blockSize;
-  }
-  sum_s[tid] = tmp;
-  __syncthreads();
-
-  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
-    if (tid < stride) {
-      sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    Sum[rowId] = sum_s[0];
-  }
-}
-
-template <class Agg>
-void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
-  int blocksX = dimM;
-  int blocksY = 1;
-  dim3 threads(128, 1);
-  dim3 grid(blocksX, blocksY);
-
-  KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      agg, A_d, C_d, dimN);
-}
-
-void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_row_sum failed");
-}
-
-void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_row_max failed");
-}
-
-void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_row_min failed");
-}
-
-/**
- * @brief   matrix column operator.
- */
-template <class Agg>
-__global__ void KeMatrixColumnOp(
-    Agg agg, real *E, real *Sum, int dimM, int dimN) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  real tmp = agg.init();
-  if (rowIdx < dimN) {
-    for (int index = 0; index < dimM; index++) {
-      tmp = agg(tmp, E[dimN * index + rowIdx]);
-    }
-    Sum[rowIdx] = tmp;
-  }
-}
-
-template <class Agg, int blockDimX, int blockDimY>
-__global__ void KeMatrixColumnOp_S(
-    Agg agg, real *E, real *Sum, int dimM, int dimN) {
-  __shared__ real _sum[blockDimX * blockDimY];
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  int index = threadIdx.y;
-
-  real tmp = agg.init();
-  if (rowIdx < dimN) {
-    for (; index < dimM;) {
-      tmp = agg(tmp, E[dimN * index + rowIdx]);
-      index += blockDimY;
-    }
-  }
-  _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp;
-  __syncthreads();
-
-  if (rowIdx < dimN) {
-    if (threadIdx.y == 0) {
-      real tmp = agg.init();
-      for (int i = 0; i < blockDimY; i++) {
-        tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]);
-      }
-      Sum[rowIdx] = tmp;
-    }
-  }
-}
-
-template <class Agg>
-void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
-  if (dimN >= 8192) {
-    int blocksX = (dimN + 128 - 1) / 128;
-    int blocksY = 1;
-    dim3 threads(128, 1);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        agg, A_d, C_d, dimM, dimN);
-  } else {
-    int blocksX = (dimN + 32 - 1) / 32;
-    int blocksY = 1;
-    dim3 threads(32, 32);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        agg, A_d, C_d, dimM, dimN);
-  }
-
-  return;
-}
-
-void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);
-
-  CHECK_SYNC("hl_matrix_column_sum failed");
-}
-
-void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);
-
-  CHECK_SYNC("hl_matrix_column_max failed");
-}
-
-void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);
-
-  CHECK_SYNC("hl_matrix_column_min failed");
-}
-
-template <int blockSize>
-__global__ void KeVectorSum(real *E, real *Sum, int dimM) {
-  __shared__ double sum_s[blockSize];
-  int tid = threadIdx.x;
-  int index = blockIdx.y * blockDim.x + threadIdx.x;
-
-  sum_s[tid] = 0.0f;
-  while (index < dimM) {
-    sum_s[tid] += E[index];
-    index += blockDim.x * gridDim.y;
-  }
-  __syncthreads();
-
-  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
-    if (tid < stride) {
-      sum_s[tid] += sum_s[tid + stride];
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    Sum[blockIdx.y] = sum_s[0];
-  }
-}
-
-void hl_vector_sum(real *A_d, real *C_h, int dimM) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_h);
-
-  int blockSize = 128;
-  int gridSize = 128;
-  int blocksX = 1;
-  int blocksY = gridSize;
-  dim3 threads(blockSize, 1);
-  dim3 grid(blocksX, blocksY);
-
-  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
-  hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {
-  }
-
-  KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      A_d, t_resource.gpu_mem, dimM);
-  KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
-      t_resource.gpu_mem, t_resource.cpu_mem, 128);
-
-  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
-  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
-
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
-                             << hl_get_device_error_string((size_t)err);
-}
-
-template <int blockSize>
-__global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
-  __shared__ double sum_s[blockSize];
-  int tid = threadIdx.x;
-  int index = blockIdx.y * blockDim.x + threadIdx.x;
-
-  sum_s[tid] = 0.0f;
-  while (index < dimM) {
-    sum_s[tid] += abs(E[index]);
-    index += blockDim.x * gridDim.y;
-  }
-  __syncthreads();
-
-  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
-    if (tid < stride) {
-      sum_s[tid] += sum_s[tid + stride];
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    Sum[blockIdx.y] = sum_s[0];
-  }
-}
-
-void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_h);
-
-  int blockSize = 128;
-  int gridSize = 128;
-  int blocksX = 1;
-  int blocksY = gridSize;
-  dim3 threads(blockSize, 1);
-  dim3 grid(blocksX, blocksY);
-
-  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
-  hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {
-  }
-
-  KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      A_d, t_resource.gpu_mem, dimM);
-  KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
-      t_resource.gpu_mem, t_resource.cpu_mem, 128);
-
-  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
-  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
-
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
-                             << hl_get_device_error_string((size_t)err);
-}
diff --git a/paddle/legacy/cuda/src/hl_cuda_cnn.cu b/paddle/legacy/cuda/src/hl_cuda_cnn.cu
deleted file mode 100644
index bac743a293c..00000000000
--- a/paddle/legacy/cuda/src/hl_cuda_cnn.cu
+++ /dev/null
@@ -1,1106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <float.h>
-#include "hl_base.h"
-#include "hl_cnn.h"
-#include "hl_device_functions.cuh"
-
-__global__ void KeMaxPoolForward(const int nthreads,
-                                 const real* inputData,
-                                 const int channels,
-                                 const int height,
-                                 const int width,
-                                 const int pooledH,
-                                 const int pooledW,
-                                 const int ksizeW,
-                                 const int ksizeH,
-                                 const int strideH,
-                                 const int strideW,
-                                 const int offsetH,
-                                 const int offsetW,
-                                 real* tgtData,
-                                 const int tgtStride,
-                                 real* maskData) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    int pw = index % pooledW;
-    int ph = (index / pooledW) % pooledH;
-    int c = (index / pooledW / pooledH) % channels;
-    int frameNum = index / pooledW / pooledH / channels;
-    int hstart = ph * strideH - offsetH;
-    int wstart = pw * strideW - offsetW;
-    int hend = min(hstart + ksizeH, height);
-    int wend = min(wstart + ksizeW, width);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    real maxval = -FLT_MAX;
-    int max_index = -1;
-    inputData += (frameNum * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        if (maxval < inputData[h * width + w]) {
-          max_index = h * width + w;
-          maxval = inputData[max_index];
-        }
-      }
-    }
-    int tgtIndex =
-        index % (pooledW * pooledH * channels) + frameNum * tgtStride;
-    tgtData[tgtIndex] = maxval;
-    if (maskData != NULL) {
-      maskData[tgtIndex] = max_index;
-    }
-  }
-}
-
-void hl_maxpool_forward(const int frameCnt,
-                        const real* inputData,
-                        const int channels,
-                        const int height,
-                        const int width,
-                        const int pooledH,
-                        const int pooledW,
-                        const int sizeX,
-                        const int sizeY,
-                        const int strideH,
-                        const int strideW,
-                        const int paddingH,
-                        const int paddingW,
-                        real* tgtData,
-                        const int tgtStride,
-                        real* maskData) {
-  int num_kernels = pooledH * pooledW * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  dim3 threads(1024, 1);
-  dim3 grid(blocks, 1);
-
-  KeMaxPoolForward<<<grid, threads, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                         inputData,
-                                                         channels,
-                                                         height,
-                                                         width,
-                                                         pooledH,
-                                                         pooledW,
-                                                         sizeX,
-                                                         sizeY,
-                                                         strideH,
-                                                         strideW,
-                                                         paddingH,
-                                                         paddingW,
-                                                         tgtData,
-                                                         tgtStride,
-                                                         maskData);
-  CHECK_SYNC("hl_maxpool_forward failed");
-}
-
-__global__ void KeMaxPoolBackward(const int nthreads,
-                                  const real* inputData,
-                                  const real* outData,
-                                  const real* outGrad,
-                                  const int channels,
-                                  const int height,
-                                  const int width,
-                                  const int pooledH,
-                                  const int pooledW,
-                                  const int sizeX,
-                                  const int sizeY,
-                                  const int strideH,
-                                  const int strideW,
-                                  const int padH,
-                                  const int padW,
-                                  real scaleA,
-                                  real scaleB,
-                                  real* targetGrad,
-                                  const int outStride) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    // find out the local index
-    // find out the local offset
-    int offsetW = index % width + padW;
-    int offsetH = (index / width) % height + padH;
-    int offsetC = (index / width / height) % channels;
-
-    int frameNum = index / width / height / channels;
-    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
-    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
-    int phend = offsetH >= 0 ? min(offsetH / strideH + 1, pooledH) : 0;
-    int pwend = offsetW >= 0 ? min(offsetW / strideW + 1, pooledW) : 0;
-    real gradient = 0;
-    real input = inputData[index];
-    outData += (frameNum * outStride + offsetC * pooledH * pooledW);
-    outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
-    for (int ph = phstart; ph < phend; ++ph) {
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        if (input == outData[ph * pooledW + pw]) {
-          gradient += outGrad[ph * pooledW + pw];
-        }
-      }
-    }
-    targetGrad[index] = scaleB * targetGrad[index] + scaleA * gradient;
-  }
-}
-
-void hl_maxpool_backward(const int frameCnt,
-                         const real* inputData,
-                         const real* outData,
-                         const real* outGrad,
-                         const int channels,
-                         const int height,
-                         const int width,
-                         const int pooledH,
-                         const int pooledW,
-                         const int sizeX,
-                         const int sizeY,
-                         const int strideH,
-                         const int strideW,
-                         const int paddingH,
-                         const int paddingW,
-                         real scaleA,
-                         real scaleB,
-                         real* targetGrad,
-                         const int outStride) {
-  int num_kernels = height * width * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-
-  KeMaxPoolBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                         inputData,
-                                                         outData,
-                                                         outGrad,
-                                                         channels,
-                                                         height,
-                                                         width,
-                                                         pooledH,
-                                                         pooledW,
-                                                         sizeX,
-                                                         sizeY,
-                                                         strideH,
-                                                         strideW,
-                                                         paddingH,
-                                                         paddingW,
-                                                         scaleA,
-                                                         scaleB,
-                                                         targetGrad,
-                                                         outStride);
-  CHECK_SYNC("hl_maxpool_backward");
-}
-
-__global__ void KeAvgPoolForward(const int nthreads,
-                                 const real* inputData,
-                                 const int channels,
-                                 const int height,
-                                 const int width,
-                                 const int pooledH,
-                                 const int pooledW,
-                                 const int sizeX,
-                                 const int sizeY,
-                                 const int strideH,
-                                 const int strideW,
-                                 const int padH,
-                                 const int padW,
-                                 real* tgtData,
-                                 const int tgtStride,
-                                 const bool excludeMode) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    int pw = index % pooledW;
-    int ph = (index / pooledW) % pooledH;
-    int c = (index / pooledW / pooledH) % channels;
-    int frameNum = index / pooledW / pooledH / channels;
-
-    int hstart = ph * strideH - padH;
-    int wstart = pw * strideW - padW;
-    int hend = min(hstart + sizeY, height);
-    int wend = min(wstart + sizeX, width);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    int poolSize =
-        excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
-
-    real aveval = 0;
-    inputData += (frameNum * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        aveval += inputData[h * width + w];
-      }
-    }
-    int tgtIndex =
-        index % (pooledW * pooledH * channels) + frameNum * tgtStride;
-    tgtData[tgtIndex] = aveval / poolSize;
-  }
-}
-
-void hl_avgpool_forward(const int frameCnt,
-                        const real* inputData,
-                        const int channels,
-                        const int height,
-                        const int width,
-                        const int pooledH,
-                        const int pooledW,
-                        const int sizeX,
-                        const int sizeY,
-                        const int strideH,
-                        const int strideW,
-                        const int paddingH,
-                        const int paddingW,
-                        real* tgtData,
-                        const int tgtStride,
-                        const bool excludeMode) {
-  int num_kernels = pooledH * pooledW * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  KeAvgPoolForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                        inputData,
-                                                        channels,
-                                                        height,
-                                                        width,
-                                                        pooledH,
-                                                        pooledW,
-                                                        sizeX,
-                                                        sizeY,
-                                                        strideH,
-                                                        strideW,
-                                                        paddingH,
-                                                        paddingW,
-                                                        tgtData,
-                                                        tgtStride,
-                                                        excludeMode);
-  CHECK_SYNC("hl_avgpool_forward failed");
-}
-
-__global__ void KeAvgPoolBackward(const int nthreads,
-                                  const real* outGrad,
-                                  const int channels,
-                                  const int height,
-                                  const int width,
-                                  const int pooledH,
-                                  const int pooledW,
-                                  const int sizeX,
-                                  const int sizeY,
-                                  const int strideH,
-                                  const int strideW,
-                                  const int padH,
-                                  const int padW,
-                                  real scaleA,
-                                  real scaleB,
-                                  real* tgtGrad,
-                                  const int outStride,
-                                  const bool excludeMode) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    int offsetW = index % width + padW;
-    int offsetH = (index / width) % height + padH;
-    int offsetC = (index / width / height) % channels;
-    int frameNum = index / width / height / channels;
-
-    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
-    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
-    int phend = offsetH >= 0 ? min(offsetH / strideH + 1, pooledH) : 0;
-    int pwend = offsetW >= 0 ? min(offsetW / strideW + 1, pooledW) : 0;
-    real gradient = 0;
-    outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
-
-    for (int ph = phstart; ph < phend; ++ph) {
-      int hstart = ph * strideH - padH;
-      int hend = min(hstart + sizeY, height);
-      hstart = max(hstart, 0);
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        // figure out the pooling size
-        int wstart = pw * strideW - padW;
-        int wend = min(wstart + sizeX, width);
-        wstart = max(wstart, 0);
-        int poolSize =
-            excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
-        gradient += outGrad[ph * pooledW + pw] / poolSize;
-      }
-    }
-    tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient;
-  }
-}
-
-void hl_avgpool_backward(const int frameCnt,
-                         const real* outGrad,
-                         const int channels,
-                         const int height,
-                         const int width,
-                         const int pooledH,
-                         const int pooledW,
-                         const int sizeX,
-                         const int sizeY,
-                         const int strideH,
-                         const int strideW,
-                         const int paddingH,
-                         const int paddingW,
-                         real scaleA,
-                         real scaleB,
-                         real* backGrad,
-                         const int outStride,
-                         const bool excludeMode) {
-  int num_kernels = height * width * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-
-  KeAvgPoolBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                         outGrad,
-                                                         channels,
-                                                         height,
-                                                         width,
-                                                         pooledH,
-                                                         pooledW,
-                                                         sizeX,
-                                                         sizeY,
-                                                         strideH,
-                                                         strideW,
-                                                         paddingH,
-                                                         paddingW,
-                                                         scaleA,
-                                                         scaleB,
-                                                         backGrad,
-                                                         outStride,
-                                                         excludeMode);
-  CHECK_SYNC("hl_avgpool_backward failed");
-}
-
-__global__ void KeMaxPool3DForward(const int nthreads,
-                                   const real* inputData,
-                                   const int channels,
-                                   const int depth,
-                                   const int height,
-                                   const int width,
-                                   const int pooledD,
-                                   const int pooledH,
-                                   const int pooledW,
-                                   const int ksizeD,
-                                   const int ksizeH,
-                                   const int ksizeW,
-                                   const int strideD,
-                                   const int strideH,
-                                   const int strideW,
-                                   const int padD,
-                                   const int padH,
-                                   const int padW,
-                                   real* tgtData,
-                                   real* maxPoolIdxData,
-                                   const int tgtStride) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
-       index += blockDim.x * gridDim.x) {
-    int pw = index % pooledW;
-    int ph = (index / pooledW) % pooledH;
-    int pd = (index / pooledW / pooledH) % pooledD;
-    int c = (index / pooledW / pooledH / pooledD) % channels;
-    int frameNum = index / pooledW / pooledH / pooledD / channels;
-    int dstart = pd * strideD - padD;
-    int hstart = ph * strideH - padH;
-    int wstart = pw * strideW - padW;
-    int dend = min(dstart + ksizeD, depth);
-    int hend = min(hstart + ksizeH, height);
-    int wend = min(wstart + ksizeW, width);
-    dstart = max(dstart, 0);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    real maxval = -FLT_MAX;
-    int maxIdx = -1;
-    inputData += (frameNum * channels + c) * depth * height * width;
-    for (int d = dstart; d < dend; ++d) {
-      for (int h = hstart; h < hend; ++h) {
-        for (int w = wstart; w < wend; ++w) {
-          if (maxval < inputData[(d * height + h) * width + w]) {
-            maxval = inputData[(d * height + h) * width + w];
-            maxIdx = (d * height + h) * width + w;
-          }
-        }
-      }
-    }
-    int tgtIndex =
-        index % (pooledW * pooledH * pooledD * channels) + frameNum * tgtStride;
-    tgtData[tgtIndex] = maxval;
-    maxPoolIdxData[tgtIndex] = maxIdx;
-  }
-}
-
-void hl_maxpool3D_forward(const int frameCnt,
-                          const real* inputData,
-                          const int channels,
-                          const int depth,
-                          const int height,
-                          const int width,
-                          const int pooledD,
-                          const int pooledH,
-                          const int pooledW,
-                          const int sizeZ,
-                          const int sizeY,
-                          const int sizeX,
-                          const int strideD,
-                          const int strideH,
-                          const int strideW,
-                          const int padD,
-                          const int padH,
-                          const int padW,
-                          real* tgtData,
-                          real* maxPoolIdxData,
-                          const int tgtStride) {
-  int num_kernels = pooledD * pooledH * pooledW * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  dim3 threads(1024, 1);
-  dim3 grid(blocks, 1);
-
-  KeMaxPool3DForward<<<grid, threads, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                           inputData,
-                                                           channels,
-                                                           depth,
-                                                           height,
-                                                           width,
-                                                           pooledD,
-                                                           pooledH,
-                                                           pooledW,
-                                                           sizeZ,
-                                                           sizeY,
-                                                           sizeX,
-                                                           strideD,
-                                                           strideH,
-                                                           strideW,
-                                                           padD,
-                                                           padH,
-                                                           padW,
-                                                           tgtData,
-                                                           maxPoolIdxData,
-                                                           tgtStride);
-  CHECK_SYNC("hl_maxpool3D_forward failed");
-}
-
-__global__ void KeMaxPool3DBackward(const int nthreads,
-                                    const real* outGrad,
-                                    const int channels,
-                                    const int depth,
-                                    const int height,
-                                    const int width,
-                                    const int pooledD,
-                                    const int pooledH,
-                                    const int pooledW,
-                                    const int sizeZ,
-                                    const int sizeY,
-                                    const int sizeX,
-                                    const int strideD,
-                                    const int strideH,
-                                    const int strideW,
-                                    const int padD,
-                                    const int padH,
-                                    const int padW,
-                                    real scaleA,
-                                    real scaleB,
-                                    real* targetGrad,
-                                    real* maxPoolIdxData,
-                                    const int outStride) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
-       index += blockDim.x * gridDim.x) {
-    int offsetW = index % width;
-    int offsetH = (index / width) % height;
-    int offsetD = (index / width / height) % depth;
-    int offsetC = (index / width / height / depth) % channels;
-    int frameNum = index / width / height / depth / channels;
-
-    int pdstart =
-        (offsetD + padD < sizeZ) ? 0 : (offsetD + padD - sizeZ) / strideD + 1;
-    int phstart =
-        (offsetH + padH < sizeY) ? 0 : (offsetH + padH - sizeY) / strideH + 1;
-    int pwstart =
-        (offsetW + padW < sizeX) ? 0 : (offsetW + padW - sizeX) / strideW + 1;
-    int pdend = min((offsetD + padD) / strideD + 1, pooledD);
-    int phend = min((offsetH + padH) / strideH + 1, pooledH);
-    int pwend = min((offsetW + padW) / strideW + 1, pooledW);
-
-    real gradient = 0;
-    outGrad += ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
-    maxPoolIdxData +=
-        ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
-    for (int pd = pdstart; pd < pdend; ++pd) {
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          if (((offsetD * height + offsetH) * width + offsetW) ==
-              maxPoolIdxData[(pd * pooledH + ph) * pooledW + pw])
-            gradient += outGrad[(pd * pooledH + ph) * pooledW + pw];
-        }
-      }
-    }
-    targetGrad[index] = scaleA * gradient + scaleB * targetGrad[index];
-  }
-}
-
-void hl_maxpool3D_backward(const int frameCnt,
-                           const real* outGrad,
-                           const int channels,
-                           const int depth,
-                           const int height,
-                           const int width,
-                           const int outputD,
-                           const int outputH,
-                           const int outputW,
-                           const int sizeZ,
-                           const int sizeY,
-                           const int sizeX,
-                           const int strideD,
-                           const int strideH,
-                           const int strideW,
-                           const int paddingD,
-                           const int paddingH,
-                           const int paddingW,
-                           real scaleA,
-                           real scaleB,
-                           real* targetGrad,
-                           real* maxPoolIdxData,
-                           const int outStride) {
-  int num_kernels = depth * height * width * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-
-  KeMaxPool3DBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                           outGrad,
-                                                           channels,
-                                                           depth,
-                                                           height,
-                                                           width,
-                                                           outputD,
-                                                           outputH,
-                                                           outputW,
-                                                           sizeZ,
-                                                           sizeY,
-                                                           sizeX,
-                                                           strideD,
-                                                           strideH,
-                                                           strideW,
-                                                           paddingD,
-                                                           paddingH,
-                                                           paddingW,
-                                                           scaleA,
-                                                           scaleB,
-                                                           targetGrad,
-                                                           maxPoolIdxData,
-                                                           outStride);
-  CHECK_SYNC("hl_maxpool3D_backward");
-}
-
-__global__ void KeAvgPool3DForward(const int nthreads,
-                                   const real* inputData,
-                                   const int channels,
-                                   const int depth,
-                                   const int height,
-                                   const int width,
-                                   const int pooledD,
-                                   const int pooledH,
-                                   const int pooledW,
-                                   const int sizeZ,
-                                   const int sizeY,
-                                   const int sizeX,
-                                   const int strideD,
-                                   const int strideH,
-                                   const int strideW,
-                                   const int padD,
-                                   const int padH,
-                                   const int padW,
-                                   real* tgtData,
-                                   const int tgtStride) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
-       index += blockDim.x * gridDim.x) {
-    int pw = index % pooledW;
-    int ph = (index / pooledW) % pooledH;
-    int pd = (index / pooledW / pooledH) % pooledD;
-    int c = (index / pooledW / pooledH / pooledD) % channels;
-    int frameNum = index / pooledW / pooledH / pooledD / channels;
-    int dstart = pd * strideD - padD;
-    int hstart = ph * strideH - padH;
-    int wstart = pw * strideW - padW;
-    int dend = min(dstart + sizeZ, depth);
-    int hend = min(hstart + sizeY, height);
-    int wend = min(wstart + sizeX, width);
-    dstart = max(dstart, 0);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-
-    real aveval = 0;
-    inputData += (frameNum * channels + c) * depth * height * width;
-    for (int d = dstart; d < dend; ++d) {
-      for (int h = hstart; h < hend; ++h) {
-        for (int w = wstart; w < wend; ++w) {
-          aveval += inputData[(d * height + h) * width + w];
-        }
-      }
-    }
-    int tgtIndex =
-        index % (pooledW * pooledH * pooledD * channels) + frameNum * tgtStride;
-    tgtData[tgtIndex] = aveval / pool_size;
-  }
-}
-
-void hl_avgpool3D_forward(const int frameCnt,
-                          const real* inputData,
-                          const int channels,
-                          const int depth,
-                          const int height,
-                          const int width,
-                          const int pooledD,
-                          const int pooledH,
-                          const int pooledW,
-                          const int sizeZ,
-                          const int sizeY,
-                          const int sizeX,
-                          const int strideD,
-                          const int strideH,
-                          const int strideW,
-                          const int paddingD,
-                          const int paddingH,
-                          const int paddingW,
-                          real* tgtData,
-                          const int tgtStride) {
-  int num_kernels = pooledD * pooledH * pooledW * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  KeAvgPool3DForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                          inputData,
-                                                          channels,
-                                                          depth,
-                                                          height,
-                                                          width,
-                                                          pooledD,
-                                                          pooledH,
-                                                          pooledW,
-                                                          sizeZ,
-                                                          sizeY,
-                                                          sizeX,
-                                                          strideD,
-                                                          strideH,
-                                                          strideW,
-                                                          paddingD,
-                                                          paddingH,
-                                                          paddingW,
-                                                          tgtData,
-                                                          tgtStride);
-  CHECK_SYNC("hl_avgpool3D_forward failed");
-}
-
-__global__ void KeAvgPool3DBackward(const int nthreads,
-                                    const real* outGrad,
-                                    const int channels,
-                                    const int depth,
-                                    const int height,
-                                    const int width,
-                                    const int pooledD,
-                                    const int pooledH,
-                                    const int pooledW,
-                                    const int sizeZ,
-                                    const int sizeY,
-                                    const int sizeX,
-                                    const int strideD,
-                                    const int strideH,
-                                    const int strideW,
-                                    const int padD,
-                                    const int padH,
-                                    const int padW,
-                                    real scaleA,
-                                    real scaleB,
-                                    real* tgtGrad,
-                                    const int outStride) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
-       index += blockDim.x * gridDim.x) {
-    int offsetW = index % width + padW;
-    int offsetH = (index / width) % height + padH;
-    int offsetD = (index / width / height) % depth + padD;
-    int offsetC = (index / width / height / depth) % channels;
-    int frameNum = index / width / height / depth / channels;
-
-    int pdstart = (offsetD < sizeZ) ? 0 : (offsetD - sizeZ) / strideD + 1;
-    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
-    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
-    int pdend = min(offsetD / strideD + 1, pooledD);
-    int phend = min(offsetH / strideH + 1, pooledH);
-    int pwend = min(offsetW / strideW + 1, pooledW);
-
-    real gradient = 0;
-    outGrad += (frameNum * channels + offsetC) * pooledD * pooledH * pooledW;
-
-    for (int pd = pdstart; pd < pdend; ++pd) {
-      int dstart = pd * strideD - padD;
-      int dend = min(dstart + sizeZ, depth);
-      dstart = max(dstart, 0);
-      for (int ph = phstart; ph < phend; ++ph) {
-        int hstart = ph * strideH - padH;
-        int hend = min(hstart + sizeY, height);
-        hstart = max(hstart, 0);
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          // figure out the pooling size
-          int wstart = pw * strideW - padW;
-          int wend = min(wstart + sizeX, width);
-          wstart = max(wstart, 0);
-          int poolsize = (dend - dstart) * (hend - hstart) * (wend - wstart);
-          gradient += outGrad[(pd * pooledH + ph) * pooledW + pw] / poolsize;
-        }
-      }
-    }
-    tgtGrad[index] = scaleA * gradient + scaleB * tgtGrad[index];
-  }
-}
-
-void hl_avgpool3D_backward(const int frameCnt,
-                           const real* outGrad,
-                           const int channels,
-                           const int depth,
-                           const int height,
-                           const int width,
-                           const int outputD,
-                           const int outputH,
-                           const int outputW,
-                           const int sizeZ,
-                           const int sizeY,
-                           const int sizeX,
-                           const int strideD,
-                           const int strideH,
-                           const int strideW,
-                           int paddingD,
-                           int paddingH,
-                           int paddingW,
-                           real scaleA,
-                           real scaleB,
-                           real* backGrad,
-                           const int outStride) {
-  int num_kernels = depth * height * width * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-
-  KeAvgPool3DBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                           outGrad,
-                                                           channels,
-                                                           depth,
-                                                           height,
-                                                           width,
-                                                           outputD,
-                                                           outputH,
-                                                           outputW,
-                                                           sizeZ,
-                                                           sizeY,
-                                                           sizeX,
-                                                           strideD,
-                                                           strideH,
-                                                           strideW,
-                                                           paddingD,
-                                                           paddingH,
-                                                           paddingW,
-                                                           scaleA,
-                                                           scaleB,
-                                                           backGrad,
-                                                           outStride);
-  CHECK_SYNC("hl_avgpool3D_backward failed");
-}
-
-__global__ void KeBilinearInterpFw(const real* in,
-                                   const size_t inImgH,
-                                   const size_t inImgW,
-                                   const size_t inputH,
-                                   const size_t inputW,
-                                   real* out,
-                                   const size_t outImgH,
-                                   const size_t outImgW,
-                                   const size_t outputH,
-                                   const size_t outputW,
-                                   const size_t numChannels,
-                                   const real ratioH,
-                                   const real ratioW) {
-  int nthreads = outputH * outputW;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < nthreads) {
-    int outIdH = tid / outputW;
-    int outIdW = tid % outputW;
-    int inImgSize = inputW / numChannels;
-    int outImgSize = outputW / numChannels;
-    int channelId = outIdW / outImgSize;
-
-    int outImgIdy = (outIdW % outImgSize) / outImgW;
-    int inImgIdy = ratioH * outImgIdy;
-    int hId = (inImgIdy < inImgH - 1) ? 1 : 0;
-    real h1lambda = ratioH * outImgIdy - inImgIdy;
-    real h2lambda = 1.f - h1lambda;
-
-    int outImgIdx = tid % outImgW;
-    int inImgIdx = ratioW * outImgIdx;
-    int wId = (inImgIdx < inImgW - 1) ? 1 : 0;
-    real w1lambda = ratioW * outImgIdx - inImgIdx;
-    real w2lambda = 1.f - w1lambda;
-
-    const real* inPos = &in[outIdH * inputW + channelId * inImgSize +
-                            inImgIdy * inImgW + inImgIdx];
-
-    // bilinear interpolation
-    out[outIdH * outputW + outIdW] =
-        h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wId]) +
-        h1lambda * (w2lambda * inPos[hId * inImgW] +
-                    w1lambda * inPos[hId * inImgW + wId]);
-  }
-}
-
-void hl_bilinear_forward(const real* inData,
-                         const size_t inImgH,
-                         const size_t inImgW,
-                         const size_t inputH,
-                         const size_t inputW,
-                         real* outData,
-                         const size_t outImgH,
-                         const size_t outImgW,
-                         const size_t outputH,
-                         const size_t outputW,
-                         const size_t numChannels,
-                         const real ratioH,
-                         const real ratioW) {
-  int threadNum = outputH * outputW;
-  int blocks = (threadNum + 1024 - 1) / 1024;
-
-  KeBilinearInterpFw<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inData,
-                                                          inImgH,
-                                                          inImgW,
-                                                          inputH,
-                                                          inputW,
-                                                          outData,
-                                                          outImgH,
-                                                          outImgW,
-                                                          outputH,
-                                                          outputW,
-                                                          numChannels,
-                                                          ratioH,
-                                                          ratioW);
-  CHECK_SYNC("hl_bilinear_forward failed");
-}
-
-__global__ void KeBilinearInterpBw(real* in,
-                                   const size_t inImgH,
-                                   const size_t inImgW,
-                                   const size_t inputH,
-                                   const size_t inputW,
-                                   const real* out,
-                                   const size_t outImgH,
-                                   const size_t outImgW,
-                                   const size_t outputH,
-                                   const size_t outputW,
-                                   const size_t numChannels,
-                                   const real ratioH,
-                                   const real ratioW) {
-  int nthreads = outputH * outputW;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < nthreads) {
-    int outIdH = tid / outputW;
-    int outIdW = tid % outputW;
-    int inImgSize = inputW / numChannels;
-    int outImgSize = outputW / numChannels;
-    int channelId = outIdW / outImgSize;
-
-    int outImgIdy = (outIdW % outImgSize) / outImgW;
-    int inImgIdy = ratioH * outImgIdy;
-    int hId = (inImgIdy < inImgH - 1) ? 1 : 0;
-    real h1lambda = ratioH * outImgIdy - inImgIdy;
-    real h2lambda = 1.f - h1lambda;
-
-    int outImgIdx = tid % outImgW;
-    int inImgIdx = ratioW * outImgIdx;
-    int wId = (inImgIdx < inImgW - 1) ? 1 : 0;
-    real w1lambda = ratioW * outImgIdx - inImgIdx;
-    real w2lambda = 1.f - w1lambda;
-
-    real* inPos = &in[outIdH * inputW + channelId * inImgSize +
-                      inImgIdy * inImgW + inImgIdx];
-    const real* outPos = &out[outIdH * outputW + outIdW];
-    paddle::paddleAtomicAdd(&inPos[0], h2lambda * w2lambda * outPos[0]);
-    paddle::paddleAtomicAdd(&inPos[wId], h2lambda * w1lambda * outPos[0]);
-    paddle::paddleAtomicAdd(&inPos[hId * inImgW],
-                            h1lambda * w2lambda * outPos[0]);
-    paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId],
-                            h1lambda * w1lambda * outPos[0]);
-  }
-}
-
-void hl_bilinear_backward(real* inGrad,
-                          const size_t inImgH,
-                          const size_t inImgW,
-                          const size_t inputH,
-                          const size_t inputW,
-                          const real* outGrad,
-                          const size_t outImgH,
-                          const size_t outImgW,
-                          const size_t outputH,
-                          const size_t outputW,
-                          const size_t numChannels,
-                          const real ratioH,
-                          const real ratioW) {
-  int threadNum = outputH * outputW;
-  int blocks = (threadNum + 1024 - 1) / 1024;
-
-  KeBilinearInterpBw<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inGrad,
-                                                          inImgH,
-                                                          inImgW,
-                                                          inputH,
-                                                          inputW,
-                                                          outGrad,
-                                                          outImgH,
-                                                          outImgW,
-                                                          outputH,
-                                                          outputW,
-                                                          numChannels,
-                                                          ratioH,
-                                                          ratioW);
-  CHECK_SYNC("hl_bilinear_backward failed");
-}
-
-__global__ void maxoutFpCompute(size_t nthreads,
-                                const real* inData,
-                                real* outData,
-                                int* idData,
-                                size_t size,
-                                size_t featLen,
-                                size_t groups) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    size_t batch_idx = index / size;
-    size_t i = index % size;
-    size_t channel_idx = i / featLen;
-    size_t feat_idx = i % featLen;
-    size_t data_idx =
-        (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
-    real max = inData[data_idx];
-    int maxId = 0;
-    for (size_t g = 1; g < groups; ++g) {
-      real tmp = inData[data_idx + g * featLen];
-      if (tmp > max) {
-        max = tmp;
-        maxId = g;
-      }
-    }
-    outData[index] = max;
-    idData[index] = maxId;
-  }
-}
-
-void hl_maxout_forward(const real* inData,
-                       real* outData,
-                       int* idData,
-                       size_t batchSize,
-                       size_t size,
-                       size_t featLen,
-                       size_t groups) {
-  int num_kernels = size * batchSize;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  maxoutFpCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(
-      num_kernels, inData, outData, idData, size, featLen, groups);
-  CHECK_SYNC("hl_maxout_forward failed");
-}
-
-__global__ void maxoutBpCompute(size_t nthreads,
-                                real* inGrad,
-                                const real* outGrad,
-                                const int* idData,
-                                size_t size,
-                                size_t featLen,
-                                size_t groups) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    size_t batch_idx = index / size;
-    size_t i = index % size;
-    size_t channel_idx = i / featLen;
-    size_t feat_idx = i % featLen;
-    size_t newIndex = batch_idx * size;
-    size_t gradIdx =
-        (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx;
-    (inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i];
-  }
-}
-
-void hl_maxout_backward(real* inGrad,
-                        const real* outGrad,
-                        const int* idData,
-                        size_t batchSize,
-                        size_t size,
-                        size_t featLen,
-                        size_t groups) {
-  int num_kernels = size * batchSize;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  maxoutBpCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(
-      num_kernels, inGrad, outGrad, idData, size, featLen, groups);
-  CHECK_SYNC("hl_maxout_backward failed");
-}
-
-__global__ void upsampleForwardCompute(real* input_data,
-                                       real* mask_data,
-                                       size_t nthreads,
-                                       size_t in_h,
-                                       size_t in_w,
-                                       size_t out_h,
-                                       size_t out_w,
-                                       real* output_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    int offset = index / (in_w * in_h) * out_h * out_w;
-    int upsample_idx = static_cast<int>(mask_data[index]);
-    output_data[offset + upsample_idx] = input_data[index];
-  }
-}
-
-__global__ void upsampleBackwardCompute(real* out_grad,
-                                        real* mask_data,
-                                        size_t nthreads,
-                                        size_t in_h,
-                                        size_t in_w,
-                                        size_t out_h,
-                                        size_t out_w,
-                                        real* input_grad) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    int offset = index / (in_w * in_h) * out_h * out_w;
-    int upsample_idx = static_cast<int>(mask_data[index]);
-    input_grad[index] = out_grad[offset + upsample_idx];
-  }
-}
-
-void hl_upsample_forward(real* inputData,
-                         real* maskData,
-                         size_t batchSize,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t channels,
-                         size_t outputH,
-                         size_t outputW,
-                         real* outputData) {
-  int num_kernels = batchSize * imgSizeH * imgSizeW * channels;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  upsampleForwardCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inputData,
-                                                              maskData,
-                                                              num_kernels,
-                                                              imgSizeH,
-                                                              imgSizeW,
-                                                              outputH,
-                                                              outputW,
-                                                              outputData);
-  CHECK_SYNC("hl_upsample_forward failed");
-}
-
-void hl_upsample_backward(real* outputGradData,
-                          real* maskData,
-                          size_t batchSize,
-                          size_t imgSizeH,
-                          size_t imgSizeW,
-                          size_t channels,
-                          size_t outputH,
-                          size_t outputW,
-                          real* inputGradData) {
-  int num_kernels = batchSize * imgSizeH * imgSizeW * channels;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  upsampleBackwardCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(outputGradData,
-                                                               maskData,
-                                                               num_kernels,
-                                                               imgSizeH,
-                                                               imgSizeW,
-                                                               outputH,
-                                                               outputW,
-                                                               inputGradData);
-  CHECK_SYNC("hl_upsample_backward failed");
-}
diff --git a/paddle/legacy/cuda/src/hl_cuda_cublas.cc b/paddle/legacy/cuda/src/hl_cuda_cublas.cc
deleted file mode 100644
index 283b8b6e9c8..00000000000
--- a/paddle/legacy/cuda/src/hl_cuda_cublas.cc
+++ /dev/null
@@ -1,400 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_cuda_cublas.h"
-#include <sys/time.h>
-#include "hl_cuda.h"
-#include "hl_thread.ph"
-#include "paddle/legacy/utils/DynamicLoader.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace dynload {
-
-std::once_flag cublas_dso_flag;
-void *cublas_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cublas routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    cublasStatus_t operator()(Args... args) {                                  \
-      typedef cublasStatus_t (*cublasFunc)(Args...);                           \
-      std::call_once(cublas_dso_flag, GetCublasDsoHandle, &cublas_dso_handle); \
-      void *p_##__name = dlsym(cublas_dso_handle, #__name);                    \
-      return reinterpret_cast<cublasFunc>(p_##__name)(args...);                \
-    }                                                                          \
-  } __name;  // struct DynLoad__##__name
-#else
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)      \
-  struct DynLoad__##__name {                  \
-    template <typename... Args>               \
-    cublasStatus_t operator()(Args... args) { \
-      return __name(args...);                 \
-    }                                         \
-  } __name;  // struct DynLoad__##__name
-#endif
-
-#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
-
-// include all needed cublas functions in HPPL
-// clang-format off
-#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
-  __macro(cublasSgemv)                    \
-  __macro(cublasDgemv)                    \
-  __macro(cublasSgemm)                    \
-  __macro(cublasDgemm)                    \
-  __macro(cublasSgeam)                    \
-  __macro(cublasDgeam)                    \
-
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetriBatched)
-CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
-
-#undef DYNAMIC_LOAD_CUBLAS_WRAP
-#undef DYNAMIC_LOAD_CUBLAS_V2_WRAP
-#undef CUBLAS_BLAS_ROUTINE_EACH
-
-} /* namespace dynload */
-
-// clang-format on
-#ifndef PADDLE_TYPE_DOUBLE
-#define CUBLAS_GEAM dynload::cublasSgeam
-#define CUBLAS_GEMV dynload::cublasSgemv
-#define CUBLAS_GEMM dynload::cublasSgemm
-#define CUBLAS_GETRF dynload::cublasSgetrfBatched
-#define CUBLAS_GETRI dynload::cublasSgetriBatched
-#else
-#define CUBLAS_GEAM dynload::cublasDgeam
-#define CUBLAS_GEMV dynload::cublasDgemv
-#define CUBLAS_GEMM dynload::cublasDgemm
-#define CUBLAS_GETRF dynload::cublasDgetrfBatched
-#define CUBLAS_GETRI dynload::cublasDgetriBatched
-#endif
-
-const char *hl_cublas_get_error_string(cublasStatus_t status) {
-  switch (status) {
-    case CUBLAS_STATUS_NOT_INITIALIZED:
-      return "[cublas status]: not initialized";
-    case CUBLAS_STATUS_ALLOC_FAILED:
-      return "[cublas status]: allocate failed";
-    case CUBLAS_STATUS_INVALID_VALUE:
-      return "[cublas status]: invalid value";
-    case CUBLAS_STATUS_ARCH_MISMATCH:
-      return "[cublas status]: arch mismatch";
-    case CUBLAS_STATUS_MAPPING_ERROR:
-      return "[cublas status]: mapping error";
-    case CUBLAS_STATUS_EXECUTION_FAILED:
-      return "[cublas status]: execution failed";
-    case CUBLAS_STATUS_INTERNAL_ERROR:
-      return "[cublas status]: internal error";
-    case CUBLAS_STATUS_SUCCESS:
-      return "[cublas status]: success";
-    default:
-      return "[cublas status]: unknown error";
-  }
-}
-
-/**
- * Check build-in cublas function using glog and it also
- * support << operator for more details error info.
- */
-cublasStatus_t g_cublasStat;
-#define CHECK_CUBLAS(cublas_func)               \
-  g_cublasStat = cublas_func;                   \
-  CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat) \
-      << "Cublas Error: " << hl_cublas_get_error_string(g_cublasStat) << " "
-
-void hl_cublas_init(cublasHandle_t *cublas_handle, cudaStream_t stream) {
-  CHECK_CUBLAS(dynload::cublasCreate(cublas_handle))
-      << "[cublas init] Cublas create handle faild!";
-
-  CHECK_CUBLAS(dynload::cublasSetStream(*cublas_handle, stream))
-      << "[cublas init] Cublas set stream faild!";
-}
-
-void hl_matrix_transpose(
-    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {
-  real alpha = 1.0;
-  real beta = 0.0;
-
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  CHECK_CUBLAS(CUBLAS_GEAM(t_resource.handle,
-                           CUBLAS_OP_T,
-                           CUBLAS_OP_N,
-                           dimM,
-                           dimN,
-                           &alpha,
-                           A_d,
-                           lda,
-                           &beta,
-                           nullptr,
-                           dimM,
-                           C_d,
-                           ldc));
-  CHECK_SYNC("hl_matrix_transpose failed");
-}
-
-void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {
-  hl_matrix_transpose(A_d, C_d, dimM, dimN, dimN, dimM);
-}
-
-void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
-  /* Solve Ax = I */
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  /* Step 1: Compute the LU decomposition of matrix A */
-  real **inout_h = &A_d;
-  real **inout_d = (real **)hl_malloc_device(sizeof(real *));
-  hl_memcpy(inout_d, inout_h, sizeof(real *));
-
-  int *pivot_d = (int *)hl_malloc_device(dimN * sizeof(int));
-  int *info_d = (int *)t_resource.gpu_mem;
-
-  /* Note: cublasSgetrfBatched is used to calculate a number of
-     small-sized matrices. There may be a better way to reconstruct
-     the API for better performance.
-   */
-  CHECK_CUBLAS(
-      CUBLAS_GETRF(t_resource.handle, dimN, inout_d, lda, pivot_d, info_d, 1));
-
-  int info_h;
-  hl_memcpy(&info_h, info_d, sizeof(int));
-  if (info_h != 0) {
-    LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
-  }
-
-  /* Step 2: Compute the inverse of the matrix given its LU decomposition */
-  real **out_h = &C_d;
-  real **out_d = (real **)hl_malloc_device(sizeof(real *));
-  hl_memcpy(out_d, out_h, sizeof(real *));
-
-  CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
-                            dimN,
-                            (const real **)inout_d,
-                            lda,
-                            pivot_d,
-                            out_d,
-                            ldc,
-                            info_d,
-                            1));
-
-  hl_memcpy(&info_h, info_d, sizeof(int));
-  if (info_h != 0) {
-    LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
-  }
-
-  hl_free_mem_device(inout_d);
-  hl_free_mem_device(pivot_d);
-  hl_free_mem_device(out_d);
-
-  CHECK_SYNC("hl_matrix_inverse failed");
-}
-
-void hl_matrix_mul(real *A_d,
-                   hl_trans_op_t transa,
-                   real *B_d,
-                   hl_trans_op_t transb,
-                   real *C_d,
-                   int dimM,
-                   int dimN,
-                   int dimK,
-                   real alpha,
-                   real beta,
-                   int lda,
-                   int ldb,
-                   int ldc) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  if (dimN == 1 && dimM != 1 && dimK != 1 && transb == HPPL_OP_N) {
-    int m = (transa == HPPL_OP_N) ? dimM : dimK;
-    int n = (transa == HPPL_OP_N) ? dimK : dimM;
-    hl_matrix_mul_vector(
-        A_d, transa, B_d, C_d, m, n, alpha, beta, lda, ldb, ldc);
-    return;
-  }
-
-  if (dimM == 1 && dimN != 1 && dimK != 1 && transa == HPPL_OP_N) {
-    int m = (transb == HPPL_OP_N) ? dimK : dimN;
-    int n = (transb == HPPL_OP_N) ? dimN : dimK;
-    hl_trans_op_t trans = (transb == HPPL_OP_N) ? HPPL_OP_T : HPPL_OP_N;
-    hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n, alpha, beta, ldb, 1, 1);
-    return;
-  }
-
-  cublasStatus_t stat;
-  if ((HPPL_OP_N == transa) && (HPPL_OP_N == transb)) {
-    stat = CUBLAS_GEMM(t_resource.handle,
-                       CUBLAS_OP_N,
-                       CUBLAS_OP_N,
-                       dimN,
-                       dimM,
-                       dimK,
-                       &alpha,
-                       B_d,
-                       ldb,
-                       A_d,
-                       lda,
-                       &beta,
-                       C_d,
-                       ldc);
-  } else if ((HPPL_OP_T == transa) && (HPPL_OP_N == transb)) {
-    stat = CUBLAS_GEMM(t_resource.handle,
-                       CUBLAS_OP_N,
-                       CUBLAS_OP_T,
-                       dimN,
-                       dimM,
-                       dimK,
-                       &alpha,
-                       B_d,
-                       ldb,
-                       A_d,
-                       lda,
-                       &beta,
-                       C_d,
-                       ldc);
-  } else if ((HPPL_OP_N == transa) && (HPPL_OP_T == transb)) {
-    stat = CUBLAS_GEMM(t_resource.handle,
-                       CUBLAS_OP_T,
-                       CUBLAS_OP_N,
-                       dimN,
-                       dimM,
-                       dimK,
-                       &alpha,
-                       B_d,
-                       ldb,
-                       A_d,
-                       lda,
-                       &beta,
-                       C_d,
-                       ldc);
-  } else {
-    LOG(FATAL) << "parameter transa error!";
-  }
-  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS) << hl_cublas_get_error_string(stat);
-  CHECK_SYNC("hl_matrix_mul failed");
-}
-
-void hl_matrix_mul(real *A_d,
-                   hl_trans_op_t transa,
-                   real *B_d,
-                   hl_trans_op_t transb,
-                   real *C_d,
-                   int dimM,
-                   int dimN,
-                   int dimK,
-                   real alpha,
-                   real beta) {
-  int lda = (HPPL_OP_N == transa) ? dimK : dimM;
-  int ldb = (HPPL_OP_N == transb) ? dimN : dimK;
-  int ldc = dimN;
-
-  hl_matrix_mul(A_d,
-                transa,
-                B_d,
-                transb,
-                C_d,
-                dimM,
-                dimN,
-                dimK,
-                alpha,
-                beta,
-                lda,
-                ldb,
-                ldc);
-}
-
-void hl_matrix_mul_vector(real *A_d,
-                          hl_trans_op_t trans,
-                          real *B_d,
-                          real *C_d,
-                          int dimM,
-                          int dimN,
-                          real alpha,
-                          real beta,
-                          int lda,
-                          int incb,
-                          int incc) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  cublasStatus_t stat;
-  if (HPPL_OP_N == trans) {
-    stat = CUBLAS_GEMV(t_resource.handle,
-                       CUBLAS_OP_T,
-                       dimN,
-                       dimM,
-                       &alpha,
-                       A_d,
-                       lda,
-                       B_d,
-                       incb,
-                       &beta,
-                       C_d,
-                       incc);
-  } else if (HPPL_OP_T == trans) {
-    stat = CUBLAS_GEMV(t_resource.handle,
-                       CUBLAS_OP_N,
-                       dimN,
-                       dimM,
-                       &alpha,
-                       A_d,
-                       lda,
-                       B_d,
-                       incb,
-                       &beta,
-                       C_d,
-                       incc);
-  } else {
-    LOG(FATAL) << "parameter transa error!";
-  }
-
-  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS) << hl_cublas_get_error_string(stat);
-  CHECK_SYNC("hl_matrix_mul_vector");
-}
-
-void hl_matrix_mul_vector(real *A_d,
-                          hl_trans_op_t trans,
-                          real *B_d,
-                          real *C_d,
-                          int dimM,
-                          int dimN,
-                          real alpha,
-                          real beta) {
-  hl_matrix_mul_vector(
-      A_d, trans, B_d, C_d, dimM, dimN, alpha, beta, dimN, 1, 1);
-}
diff --git a/paddle/legacy/cuda/src/hl_cuda_cudnn.cc b/paddle/legacy/cuda/src/hl_cuda_cudnn.cc
deleted file mode 100644
index b0ac5aaac28..00000000000
--- a/paddle/legacy/cuda/src/hl_cuda_cudnn.cc
+++ /dev/null
@@ -1,1117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_cuda_cudnn.h"
-#include <cudnn.h>
-#include <gflags/gflags.h>
-#include "hl_cuda_cudnn.ph"
-#include "hl_thread.ph"
-#include "paddle/legacy/utils/DynamicLoader.h"
-#include "paddle/legacy/utils/Logging.h"
-
-DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
-             4096,
-             "Specify cuDNN max workspace limit, in units MB, "
-             "4096MB=4GB by default.");
-
-namespace dynload {
-
-std::once_flag cudnn_dso_flag;
-void* cudnn_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cudbnn routine
- * via operator overloading: operator ()
- *
- * note: default dynamic linked libs
- **/
-
-#ifdef PADDLE_USE_DSO
-
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                                     \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    auto operator()(Args... args) -> decltype(__name(args...)) {            \
-      using cudnn_func = decltype(__name(args...)) (*)(Args...);            \
-      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                  \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);             \
-    }                                                                       \
-  } __name; /* struct DynLoad__##__name */
-
-#else
-
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      return __name(args...);                                    \
-    }                                                            \
-  } __name; /* struct DynLoad__##__name */
-
-#endif
-
-/**
- * include all needed cudnn functions in HPPL
- * different cudnn version has different interfaces
- **/
-// clang-format off
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
-  __macro(cudnnSetTensor4dDescriptor)                     \
-  __macro(cudnnSetTensor4dDescriptorEx)                   \
-  __macro(cudnnGetConvolutionNdForwardOutputDim)          \
-  __macro(cudnnGetConvolutionForwardAlgorithm)            \
-  __macro(cudnnCreateTensorDescriptor)                    \
-  __macro(cudnnDestroyTensorDescriptor)                   \
-  __macro(cudnnCreateFilterDescriptor)                    \
-  __macro(cudnnSetFilter4dDescriptor)                     \
-  __macro(cudnnSetPooling2dDescriptor)                    \
-  __macro(cudnnDestroyFilterDescriptor)                   \
-  __macro(cudnnCreateConvolutionDescriptor)               \
-  __macro(cudnnCreatePoolingDescriptor)                   \
-  __macro(cudnnDestroyPoolingDescriptor)                  \
-  __macro(cudnnSetConvolution2dDescriptor)                \
-  __macro(cudnnDestroyConvolutionDescriptor)              \
-  __macro(cudnnCreate)                                    \
-  __macro(cudnnDestroy)                                   \
-  __macro(cudnnSetStream)                                 \
-  __macro(cudnnActivationForward)                         \
-  __macro(cudnnConvolutionForward)                        \
-  __macro(cudnnConvolutionBackwardBias)                   \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
-  __macro(cudnnTransformTensor)                           \
-  __macro(cudnnPoolingForward)                            \
-  __macro(cudnnPoolingBackward)                           \
-  __macro(cudnnSoftmaxBackward)                           \
-  __macro(cudnnSoftmaxForward)                            \
-  __macro(cudnnGetVersion)                                \
-  __macro(cudnnGetErrorString)
-CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP)
-
-#define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
-  __macro(cudnnAddTensor)                                 \
-  __macro(cudnnConvolutionBackwardData)                   \
-  __macro(cudnnConvolutionBackwardFilter)
-CUDNN_DNN_ROUTINE_EACH_R2(DYNAMIC_LOAD_CUDNN_WRAP)
-
-// APIs available after R3:
-#if CUDNN_VERSION >= 3000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)              \
-  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
-#endif
-
-
-// APIs available after R4:
-#if CUDNN_VERSION >= 4007
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)             \
-  __macro(cudnnBatchNormalizationForwardTraining)            \
-  __macro(cudnnBatchNormalizationForwardInference)           \
-  __macro(cudnnBatchNormalizationBackward)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
-#endif
-
-// APIs in R5
-#if CUDNN_VERSION >= 5000
-#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)                    \
-  __macro(cudnnCreateActivationDescriptor)                    \
-  __macro(cudnnSetActivationDescriptor)                       \
-  __macro(cudnnGetActivationDescriptor)                       \
-  __macro(cudnnDestroyActivationDescriptor)
-CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_R5
-#endif
-
-#undef CUDNN_DNN_ROUTINE_EACH
-// clang-format on
-} /* namespace dynload */
-
-/**
- * Check build-in cudnn function using glog and it **does not**
- * support << operator for more details error info.
- */
-#define CHECK_CUDNN(cudnnFunc)                                         \
-  do {                                                                 \
-    cudnnStatus_t cudnnStat = cudnnFunc;                               \
-    CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat)                          \
-        << "Cudnn Error: " << dynload::cudnnGetErrorString(cudnnStat); \
-  } while (0)
-
-bool g_is_libcudnn_init = false;
-int g_cudnn_lib_version = 0;
-
-void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) {
-  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
-}
-
-void hl_cudnn_init(cudnnHandle_t* cudnn_handle, cudaStream_t stream) {
-  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
-  size_t cudnn_dso_major = cudnn_dso_ver / 1000;
-  size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
-
-  // Compare cudnn header version with that of cudnn.so.
-  CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
-        (cudnn_cuh_major == cudnn_dso_major))
-      << "[cudnn init] libcudnn v" << cudnn_dso_major << " with header v"
-      << cudnn_cuh_major << " unmatched!\n"
-      << "PaddlePaddle Requirement: "
-      << "(header v[2-3] with libcudnn v[2-3]) Or "
-      << "(header v4 with libcudnn v4) Or "
-      << "(header v5 with libcudnn v5) Or"
-      << "(header v6 with libcudnn v6).";
-
-  CHECK(!(CUDNN_VERSION < 6000 && CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
-      << "cudnn v5 requires cuda version >= 7.5";
-
-  CHECK(!(CUDNN_VERSION >= 6000 && CUDA_VERSION < 8000))
-      << "cudnn v6 requires cuda version >= 8.0";
-
-  CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
-  CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
-
-  g_is_libcudnn_init = true;
-  g_cudnn_lib_version = cudnn_dso_ver;
-}
-
-int hl_get_cudnn_lib_version() { return g_cudnn_lib_version; }
-
-void hl_conv_workspace(hl_tensor_descriptor input,
-                       hl_tensor_descriptor output,
-                       hl_filter_descriptor filter,
-                       hl_convolution_descriptor conv,
-                       int* convFwdAlgo,
-                       size_t* fwdLimitBytes,
-                       int* convBwdDataAlgo,
-                       size_t* bwdDataLimitBytes,
-                       int* convBwdFilterAlgo,
-                       size_t* bwdFilterLimitBytes,
-                       bool useDilation) {
-#if CUDNN_VERSION >= 4000
-
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(filter);
-  CHECK_NOTNULL(conv);
-
-  // Specify workspace limit directly
-  size_t memoryLimitBytes =
-      (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
-
-  // For dilation
-  int algo = 0;
-
-  // cudnn convolution forward configuration
-  cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-  // cudnn convolution backward data configuration
-  cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnConvolutionDescriptor_t bwd_data_conv_desc =
-      GET_CONVOLUTION_DESCRIPTOR(conv);
-  // cudnn convolution backward filter configuration
-  cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
-      GET_CONVOLUTION_DESCRIPTOR(conv);
-  cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
-
-  if (useDilation) {
-    convFwdAlgo = &algo;
-    convBwdDataAlgo = &algo;
-    convBwdFilterAlgo = &algo;
-  } else {
-    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
-        t_resource.cudnn_handle,
-        fwd_src_desc,
-        fwd_filter_desc,
-        fwd_conv_desc,
-        fwd_dest_desc,
-        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-        memoryLimitBytes,
-        reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-        t_resource.cudnn_handle,
-        bwd_data_filter_desc,
-        bwd_data_diff_desc,
-        bwd_data_conv_desc,
-        bwd_data_grad_desc,
-        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-        memoryLimitBytes,
-        reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-        t_resource.cudnn_handle,
-        bwd_filter_src_desc,
-        bwd_filter_diff_desc,
-        bwd_filter_conv_desc,
-        bwd_filter_grad_desc,
-        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-        memoryLimitBytes,
-        reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
-  }
-
-  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
-      t_resource.cudnn_handle,
-      fwd_src_desc,
-      fwd_filter_desc,
-      fwd_conv_desc,
-      fwd_dest_desc,
-      static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
-      fwdLimitBytes));
-
-  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
-      t_resource.cudnn_handle,
-      bwd_data_filter_desc,
-      bwd_data_diff_desc,
-      bwd_data_conv_desc,
-      bwd_data_grad_desc,
-      static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
-      bwdDataLimitBytes));
-
-  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-      t_resource.cudnn_handle,
-      bwd_filter_src_desc,
-      bwd_filter_diff_desc,
-      bwd_filter_conv_desc,
-      bwd_filter_grad_desc,
-      static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
-      bwdFilterLimitBytes));
-
-#endif
-}
-
-void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
-                                 int batch_size,
-                                 int feature_maps,
-                                 int height,
-                                 int width) {
-  CHECK_NOTNULL(image_desc);
-
-  cudnn_tensor_descriptor hl_desc =
-      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
-  CHECK_NOTNULL(hl_desc);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
-
-  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(hl_desc->desc,
-                                                  CUDNN_TENSOR_NCHW,
-                                                  data_type,
-                                                  batch_size,
-                                                  feature_maps,
-                                                  height,
-                                                  width));
-
-  hl_desc->format = CUDNN_TENSOR_NCHW;
-  hl_desc->data_type = data_type;
-  hl_desc->batch_size = batch_size;
-  hl_desc->feature_maps = feature_maps;
-  hl_desc->height = height;
-  hl_desc->width = width;
-
-  *image_desc = (hl_tensor_descriptor)hl_desc;
-}
-
-void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
-  CHECK_NOTNULL(image_desc);
-
-  cudnn_tensor_descriptor hl_desc =
-      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
-  CHECK_NOTNULL(hl_desc);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
-
-  hl_desc->data_type = data_type;
-
-  *image_desc = (hl_tensor_descriptor)hl_desc;
-}
-
-void hl_tensor_reshape(hl_tensor_descriptor image_desc,
-                       int batch_size,
-                       int feature_maps,
-                       int height,
-                       int width) {
-  const int stride_w = 1;
-  const int stride_h = width * stride_w;
-  const int stride_c = height * stride_h;
-  const int stride_n = feature_maps * stride_c;
-  return hl_tensor_reshape(image_desc,
-                           batch_size,
-                           feature_maps,
-                           height,
-                           width,
-                           stride_n,
-                           stride_c,
-                           stride_h,
-                           stride_w);
-}
-
-void hl_tensor_reshape(hl_tensor_descriptor image_desc,
-                       int batch_size,
-                       int feature_maps,
-                       int height,
-                       int width,
-                       int nStride,
-                       int cStride,
-                       int hStride,
-                       int wStride) {
-  CHECK_NOTNULL(image_desc);
-
-  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
-  CHECK_NOTNULL(hl_desc->desc);
-
-  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
-                                                    hl_desc->data_type,
-                                                    batch_size,
-                                                    feature_maps,
-                                                    height,
-                                                    width,
-                                                    nStride,
-                                                    cStride,
-                                                    hStride,
-                                                    wStride));
-
-  hl_desc->batch_size = batch_size;
-  hl_desc->feature_maps = feature_maps;
-  hl_desc->height = height;
-  hl_desc->width = width;
-}
-
-void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {
-  CHECK_NOTNULL(image_desc);
-
-  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
-  CHECK_NOTNULL(hl_desc->desc);
-
-  CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
-
-  hl_desc->desc = NULL;
-
-  free(image_desc);
-}
-
-void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
-                                  hl_pooling_mode_t mode,
-                                  int height,
-                                  int width,
-                                  int height_padding,
-                                  int width_padding,
-                                  int stride_height,
-                                  int stride_width) {
-  cudnnPoolingMode_t cudnn_mode;
-  switch (mode) {
-    case HL_POOLING_MAX:
-      cudnn_mode = CUDNN_POOLING_MAX;
-      break;
-    case HL_POOLING_AVERAGE:
-      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-      break;
-    case HL_POOLING_AVERAGE_INCLUDE_PADDING:
-      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-      break;
-    default:
-      LOG(FATAL) << "parameter mode error";
-  }
-
-  CHECK_NOTNULL(pooling_desc);
-
-  cudnn_pooling_descriptor hl_pooling_desc =
-      (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
-  CHECK_NOTNULL(hl_pooling_desc);
-
-  CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
-
-  CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(hl_pooling_desc->desc,
-                                                   cudnn_mode,
-#if CUDNN_VERSION >= 5000
-                                                   CUDNN_PROPAGATE_NAN,
-#endif
-                                                   height,
-                                                   width,
-                                                   height_padding,
-                                                   width_padding,
-                                                   stride_height,
-                                                   stride_width));
-
-  hl_pooling_desc->mode = cudnn_mode;
-  hl_pooling_desc->window_height = height;
-  hl_pooling_desc->window_width = width;
-  hl_pooling_desc->stride_height = stride_height;
-  hl_pooling_desc->stride_width = stride_width;
-
-  *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
-}
-
-void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {
-  CHECK_NOTNULL(pooling_desc);
-
-  cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
-
-  CHECK_NOTNULL(hl_pooling->desc);
-  CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
-
-  hl_pooling->desc = NULL;
-
-  free(pooling_desc);
-}
-
-void hl_pooling_forward(hl_tensor_descriptor input,
-                        real* input_image,
-                        hl_tensor_descriptor output,
-                        real* output_image,
-                        hl_pooling_descriptor pooling) {
-  cudnnPoolingDescriptor_t pooling_desc;
-  cudnnTensorDescriptor_t input_desc;
-  cudnnTensorDescriptor_t output_desc;
-
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(pooling);
-  CHECK_NOTNULL(input_image);
-  CHECK_NOTNULL(output_image);
-
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  input_desc = ((cudnn_tensor_descriptor)input)->desc;
-  output_desc = ((cudnn_tensor_descriptor)output)->desc;
-  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
-  CHECK_CUDNN(dynload::cudnnPoolingForward(t_resource.cudnn_handle,
-                                           pooling_desc,
-                                           &alpha,
-                                           input_desc,
-                                           input_image,
-                                           &beta,
-                                           output_desc,
-                                           output_image));
-  CHECK_SYNC("hl_pooling_forward failed");
-}
-
-void hl_pooling_backward(hl_tensor_descriptor input,
-                         real* input_image,
-                         real* input_image_grad,
-                         hl_tensor_descriptor output,
-                         real* output_image,
-                         real* output_image_grad,
-                         hl_pooling_descriptor pooling) {
-  cudnnPoolingDescriptor_t pooling_desc;
-  cudnnTensorDescriptor_t input_desc;
-  cudnnTensorDescriptor_t output_desc;
-
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(pooling);
-  CHECK_NOTNULL(input_image);
-  CHECK_NOTNULL(input_image_grad);
-  CHECK_NOTNULL(output_image);
-  CHECK_NOTNULL(output_image_grad);
-
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  input_desc = ((cudnn_tensor_descriptor)input)->desc;
-  output_desc = ((cudnn_tensor_descriptor)output)->desc;
-  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
-  CHECK_CUDNN(dynload::cudnnPoolingBackward(t_resource.cudnn_handle,
-                                            pooling_desc,
-                                            &alpha,
-                                            output_desc,
-                                            output_image,
-                                            output_desc,
-                                            output_image_grad,
-                                            input_desc,
-                                            input_image,
-                                            &beta,
-                                            input_desc,
-                                            input_image_grad));
-  CHECK_SYNC("hl_pooling_backward failed");
-}
-
-void hl_create_filter_descriptor(hl_filter_descriptor* filter,
-                                 int input_feature_maps,
-                                 int output_feature_maps,
-                                 int height,
-                                 int width) {
-  CHECK_NOTNULL(filter);
-
-  cudnn_filter_descriptor hl_filter =
-      (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
-  CHECK_NOTNULL(hl_filter);
-
-  CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
-
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(hl_filter->desc,
-                                                  data_type,
-#if CUDNN_VERSION >= 5000
-                                                  CUDNN_TENSOR_NCHW,
-#endif
-                                                  output_feature_maps,
-                                                  input_feature_maps,
-                                                  height,
-                                                  width));
-
-  hl_filter->data_type = data_type;
-  hl_filter->output_feature_maps = output_feature_maps;
-  hl_filter->input_feature_maps = input_feature_maps;
-  hl_filter->filter_height = height;
-  hl_filter->filter_width = width;
-
-  *filter = (hl_filter_descriptor)hl_filter;
-}
-
-void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {
-  CHECK_NOTNULL(filter);
-
-  cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
-  CHECK_NOTNULL(hl_filter->desc);
-
-  CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
-
-  hl_filter->desc = NULL;
-
-  free(filter);
-}
-
-void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
-                                      hl_tensor_descriptor image,
-                                      hl_filter_descriptor filter,
-                                      int padding_height,
-                                      int padding_width,
-                                      int stride_height,
-                                      int stride_width,
-                                      int dilation_h,
-                                      int dilation_w) {
-  CHECK_NOTNULL(conv);
-
-  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
-      sizeof(_cudnn_convolution_descriptor));
-
-  CHECK_NOTNULL(hl_conv);
-  CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
-
-  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
-
-#if CUDNN_VERSION >= 6000
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
-                                                       padding_height,
-                                                       padding_width,
-                                                       stride_height,
-                                                       stride_width,
-                                                       dilation_h,
-                                                       dilation_w,
-                                                       mode,
-                                                       data_type));
-#else
-  if (dilation_h > 1 || dilation_w > 1) {
-    LOG(FATAL)
-        << "Current cuDNN version does't support for dilation convolution. "
-        << "The dilation convolution requires cuDNN >= v6.0.";
-  }
-
-  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
-                                                       padding_height,
-                                                       padding_width,
-                                                       stride_height,
-                                                       stride_width,
-                                                       dilation_h,
-                                                       dilation_w,
-                                                       mode));
-#endif
-
-  hl_conv->input_image = image;
-  hl_conv->filter = filter;
-  hl_conv->padding_height = padding_height;
-  hl_conv->padding_width = padding_width;
-  hl_conv->stride_height = stride_height;
-  hl_conv->stride_width = stride_width;
-  hl_conv->upscalex = 1;
-  hl_conv->upscaley = 1;
-  hl_conv->mode = mode;
-
-  *conv = (hl_convolution_descriptor)hl_conv;
-}
-
-void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
-                                     hl_tensor_descriptor image,
-                                     hl_filter_descriptor filter,
-                                     int padding_height,
-                                     int padding_width,
-                                     int stride_height,
-                                     int stride_width,
-                                     int dilation_h,
-                                     int dilation_w) {
-  CHECK_NOTNULL(conv);
-  CHECK_NOTNULL(image);
-  CHECK_NOTNULL(filter);
-
-  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
-
-#if CUDNN_VERSION >= 6000
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
-                                                       padding_height,
-                                                       padding_width,
-                                                       stride_height,
-                                                       stride_width,
-                                                       dilation_h,
-                                                       dilation_w,
-                                                       mode,
-                                                       data_type));
-#else
-  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
-                                                       padding_height,
-                                                       padding_width,
-                                                       stride_height,
-                                                       stride_width,
-                                                       dilation_h,
-                                                       dilation_w,
-                                                       mode));
-#endif
-
-  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
-  hl_conv->input_image = image;
-  hl_conv->filter = filter;
-  hl_conv->padding_height = padding_height;
-  hl_conv->padding_width = padding_width;
-  hl_conv->stride_height = stride_height;
-  hl_conv->stride_width = stride_width;
-  hl_conv->upscalex = 1;
-  hl_conv->upscaley = 1;
-  hl_conv->mode = mode;
-}
-
-void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {
-  CHECK_NOTNULL(conv);
-
-  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
-  CHECK_NOTNULL(hl_conv->desc);
-
-  CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
-  hl_conv->desc = NULL;
-
-  free(conv);
-}
-
-void hl_convolution_forward(hl_tensor_descriptor input,
-                            real* input_data,
-                            hl_tensor_descriptor output,
-                            real* output_data,
-                            hl_filter_descriptor filter,
-                            real* filter_data,
-                            hl_convolution_descriptor conv,
-                            void* gpuWorkSpace,
-                            size_t sizeInBytes,
-                            int convFwdAlgo) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(filter);
-  CHECK_NOTNULL(conv);
-  CHECK_NOTNULL(input_data);
-  CHECK_NOTNULL(output_data);
-  CHECK_NOTNULL(filter_data);
-  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t dest_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  CHECK_CUDNN(dynload::cudnnConvolutionForward(
-      t_resource.cudnn_handle,
-      &alpha,
-      src_desc,
-      input_data,
-      filter_desc,
-      filter_data,
-      conv_desc,
-      static_cast<cudnnConvolutionFwdAlgo_t>(convFwdAlgo),
-      gpuWorkSpace,
-      sizeInBytes,
-      &beta,
-      dest_desc,
-      output_data));
-  CHECK_SYNC("hl_convolution_forward failed");
-}
-
-void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
-                                     real* bias_data,
-                                     hl_tensor_descriptor output,
-                                     real* output_data) {
-  CHECK_NOTNULL(bias);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(bias_data);
-  CHECK_NOTNULL(output_data);
-
-  cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-
-  CHECK_CUDNN(dynload::cudnnAddTensor(t_resource.cudnn_handle,
-#if CUDNN_VERSION < 4000
-                                      CUDNN_ADD_SAME_C,
-#endif
-                                      &alpha,
-                                      bias_desc,
-                                      bias_data,
-                                      &beta,
-                                      output_desc,
-                                      output_data));
-  CHECK_SYNC("hl_convolution_forward_add_bias failed");
-}
-
-void hl_convolution_backward_bias(hl_tensor_descriptor bias,
-                                  real* bias_grad_data,
-                                  hl_tensor_descriptor output,
-                                  real* output_grad_data) {
-  CHECK_NOTNULL(bias);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(bias_grad_data);
-  CHECK_NOTNULL(output_grad_data);
-
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
-  CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(t_resource.cudnn_handle,
-                                                    &alpha,
-                                                    diff_desc,
-                                                    output_grad_data,
-                                                    &beta,
-                                                    bias_desc,
-                                                    bias_grad_data));
-  CHECK_SYNC("hl_convolution_backward_bias failed");
-}
-
-void hl_convolution_backward_filter(hl_tensor_descriptor input,
-                                    real* input_data,
-                                    hl_tensor_descriptor output,
-                                    real* output_grad_data,
-                                    hl_filter_descriptor filter,
-                                    real* filter_grad_data,
-                                    hl_convolution_descriptor conv,
-                                    void* gpuWorkSpace,
-                                    size_t sizeInBytes,
-                                    int convBwdFilterAlgo) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(filter);
-  CHECK_NOTNULL(conv);
-  CHECK_NOTNULL(input_data);
-  CHECK_NOTNULL(output_grad_data);
-  CHECK_NOTNULL(filter_grad_data);
-
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-  cudnnFilterDescriptor_t grad_desc = GET_FILTER_DESCRIPTOR(filter);
-
-  CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
-      t_resource.cudnn_handle,
-      &alpha,
-      src_desc,
-      input_data,
-      diff_desc,
-      output_grad_data,
-      conv_desc,
-#if CUDNN_VERSION >= 4000
-      static_cast<cudnnConvolutionBwdFilterAlgo_t>(convBwdFilterAlgo),
-      gpuWorkSpace,
-      sizeInBytes,
-#endif
-      &beta,
-      grad_desc,
-      filter_grad_data));
-  CHECK_SYNC("hl_convolution_backward_filter failed");
-}
-
-void hl_convolution_backward_data(hl_tensor_descriptor input,
-                                  real* input_data_grad,
-                                  hl_tensor_descriptor output,
-                                  real* output_grad_data,
-                                  hl_filter_descriptor filter,
-                                  real* filter_data,
-                                  hl_convolution_descriptor conv,
-                                  void* gpuWorkSpace,
-                                  size_t sizeInBytes,
-                                  int convBwdDataAlgo) {
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t grad_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-
-  CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
-      t_resource.cudnn_handle,
-      &alpha,
-      filter_desc,
-      filter_data,
-      diff_desc,
-      output_grad_data,
-      conv_desc,
-#if CUDNN_VERSION >= 4000
-      static_cast<cudnnConvolutionBwdDataAlgo_t>(convBwdDataAlgo),
-      gpuWorkSpace,
-      sizeInBytes,
-#endif
-      &beta,
-      grad_desc,
-      input_data_grad));
-  CHECK_SYNC("hl_convolution_backward_data failed");
-}
-
-void hl_softmax_forward(real* input, real* output, int height, int width) {
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
-                                                  CUDNN_TENSOR_NCHW,
-                                                  data_type,
-                                                  height,
-                                                  width,
-                                                  1,
-                                                  1));
-
-  real alpha = 1.0f;
-  real beta = 0.0f;
-  CHECK_CUDNN(dynload::cudnnSoftmaxForward(t_resource.cudnn_handle,
-                                           CUDNN_SOFTMAX_ACCURATE,
-                                           CUDNN_SOFTMAX_MODE_CHANNEL,
-                                           &alpha,
-                                           t_resource.cudnn_desc,
-                                           input,
-                                           &beta,
-                                           t_resource.cudnn_desc,
-                                           output));
-  CHECK_SYNC("hl_softmax_forward failed");
-}
-
-void hl_softmax_backward(real* output_value,
-                         real* output_grad,
-                         int height,
-                         int width) {
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
-                                                  CUDNN_TENSOR_NCHW,
-                                                  data_type,
-                                                  height,
-                                                  width,
-                                                  1,
-                                                  1));
-
-  real alpha = 1.0f;
-  real beta = 0.0f;
-  CHECK_CUDNN(dynload::cudnnSoftmaxBackward(t_resource.cudnn_handle,
-                                            CUDNN_SOFTMAX_ACCURATE,
-                                            CUDNN_SOFTMAX_MODE_CHANNEL,
-                                            &alpha,
-                                            t_resource.cudnn_desc,
-                                            output_value,
-                                            t_resource.cudnn_desc,
-                                            output_grad,
-                                            &beta,
-                                            t_resource.cudnn_desc,
-                                            output_grad));
-  CHECK_SYNC("hl_softmax_backward failed");
-}
-
-void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                    real* input,
-                                    hl_tensor_descriptor outputDesc,
-                                    real* output,
-                                    hl_tensor_descriptor bnParamDesc,
-                                    real* scale,
-                                    real* bias,
-                                    double factor,
-                                    real* runningMean,
-                                    real* runningInvVar,
-                                    double epsilon,
-                                    real* savedMean,
-                                    real* savedVar) {
-#if CUDNN_VERSION >= 4007
-  if ((NULL != runningMean && NULL == runningInvVar) ||
-      (NULL == runningMean && NULL != runningInvVar)) {
-    LOG(FATAL) << "runningMean and runningInvVar can be NULL "
-               << "but only at the same time.";
-  }
-  if ((NULL != savedMean && NULL == savedVar) ||
-      (NULL == savedMean && NULL != savedVar)) {
-    LOG(FATAL) << "savedMean and savedVar can be NULL "
-               << "but only at the same time.";
-  }
-
-  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
-  cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
-  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(
-      dynload::cudnnBatchNormalizationForwardTraining(t_resource.cudnn_handle,
-                                                      mode,
-                                                      &alpha,
-                                                      &beta,
-                                                      xDesc,
-                                                      input,
-                                                      yDesc,
-                                                      output,
-                                                      bnDesc,
-                                                      scale,
-                                                      bias,
-                                                      factor,
-                                                      runningMean,
-                                                      runningInvVar,
-                                                      epsilon,
-                                                      savedMean,
-                                                      savedVar));
-
-  CHECK_SYNC("hl_batch_norm_forward_training failed");
-#else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
-             << "But cudnn lib version is " << g_cudnn_lib_version;
-#endif
-}
-
-void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                     real* input,
-                                     hl_tensor_descriptor outputDesc,
-                                     real* output,
-                                     hl_tensor_descriptor bnParamDesc,
-                                     real* scale,
-                                     real* bias,
-                                     real* estimatedMean,
-                                     real* estimatedInvVar,
-                                     double epsilon) {
-#if CUDNN_VERSION >= 4007
-  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
-  cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
-  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-
-  CHECK_CUDNN(
-      dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
-                                                       mode,
-                                                       &alpha,
-                                                       &beta,
-                                                       xDesc,
-                                                       input,
-                                                       yDesc,
-                                                       output,
-                                                       bnDesc,
-                                                       scale,
-                                                       bias,
-                                                       estimatedMean,
-                                                       estimatedInvVar,
-                                                       epsilon));
-
-  CHECK_SYNC("hl_batch_norm_forward_inference failed");
-#else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
-             << "But cudnn lib version is " << g_cudnn_lib_version;
-#endif
-}
-
-void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                            real* input,
-                            hl_tensor_descriptor outGradDesc,
-                            real* outGrad,
-                            hl_tensor_descriptor inGradDesc,
-                            real* inGrad,
-                            hl_tensor_descriptor dBnParamDesc,
-                            real* scale,
-                            real* scaleGrad,
-                            real* biasGrad,
-                            double epsilon,
-                            real* savedMean,
-                            real* savedInvVar) {
-#if CUDNN_VERSION >= 4007
-  if ((NULL != savedMean && NULL == savedInvVar) ||
-      (NULL == savedMean && NULL != savedInvVar)) {
-    LOG(FATAL) << "savedMean and savedVar can be NULL "
-               << "but only at the same time.";
-  }
-
-  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
-  cudnnTensorDescriptor_t dyDesc = GET_TENSOR_DESCRIPTOR(outGradDesc);
-  cudnnTensorDescriptor_t dxDesc = GET_TENSOR_DESCRIPTOR(inGradDesc);
-  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(dBnParamDesc);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(t_resource.cudnn_handle,
-                                                       mode,
-                                                       &alpha,
-                                                       &beta,
-                                                       &alpha,
-                                                       &beta,
-                                                       xDesc,
-                                                       input,
-                                                       dyDesc,
-                                                       outGrad,
-                                                       dxDesc,
-                                                       inGrad,
-                                                       bnDesc,
-                                                       scale,
-                                                       scaleGrad,
-                                                       biasGrad,
-                                                       epsilon,
-                                                       savedMean,
-                                                       savedInvVar));
-
-  CHECK_SYNC("hl_batch_norm_backward failed");
-#else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
-             << "But cudnn lib version is " << g_cudnn_lib_version;
-#endif
-}
diff --git a/paddle/legacy/cuda/src/hl_cuda_device.cc b/paddle/legacy/cuda/src/hl_cuda_device.cc
deleted file mode 100644
index 92197afb3d4..00000000000
--- a/paddle/legacy/cuda/src/hl_cuda_device.cc
+++ /dev/null
@@ -1,681 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// clang-format off
-// Because clang-format 4.X and clang-format 3.8+ format
-// following lines in different. So disable clang-format.
-#include "hl_cuda.h"
-#include <cuda_profiler_api.h>
-#include <string.h>
-#include <sys/syscall.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include "hl_cuda.ph"
-#include "hl_thread.ph"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/DynamicLoader.h"
-// clang-format on
-
-namespace dynload {
-
-std::once_flag curand_dso_flag;
-void *curand_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load curand routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    curandStatus_t operator()(Args... args) {                                  \
-      typedef curandStatus_t (*curandFunc)(Args...);                           \
-      std::call_once(curand_dso_flag, GetCurandDsoHandle, &curand_dso_handle); \
-      void *p_##__name = dlsym(curand_dso_handle, #__name);                    \
-      return reinterpret_cast<curandFunc>(p_##__name)(args...);                \
-    }                                                                          \
-  } __name; /* struct DynLoad__##__name */
-#else
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)      \
-  struct DynLoad__##__name {                  \
-    template <typename... Args>               \
-    curandStatus_t operator()(Args... args) { \
-      return __name(args...);                 \
-    }                                         \
-  } __name; /* struct DynLoad__##__name */
-#endif
-
-/* include all needed curand functions in HPPL */
-// clang-format off
-#define CURAND_RAND_ROUTINE_EACH(__macro)    \
-  __macro(curandCreateGenerator)             \
-  __macro(curandSetStream)                   \
-  __macro(curandSetPseudoRandomGeneratorSeed)\
-  __macro(curandGenerateUniform)             \
-  __macro(curandGenerateUniformDouble)
-// clang-format on
-
-CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
-
-#undef CURAND_RAND_ROUTINE_EACH
-#undef DYNAMIC_LOAD_CURAND_WRAP
-
-} /* namespace dynload */
-
-/**
- * @brief   global resource.
- */
-int g_system_device_num = 0;                /* system device number */
-int device_num = 0;                         /* use    device number */
-hl_device_prop *g_device;                   /* device info table */
-__thread thread_device_resources *t_device; /* device resources table */
-int g_cuda_lib_version = 0;
-
-/* number of global stream */
-#define NUMBER_OF_GLOBAL_STREAM (HPPL_THREAD_STREAM_1)
-/* number of thread stream */
-#define NUMBER_OF_THREAD_STREAM (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
-/* sizeof of device memory */
-#define HPPL_GPU_MEMORY_SIZE (256 * 4)
-
-/**
- * Check build-in cuda function using glog and it **does not**
- * support << operator for more details error info.
- */
-#define CHECK_CUDA(cudaFunc)                                         \
-  do {                                                               \
-    cudaError_t cudaStat = cudaFunc;                                 \
-    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                \
-                                    << cudaGetErrorString(cudaStat); \
-  } while (0)
-
-/**
- * @brief   thread resource.
- */
-__thread _hl_thread_resource t_resource = {{0},    /* stream */
-                                           0,      /* handle */
-                                           0,      /* gen */
-                                           0,      /* cudnn_handle */
-                                           0,      /* cudnn_desc */
-                                           NULL,   /* gen_mutex */
-                                           NULL,   /* gpu_mem */
-                                           NULL,   /* cpu_mem */
-                                           0,      /* event */
-                                           -1,     /* device */
-                                           0,      /* major */
-                                           false}; /* is_init */
-
-__thread cudaStream_t default_stream = 0;
-__thread bool g_sync_flag = true;
-bool hl_start_flag = false;
-
-inline pid_t gettid() {
-#if defined(__APPLE__) || defined(__OSX__)
-  // syscall is deprecated: first deprecated in macOS 10.12.
-  // syscall is unsupported;
-  // syscall pid_t tid = syscall(SYS_thread_selfid);
-  uint64_t tid;
-  pthread_threadid_np(NULL, &tid);
-#else
-#ifndef _WIN32
-#ifndef __NR_gettid
-#define __NR_gettid 224
-#endif
-  pid_t tid = syscall(__NR_gettid);
-#else   // _WIN32
-  pid_t tid = _getpid();
-#endif  // _WIN32
-#endif
-  CHECK_NE((int)tid, -1);
-  return tid;
-}
-
-void hl_init(int device) {
-  CHECK(hl_start_flag) << "[Init failed] hl_start() did not succeed.";
-
-  /* thread has been initialized */
-  if (true == t_resource.is_init) {
-    hl_set_device(device);
-    return;
-  }
-
-  /* create thread devcie resources */
-  char *tmp;
-  thread_device_resources device_res;
-  tmp = (char *)malloc(g_system_device_num * sizeof(thread_device_resources *) +
-                       device_num * sizeof(_thread_device_resources));
-  CHECK_NOTNULL(tmp);
-  t_device = (thread_device_resources *)tmp;
-  device_res = (thread_device_resources)(
-      (char *)tmp + g_system_device_num * sizeof(thread_device_resources *));
-  memset(t_device, 0, g_system_device_num * sizeof(thread_device_resources *));
-
-  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_THREAD_STREAM *
-                                    sizeof(cudaStream_t));
-  CHECK_NOTNULL(tmp_stream);
-
-  int num = 0;
-  for (int dev = 0; dev < g_system_device_num; dev++) {
-    if (!g_device[dev]) {
-      continue;
-    }
-
-    t_device[dev] = &device_res[num];
-    t_device[dev]->stream =
-        (cudaStream_t *)(tmp_stream +
-                         num * NUMBER_OF_THREAD_STREAM * sizeof(cudaStream_t));
-
-    hl_create_thread_resources(dev, t_device[dev]);
-    num++;
-  }
-
-  hl_cudnn_desc_init(&t_resource.cudnn_desc);
-
-  /* thread initialization is complete */
-  t_resource.is_init = true;
-  /* set device */
-  t_resource.device = -1;
-  hl_set_device(device);
-}
-
-void hl_fini() {
-  if (false == t_resource.is_init) {
-    return;
-  }
-
-  /* hppl stream fini */
-  t_resource.device = -1;
-  for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
-    t_resource.stream[i] = 0;
-  }
-
-  char *tmp = (char *)t_device;
-  char *tmp_stream = NULL;
-  for (int dev = 0; dev < g_system_device_num; dev++) {
-    if (!t_device[dev]) {
-      continue;
-    }
-    if (!tmp_stream) {
-      tmp_stream = (char *)t_device[dev]->stream;
-    }
-    for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-      CHECK_CUDA(cudaStreamDestroy(t_device[dev]->stream[j]));
-    }
-
-    /* free device memory */
-    hl_free_mem_device(t_device[dev]->gpu_mem);
-    hl_free_mem_host(t_device[dev]->cpu_mem);
-    CHECK_CUDA(cudaEventDestroy(t_device[dev]->mem_event));
-  }
-
-  free(tmp);
-  free(tmp_stream);
-  t_resource.is_init = false;
-}
-
-int hl_get_device_count() { return device_num; }
-
-void hl_set_device(int device) {
-  if (device == t_resource.device) {
-    return;
-  }
-
-  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-      << "Device: " << device << " is not specified in startup.";
-
-  CHECK_CUDA(cudaSetDevice(device));
-
-  /* switch thread stream */
-  for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) {
-    t_resource.stream[i] = g_device[device]->device_resources->stream[i];
-  }
-
-  if (true == t_resource.is_init) {
-    for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
-      t_resource.stream[i] =
-          t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
-    }
-    t_resource.gpu_mem = t_device[device]->gpu_mem;
-    t_resource.cpu_mem = t_device[device]->cpu_mem;
-    t_resource.event = t_device[device]->mem_event;
-  }
-
-  t_resource.handle = g_device[device]->device_resources->handle;
-  t_resource.gen = g_device[device]->device_resources->gen;
-  t_resource.cudnn_handle = g_device[device]->device_resources->cudnn_handle;
-  t_resource.gen_mutex = g_device[device]->device_resources->gen_mutex;
-  t_resource.device = device;
-  t_resource.major = g_device[device]->major;
-  default_stream = t_resource.stream[0];
-}
-
-int hl_get_device() {
-  int device;
-  CHECK_CUDA(cudaGetDevice(&device));
-  return device;
-}
-
-void *hl_malloc_device(size_t size) {
-  void *dest_d;
-
-  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(cudaMalloc((void **)&dest_d, size));
-
-  return dest_d;
-}
-
-void hl_free_mem_device(void *dest_d) {
-  CHECK_NOTNULL(dest_d);
-
-  cudaError_t err = cudaFree(dest_d);
-  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
-      << hl_get_device_error_string();
-}
-
-void *hl_malloc_host(size_t size) {
-  void *dest_h;
-
-  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
-
-  return dest_h;
-}
-
-void hl_free_mem_host(void *dest_h) {
-  CHECK_NOTNULL(dest_h);
-
-  cudaError_t err = cudaFreeHost(dest_h);
-  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
-      << hl_get_device_error_string();
-}
-
-void hl_memcpy(void *dst, void *src, size_t size) {
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
-}
-
-void hl_memset_device(void *dest_d, int value, size_t size) {
-  CHECK_CUDA(cudaMemset(dest_d, value, size));
-}
-
-void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(src_h);
-  CHECK_NOTNULL(dest_d);
-  CHECK_CUDA(cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
-}
-
-void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(dest_h);
-  CHECK_NOTNULL(src_d);
-  CHECK_CUDA(cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
-}
-
-void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(dest_d);
-  CHECK_NOTNULL(src_d);
-  CHECK_CUDA(cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
-}
-
-void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
-  cudaStream_t cu_stream;
-
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_LT(stream, HPPL_STREAM_END);
-  cu_stream = t_resource.stream[stream];
-
-  CHECK_CUDA(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
-}
-
-void hl_start() {
-  hl_specify_devices_start(NULL, 0);
-  /* set default device */
-  hl_set_device(0);
-}
-
-bool hl_device_can_access_peer(int device, int peerDevice) {
-  int canAccessPeer;
-  CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
-
-  if (canAccessPeer == 1) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-void hl_device_enable_peer_access(int peerDevice) {
-  cudaError_t err = cudaDeviceEnablePeerAccess(peerDevice, 0);
-  if (cudaErrorPeerAccessAlreadyEnabled == err) {
-    cudaGetLastError();
-  } else {
-    CHECK_CUDA(err);
-  }
-}
-
-void hl_create_global_resources(hl_device_prop device_prop) {
-  struct cudaDeviceProp cu_prop;
-  int device = device_prop->device;
-  global_device_resources device_res = device_prop->device_resources;
-
-  CHECK_CUDA(cudaSetDevice(device));
-  /* device properties */
-  CHECK_CUDA(cudaGetDeviceProperties(&cu_prop, device));
-
-  device_prop->major = cu_prop.major;
-  device_prop->minor = cu_prop.minor;
-  strncpy(device_prop->device_name, cu_prop.name, 256);
-  device_prop->device_mem = cu_prop.totalGlobalMem;
-
-  /* create device stream */
-  for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) {
-    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
-  }
-
-  /* cublas init */
-  hl_cublas_init(&device_res->handle, device_res->stream[0]);
-
-  /* create curand gen */
-  CHECK_EQ(dynload::curandCreateGenerator(&device_res->gen,
-                                          CURAND_RNG_PSEUDO_DEFAULT),
-           CURAND_STATUS_SUCCESS)
-      << "[Start failed] Curand init failed.";
-
-  CHECK_EQ(dynload::curandSetStream(device_res->gen, device_res->stream[0]),
-           CURAND_STATUS_SUCCESS)
-      << "[Start failed] Curand set stream failed!";
-
-  /* create cudnn handle */
-  hl_cudnn_init(&device_res->cudnn_handle, device_res->stream[0]);
-
-  int seed = gettid();
-  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(device_res->gen,
-                                                       seed + device),
-           CURAND_STATUS_SUCCESS);
-
-  device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
-  pthread_mutex_init(device_res->gen_mutex, NULL);
-
-  CHECK_CUDA(cudaRuntimeGetVersion(&g_cuda_lib_version));
-}
-
-int hl_get_cuda_version() { return g_cuda_lib_version; }
-
-void hl_create_thread_resources(int device,
-                                thread_device_resources device_res) {
-  CHECK_CUDA(cudaSetDevice(device));
-
-  /* create thread stream */
-  for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
-  }
-
-  /* allocation device memory */
-  device_res->gpu_mem = (real *)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
-
-  /* allocation host memory */
-  device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
-
-  CHECK_CUDA(cudaEventCreate(&device_res->mem_event));
-}
-
-void hl_specify_devices_start(int *device, int number) {
-  if (hl_start_flag) return;
-
-  /* 1. get the number of devices */
-  CHECK_CUDA(cudaGetDeviceCount(&g_system_device_num));
-  CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device";
-  if (device == NULL) {
-    number = g_system_device_num;
-  }
-
-  /* 2. check device & create device property table */
-  CHECK_LE(number, g_system_device_num)
-      << "[Start failed] System does not have enough device. "
-      << "Device number: " << g_system_device_num << "Input number: " << number;
-
-  char *tmp;
-  hl_device_prop device_prop;
-  tmp = (char *)malloc(g_system_device_num * sizeof(hl_device_prop *) +
-                       number * sizeof(_hl_device_prop));
-  CHECK(tmp) << "[Start failed] System memory is not enough.";
-
-  g_device = (hl_device_prop *)tmp;
-  device_prop = (hl_device_prop)(
-      (char *)tmp + g_system_device_num * sizeof(hl_device_prop *));
-  memset(g_device, 0, g_system_device_num * sizeof(hl_device_prop *));
-  int num = 0;
-  for (int i = 0; i < number; i++) {
-    int dev;
-    if (device == NULL) {
-      dev = i;
-    } else {
-      dev = device[i];
-    }
-
-    CHECK_LT(dev, g_system_device_num)
-        << "[Start failed] The specified device number is "
-        << "out of range. Max device number: " << g_system_device_num - 1
-        << " Specified devcie number: " << dev;
-
-    if (g_device[dev]) {
-      /* Warning */
-      LOG(WARNING) << "[Warning] Repeat specify device: " << dev;
-      continue;
-    }
-
-    g_device[dev] = &device_prop[num];
-    g_device[dev]->device = dev;
-    num++;
-  }
-  device_num = num;
-
-  /* 3.  create global device resources */
-  char *tmp_res = (char *)malloc(device_num * sizeof(_global_device_resources));
-  CHECK_NOTNULL(tmp_res);
-
-  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_GLOBAL_STREAM *
-                                    sizeof(cudaStream_t));
-  CHECK_NOTNULL(tmp_stream);
-
-  num = 0;
-  for (int i = 0; i < g_system_device_num; i++) {
-    if (!g_device[i]) {
-      continue;
-    }
-
-    g_device[i]->device_resources = (global_device_resources)(
-        tmp_res + num * sizeof(_global_device_resources));
-    g_device[i]->device_resources->stream =
-        (cudaStream_t *)(tmp_stream +
-                         num * NUMBER_OF_GLOBAL_STREAM * sizeof(cudaStream_t));
-
-    hl_create_global_resources(g_device[i]);
-    num++;
-  }
-
-  /* hl_start() is ok */
-  hl_start_flag = true;
-  /* set default device */
-  if (device == NULL) {
-    hl_set_device(0);
-  } else {
-    hl_set_device(device[0]);
-  }
-}
-
-void hl_rand(real *dest_d, size_t num) {
-  pthread_mutex_lock(t_resource.gen_mutex);
-  CHECK_EQ(
-#ifndef PADDLE_TYPE_DOUBLE
-      dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
-#else
-      dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
-#endif
-      CURAND_STATUS_SUCCESS);
-  pthread_mutex_unlock(t_resource.gen_mutex);
-  CHECK_SYNC("hl_rand failed");
-}
-
-void hl_srand(unsigned int seed) {
-  pthread_mutex_lock(t_resource.gen_mutex);
-  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(t_resource.gen, seed),
-           CURAND_STATUS_SUCCESS);
-  pthread_mutex_unlock(t_resource.gen_mutex);
-}
-
-void hl_set_sync_flag(bool flag) { g_sync_flag = flag; }
-
-bool hl_get_sync_flag() { return g_sync_flag; }
-
-void hl_stream_synchronize(hl_stream_t stream) {
-  cudaStream_t cu_stream;
-
-  CHECK_LT(stream, HPPL_STREAM_END) << __func__
-                                    << ": the parameter stream is error.";
-
-  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(cudaStreamSynchronize(cu_stream));
-}
-
-void hl_create_event(hl_event_t *event) {
-  CHECK_NOTNULL(event);
-
-  struct _hl_event_st *st_event =
-      (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
-
-  CHECK_CUDA(cudaEventCreate(&st_event->cu_event));
-
-  *event = st_event;
-}
-
-float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
-  float time;
-  CHECK_NOTNULL(start);
-  CHECK_NOTNULL(end);
-
-  CHECK_CUDA(cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
-  return time;
-}
-
-void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
-  cudaStream_t cu_stream;
-
-  CHECK_NOTNULL(event);
-  CHECK_LT(stream, HPPL_STREAM_END) << __func__
-                                    << ": the parameter stream is error.";
-
-  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(cudaEventRecord(event->cu_event, cu_stream));
-}
-
-void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
-  cudaStream_t cu_stream;
-
-  CHECK_NOTNULL(event);
-  CHECK_LT(stream, HPPL_STREAM_END) << __func__
-                                    << ": the parameter stream is error.";
-
-  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
-}
-
-void hl_destroy_event(hl_event_t event) {
-  CHECK_NOTNULL(event);
-  CHECK_CUDA(cudaEventDestroy(event->cu_event));
-
-  free(event);
-  event = NULL;
-}
-
-void hl_event_synchronize(hl_event_t event) {
-  CHECK_NOTNULL(event);
-  CHECK_CUDA(cudaEventSynchronize(event->cu_event));
-}
-
-void hl_get_device_name(char *name, int len, int device) {
-  CHECK_NOTNULL(name);
-  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-      << "Device(" << device << ") is not specified in startup.";
-
-  strncpy(name, g_device[device]->device_name, len);
-}
-
-void hl_get_device_memory(size_t *mem_size, int device) {
-  CHECK_NOTNULL(mem_size);
-  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-      << "Device(" << device << ") is not specified in startup.";
-
-  *mem_size = g_device[device]->device_mem;
-}
-
-void hl_get_device_compute_capability(int *major, int *minor, int device) {
-  CHECK_NOTNULL(major);
-  CHECK_NOTNULL(minor);
-  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-      << "Device(" << device << ") is not specified in startup.";
-
-  *major = g_device[device]->major;
-  *minor = g_device[device]->minor;
-}
-
-int hl_get_device_last_error() { return (int)cudaGetLastError(); }
-
-const char *hl_get_device_error_string() {
-  cudaError_t err = cudaGetLastError();
-  return cudaGetErrorString(err);
-}
-
-const char *hl_get_device_error_string(size_t err) {
-  return cudaGetErrorString((cudaError_t)err);
-}
-
-void hl_device_synchronize() { CHECK_CUDA(cudaDeviceSynchronize()); }
-void hl_set_device_flags_block() {
-  CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
-}
-
-bool hl_cuda_event_is_ready(hl_event_t event) {
-  cudaError_t err = cudaEventQuery(event->cu_event);
-  CHECK(cudaSuccess == err || cudaErrorNotReady == err);
-
-  if (cudaErrorNotReady == err) {
-    return false;
-  }
-  return true;
-}
-
-void hl_profiler_start() { CHECK_CUDA(cudaProfilerStart()); }
-
-void hl_profiler_end() { CHECK_CUDA(cudaProfilerStop()); }
diff --git a/paddle/legacy/cuda/src/hl_cuda_lstm.cu b/paddle/legacy/cuda/src/hl_cuda_lstm.cu
deleted file mode 100644
index 9ac564fd254..00000000000
--- a/paddle/legacy/cuda/src/hl_cuda_lstm.cu
+++ /dev/null
@@ -1,876 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_activation_functions.h"
-#include "hl_base.h"
-#include "hl_cuda_cublas.h"
-#include "hl_device_functions.cuh"
-#include "paddle/legacy/utils/Logging.h"
-
-typedef hppl::Active<real>::forward t_forward;
-typedef hppl::Active<real>::backward t_backward;
-
-bool hl_lstm_sequence_parallel(int frameSize) {
-  if (frameSize == 32 || frameSize == 64) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-class frameValue {
- public:
-  real *value_;
-  __device__ frameValue(real *value) : value_(value) {}
-  template <int reversed, int frameSize>
-  __device__ inline void init(int start, int length, int idx) {
-    if (reversed == 0) {
-      value_ += start * frameSize + idx;
-    } else {
-      value_ += (start + length - 1) * frameSize + idx;
-    }
-  }
-  __device__ inline real *getPtr() const { return value_; }
-  __device__ inline real getValue() { return *value_; }
-  __device__ inline void setValue(real value) { *value_ = value; }
-  template <int reversed, int frameSize>
-  __device__ inline void nextFrame() {
-    if (reversed == 0) {
-      value_ += frameSize;
-    } else {
-      value_ -= frameSize;
-    }
-  }
-};
-
-__device__ __forceinline__ void ptx_sync(const int id, const int barriers) {
-  asm volatile("bar.sync %0, %1;" : : "r"(id), "r"(barriers) : "memory");
-}
-
-__device__ __forceinline__ void ptx_arrive(const int id, const int barriers) {
-  asm volatile("bar.arrive %0, %1;" : : "r"(id), "r"(barriers) : "memory");
-}
-
-template <int valueSize, int frameSize>
-__device__ __forceinline__ real forward_sequence(real value,
-                                                 real *shValue,
-                                                 real *state,
-                                                 real *preOutput,
-                                                 real *output,
-                                                 real check,
-                                                 int index,
-                                                 t_forward activeNode,
-                                                 t_forward activeGate,
-                                                 t_forward activeState) {
-  real out;
-  real prevOut;
-  real state_r;
-  const int idx = index % frameSize;
-  const int idy = index / frameSize;
-  // assert(index < valueSize);
-
-  if (idy == 0) {
-    value = activeNode(value);
-    shValue[index] = value;
-  }
-  if (idy == 1 || idy == 2) {
-    state_r = state[idx];
-    value += state_r * check;
-    value = activeGate(value);
-    shValue[index] = value;
-  }
-  ptx_sync(1, valueSize);
-  if (idy == 3) {
-    state_r = state[idx];
-    state_r = state_r * shValue[idx + frameSize * 2];
-    state_r += shValue[idx] * shValue[idx + frameSize];
-    state[idx] = state_r;
-    ptx_arrive(2, frameSize * 2);
-    value += state_r * check;
-    value = activeGate(value);
-    shValue[index] = value;
-    ptx_sync(3, frameSize * 2);
-    prevOut = preOutput[idx];
-    out = prevOut * value;
-    output[idx] = out;
-  }
-  if (idy == 0) {
-    ptx_sync(2, frameSize * 2);
-    prevOut = state[idx];
-    prevOut = activeState(prevOut);
-    preOutput[idx] = prevOut;
-    ptx_arrive(3, frameSize * 2);
-  }
-  return value;
-}
-
-#define OUTPUT_BARRIER_ID 10
-#define OUTPUT_BARRIER_ID2 11
-template <int valueSize,
-          int frameSize,
-          int reversed,
-          int computeThreads,
-          int blockSize>
-__global__ void KeLstmForward(real *gateValue,
-                              real *state,
-                              real *output,
-                              real *preOutput,
-                              real *checkIg,
-                              real *checkFg,
-                              real *checkOg,
-                              real *weight,
-                              const int *starts,
-                              hl_activation_mode_t active_node,
-                              hl_activation_mode_t active_gate,
-                              hl_activation_mode_t active_state) {
-  __shared__ real shValue[valueSize];
-  __shared__ real shState[frameSize];
-  __shared__ real shPrevOutput[frameSize];
-  __shared__ real shOutput[frameSize];
-
-  const int index = threadIdx.x;
-  int start = starts[blockIdx.x];
-  int length = starts[blockIdx.x + 1] - start;
-
-  /* init */
-  real check;
-  real value;
-  frameValue frameGate(gateValue);
-  frameValue frameState(state);
-  frameValue frameOutput(output);
-  frameValue framePreOutput(preOutput);
-  if (index < valueSize) {
-    const int idx = index % frameSize;
-    const int idy = index / frameSize;
-    frameGate.init<reversed, valueSize>(start, length, index);
-    value = frameGate.getValue();
-    if (idy == 0) {
-      shState[idx] = 0.0;
-    } else if (idy == 1) {
-      check = checkIg[idx];
-    } else if (idy == 2) {
-      check = checkFg[idx];
-    } else if (idy == 3) {
-      check = checkOg[idx];
-    }
-
-    if (idy == 3) {
-      frameState.init<reversed, frameSize>(start, length, idx);
-      frameOutput.init<reversed, frameSize>(start, length, idx);
-      framePreOutput.init<reversed, frameSize>(start, length, idx);
-    }
-
-    ptx_sync(1, valueSize);
-  }
-
-  for (int i = 0; i < length; ++i) {
-    if (index < valueSize) {
-      if (valueSize == 128) {
-        if (i != 0) {
-          ptx_sync(OUTPUT_BARRIER_ID2, blockSize);
-          value += shValue[index];
-        }
-      }
-      value = forward_sequence<valueSize, frameSize>(
-          value,
-          shValue,
-          shState,
-          shPrevOutput,
-          shOutput,
-          check,
-          index,
-          hppl::gpu::forward[active_node],
-          hppl::gpu::forward[active_gate],
-          hppl::gpu::forward[active_state]);
-      const int idx = index % frameSize;
-      const int idy = index / frameSize;
-      if (valueSize == 128) {
-        if (idy == 3) {
-          ptx_arrive(OUTPUT_BARRIER_ID, frameSize + 128);
-        }
-      }
-      if (valueSize == 256) {
-        ptx_sync(OUTPUT_BARRIER_ID, valueSize);
-      }
-      frameGate.setValue(value);
-      if (idy == 3) {
-        frameState.setValue(shState[idx]);
-        frameOutput.setValue(shOutput[idx]);
-        framePreOutput.setValue(shPrevOutput[idx]);
-        frameState.nextFrame<reversed, frameSize>();
-        frameOutput.nextFrame<reversed, frameSize>();
-        framePreOutput.nextFrame<reversed, frameSize>();
-      }
-      if (i != length - 1) {
-        frameGate.nextFrame<reversed, valueSize>();
-        value = frameGate.getValue();
-      }
-    }
-    if (i != length - 1) {
-      if (valueSize == 128) {
-        if (valueSize <= index) {
-          real B_r[frameSize];
-          const int computeIdx = index - valueSize;
-          if (i == 0) {
-#pragma unroll
-            for (int n = 0; n < frameSize; n++) {
-              B_r[n] = weight[n * valueSize + computeIdx];
-            }
-          }
-          ptx_sync(OUTPUT_BARRIER_ID, frameSize + 128);
-          real A_r[frameSize];
-          for (int n = 0; n < frameSize; n++) {
-            A_r[n] = shOutput[n];
-          }
-          real sum = 0.0f;
-          for (int n = 0; n < frameSize; n++) {
-            sum += A_r[n] * B_r[n];
-          }
-          shValue[computeIdx] = sum;
-          ptx_arrive(OUTPUT_BARRIER_ID2, blockSize);
-        }
-      }
-      if (valueSize == 256) {
-        real B_r[frameSize];
-        if (i == 0) {
-#pragma unroll
-          for (int n = 0; n < frameSize; n++) {
-            B_r[n] = weight[n * valueSize + index];
-          }
-        }
-        real sum = 0.0f;
-        for (int n = 0; n < frameSize; n++) {
-          sum += shOutput[n] * B_r[n];
-        }
-        value += sum;
-      }
-    }
-  }
-}
-
-void hl_lstm_parallel_forward(real *gateValue,
-                              real *stateValue,
-                              real *preOutputValue,
-                              real *outputValue,
-                              real *checkIg,
-                              real *checkFg,
-                              real *checkOg,
-                              real *weight,
-                              const int *sequence,
-                              int frameSize,
-                              int numSequences,
-                              bool reversed,
-                              hl_activation_mode_t active_node,
-                              hl_activation_mode_t active_gate,
-                              hl_activation_mode_t active_state) {
-  CHECK(frameSize == 32 || frameSize == 64);
-  dim3 grid(numSequences, 1);
-  if (!reversed) {
-    if (frameSize == 32) {
-      KeLstmForward<128, 32, 0, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          stateValue,
-          outputValue,
-          preOutputValue,
-          checkIg,
-          checkFg,
-          checkOg,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 64) {
-      KeLstmForward<256, 64, 0, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          stateValue,
-          outputValue,
-          preOutputValue,
-          checkIg,
-          checkFg,
-          checkOg,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    }
-  } else {
-    if (frameSize == 32) {
-      KeLstmForward<128, 32, 1, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          stateValue,
-          outputValue,
-          preOutputValue,
-          checkIg,
-          checkFg,
-          checkOg,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 64) {
-      KeLstmForward<256, 64, 1, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          stateValue,
-          outputValue,
-          preOutputValue,
-          checkIg,
-          checkFg,
-          checkOg,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    }
-  }
-  CHECK_SYNC("hl_lstm_parallel_forward failed");
-}
-
-__device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
-  const int warp_size = 32;
-  int addr = idx % warp_size;
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, addr < warp_size);
-#pragma unroll
-  for (int k = 1; k < 32; k++) {
-    // rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32);
-    addr = __shfl_sync(mask, addr, (idx + 1) % 32, 32);
-    a[k] = __shfl_sync(mask, a[k], addr, 32);
-  }
-
-#pragma unroll
-  for (int tid = 0; tid < 31; tid++) {
-    real tmp = (idx > tid) ? a[0] : a[1];
-#pragma unroll
-    for (int k = 31; k > 0; k--) {
-      a[(k + 1) % 32] = (idx > tid) ? a[k] : a[(k + 1) % 32];
-    }
-    a[1] = tmp;
-  }
-
-  addr = (32 - idx) % 32;
-  CREATE_SHFL_MASK(mask, idx % 32 < warp_size);
-#pragma unroll
-  for (int k = 0; k < 32; k++) {
-    a[k] = __shfl_sync(mask, a[k], addr, 32);
-    addr = __shfl_sync(mask, addr, (idx + 31) % 32, 32);
-  }
-}
-
-template <int valueSize, int frameSize>
-__device__ void backward_sequence(real rGateValue,
-                                  real rOutputGrad,
-                                  real rPreOutputValue,
-                                  real &rGateGrad,
-                                  real &rStateGrad,
-                                  real *shStateGrad,
-                                  real *shStateValue,
-                                  real *shGateValue,
-                                  real rCheck,
-                                  real &rGateValuePrev,
-                                  int index,
-                                  t_backward activeNode,
-                                  t_backward activeGate,
-                                  t_backward activeState) {
-  const int frameIdx = index % frameSize;
-  const int frameIdy = index / frameSize;
-  if (frameIdy == 3) {
-    real rPrevOutputGrad;
-    rPrevOutputGrad = rOutputGrad * rGateValue;
-    rStateGrad = activeState(rPrevOutputGrad, rPreOutputValue);
-    rGateGrad = rOutputGrad * rPreOutputValue;
-    rGateGrad = activeGate(rGateGrad, rGateValue);
-    rStateGrad += rGateGrad * rCheck;
-    shStateGrad[index] = rStateGrad;
-    ptx_arrive(3, valueSize);
-  } else if (frameIdy == 1) {
-    shGateValue[frameIdx + frameSize] = rGateValue;
-    rStateGrad = rGateGrad * rCheck;
-    shStateGrad[index] = rStateGrad;
-    ptx_sync(3, valueSize);
-    rStateGrad += shStateGrad[frameIdx + frameSize * 2];
-    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
-    rGateGrad = rStateGrad * shGateValue[frameIdx];
-    rGateGrad = activeGate(rGateGrad, rGateValue);
-  } else if (frameIdy == 2) {
-    rStateGrad = rStateGrad * rGateValuePrev;
-    rStateGrad += rGateGrad * rCheck;
-    shStateGrad[index] = rStateGrad;
-    ptx_sync(3, valueSize);
-    rStateGrad += shStateGrad[frameIdx + frameSize];
-    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
-    rGateValuePrev = rGateValue;
-    rGateGrad = rStateGrad * shStateValue[frameIdx];
-    rGateGrad = activeGate(rGateGrad, rGateValue);
-  } else if (frameIdy == 0) {
-    shGateValue[frameIdx] = rGateValue;
-    ptx_sync(3, valueSize);
-    rStateGrad = shStateGrad[frameIdx + frameSize];
-    rStateGrad += shStateGrad[frameIdx + frameSize * 2];
-    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
-    rGateGrad = rStateGrad * shGateValue[frameIdx + frameSize];
-    rGateGrad = activeNode(rGateGrad, rGateValue);
-  }
-}
-
-template <int valueSize, int frameSize>
-__device__ void load_weight(real rWeight[], real *weight, const int index) {
-  if (valueSize == 128) {
-    weight += index;
-#pragma unroll
-    for (int n = 0; n < frameSize; n++) {
-      rWeight[n] = weight[n * valueSize];
-    }
-    transpose_32x32(rWeight, index % 32);
-  }
-  if (valueSize == 256) {
-    int id = (index / 32) % 2;
-    weight += index - id * 32 + id * 32 * valueSize;
-#pragma unroll
-    for (int n = 0; n < 32; n++) {
-      rWeight[n] = weight[n * valueSize];
-      rWeight[n + 32] = weight[n * valueSize + 32];
-    }
-    transpose_32x32(rWeight, index % 32);
-    transpose_32x32(&rWeight[32], index % 32);
-  }
-}
-
-template <int valueSize, int frameSize, int reversed>
-__global__ void KeLstmBackward(real *gateValue,
-                               real *gateGrad,
-                               real *stateValue,
-                               real *stateGrad, /* do not need save */
-                               real *preOutputValue,
-                               real *preOutputGrad, /* do not need save */
-                               real *checkIg,
-                               real *checkIgGrad,
-                               real *checkFg,
-                               real *checkFgGrad,
-                               real *checkOg,
-                               real *checkOgGrad,
-                               real *outputGrad,
-                               real *weightValue,
-                               const int *starts,
-                               hl_activation_mode_t active_node,
-                               hl_activation_mode_t active_gate,
-                               hl_activation_mode_t active_state) {
-  __shared__ real shGateValue[valueSize];
-  __shared__ real shStateGrad[valueSize];
-  __shared__ real shStateValue[frameSize];
-  __shared__ real shGateGrad[4][frameSize];
-  __shared__ real shOutputGrad[4][frameSize];
-  const int index = threadIdx.x;
-  int start = starts[blockIdx.x];
-  int length = starts[blockIdx.x + 1] - start;
-
-  const int frameIdx = index % frameSize;
-  const int frameIdy = index / frameSize;
-  real rCheck;
-  real rCheckGrad;
-  real rGateGrad;
-  real rStateGrad;
-  real rGateValuePrev;
-  real rPreOutputValue;
-  real rOutputGrad;
-  real rGateValue;
-  real rStateValue;
-
-  frameValue frameGateValue(gateValue);
-  frameValue frameGateGrad(gateGrad);
-  frameValue framePreOutputValue(preOutputValue);
-  frameValue frameStateValue(stateValue);
-  frameValue frameOutputGrad(outputGrad);
-  if (frameIdy == 0) {
-  } else if (frameIdy == 1) {
-    rCheck = checkIg[frameIdx];
-  } else if (frameIdy == 2) {
-    rCheck = checkFg[frameIdx];
-    rGateValuePrev = 0.0;
-    rStateGrad = 0.0;
-  } else if (frameIdy == 3) {
-    rCheck = checkOg[frameIdx];
-    framePreOutputValue.init<!reversed, frameSize>(start, length, frameIdx);
-    frameOutputGrad.init<!reversed, frameSize>(start, length, frameIdx);
-    rOutputGrad = frameOutputGrad.getValue();
-    rPreOutputValue = framePreOutputValue.getValue();
-    frameStateValue.init<!reversed, frameSize>(start, length, frameIdx);
-    rStateValue = frameStateValue.getValue();
-  }
-
-  frameGateValue.init<!reversed, valueSize>(start, length, index);
-  frameGateGrad.init<!reversed, valueSize>(start, length, index);
-  rGateValue = frameGateValue.getValue();
-  rGateGrad = 0.0;
-  rCheckGrad = 0.0;
-
-  real B_r[frameSize];
-  load_weight<valueSize, frameSize>(B_r, weightValue, index);
-
-  for (int i = 0; i < length; ++i) {
-    if (frameIdy == 3) {
-      if (i != length - 1) {
-        frameStateValue.nextFrame<!reversed, frameSize>();
-        shStateValue[frameIdx] = frameStateValue.getValue();
-      } else {
-        shStateValue[frameIdx] = 0.0;
-      }
-    }
-    backward_sequence<valueSize, frameSize>(rGateValue,
-                                            rOutputGrad,
-                                            rPreOutputValue,
-                                            rGateGrad,
-                                            rStateGrad,
-                                            shStateGrad,
-                                            shStateValue,
-                                            shGateValue,
-                                            rCheck,
-                                            rGateValuePrev,
-                                            index,
-                                            hppl::gpu::backward[active_node],
-                                            hppl::gpu::backward[active_gate],
-                                            hppl::gpu::backward[active_state]);
-    if (frameIdy == 3) {
-      rCheckGrad += rGateGrad * rStateValue;
-      rStateValue = shStateValue[frameIdx];
-    }
-
-    frameGateGrad.setValue(rGateGrad);
-    frameGateGrad.nextFrame<!reversed, valueSize>();
-
-    if (i != length - 1) {
-      if (frameIdy == 3) {
-        framePreOutputValue.nextFrame<!reversed, frameSize>();
-        rPreOutputValue = framePreOutputValue.getValue();
-        frameOutputGrad.nextFrame<!reversed, frameSize>();
-        rOutputGrad = frameOutputGrad.getValue();
-      } else if (frameIdy == 2) {
-        rCheckGrad += rGateGrad * shStateValue[frameIdx];
-      } else if (frameIdy == 1) {
-        rCheckGrad += rGateGrad * shStateValue[frameIdx];
-      }
-
-      frameGateValue.nextFrame<!reversed, valueSize>();
-      rGateValue = frameGateValue.getValue();
-      shGateGrad[frameIdy][frameIdx] = rGateGrad;
-      if (valueSize == 128) {
-        real sum = 0.0f;
-#pragma unroll
-        for (int n = 0; n < frameSize; n++) {
-          sum += shGateGrad[frameIdy][n] * B_r[n];
-        }
-        if (frameIdy == 3) {
-          rOutputGrad += sum;
-        } else {
-          shOutputGrad[frameIdy][frameIdx] = sum;
-        }
-      }
-      if (valueSize == 256) {
-        ptx_sync(5, valueSize);
-        real A_r[frameSize];
-        for (int n = 0; n < frameSize; n++) {
-          A_r[n] = shGateGrad[frameIdy][n];
-        }
-        real sum = 0.0f;
-        for (int n = 0; n < frameSize; n++) {
-          sum += A_r[n] * B_r[n];
-        }
-        if (frameIdy == 3) {
-          rOutputGrad += sum;
-        } else {
-          shOutputGrad[frameIdy][frameIdx] = sum;
-        }
-      }
-
-      if (frameIdy == 3) {
-        ptx_sync(6, valueSize);
-#pragma unroll
-        for (int i = 0; i < 3; i++) {
-          rOutputGrad += shOutputGrad[i][frameIdx];
-        }
-      } else {
-        ptx_arrive(6, valueSize);
-      }
-    }
-  }
-
-  /* TODO: Temporary save & merger in another kernel */
-  if (frameIdy == 1) {
-    if (checkIgGrad)
-      paddle::paddleAtomicAdd(checkIgGrad + frameIdx, rCheckGrad);
-  } else if (frameIdy == 2) {
-    if (checkFgGrad)
-      paddle::paddleAtomicAdd(checkFgGrad + frameIdx, rCheckGrad);
-  } else if (frameIdy == 3) {
-    if (checkOgGrad)
-      paddle::paddleAtomicAdd(checkOgGrad + frameIdx, rCheckGrad);
-  }
-}
-
-void hl_lstm_parallel_backward_data(real *gateValue,
-                                    real *gateGrad,
-                                    real *stateValue,
-                                    real *stateGrad,
-                                    real *preOutputValue,
-                                    real *preOutputGrad,
-                                    real *outputGrad,
-                                    real *checkIg,
-                                    real *checkIgGrad,
-                                    real *checkFg,
-                                    real *checkFgGrad,
-                                    real *checkOg,
-                                    real *checkOgGrad,
-                                    real *weight,
-                                    const int *sequence,
-                                    int frameSize,
-                                    int numSequences,
-                                    bool reversed,
-                                    hl_activation_mode_t active_node,
-                                    hl_activation_mode_t active_gate,
-                                    hl_activation_mode_t active_state) {
-  CHECK(frameSize == 32 || frameSize == 64 || frameSize == 128 ||
-        frameSize == 256);
-  dim3 grid(numSequences, 1);
-  if (!reversed) {
-    if (frameSize == 32) {
-      KeLstmBackward<128, 32, 0><<<grid, 128, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 64) {
-      KeLstmBackward<256, 64, 0><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 128) {
-      KeLstmBackward<512, 128, 0><<<grid, 512, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 256) {
-      KeLstmBackward<1024, 256, 0><<<grid, 1024, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    }
-  } else {
-    if (frameSize == 32) {
-      KeLstmBackward<128, 32, 1><<<grid, 128, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 64) {
-      KeLstmBackward<256, 64, 1><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 128) {
-      KeLstmBackward<512, 128, 1><<<grid, 512, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 256) {
-      KeLstmBackward<1024, 256, 1><<<grid, 1024, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    }
-  }
-  CHECK_SYNC("hl_lstm_parallel_backward_data");
-}
-
-template <int B_X, int B_Y>
-__global__ void KeSetGradZero(real *gateGrad,
-                              const int *starts,
-                              int valueSize,
-                              int numSequences,
-                              bool reversed) {
-  // const int tid = threadIdx.x;
-
-  const int frameIdx = blockIdx.x * B_X + threadIdx.x;
-  const int numSeqId = blockIdx.y * B_Y + threadIdx.y;
-
-  if (numSeqId >= numSequences || frameIdx >= valueSize) return;
-
-  if (!reversed) {
-    int seqId = starts[numSeqId];
-    gateGrad[seqId * valueSize + frameIdx] = 0.0;
-  } else {
-    int seqId = starts[numSeqId + 1] - 1;
-    gateGrad[seqId * valueSize + frameIdx] = 0.0;
-  }
-}
-
-void hl_lstm_parallel_backward_weight(real *weightGrad,
-                                      real *outputValue,
-                                      real *gateGrad,
-                                      const int *sequence,
-                                      int frameSize,
-                                      int batchSize,
-                                      int numSequences,
-                                      bool reversed) {
-  int valueSize = 4 * frameSize;
-  dim3 threads(32, 32);
-  dim3 grid((valueSize + 32 - 1) / 32, (numSequences + 32 - 1) / 32);
-  KeSetGradZero<32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      gateGrad, sequence, valueSize, numSequences, reversed);
-
-  if (!reversed) {
-    hl_matrix_mul(outputValue,
-                  HPPL_OP_T,
-                  gateGrad + valueSize,
-                  HPPL_OP_N,
-                  weightGrad,
-                  frameSize,
-                  valueSize,
-                  batchSize - 1,
-                  1.0,
-                  1.0);
-  } else {
-    hl_matrix_mul(outputValue + frameSize,
-                  HPPL_OP_T,
-                  gateGrad,
-                  HPPL_OP_N,
-                  weightGrad,
-                  frameSize,
-                  valueSize,
-                  batchSize - 1,
-                  1.0,
-                  1.0);
-  }
-  CHECK_SYNC("hl_lstm_parallel_backward_weight");
-}
diff --git a/paddle/legacy/cuda/src/hl_cuda_matrix.cu b/paddle/legacy/cuda/src/hl_cuda_matrix.cu
deleted file mode 100644
index 6fe460026bb..00000000000
--- a/paddle/legacy/cuda/src/hl_cuda_matrix.cu
+++ /dev/null
@@ -1,806 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_base.h"
-#include "hl_device_functions.cuh"
-#include "hl_gpu_matrix_kernel.cuh"
-#include "hl_matrix.h"
-#include "hl_matrix_apply.cuh"
-#include "hl_matrix_ops.cuh"
-#include "hl_sequence.h"
-#include "hl_sparse.ph"
-#include "paddle/legacy/utils/Logging.h"
-
-DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b);
-void hl_matrix_add(real* A_d,
-                   real* B_d,
-                   real* C_d,
-                   int dimM,
-                   int dimN,
-                   real alpha,
-                   real beta) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_gpu_apply_ternary_op<real, ternary::_add<real>, 0, 0>(
-      ternary::_add<real>(alpha, beta),
-      A_d,
-      B_d,
-      C_d,
-      dimM,
-      dimN,
-      dimN,
-      dimN,
-      dimN);
-  CHECK_SYNC("hl_matrix_add failed");
-}
-
-#ifdef PADDLE_TYPE_DOUBLE
-#define THRESHOLD 128
-#else
-#define THRESHOLD 64
-#endif
-__device__ __forceinline__ void findMax(real* I,
-                                        real* dfMax_s,
-                                        int blockSize,
-                                        int base,
-                                        int curIdx,
-                                        int nextIdx,
-                                        int dimN,
-                                        real* max) {
-  dfMax_s[base] = -1.0e20;
-  while (curIdx < dimN) {
-    if (dfMax_s[base] < I[nextIdx]) {
-      dfMax_s[base] = I[nextIdx];
-    }
-    nextIdx += blockSize;
-    curIdx += blockSize;
-  }
-  __syncthreads();
-
-  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
-    __syncthreads();
-    if (base < stride) {
-      nextIdx = base + stride;
-      if (dfMax_s[base] < dfMax_s[nextIdx]) {
-        dfMax_s[base] = dfMax_s[nextIdx];
-      }
-    }
-  }
-
-  if (0 == base) {
-    max[0] = dfMax_s[0];
-  }
-  __syncthreads();
-}
-
-__device__ __forceinline__ void subMaxAndExp(real* I,
-                                             real* O,
-                                             int curIdx,
-                                             int nextIdx,
-                                             int blockSize,
-                                             int dimN,
-                                             real max) {
-  real val;
-  while (curIdx < dimN) {
-    val = I[nextIdx] - max;
-    if (val < -THRESHOLD) {
-      val = -THRESHOLD;
-    }
-    I[nextIdx] = val;
-#ifndef PADDLE_TYPE_DOUBLE
-    O[nextIdx] = __expf(val);
-#else
-    O[nextIdx] = exp(val);
-#endif
-    nextIdx += blockSize;
-    curIdx += blockSize;
-  }
-  __syncthreads();
-}
-
-__device__ __forceinline__ void valueSum(real* O,
-                                         real* dfMax_s,
-                                         int blockSize,
-                                         int base,
-                                         int curIdx,
-                                         int nextIdx,
-                                         int dimN) {
-  dfMax_s[base] = 0;
-  while (curIdx < dimN) {
-    dfMax_s[base] += O[nextIdx];
-    nextIdx += blockSize;
-    curIdx += blockSize;
-  }
-  __syncthreads();
-
-  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
-    __syncthreads();
-    if (base < stride) {
-      nextIdx = base + stride;
-      dfMax_s[base] += dfMax_s[nextIdx];
-    }
-  }
-  __syncthreads();
-}
-
-__device__ __forceinline__ void divSum(
-    real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) {
-  while (curIdx < dimN) {
-    O[nextIdx] /= sum;
-    nextIdx += blockSize;
-    curIdx += blockSize;
-  }
-}
-
-__device__ __forceinline__ void softmax(real* I,
-                                        real* O,
-                                        real* dfMax_s,
-                                        int blockSize,
-                                        int base,
-                                        int curIdx,
-                                        int nextIdx,
-                                        int dimN) {
-  __shared__ real max;
-
-  // find the max number
-  findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max);
-
-  // sub max Value and do Exp operation
-  subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max);
-
-  // add dimN values into blockDim.x buffer
-  // sum is in dfMax_s[0]
-  valueSum(O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
-
-  // divided by sum
-  divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN);
-}
-
-template <int blockSize>
-__global__ void KeMatrixSoftMax(real* O, real* I, int dimN) {
-  int base = threadIdx.x;
-  __shared__ real dfMax_s[blockSize];
-  int nextIdx = blockIdx.x * dimN + base;
-  int curIdx = base;
-
-  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
-}
-
-void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  dim3 block(512, 1);
-  dim3 grid(dimM, 1);
-  KeMatrixSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
-  CHECK_SYNC("hl_matrix_softmax failed");
-}
-
-template <int blockSize>
-__global__ void KeSequenceSoftMax(real* O, real* I, const int* index) {
-  int base = threadIdx.x;
-  int bid = blockIdx.x;
-  __shared__ real dfMax_s[blockSize];
-
-  int start = index[bid];
-  int dimN = index[bid + 1] - start;
-
-  int nextIdx = start + base;
-  int curIdx = base;
-
-  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
-}
-
-void hl_sequence_softmax_forward(real* A_d,
-                                 real* C_d,
-                                 const int* index,
-                                 int numSequence) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  dim3 block(512, 1);
-  dim3 grid(numSequence, 1);
-  KeSequenceSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
-  CHECK_SYNC("hl_sequence_softmax_forward failed");
-}
-
-__global__ void KeMatrixDerivative(
-    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  int index;
-
-  if (rowIdx < dimM && colIdx < dimN) {
-    index = rowIdx * dimN + colIdx;
-    grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]);
-  }
-}
-
-void hl_matrix_softmax_derivative(
-    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
-  CHECK_NOTNULL(grad_d);
-  CHECK_NOTNULL(output_d);
-  CHECK_NOTNULL(sftmaxSum_d);
-
-  int blocksX = (dimM + 0) / 1;
-  int blocksY = (dimN + 1024 - 1) / 1024;
-  dim3 threads(1, 1024);
-  dim3 grid(blocksX, blocksY);
-
-  KeMatrixDerivative<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      grad_d, output_d, sftmaxSum_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_softmax_derivative failed");
-}
-
-__global__ void KeMatrixMultiBinaryCrossEntropy(
-    real* output, real* entropy, int* row, int* col, int dimM, int dimN) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < dimM) {
-    for (int i = 0; i < dimN; i++) {
-      entropy[index] -= log(1 - output[index * dimN + i]);
-    }
-    int* row_col = col + row[index];
-    int col_num = row[index + 1] - row[index];
-    for (int i = 0; i < col_num; i++) {
-      real o = output[index * dimN + row_col[i]];
-      entropy[index] -= log(o / (1 - o));
-    }
-  }
-}
-
-void hl_matrix_multi_binary_cross_entropy(real* output,
-                                          real* entropy,
-                                          hl_sparse_matrix_s csr_mat,
-                                          int dimM,
-                                          int dimN) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(entropy);
-  CHECK_NOTNULL(csr_mat);
-  CHECK_EQ(csr_mat->format, HL_SPARSE_CSR);
-  int n_threads = 1024;
-  int blocks = (dimM + n_threads - 1) / n_threads;
-  dim3 threads(n_threads);
-  dim3 grid(blocks);
-  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
-  KeMatrixMultiBinaryCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
-  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed");
-}
-
-__global__ void KeMatrixMultiBinaryCrossEntropyBp(
-    real* output, real* grad, int* row, int* col, int dimM, int dimN) {
-  int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (row_idx < dimM) {
-    for (int i = 0; i < dimN; i++) {
-      int index = row_idx * dimN + i;
-      grad[index] += 1.0 / (1 - output[index]);
-    }
-    int col_num = row[row_idx + 1] - row[row_idx];
-    int* row_col = col + row[row_idx];
-    for (int i = 0; i < col_num; i++) {
-      int index = row_idx * dimN + row_col[i];
-      grad[index] -= 1.0 / (output[index] * (1 - output[index]));
-    }
-  }
-}
-
-void hl_matrix_multi_binary_cross_entropy_bp(
-    real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(grad);
-  CHECK_NOTNULL(csr_mat);
-  CHECK_EQ(csr_mat->format, HL_SPARSE_CSR);
-  int n_threads = 1024;
-  int blocks = (dimM + n_threads - 1) / n_threads;
-  dim3 threads(n_threads);
-  dim3 grid(blocks);
-  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
-  KeMatrixMultiBinaryCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
-  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed");
-}
-
-__global__ void KeMatrixCrossEntropy(
-    real* O, real* E, int* label, int dimM, int dimN) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int newBase;
-  if (index < dimM) {
-    newBase = label[index];
-    newBase = newBase % dimN;
-    E[index] = -log(O[index * dimN + newBase]);
-  }
-}
-
-void hl_matrix_cross_entropy(
-    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  int blocks = (dimM + 1024 - 1) / 1024;
-  dim3 threads(1024, 1);
-  dim3 grid(blocks, 1);
-  KeMatrixCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      A_d, C_d, label_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_cross_entropy failed");
-}
-
-__global__ void KeMatrixCrossEntropyBp(
-    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  int index;
-  if (rowIdx < dimM && colIdx < dimN) {
-    index = rowIdx * dimN + colIdx;
-    if (label_d[rowIdx] == colIdx) {
-      grad_d[index] -= 1.0f / output_d[index];
-    }
-  }
-}
-
-void hl_matrix_cross_entropy_bp(
-    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
-  CHECK_NOTNULL(grad_d);
-  CHECK_NOTNULL(output_d);
-  CHECK_NOTNULL(label_d);
-
-  int blocksX = (dimM + 0) / 1;
-  int blocksY = (dimN + 1024 - 1) / 1024;
-  dim3 threads(1, 1024);
-  dim3 grid(blocksX, blocksY);
-  KeMatrixCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      grad_d, output_d, label_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_cross_entropy_bp failed");
-}
-
-void hl_matrix_zero_mem(real* data, int num) {
-  hl_gpu_apply_unary_op(unary::Zero<real>(), data, 1, num, num);
-}
-
-__global__ void KeParamReluForward(real* output,
-                                   real* input,
-                                   real* w,
-                                   int width,
-                                   int height,
-                                   int partial_sum) {
-  int tx = blockIdx.x * blockDim.x + threadIdx.x;
-  int ty = blockIdx.y * blockDim.y + threadIdx.y;
-  if (tx < width && ty < height) {
-    int index = ty * width + tx;
-    output[index] =
-        input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum];
-  }
-}
-
-void hl_param_relu_forward(real* output,
-                           real* input,
-                           real* w,
-                           int width,
-                           int height,
-                           int partial_sum) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(w);
-  dim3 threads(16, 16);
-  int blockX = (width + 16 - 1) / 16;
-  int blockY = (height + 16 - 1) / 16;
-  dim3 grid(blockX, blockY);
-  KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      output, input, w, width, height, partial_sum);
-  CHECK_SYNC("hl_param_relu_forward failed");
-}
-
-template <int blockSize>
-__global__ void KeParamReluBackWardW(real* grad_w,
-                                     real* grad_o,
-                                     real* input,
-                                     int width,
-                                     int height,
-                                     int partial_sum) {
-  const int tid = threadIdx.x;
-  __shared__ real temp[blockSize];
-  grad_o += partial_sum * blockIdx.x;
-  input += partial_sum * blockIdx.x;
-  real tmp = 0.0;
-  for (int index = tid; index < partial_sum * height; index += blockSize) {
-    int row = index / partial_sum;
-    int offset = row * width + (index - row * partial_sum);
-    if (input[offset] < 0) {
-      tmp += grad_o[offset] * input[offset];
-    }
-  }
-  temp[tid] = tmp;
-  __syncthreads();
-  for (int s = blockSize / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      temp[tid] += temp[tid + s];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    grad_w[blockIdx.x] += temp[0];
-  }
-}
-
-void hl_param_relu_backward_w(real* grad_w,
-                              real* grad_o,
-                              real* input,
-                              int width,
-                              int height,
-                              int partial_sum) {
-  CHECK_NOTNULL(grad_w);
-  CHECK_NOTNULL(grad_o);
-  CHECK_NOTNULL(input);
-  const int blockSize = 1024;
-  int grid_num = width / partial_sum;
-  dim3 threads(blockSize, 1);
-  dim3 grid(grid_num, 1);
-  KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      grad_w, grad_o, input, width, height, partial_sum);
-  CHECK_SYNC("hl_param_relu_backward_w failed");
-}
-
-__global__ void KeParamReluBackwardDiff(real* grad_o,
-                                        real* input,
-                                        real* w,
-                                        real* diff,
-                                        int width,
-                                        int height,
-                                        int partial_sum) {
-  int tx = blockIdx.x * blockDim.x + threadIdx.x;
-  int ty = blockIdx.y * blockDim.y + threadIdx.y;
-  if (tx < width && ty < height) {
-    int index = ty * width + tx;
-    diff[index] += grad_o[index] * (input[index] > 0 ? 1 : w[tx / partial_sum]);
-  }
-}
-
-void hl_param_relu_backward_diff(real* grad_o,
-                                 real* data,
-                                 real* w,
-                                 real* diff,
-                                 int width,
-                                 int height,
-                                 int partial_sum) {
-  CHECK_NOTNULL(grad_o);
-  CHECK_NOTNULL(data);
-  CHECK_NOTNULL(w);
-  CHECK_NOTNULL(diff);
-  dim3 threads(16, 16);
-  int blockX = (width + 16 - 1) / 16;
-  int blockY = (height + 16 - 1) / 16;
-  dim3 grid(blockX, blockY);
-  KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      grad_o, data, w, diff, width, height, partial_sum);
-  CHECK_SYNC("hl_param_relu_backward_diff failed");
-}
-
-__global__ void KeMatrixAddSharedBias(
-    real* A, real* B, const int channel, const int M, const int N, real scale) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int dim = N / channel;
-  if (index < M * N) {
-    int i = index % N;
-    i = i / dim;
-    A[index] += scale * B[i];
-  }
-}
-
-void hl_matrix_add_shared_bias(real* A_d,
-                               real* B_d,
-                               const int channel,
-                               const int dimM,
-                               const int dimN,
-                               real scale) {
-  const int blocks = 512;
-  const int grids = DIVUP(dimM * dimN, blocks);
-  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>(
-      A_d, B_d, channel, dimM, dimN, scale);
-  CHECK_SYNC("hl_matrix_add_shared_bias failed");
-}
-
-template <int blockSize>
-__global__ void KeMatrixCollectSharedBias(real* B,
-                                          real* A,
-                                          const int channel,
-                                          const int M,
-                                          const int N,
-                                          const int dim,
-                                          const int limit,
-                                          real scale) {
-  if (dim < limit) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < channel) {
-      real sum = 0.0;
-      for (int i = 0; i < M; ++i) {
-        for (int j = 0; j < dim; ++j) {
-          sum += A[i * N + index * dim + j];
-        }
-      }
-      B[index] += scale * sum;
-    }
-  } else {
-    const int tid = threadIdx.x;
-    const int bid = blockIdx.x;
-    __shared__ real smem[blockSize];
-    real sum = 0.0;
-    for (int j = 0; j < ((dim * M + blockSize - 1) / blockSize); ++j) {
-      int n = j * blockSize + tid;
-      int m = n / dim;
-      int w = n % dim;
-      smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
-      __syncthreads();
-      simpleReduce(smem, tid, blockSize);
-      sum += smem[0];
-    }
-    if (tid == 0) {
-      B[bid] += scale * sum;
-    }
-  }
-}
-
-void hl_matrix_collect_shared_bias(real* B_d,
-                                   real* A_d,
-                                   const int channel,
-                                   const int dimM,
-                                   const int dimN,
-                                   real scale) {
-  const int dim = dimN / channel;
-  const int blocks = 256;
-  const int limit = 64;
-  int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
-
-  KeMatrixCollectSharedBias<blocks><<<grids, blocks, 0, STREAM_DEFAULT>>>(
-      B_d, A_d, channel, dimM, dimN, dim, limit, scale);
-  CHECK_SYNC("hl_matrix_collect_shared_bias failed");
-}
-
-__global__ void keMatrixRotate(
-    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < dimM * dimN) {
-    int i = idx / dimN;
-    int j = idx % dimN;
-    if (clockWise) {
-      matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
-    } else {
-      matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
-    }
-  }
-}
-
-void hl_matrix_rotate(
-    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
-  CHECK_NOTNULL(mat);
-  CHECK_NOTNULL(matRot);
-  const int threads = 512;
-  const int blocks = DIVUP(dimM * dimN, threads);
-  keMatrixRotate<<<blocks, threads, 0, STREAM_DEFAULT>>>(
-      mat, matRot, dimM, dimN, clockWise);
-  CHECK_SYNC("hl_matrix_rotate failed");
-}
-
-__global__ void keMatrixVol2Col(int num_kernels,
-                                const real* dataSrc,
-                                real* dataDst,
-                                int depth,
-                                int height,
-                                int width,
-                                int filterD,
-                                int filterH,
-                                int filterW,
-                                int strideD,
-                                int strideH,
-                                int strideW,
-                                int paddingD,
-                                int paddingH,
-                                int paddingW,
-                                int depth_col,
-                                int height_col,
-                                int width_col) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
-       index += blockDim.x * gridDim.x) {
-    int w_out = index % width_col;
-    int h_out = (index / width_col) % height_col;
-    int d_out = (index / width_col / height_col) % depth_col;
-    int channel_in = index / width_col / height_col / depth_col;
-    int channel_out = channel_in * filterD * filterH * filterW;
-    int w_in = w_out * strideW - paddingW;
-    int h_in = h_out * strideH - paddingH;
-    int d_in = d_out * strideD - paddingD;
-
-    dataDst +=
-        ((channel_out * depth_col + d_out) * height_col + h_out) * width_col +
-        w_out;
-    dataSrc += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
-    for (int k = 0; k < filterD; ++k) {
-      for (int i = 0; i < filterH; ++i) {
-        for (int j = 0; j < filterW; ++j) {
-          int d = d_in + k;
-          int h = h_in + i;
-          int w = w_in + j;
-          *dataDst = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
-                      w < width)
-                         ? dataSrc[(k * height + i) * width + j]
-                         : 0;
-          dataDst += depth_col * height_col * width_col;
-        }
-      }
-    }
-  }
-}
-
-void hl_matrix_vol2Col(const real* dataSrc,
-                       int channels,
-                       int depth,
-                       int height,
-                       int width,
-                       int filterD,
-                       int filterH,
-                       int filterW,
-                       int strideD,
-                       int strideH,
-                       int strideW,
-                       int paddingD,
-                       int paddingH,
-                       int paddingW,
-                       real* dataDst) {
-  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
-  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
-  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
-  int num_kernels = channels * depth_col * height_col * width_col;
-
-  const int threads = 512;
-  const int blocks = DIVUP(num_kernels, threads);
-
-  keMatrixVol2Col<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                          dataSrc,
-                                                          dataDst,
-                                                          depth,
-                                                          height,
-                                                          width,
-                                                          filterD,
-                                                          filterH,
-                                                          filterW,
-                                                          strideD,
-                                                          strideH,
-                                                          strideW,
-                                                          paddingD,
-                                                          paddingH,
-                                                          paddingW,
-                                                          depth_col,
-                                                          height_col,
-                                                          width_col);
-  CHECK_SYNC("hl_matrix_vol2Col failed");
-}
-
-__global__ void keMatrixCol2Vol(int num_kernels,
-                                real* dataDst,
-                                const real* dataSrc,
-                                int depth,
-                                int height,
-                                int width,
-                                int filterD,
-                                int filterH,
-                                int filterW,
-                                int strideD,
-                                int strideH,
-                                int strideW,
-                                int paddingD,
-                                int paddingH,
-                                int paddingW,
-                                int depth_col,
-                                int height_col,
-                                int width_col,
-                                real alpha,
-                                real beta) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
-       index += blockDim.x * gridDim.x) {
-    real srcVal = 0;
-    real dstVal = dataDst[index];
-    int w = index % width + paddingW;
-    int h = (index / width) % height + paddingH;
-    int d = (index / width / height) % depth + paddingD;
-    int c = index / width / height / depth;
-    // compute the start and end of the output
-    int w_col_start = (w < filterW) ? 0 : (w - filterW) / strideW + 1;
-    int w_col_end = min(w / strideW + 1, width_col);
-    int h_col_start = (h < filterH) ? 0 : (h - filterH) / strideH + 1;
-    int h_col_end = min(h / strideH + 1, height_col);
-    int d_col_start = (d < filterD) ? 0 : (d - filterD) / strideD + 1;
-    int d_col_end = min(d / strideD + 1, depth_col);
-
-    int offset = (c * filterD * filterW * filterH + d * filterW * filterH +
-                  h * filterW + w) *
-                 depth_col * height_col * width_col;
-
-    int coeff_d_col =
-        (1 - strideD * filterW * filterH * depth_col) * height_col * width_col;
-    int coeff_h_col =
-        (1 - strideH * filterW * depth_col * height_col) * width_col;
-    int coeff_w_col = (1 - strideW * depth_col * height_col * width_col);
-
-    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          srcVal += dataSrc[offset + d_col * coeff_d_col + h_col * coeff_h_col +
-                            w_col * coeff_w_col];
-        }
-      }
-    }
-    dataDst[index] = alpha * srcVal + beta * dstVal;
-  }
-}
-
-void hl_matrix_col2Vol(real* dataDst,
-                       int channels,
-                       int depth,
-                       int height,
-                       int width,
-                       int filterD,
-                       int filterH,
-                       int filterW,
-                       int strideD,
-                       int strideH,
-                       int strideW,
-                       int paddingD,
-                       int paddingH,
-                       int paddingW,
-                       const real* dataSrc,
-                       real alpha,
-                       real beta) {
-  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
-  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
-  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
-  int num_kernels = channels * depth * height * width;
-
-  const int threads = 512;
-  const int blocks = DIVUP(num_kernels, threads);
-
-  keMatrixCol2Vol<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                          dataDst,
-                                                          dataSrc,
-                                                          depth,
-                                                          height,
-                                                          width,
-                                                          filterD,
-                                                          filterH,
-                                                          filterW,
-                                                          strideD,
-                                                          strideH,
-                                                          strideW,
-                                                          paddingD,
-                                                          paddingH,
-                                                          paddingW,
-                                                          depth_col,
-                                                          height_col,
-                                                          width_col,
-                                                          alpha,
-                                                          beta);
-
-  CHECK_SYNC("hl_matrix_col2Vol failed");
-}
-
-__global__ void keVectorCast2Int(int* out, real* vec, int size) {
-  for (int i = threadIdx.x; i < (size); i += blockDim.x) {
-    out[i] = int(vec[i]);
-  }
-}
-
-void hl_vector_cast2int(int* out, real* vec, int size) {
-  keVectorCast2Int<<<1, 512, 0, STREAM_DEFAULT>>>(out, vec, size);
-  CHECK_SYNC("hl_vector_cast2int failed");
-}
diff --git a/paddle/legacy/cuda/src/hl_cuda_sequence.cu b/paddle/legacy/cuda/src/hl_cuda_sequence.cu
deleted file mode 100644
index 1d772b5ce27..00000000000
--- a/paddle/legacy/cuda/src/hl_cuda_sequence.cu
+++ /dev/null
@@ -1,408 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_base.h"
-#include "hl_device_functions.cuh"
-#include "paddle/legacy/utils/Logging.h"
-
-__global__ void KeMaxSequenceForward(real* input,
-                                     const int* sequence,
-                                     real* output,
-                                     int* index,
-                                     int numSequences,
-                                     int dim) {
-  int dimIdx = threadIdx.x;
-  int sequenceId = blockIdx.x;
-  if (sequenceId >= numSequences) return;
-  int start = sequence[sequenceId];
-  int end = sequence[sequenceId + 1];
-
-  for (int i = dimIdx; i < dim; i += blockDim.x) {
-    real tmp = -HL_FLOAT_MAX;
-    int tmpId = -1;
-    for (int insId = start; insId < end; insId++) {
-      if (tmp < input[insId * dim + i]) {
-        tmp = input[insId * dim + i];
-        tmpId = insId;
-      }
-    }
-    output[sequenceId * dim + i] = tmp;
-    index[sequenceId * dim + i] = tmpId;
-  }
-}
-
-void hl_max_sequence_forward(real* input,
-                             const int* sequence,
-                             real* output,
-                             int* index,
-                             int numSequences,
-                             int dim) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(index);
-
-  dim3 threads(256, 1);
-  dim3 grid(numSequences, 1);
-  KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      input, sequence, output, index, numSequences, dim);
-  CHECK_SYNC("hl_max_sequence_forward failed");
-}
-
-__global__ void KeMaxSequenceBackward(
-    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int colIdx = idx % dim;
-  if (idx < numSequences * dim) {
-    int insId = index[idx];
-    inputGrad[insId * dim + colIdx] += outputGrad[idx];
-  }
-}
-
-void hl_max_sequence_backward(
-    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
-  CHECK_NOTNULL(outputGrad);
-  CHECK_NOTNULL(index);
-  CHECK_NOTNULL(inputGrad);
-
-  unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
-  dim3 threads(128, 1);
-  dim3 grid(blocks, 1);
-  KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      outputGrad, index, inputGrad, numSequences, dim);
-  CHECK_SYNC("hl_max_sequence_backward failed");
-}
-
-template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
-__global__ void KeMatrixAddRows(real* output,
-                                real* table,
-                                int* ids,
-                                int numSamples,
-                                int tableSize,
-                                int dim) {
-  int idx = threadIdx.x;
-  int idy = threadIdx.y;
-  int sampleId = blockIdx.x + idy * gridDimX;
-
-  while (sampleId < numSamples) {
-    int tableId = ids[sampleId];
-    if ((0 <= tableId) && (tableId < tableSize)) {
-      real* outputData = output + sampleId * dim;
-      real* tableData = table + tableId * dim;
-      for (int i = idx; i < dim; i += blockDimX) {
-        if (AddRow == 0) {
-          outputData[i] += tableData[i];
-        } else {
-          paddle::paddleAtomicAdd(&tableData[i], outputData[i]);
-        }
-      }
-    }
-    sampleId += blockDimY * gridDimX;
-  }
-}
-
-template <int blockDimX,
-          int blockDimY,
-          int gridDimX,
-          bool seq2batch,
-          bool isAdd>
-__global__ void KeSequence2Batch(real* batch,
-                                 real* sequence,
-                                 const int* batchIndex,
-                                 int seqWidth,
-                                 int batchCount) {
-  int idx = threadIdx.x;
-  int idy = threadIdx.y;
-  int id = blockIdx.x + idy * gridDimX;
-  while (id < batchCount) {
-    int seqId = batchIndex[id];
-    real* batchData = batch + id * seqWidth;
-    real* seqData = sequence + seqId * seqWidth;
-    for (int i = idx; i < seqWidth; i += blockDimX) {
-      if (seq2batch) {
-        if (isAdd) {
-          batchData[i] += seqData[i];
-        } else {
-          batchData[i] = seqData[i];
-        }
-      } else {
-        if (isAdd) {
-          seqData[i] += batchData[i];
-        } else {
-          seqData[i] = batchData[i];
-        }
-      }
-    }
-    id += blockDimY * gridDimX;
-  }
-}
-
-void hl_sequence2batch_copy(real* batch,
-                            real* sequence,
-                            const int* batchIndex,
-                            int seqWidth,
-                            int batchCount,
-                            bool seq2batch) {
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(batch);
-  CHECK_NOTNULL(batchIndex);
-
-  dim3 threads(128, 8);
-  dim3 grid(8, 1);
-  if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        batch, sequence, batchIndex, seqWidth, batchCount);
-  } else {
-    KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        batch, sequence, batchIndex, seqWidth, batchCount);
-  }
-  CHECK_SYNC("hl_sequence2batch_copy failed");
-}
-
-void hl_sequence2batch_add(real* batch,
-                           real* sequence,
-                           int* batchIndex,
-                           int seqWidth,
-                           int batchCount,
-                           bool seq2batch) {
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(batch);
-  CHECK_NOTNULL(batchIndex);
-
-  dim3 threads(128, 8);
-  dim3 grid(8, 1);
-  if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        batch, sequence, batchIndex, seqWidth, batchCount);
-  } else {
-    KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        batch, sequence, batchIndex, seqWidth, batchCount);
-  }
-  CHECK_SYNC("hl_sequence2batch_add failed");
-}
-
-template <bool normByTimes, bool seq2batch>
-__global__ void KeSequence2BatchPadding(real* batch,
-                                        real* sequence,
-                                        const int* sequenceStartPositions,
-                                        const size_t sequenceWidth,
-                                        const size_t maxSequenceLength,
-                                        const size_t numSequences) {
-  int batchIdx = blockIdx.y;
-  int sequenceStart = sequenceStartPositions[batchIdx];
-  int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
-
-  int sequenceIdx = blockIdx.x * blockDim.y + threadIdx.y;
-  int batchBaseIdx = (sequenceIdx * numSequences + batchIdx) * sequenceWidth;
-  int sequenceBaseIdx = (sequenceStart + sequenceIdx) * sequenceWidth;
-
-  real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
-
-  if (sequenceIdx < sequenceLength) {
-    if (seq2batch) {
-      /* sequence -> batch */
-      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
-        batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i];
-      }
-    } else {
-      /* batch -> sequence */
-      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
-        sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i];
-      }
-    }
-  } else if (sequenceIdx < maxSequenceLength) {
-    if (seq2batch) {
-      /* sequence -> batch */
-      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
-        batch[batchBaseIdx + i] = 0;
-      }
-    }
-  }
-}
-
-void hl_sequence2batch_copy_padding(real* batch,
-                                    real* sequence,
-                                    const int* sequenceStartPositions,
-                                    const size_t sequenceWidth,
-                                    const size_t maxSequenceLength,
-                                    const size_t numSequences,
-                                    bool normByTimes,
-                                    bool seq2batch) {
-  CHECK_NOTNULL(batch);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(sequenceStartPositions);
-
-  if (!normByTimes && numSequences == 1) {
-    size_t elementCount = maxSequenceLength * sequenceWidth;
-    if (seq2batch) {
-      /* sequence -> batch */
-      hl_memcpy_device2device(batch, sequence, sizeof(real) * elementCount);
-    } else {
-      /* batch -> sequence */
-      hl_memcpy_device2device(sequence, batch, sizeof(real) * elementCount);
-    }
-    return;
-  }
-
-  const int CUDA_BLOCK_SIZE = 512;
-
-  /* At least use 32 threads to copy sequenceWidth elements,
-     and at least 8 elements for each thread. */
-  int blockDimX = ((((sequenceWidth + 7) >> 3) + 31) >> 5) << 5;
-  blockDimX = (blockDimX < CUDA_BLOCK_SIZE) ? blockDimX : CUDA_BLOCK_SIZE;
-
-  int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
-  dim3 threads(blockDimX, blockDimY);
-
-  int gridDimX = (maxSequenceLength + blockDimY - 1) / blockDimY;
-  int gridDimY = numSequences;
-  dim3 grid(gridDimX, gridDimY);
-
-  if (seq2batch) {
-    /* sequence -> batch */
-    if (normByTimes) {
-      KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          batch,
-          sequence,
-          sequenceStartPositions,
-          sequenceWidth,
-          maxSequenceLength,
-          numSequences);
-    } else {
-      KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          batch,
-          sequence,
-          sequenceStartPositions,
-          sequenceWidth,
-          maxSequenceLength,
-          numSequences);
-    }
-  } else {
-    /* batch -> sequence */
-    if (normByTimes) {
-      KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          batch,
-          sequence,
-          sequenceStartPositions,
-          sequenceWidth,
-          maxSequenceLength,
-          numSequences);
-    } else {
-      KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          batch,
-          sequence,
-          sequenceStartPositions,
-          sequenceWidth,
-          maxSequenceLength,
-          numSequences);
-    }
-  }
-
-  CHECK_SYNC("hl_sequence2batch_copy_padding failed");
-}
-
-__device__ inline float my_rsqrt(float x) { return rsqrtf(x); }
-
-__device__ inline double my_rsqrt(double x) { return rsqrt(x); }
-
-__global__ void KeSequenceAvgForward(real* dst,
-                                     real* src,
-                                     const int* starts,
-                                     int height,
-                                     int width,
-                                     const int mode) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int row = gid / width;
-  int col = gid % width;
-
-  if (gid < height * width) {
-    int start = starts[row];
-    int end = starts[row + 1];
-    int seqLength = end - start;
-    if (seqLength == 0) return;
-    real sum = 0.0;
-    for (int i = start; i < end; i++) {
-      sum += src[i * width + col];
-    }
-    sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
-                                       : sum * my_rsqrt((real)seqLength));
-    dst[gid] += sum;
-  }
-}
-
-void hl_sequence_avg_forward(real* dst,
-                             real* src,
-                             const int* starts,
-                             int height,
-                             int width,
-                             const int mode) {
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_NOTNULL(starts);
-
-  int block = 512;
-  int grid = DIVUP(width * height, 512);
-
-  CHECK(mode == 0 || mode == 1 || mode == 2)
-      << "mode error in hl_sequence_avg_forward!";
-
-  KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
-      dst, src, starts, height, width, mode);
-  CHECK_SYNC("hl_sequence_avg_forward failed");
-}
-
-__global__ void KeSequenceAvgBackward(real* dst,
-                                      real* src,
-                                      const int* starts,
-                                      int height,
-                                      int width,
-                                      const int mode) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int row = gid / width;
-  int col = gid % width;
-
-  if (gid < height * width) {
-    int start = starts[row];
-    int end = starts[row + 1];
-    int seqLength = end - start;
-    if (seqLength == 0) return;
-    real grad = src[gid];
-    grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
-                                         : grad * my_rsqrt((real)seqLength));
-    for (int i = start; i < end; i++) {
-      dst[i * width + col] += grad;
-    }
-  }
-}
-
-void hl_sequence_avg_backward(real* dst,
-                              real* src,
-                              const int* starts,
-                              int height,
-                              int width,
-                              const int mode) {
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_NOTNULL(starts);
-
-  int block = 512;
-  int grid = DIVUP(width * height, 512);
-
-  CHECK(mode == 0 || mode == 1 || mode == 2)
-      << "mode error in hl_sequence_avg_backward!";
-
-  KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
-      dst, src, starts, height, width, mode);
-  CHECK_SYNC("hl_sequence_avg_backward failed");
-}
diff --git a/paddle/legacy/cuda/src/hl_cuda_sparse.cu b/paddle/legacy/cuda/src/hl_cuda_sparse.cu
deleted file mode 100644
index 8065a6f9f6f..00000000000
--- a/paddle/legacy/cuda/src/hl_cuda_sparse.cu
+++ /dev/null
@@ -1,1262 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_cuda.h"
-#include "hl_cuda_sparse.cuh"
-#include "hl_matrix_apply.cuh"
-#include "hl_matrix_ops.cuh"
-#include "hl_sparse.h"
-#include "hl_sparse.ph"
-#include "paddle/legacy/utils/Logging.h"
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(mul_scalar, ONE_PARAMETER, a = a * p);
-DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-
-void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
-                         real *C_d,
-                         int dimM,
-                         int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && A_d->rows == dimM && A_d->cols == dimN);
-  CHECK(A_d->format == HL_SPARSE_CSR) << "matrix format error!";
-
-  if (A_d->nnz == 0) {
-    hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) && A_d2->csr_row &&
-        A_d2->csr_col)
-      << "parameter transa error!";
-
-  int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
-  int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
-  dim3 threads(CU_CSR2DENSE_THREAD_X, CU_CSR2DENSE_THREAD_X);
-  dim3 grid(blocksX, blocksY);
-
-  if (A_d->type == HL_NO_VALUE) {
-    KeSMatrixCsr2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
-  } else if (A_d->type == HL_FLOAT_VALUE) {
-    KeSMatrixCsr2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
-  } else {
-  }
-  CHECK_SYNC("hl_matrix_csr2dense failed");
-}
-
-void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
-                         real *C_d,
-                         int dimM,
-                         int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && A_d->rows == dimM && A_d->cols == dimN);
-  CHECK(A_d->format == HL_SPARSE_CSC) << "matrix format error!";
-
-  if (A_d->nnz == 0) {
-    hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
-  CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) && A_d2->csc_row &&
-        A_d2->csc_col)
-      << "parameter transa error!";
-
-  int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
-  int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
-  dim3 threads(CU_CSR2DENSE_THREAD_X, CU_CSR2DENSE_THREAD_X);
-  dim3 grid(blocksX, blocksY);
-
-  if (A_d->type == HL_NO_VALUE) {
-    KeSMatrixCsc2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
-  } else if (A_d->type == HL_FLOAT_VALUE) {
-    KeSMatrixCsc2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
-  } else {
-  }
-  CHECK_SYNC("hl_matrix_csc2dense failed");
-}
-
-void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
-                             hl_matrix_format_t format,
-                             hl_matrix_value_t value_type,
-                             int dimM,
-                             int dimN,
-                             int nnz) {
-  CHECK_NOTNULL(A_d);
-  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-      << "sparse matrix format error!";
-  CHECK(value_type == HL_FLOAT_VALUE || value_type == HL_NO_VALUE)
-      << "sparse matrix value type error!";
-  /* avoid malloc 0 bytes */
-  int nnz_s = (nnz == 0 ? 1 : nnz);
-
-  if (format == HL_SPARSE_CSR) {
-    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    csr->sparsity = -1.0;
-
-    if (value_type == HL_NO_VALUE) {
-      csr->csr_val = NULL;
-      csr->nnz_s = nnz_s;
-      csr->row_s = dimM + 1;
-      csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
-      csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
-
-      *A_d = (hl_sparse_matrix_s)tmp;
-      (*A_d)->matrix = (hl_matrix_s)csr;
-    } else if (value_type == HL_FLOAT_VALUE) {
-      csr->nnz_s = nnz_s;
-      csr->row_s = dimM + 1;
-      csr->csr_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
-      csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
-      csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
-
-      *A_d = (hl_sparse_matrix_s)tmp;
-      (*A_d)->matrix = (hl_matrix_s)csr;
-    }
-  } else if (format == HL_SPARSE_CSC) {
-    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    csc->sparsity = -1.0f;
-
-    if (value_type == HL_NO_VALUE) {
-      csc->csc_val = NULL;
-      csc->nnz_s = nnz_s;
-      csc->col_s = dimN + 1;
-      csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
-      csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
-
-      *A_d = (hl_sparse_matrix_s)tmp;
-      (*A_d)->matrix = (hl_matrix_s)csc;
-    } else if (value_type == HL_FLOAT_VALUE) {
-      csc->nnz_s = nnz_s;
-      csc->col_s = dimN + 1;
-      csc->csc_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
-      csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
-      csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
-
-      *A_d = (hl_sparse_matrix_s)tmp;
-      (*A_d)->matrix = (hl_matrix_s)csc;
-    }
-  }
-
-  (*A_d)->format = format;
-  (*A_d)->type = value_type;
-  (*A_d)->rows = dimM;
-  (*A_d)->cols = dimN;
-  (*A_d)->nnz = nnz;
-}
-
-void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
-  CHECK_NOTNULL(A_d);
-  CHECK(A_d->format == HL_SPARSE_CSR || A_d->format == HL_SPARSE_CSC)
-      << "sparse matrix format error!";
-
-  if (A_d->matrix == NULL) {
-    free(A_d);
-    return;
-  }
-
-  if (A_d->format == HL_SPARSE_CSR) {
-    hl_csr_matrix csr = (hl_csr_matrix)A_d->matrix;
-    if (csr->csr_val != NULL) {
-      hl_free_mem_device(csr->csr_val);
-      csr->csr_val = NULL;
-    }
-
-    if (csr->csr_row != NULL) {
-      hl_free_mem_device(csr->csr_row);
-      csr->csr_row = NULL;
-    }
-
-    if (csr->csr_col != NULL) {
-      hl_free_mem_device(csr->csr_col);
-      csr->csr_col = NULL;
-    }
-
-    A_d->matrix = NULL;
-    free(A_d);
-  } else if (A_d->format == HL_SPARSE_CSC) {
-    hl_csc_matrix csc = (hl_csc_matrix)A_d->matrix;
-    if (csc->csc_val != NULL) {
-      hl_free_mem_device(csc->csc_val);
-      csc->csc_val = NULL;
-    }
-
-    if (csc->csc_row != NULL) {
-      hl_free_mem_device(csc->csc_row);
-      csc->csc_row = NULL;
-    }
-
-    if (csc->csc_col != NULL) {
-      hl_free_mem_device(csc->csc_col);
-      csc->csc_col = NULL;
-    }
-
-    A_d->matrix = NULL;
-    free(A_d);
-  }
-}
-
-void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                void *dest_d,
-                                size_t size,
-                                hl_matrix_format_t format,
-                                hl_matrix_value_t value_type,
-                                int dimM,
-                                int dimN,
-                                int nnz) {
-  CHECK_NOTNULL(A_d);
-  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-      << "sparse matrix format error!";
-
-  if (format == HL_SPARSE_CSR) {
-    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-    size_t size_ = (dimM + 1) * sizeof(int) + nnz * sizeof(int);
-    if (value_type != HL_NO_VALUE) {
-      size_ += nnz * sizeof(real);
-    }
-    CHECK_LE(size_, size) << "dest_d size(" << size
-                          << ") too small, should bigger than(" << size_
-                          << ")!";
-
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-
-    if (value_type == HL_NO_VALUE) {
-      csr->csr_val = NULL;
-      csr->csr_row = (int *)dest_d;
-      csr->csr_col = (int *)((char *)dest_d + (dimM + 1) * sizeof(int));
-    } else {
-      csr->csr_val = (real *)dest_d;
-      csr->csr_row = (int *)((char *)dest_d + nnz * sizeof(real));
-      csr->csr_col = (int *)((char *)dest_d + nnz * sizeof(real) +
-                             (dimM + 1) * sizeof(int));
-    }
-    csr->nnz_s = nnz;
-    csr->row_s = dimM + 1;
-    csr->sparsity = -1.0;
-    *A_d = (hl_sparse_matrix_s)tmp;
-    (*A_d)->matrix = (hl_matrix_s)csr;
-  } else if (format == HL_SPARSE_CSC) {
-    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-    size_t size_ = (dimN + 1) * sizeof(int) + nnz * sizeof(int);
-    if (value_type != HL_NO_VALUE) {
-      size_ += nnz * sizeof(real);
-    }
-    CHECK_LE(size_, size) << "dest_d size(" << size
-                          << ") too small, should bigger than(" << size_
-                          << ")!";
-
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    if (value_type == HL_NO_VALUE) {
-      csc->csc_val = NULL;
-      csc->csc_col = (int *)dest_d;
-      csc->csc_row = (int *)((char *)dest_d + (dimN + 1) * sizeof(int));
-    } else {
-      csc->csc_val = (real *)dest_d;
-      csc->csc_col = (int *)((char *)dest_d + nnz * sizeof(real));
-      csc->csc_row = (int *)((char *)dest_d + nnz * sizeof(real) +
-                             (dimN + 1) * sizeof(int));
-    }
-    csc->nnz_s = nnz;
-    csc->col_s = dimN + 1;
-    csc->sparsity = -1.0f;
-    *A_d = (hl_sparse_matrix_s)tmp;
-    (*A_d)->matrix = (hl_matrix_s)csc;
-  }
-
-  (*A_d)->format = format;
-  (*A_d)->type = value_type;
-  (*A_d)->rows = dimM;
-  (*A_d)->cols = dimN;
-  (*A_d)->nnz = nnz;
-}
-
-void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                real *value_d,
-                                int *rows_d,
-                                int *cols_d,
-                                hl_matrix_format_t format,
-                                hl_matrix_value_t value_type,
-                                int dimM,
-                                int dimN,
-                                int nnz) {
-  CHECK_NOTNULL(A_d);
-  CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-      << "sparse matrix format error!";
-
-  if (format == HL_SPARSE_CSR) {
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    csr->csr_row = rows_d;
-    csr->csr_col = cols_d;
-    csr->csr_val = value_d;
-    csr->nnz_s = nnz;
-    csr->row_s = dimM + 1;
-    csr->sparsity = -1.0;
-    *A_d = (hl_sparse_matrix_s)tmp;
-    (*A_d)->matrix = (hl_matrix_s)csr;
-  } else if (format == HL_SPARSE_CSC) {
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    csc->csc_row = rows_d;
-    csc->csc_col = cols_d;
-    csc->csc_val = value_d;
-    csc->nnz_s = nnz;
-    csc->col_s = dimN + 1;
-    csc->sparsity = -1.0f;
-    *A_d = (hl_sparse_matrix_s)tmp;
-    (*A_d)->matrix = (hl_matrix_s)csc;
-  }
-
-  (*A_d)->format = format;
-  (*A_d)->type = value_type;
-  (*A_d)->rows = dimM;
-  (*A_d)->cols = dimN;
-  (*A_d)->nnz = nnz;
-}
-
-void hl_destruct_sparse_matrix(hl_sparse_matrix_s A_d) {
-  CHECK_NOTNULL(A_d);
-  free(A_d);
-}
-
-void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
-                          real *csr_val,
-                          int *csr_row,
-                          int *csr_col,
-                          hl_stream_t stream) {
-  CHECK_NOTNULL(csr_matrix);
-  CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
-      << "csr_matrix is not csr format!";
-  CHECK_NOTNULL(csr_matrix->matrix);
-
-  hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
-  CHECK_LE(csr_matrix->nnz, csr->nnz_s) << "copy size " << csr_matrix->nnz
-                                        << " is big than alloc size "
-                                        << csr->nnz_s;
-
-  CHECK_LE((csr_matrix->rows + 1), csr->row_s)
-      << "copy size " << (csr_matrix->rows + 1) << " is big than alloc size "
-      << csr->row_s;
-
-  CHECK(csr_matrix->type == HL_FLOAT_VALUE || csr_matrix->type == HL_NO_VALUE)
-      << "sparse matrix value type error!";
-
-  if (csr_matrix->type == HL_NO_VALUE) {
-    if (csr_row == NULL && csr_col == NULL) {
-      return;
-    } else if (csr_row != NULL && csr_col != NULL) {
-      hl_memcpy_async(
-          csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
-
-      hl_memcpy_async(
-          csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
-    } else {
-      LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
-    }
-  } else if (csr_matrix->type == HL_FLOAT_VALUE) {
-    if (csr_val == NULL && csr_row == NULL && csr_col == NULL) {
-      return;
-    } else if (csr_val != NULL && csr_row == NULL && csr_col == NULL) {
-      hl_memcpy_async(
-          csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
-    } else if (csr_val != NULL && csr_row != NULL && csr_col != NULL) {
-      hl_memcpy_async(
-          csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
-      hl_memcpy_async(
-          csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
-      hl_memcpy_async(
-          csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
-    } else {
-      LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
-    }
-  }
-
-  csr->sparsity = ((float)csr_matrix->nnz) / ((float)csr_matrix->rows) /
-                  ((float)csr_matrix->cols);
-}
-
-void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
-                          real *csc_val,
-                          int *csc_row,
-                          int *csc_col,
-                          hl_stream_t stream) {
-  CHECK_NOTNULL(csc_matrix);
-  CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
-      << "csc_matrix is not csc format error!";
-
-  hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
-  CHECK_LE(csc_matrix->nnz, csc->nnz_s) << "copy size " << csc_matrix->nnz
-                                        << " is big than alloc size "
-                                        << csc->nnz_s;
-
-  CHECK_LE((csc_matrix->cols + 1), csc->col_s)
-      << "copy size " << (csc_matrix->cols + 1) << " is big than alloc size "
-      << csc->col_s;
-
-  CHECK(csc_matrix->type == HL_FLOAT_VALUE || csc_matrix->type == HL_NO_VALUE)
-      << "sparse matrix value type error!";
-
-  if (csc_matrix->type == HL_NO_VALUE) {
-    if (csc_row == NULL && csc_col == NULL) {
-      return;
-    } else if (csc_row != NULL && csc_col != NULL) {
-      hl_memcpy_async(
-          csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
-      hl_memcpy_async(
-          csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
-    } else {
-      LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
-    }
-  } else if (csc_matrix->type == HL_FLOAT_VALUE) {
-    if (csc_val == NULL && csc_row == NULL && csc_col == NULL) {
-      return;
-    } else if (csc_val != NULL && csc_row == NULL && csc_col == NULL) {
-      hl_memcpy_async(
-          csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
-    } else if (csc_val != NULL && csc_row != NULL && csc_col != NULL) {
-      hl_memcpy_async(
-          csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
-      hl_memcpy_async(
-          csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
-      hl_memcpy_async(
-          csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
-    } else {
-      LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
-    }
-  }
-
-  csc->sparsity = ((float)csc_matrix->nnz) / ((float)csc_matrix->rows) /
-                  ((float)csc_matrix->cols);
-}
-
-void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
-                             hl_sparse_matrix_s src,
-                             hl_stream_t stream) {
-  CHECK(dst && src && dst->matrix && src->matrix)
-      << "parameter dst or src is null pointer!";
-  CHECK_EQ(dst->format, src->format) << "sparse matrix format does not match!";
-  CHECK(dst->type != HL_FLOAT_VALUE || src->type != HL_NO_VALUE)
-      << "src sparse matrix is no value, dst sparse matrix has value!";
-
-  if (dst->format == HL_SPARSE_CSR) {
-    dst->rows = src->rows;
-    dst->cols = src->cols;
-    dst->nnz = src->nnz;
-    hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
-    hl_memcpy_csr_matrix(dst, csr->csr_val, csr->csr_row, csr->csr_col, stream);
-  } else if (dst->format == HL_SPARSE_CSC) {
-    dst->rows = src->rows;
-    dst->cols = src->cols;
-    dst->nnz = src->nnz;
-    hl_csc_matrix csc = (hl_csc_matrix)src->matrix;
-    hl_memcpy_csc_matrix(dst, csc->csc_val, csc->csc_row, csc->csc_col, stream);
-  } else {
-    LOG(FATAL) << "sparse matrix format error!";
-  }
-}
-
-/**
- * Calculate beta * C, if beta is zero, C does not have to be a valid input.
- */
-static void _beta_mul_c(real *c, int dimM, int dimN, real beta) {
-  if (beta == 0.0) {
-    hl_gpu_apply_unary_op(unary::Zero<real>(), c, dimM, dimN, dimN);
-  } else {
-    if (beta != 1.0) {
-      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta), c, dimM, dimN, dimN);
-    }
-  }
-
-  return;
-}
-
-void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
-                             hl_trans_op_t transa,
-                             real *B_d,
-                             hl_trans_op_t transb,
-                             real *C_d,
-                             int dimM,
-                             int dimN,
-                             int dimK,
-                             real alpha,
-                             real beta) {
-  CHECK_EQ(transb, HPPL_OP_N);
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && dimK > 0);
-  CHECK_EQ(A_d->format, HL_SPARSE_CSR) << "matrix format error!";
-
-  if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) ||
-      (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) {
-    LOG(FATAL) << "parameter error!";
-  }
-
-  if (A_d->nnz == 0) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
-      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter error!";
-  }
-
-  if (HPPL_OP_N == transa) {
-    int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N;
-    int blocksY = (dimM + CU_CSRMM_THREAD_Y - 1) / CU_CSRMM_THREAD_Y;
-    dim3 threads(CU_CSRMM_THREAD_X, CU_CSRMM_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-
-    /* sparsity pattern */
-    // A_d->sparsity;
-    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csr_val,
-          A_d2->csr_col,
-          A_d2->csr_row,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csr_val,
-          A_d2->csr_col,
-          A_d2->csr_row,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else if (HPPL_OP_T == transa) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-
-    int blocksX =
-        (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
-    int blocksY =
-        (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
-    dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csr_val,
-          A_d2->csr_col,
-          A_d2->csr_row,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csr_val,
-          A_d2->csr_col,
-          A_d2->csr_row,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else {
-    LOG(FATAL) << "parameter transa error!";
-  }
-
-  CHECK_SYNC("hl_matrix_csr_mul_dense failed");
-}
-
-void hl_matrix_dense_mul_csc(real *A_d,
-                             hl_trans_op_t transa,
-                             hl_sparse_matrix_s B_d,
-                             hl_trans_op_t transb,
-                             real *C_d,
-                             int dimM,
-                             int dimN,
-                             int dimK,
-                             real alpha,
-                             real beta) {
-  CHECK_EQ(transa, HPPL_OP_N);
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  if (dimM <= 0 || dimN <= 0 || dimK <= 0 ||
-      ((transb == HPPL_OP_N) && (B_d->rows != dimK || B_d->cols != dimN)) ||
-      ((transb == HPPL_OP_T) && (B_d->rows != dimN || B_d->cols != dimK))) {
-    LOG(FATAL) << "parameter dims error!";
-  }
-
-  CHECK_EQ(B_d->format, HL_SPARSE_CSC) << "matrix format error!";
-
-  if (B_d->nnz == 0) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csc_matrix B_d2 = (hl_csc_matrix)(B_d->matrix);
-  if ((B_d2->csc_val == NULL && B_d->type != HL_NO_VALUE) ||
-      B_d2->csc_row == NULL || B_d2->csc_col == NULL) {
-    LOG(FATAL) << "parameter B is null!";
-  }
-
-  if (transb == HPPL_OP_N) {
-    int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST;
-    int blocksY = (dimN + CU_CSCMM_BLOCK_N_BEST - 1) / CU_CSCMM_BLOCK_N_BEST;
-    dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST);
-    dim3 grid(blocksX, blocksY);
-
-    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csc_val,
-          B_d2->csc_row,
-          B_d2->csc_col,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csc_val,
-          B_d2->csc_row,
-          B_d2->csc_col,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else if (transb == HPPL_OP_T) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
-    int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
-    dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csc_val,
-          B_d2->csc_col,
-          B_d2->csc_row,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csc_val,
-          B_d2->csc_col,
-          B_d2->csc_row,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else {
-    LOG(FATAL) << "parameter transb error!";
-  }
-
-  CHECK_SYNC("hl_matrix_dense_mul_csc failed");
-}
-
-void hl_matrix_dense_mul_csr(real *A_d,
-                             hl_trans_op_t transa,
-                             hl_sparse_matrix_s B_d,
-                             hl_trans_op_t transb,
-                             real *C_d,
-                             int dimM,
-                             int dimN,
-                             int dimK,
-                             real alpha,
-                             real beta) {
-  CHECK_EQ(transa, HPPL_OP_N);
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  if (dimM <= 0 || dimN <= 0 || dimK <= 0 ||
-      (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN)) ||
-      (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) {
-    LOG(FATAL) << "parameter dims error!";
-  }
-
-  CHECK_EQ(B_d->format, HL_SPARSE_CSR) << "matrix format error!";
-
-  if (B_d->nnz == 0) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix);
-  if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) ||
-      B_d2->csr_row == NULL || B_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter transa error!";
-  }
-
-  if (transb == HPPL_OP_N) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
-    int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
-    dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csr_val,
-          B_d2->csr_row,
-          B_d2->csr_col,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csr_val,
-          B_d2->csr_row,
-          B_d2->csr_col,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else if (transb == HPPL_OP_T) {
-    int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST;
-    int blocksY = (dimN + CU_CSCMM_BLOCK_N_BEST - 1) / CU_CSCMM_BLOCK_N_BEST;
-    dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST);
-    dim3 grid(blocksX, blocksY);
-    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csr_val,
-          B_d2->csr_col,
-          B_d2->csr_row,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csr_val,
-          B_d2->csr_col,
-          B_d2->csr_row,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else {
-    LOG(FATAL) << "parameter transb error!";
-  }
-
-  CHECK_SYNC("hl_matrix_dense_mul_csr failed");
-}
-
-void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
-                             hl_trans_op_t transa,
-                             real *B_d,
-                             hl_trans_op_t transb,
-                             real *C_d,
-                             int dimM,
-                             int dimN,
-                             int dimK,
-                             real alpha,
-                             real beta) {
-  CHECK_EQ(transb, HPPL_OP_N);
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && dimK > 0) << "parameter error!";
-  CHECK_EQ(A_d->format, HL_SPARSE_CSC) << "matrix format error!";
-
-  if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) ||
-      (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) {
-    LOG(FATAL) << "parameter error!";
-  }
-
-  if (A_d->nnz == 0) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
-  if ((A_d2->csc_val == NULL && A_d->type != HL_NO_VALUE) ||
-      A_d2->csc_row == NULL || A_d2->csc_col == NULL) {
-    LOG(FATAL) << "parameter error!";
-  }
-
-  if (HPPL_OP_N == transa) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-
-    int blocksX =
-        (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
-    int blocksY =
-        (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
-    dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csc_val,
-          A_d2->csc_row,
-          A_d2->csc_col,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csc_val,
-          A_d2->csc_row,
-          A_d2->csc_col,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else if (HPPL_OP_T == transa) {
-    int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N;
-    int blocksY = (dimM + CU_CSRMM_THREAD_Y - 1) / CU_CSRMM_THREAD_Y;
-    dim3 threads(CU_CSRMM_THREAD_X, CU_CSRMM_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-
-    /* sparsity pattern */
-    // A_d->sparsity;
-    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csc_val,
-          A_d2->csc_row,
-          A_d2->csc_col,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csc_val,
-          A_d2->csc_row,
-          A_d2->csc_col,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else {
-    LOG(FATAL) << "parameter transa error!";
-  }
-
-  CHECK_SYNC("hl_matrix_csc_mul_dense failed");
-}
-
-void hl_sparse_matrix_mul(real *A_d,
-                          hl_trans_op_t transa,
-                          real *B_d,
-                          hl_trans_op_t transb,
-                          hl_sparse_matrix_s C_d,
-                          int dimM,
-                          int dimN,
-                          int dimK,
-                          real alpha,
-                          real beta) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && dimK > 0) << "parameter error!";
-  CHECK_NE(C_d->type, HL_NO_VALUE) << "C value type error!";
-
-  if (C_d->nnz == 0) return;
-
-  if (C_d->format == HL_SPARSE_CSC) {
-    hl_csc_matrix C_d2 = (hl_csc_matrix)(C_d->matrix);
-    if (C_d2->csc_val == NULL || C_d2->csc_row == NULL ||
-        C_d2->csc_col == NULL) {
-      LOG(FATAL) << "parameter error!";
-    }
-
-    if (beta != 1.0) {
-      hl_gpu_apply_unary_op(
-          unary::mul_scalar<real>(beta), C_d2->csc_val, 1, C_d->nnz, C_d->nnz);
-    }
-
-    int blocksX = dimN;
-    int blocksY = 1;
-    dim3 threads(CU_CSCMM_DMD2CSC_THREAD_X, 1);
-    dim3 grid(blocksX, blocksY);
-    bool transA = transa == HPPL_OP_T ? 1 : 0;
-    bool transB = transb == HPPL_OP_T ? 1 : 0;
-    KeSMatrixDenseMulDense2CSC<<<grid, threads, 0, STREAM_DEFAULT>>>(
-        C_d2->csc_val,
-        C_d2->csc_row,
-        C_d2->csc_col,
-        A_d,
-        B_d,
-        transA,
-        transB,
-        dimM,
-        dimN,
-        dimK,
-        alpha,
-        beta);
-    CHECK_SYNC("hl_sparse_matrix_mul failed");
-  } else {
-    hl_csr_matrix C_d2 = (hl_csr_matrix)(C_d->matrix);
-    if ((C_d2->csr_val == NULL && C_d->type != HL_NO_VALUE) ||
-        C_d2->csr_row == NULL || C_d2->csr_col == NULL) {
-      LOG(FATAL) << "parameter error!";
-    }
-
-    if (beta != 1.0) {
-      hl_gpu_apply_unary_op(
-          unary::mul_scalar<real>(beta), C_d2->csr_val, 1, C_d->nnz, C_d->nnz);
-    }
-
-    bool transA = transa == HPPL_OP_T ? 1 : 0;
-    bool transB = transb == HPPL_OP_T ? 1 : 0;
-    if (!transB) {
-      int blocksX = dimM;
-      int blocksY = 1;
-      dim3 threads(CU_CSCMM_DMD2CSR_THREAD_X, 1);
-      dim3 grid(blocksX, blocksY);
-
-      KeSMatrixDenseMulDense2CSR<<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d2->csr_val,
-          C_d2->csr_row,
-          C_d2->csr_col,
-          A_d,
-          B_d,
-          transA,
-          transB,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-      CHECK_SYNC("hl_sparse_matrix_mul failed");
-    } else {
-      CHECK(!transA) << "Not supported A is trans and B is not trans!";
-
-      dim3 block(CU_BLOCK_SIZE, 1);
-      int avgNnzPerRow = C_d->nnz / dimM;
-      avgNnzPerRow = avgNnzPerRow > 0 ? avgNnzPerRow : 1;
-      int gridx = DIVUP(avgNnzPerRow, CU_BLOCK_SIZE);
-      dim3 grid(gridx, dimM);
-      KeSMatrixDenseMulDenseTrans2CSR<<<grid, block, 0, STREAM_DEFAULT>>>(
-          C_d2->csr_val,
-          C_d2->csr_row,
-          C_d2->csr_col,
-          A_d,
-          B_d,
-          transA,
-          transB,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-      CHECK_SYNC("hl_sparse_matrix_mul failed");
-    }
-  }
-}
-
-void hl_memcpy_from_csc_matrix(real *csc_val,
-                               size_t val_size,
-                               int *csc_row,
-                               size_t row_size,
-                               int *csc_col,
-                               size_t col_size,
-                               hl_sparse_matrix_s csc_matrix,
-                               hl_stream_t stream) {
-  CHECK_NOTNULL(csc_matrix);
-  CHECK_NOTNULL(csc_row);
-  CHECK_NOTNULL(csc_col);
-
-  CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
-      << "csc_matrix is not csc format error!";
-
-  if (csc_matrix->nnz > row_size ||
-      csc_matrix->cols + 1 > static_cast<int>(col_size)) {
-    LOG(FATAL) << "size not match!";
-  }
-
-  hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
-  hl_memcpy_async((void *)csc_row,
-                  (void *)csc->csc_row,
-                  (csc_matrix->nnz) * sizeof(int),
-                  stream);
-  hl_memcpy_async((void *)csc_col,
-                  (void *)csc->csc_col,
-                  (csc_matrix->cols + 1) * sizeof(int),
-                  stream);
-  if (csc_matrix->type == HL_FLOAT_VALUE) {
-    if (csc_val != NULL) {
-      CHECK_LE(csc_matrix->nnz, val_size) << "size not match!";
-      hl_memcpy_async((void *)csc_val,
-                      (void *)csc->csc_val,
-                      (csc_matrix->nnz) * sizeof(real),
-                      stream);
-    } else {
-      LOG(FATAL) << "parameter csr_val is null pointer!";
-    }
-  }
-}
-
-void hl_memcpy_from_csr_matrix(real *csr_val,
-                               size_t val_size,
-                               int *csr_row,
-                               size_t row_size,
-                               int *csr_col,
-                               size_t col_size,
-                               hl_sparse_matrix_s csr_matrix,
-                               hl_stream_t stream) {
-  CHECK_NOTNULL(csr_matrix);
-  CHECK_NOTNULL(csr_row);
-  CHECK_NOTNULL(csr_col);
-  CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
-      << "csr_matrix is not csr format error!";
-
-  if (csr_matrix->nnz > col_size ||
-      csr_matrix->rows + 1 > static_cast<int>(row_size)) {
-    LOG(FATAL) << "size not match!";
-  }
-
-  hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
-  hl_memcpy_async((void *)csr_row,
-                  (void *)csr->csr_row,
-                  (csr_matrix->rows + 1) * sizeof(int),
-                  stream);
-  hl_memcpy_async((void *)csr_col,
-                  (void *)csr->csr_col,
-                  (csr_matrix->nnz) * sizeof(int),
-                  stream);
-  if (csr_matrix->type == HL_FLOAT_VALUE) {
-    if (csr_val != NULL) {
-      CHECK_LE(csr_matrix->nnz, val_size) << "size not match!";
-      hl_memcpy_async((void *)csr_val,
-                      (void *)csr->csr_val,
-                      (csr_matrix->nnz) * sizeof(real),
-                      stream);
-    } else {
-      LOG(FATAL) << "parameter csr_val is null pointer!";
-    }
-  }
-}
-
-void hl_sparse_matrix_column_sum(
-    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
-  if (B_d->format == HL_SPARSE_CSR) {
-    hl_matrix_csr_column_sum(A_d, B_d, dimM, dimN, scale);
-  } else {
-    LOG(FATAL) << "Not support CSC format error!";
-  }
-}
-
-void hl_matrix_csr_column_sum(
-    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-
-  if (dimM <= 0 || dimN <= 0 || (B_d->rows != dimM || B_d->cols != dimN)) {
-    LOG(FATAL) << "parameter dims error!";
-  }
-
-  hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix);
-  if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) ||
-      B_d2->csr_row == NULL || B_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter B is null!";
-  }
-
-  if (B_d->nnz == 0) return;
-
-  int nnz = B_d->nnz;
-  int block = 512;
-  int grid = DIVUP(nnz, 512);
-  KeSMatrixCsrColumnSum<<<grid, block, 0, STREAM_DEFAULT>>>(
-      A_d, B_d2->csr_val, B_d2->csr_col, nnz);
-
-  CHECK_SYNC("hl_matrix_csr_column_sum failed");
-}
-
-void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
-  if (A_d->format == HL_SPARSE_CSR) {
-    hl_matrix_csr_add_bias(A_d, B_d, scale);
-  } else {
-    LOG(FATAL) << "Not support CSC format error!";
-  }
-}
-
-void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-
-  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
-      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter A_d is null!";
-  }
-
-  if (A_d->nnz == 0) return;
-
-  int nnz = A_d->nnz;
-  int block = 512;
-  int grid = DIVUP(nnz, 512);
-  KeSMatrixCsrAddBias<<<grid, block, 0, STREAM_DEFAULT>>>(
-      A_d2->csr_val, A_d2->csr_col, B_d, scale, nnz);
-
-  CHECK_SYNC("hl_sparse_matrix_add_bias failed");
-}
-
-void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
-                                real *B_d,
-                                int dimM,
-                                int dimN,
-                                real alpha,
-                                real beta) {
-  if (A_d->format == HL_SPARSE_CSR) {
-    hl_matrix_csr_add_dense(A_d, B_d, dimM, dimN, alpha, beta);
-  } else {
-    LOG(FATAL) << "Not support CSC format error!";
-  }
-}
-
-void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
-                             real *B_d,
-                             int dimM,
-                             int dimN,
-                             real alpha,
-                             real beta) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-
-  if (dimM <= 0 || dimN <= 0 || A_d->rows != dimM || A_d->cols != dimN) {
-    LOG(FATAL) << "parameter dim error!";
-  }
-
-  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
-      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter A_d is null!";
-  }
-
-  if (A_d->nnz == 0) return;
-
-  int gridX = DIVUP((A_d->nnz / dimM), 512);
-  gridX = gridX > 0 ? gridX : 1;
-  dim3 block(512, 1);
-  dim3 grid(gridX, dimM);
-  KeSMatrixCsrAddDense<<<grid, block, 0, STREAM_DEFAULT>>>(A_d2->csr_val,
-                                                           A_d2->csr_row,
-                                                           A_d2->csr_col,
-                                                           B_d,
-                                                           alpha,
-                                                           beta,
-                                                           dimM,
-                                                           dimN);
-
-  CHECK_SYNC("hl_sparse_matrix_add_dense failed");
-}
-
-int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
-  __sparse_get_return__(sMat, row);
-}
-
-int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
-  __sparse_get_return__(sMat, col);
-}
-
-real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
-  __sparse_get_return__(sMat, val);
-}
diff --git a/paddle/legacy/cuda/src/hl_cuda_sparse.cuh b/paddle/legacy/cuda/src/hl_cuda_sparse.cuh
deleted file mode 100644
index adb898c9ac6..00000000000
--- a/paddle/legacy/cuda/src/hl_cuda_sparse.cuh
+++ /dev/null
@@ -1,1015 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#include "hl_device_functions.cuh"
-
-template <int VALUE_TYPE>
-__device__ real findvalue(real* csr_val,
-                          int* csr_col,
-                          int col_start,
-                          int col_end,
-                          int index) {
-  int start = col_start;
-  int end = col_end-1;
-  int mid = -1;
-
-  while (start < end) {
-    mid = start + ((end - start) / 2);
-    if (csr_col[mid] < index)
-      start = mid + 1;
-    else
-      end = mid;
-  }
-
-  if ((start < col_end) && (csr_col[start] == index)) {
-    real ret = VALUE_TYPE == 0 ? 1.0 : csr_val[start];
-    return ret;
-  } else {
-    return 0.0;
-  }
-}
-
-#define     CU_CSR2DENSE_THREAD_X   16
-#define     CU_CSR2DENSE_THREAD_Y   16
-template <int VALUE_TYPE>
-__global__ void KeSMatrixCsr2Dense(real * csr_val,
-                                   int * csr_row,
-                                   int * csr_col,
-                                   real * C_d,
-                                   const int dimM,
-                                   const int dimN) {
-  const int row = blockIdx.y*blockDim.y+threadIdx.y;
-  const int col = blockIdx.x*blockDim.x+threadIdx.x;
-
-  if (row >= dimM || col >= dimN) {
-    return;
-  }
-
-  int start = csr_row[row];
-  int end = csr_row[row+1];
-
-  real sum = findvalue<VALUE_TYPE>(csr_val, csr_col, start, end, col);
-  C_d[row*dimN + col] = sum;
-}
-
-template <int VALUE_TYPE>
-__global__ void KeSMatrixCsc2Dense(real * csc_val,
-                                   int * csc_row,
-                                   int * csc_col,
-                                   real * C_d,
-                                   const int dimM,
-                                   const int dimN) {
-  const int row = blockIdx.y*blockDim.y+threadIdx.y;
-  const int col = blockIdx.x*blockDim.x+threadIdx.x;
-
-  if (row >= dimM || col >= dimN) {
-    return;
-  }
-
-  int start = csc_col[col];
-  int end = csc_col[col+1];
-
-  real sum = findvalue<VALUE_TYPE>(csc_val, csc_row, start, end, row);
-  C_d[row*dimN + col] = sum;
-}
-
-__device__ __forceinline__
-void _calculate_c(real &c, real sum) {
-  c = sum;
-}
-__device__ __forceinline__
-void _calculate_c(real &c, real sum, real beta) {
-  c = sum + beta * c;
-}
-
-#define     CU_CSRMM_N                  4
-#define     CU_CSRMM_THREAD_X           32
-#define     CU_CSRMM_THREAD_Y           32
-#define     CU_CSRMM_BLOCK_N            (32*CU_CSRMM_N)
-#define     CU_CSRMM_SHARED_ELEMENT     (2*CU_CSRMM_THREAD_X)
-template <int VALUE_TYPE>
-__global__ void KeSMatrixCsrMulDense(real *C_d,
-                                     real * csr_val,
-                                     int * csr_col,
-                                     int * csr_row,
-                                     real *B_d,
-                                     int dimM,
-                                     int dimN,
-                                     int dimK,
-                                     real alpha,
-                                     real beta) {
-  const int idx = threadIdx.x;
-  const int idy = threadIdx.y;
-  const int index_m = blockIdx.y*CU_CSRMM_THREAD_Y+threadIdx.y;
-  int index_n = blockIdx.x*CU_CSRMM_BLOCK_N+threadIdx.x;
-
-  __shared__ real csr_val_sh[CU_CSRMM_THREAD_Y][CU_CSRMM_SHARED_ELEMENT];
-  __shared__ int csr_col_sh[CU_CSRMM_THREAD_Y][CU_CSRMM_SHARED_ELEMENT];
-
-  if (index_m >= dimM) {
-    return;
-  }
-
-  // possible optimization, cache this in shared memory
-  int csr_start = csr_row[index_m];
-  int csr_end = csr_row[index_m+1];
-  int csr_index =  csr_start + idx;
-
-  int csr_iter = (csr_end-csr_start)/CU_CSRMM_SHARED_ELEMENT;
-  int csr_rem = (csr_end-csr_start)%CU_CSRMM_SHARED_ELEMENT;
-
-  int index_k = -1;
-  real sum[CU_CSRMM_N] = {0};
-  real b_r[CU_CSRMM_N] = {0};
-
-  for (int csr_i = 0; csr_i < csr_iter; csr_i++) {
-    #pragma unroll
-    for (int i = 0; i < (CU_CSRMM_SHARED_ELEMENT/CU_CSRMM_THREAD_X); i++) {
-      if (VALUE_TYPE != 0) {
-        csr_val_sh[idy][idx + i*CU_CSRMM_THREAD_X] = csr_val[csr_index];
-      }
-      csr_col_sh[idy][idx + i*CU_CSRMM_THREAD_X] = csr_col[csr_index];
-      csr_index += CU_CSRMM_THREAD_X;
-    }
-
-    for (int index = 0; index < CU_CSRMM_SHARED_ELEMENT; index++) {
-      index_k = csr_col_sh[idy][index];
-      real a_r = VALUE_TYPE == 0 ? 1.0 : csr_val_sh[idy][index];
-      int tmp_index = index_n;
-      real *B_d_r = B_d + tmp_index;
-      #pragma unroll
-      for (int n = 0; n < CU_CSRMM_N; n++) {
-        if (tmp_index >= dimN) break;
-        b_r[n] = B_d_r[index_k*dimN];
-        B_d_r += CU_CSRMM_THREAD_X;
-        tmp_index += CU_CSRMM_THREAD_X;
-      }
-
-      #pragma unroll
-      for (int n = 0; n < CU_CSRMM_N; n++) {
-        sum[n] = VALUE_TYPE == 0 ? sum[n] + b_r[n] : sum[n] + a_r*b_r[n];
-      }
-    }
-    // __syncthreads();
-  }
-
-  if (csr_rem != 0) {
-    #pragma unroll
-    for (int i = 0; i < (CU_CSRMM_SHARED_ELEMENT/CU_CSRMM_THREAD_X); i++) {
-      if (csr_index < csr_end) {
-        if (VALUE_TYPE != 0) {
-            csr_val_sh[idy][idx + i*CU_CSRMM_THREAD_X] = csr_val[csr_index];
-        }
-        csr_col_sh[idy][idx + i*CU_CSRMM_THREAD_X] = csr_col[csr_index];
-      }
-      csr_index += CU_CSRMM_THREAD_X;
-    }
-    // __syncthreads();
-
-    #pragma unroll
-    for (int index = 0; index < csr_rem; index++) {
-      index_k = csr_col_sh[idy][index];
-      real a_r = VALUE_TYPE == 0 ? 1.0 : csr_val_sh[idy][index];
-      int tmp_index = index_n;
-      real *B_d_r = B_d + tmp_index;
-      #pragma unroll
-      for (int n = 0; n < CU_CSRMM_N; n++) {
-        if (tmp_index >= dimN) break;
-        b_r[n] = B_d_r[index_k*dimN];
-        B_d_r += CU_CSRMM_THREAD_X;
-        tmp_index += CU_CSRMM_THREAD_X;
-      }
-
-      #pragma unroll
-      for (int n = 0; n < CU_CSRMM_N; n++) {
-        sum[n] = VALUE_TYPE == 0 ? sum[n] + b_r[n] : sum[n] + a_r*b_r[n];
-      }
-    }
-  }
-
-  C_d += __mul24(index_m, dimN);
-  if (beta == 0.0) {
-    for (int n = 0; n < CU_CSRMM_N; n++) {
-      if (index_n < dimN) {
-        _calculate_c(C_d[index_n], alpha * sum[n]);
-        index_n += CU_CSRMM_THREAD_X;
-      }
-    }
-  } else {
-    for (int n = 0; n < CU_CSRMM_N; n++) {
-      if (index_n < dimN) {
-        _calculate_c(C_d[index_n], alpha * sum[n], beta);
-        index_n += CU_CSRMM_THREAD_X;
-      }
-    }
-  }
-}
-
-#define CU_CSC_MUL_DENSE_THREAD_N           1
-#define CU_CSC_MUL_DENSE_THREAD_X           32
-#define CU_CSC_MUL_DENSE_THREAD_Y           4
-#define CU_CSC_MUL_DENSE_BLOCK_K            (CU_CSC_MUL_DENSE_THREAD_Y)
-#define CU_CSC_MUL_DENSE_BLOCK_N            \
-        (CU_CSC_MUL_DENSE_THREAD_N * CU_CSC_MUL_DENSE_THREAD_X)
-#define CU_CSC_MUL_DENSE_SHARED_ELEMENT     (CU_CSC_MUL_DENSE_THREAD_X)
-template <int VALUE_TYPE>
-__global__ void KeSMatrixCscMulDense(real *C_d,
-                                     real * csc_val,
-                                     int * csc_row,
-                                     int * csc_col,
-                                     real *B_d,
-                                     int dimM,
-                                     int dimN,
-                                     int dimK,
-                                     real alpha,
-                                     real beta) {
-  const int idx = threadIdx.x;
-  const int idy = threadIdx.y;
-  const int index_k = blockIdx.y*CU_CSC_MUL_DENSE_BLOCK_K+threadIdx.y;
-  const int index_n = blockIdx.x*CU_CSC_MUL_DENSE_BLOCK_N+threadIdx.x;
-
-  if (index_k >= dimK) {
-    return;
-  }
-
-  __shared__
-  real csc_val_sh[CU_CSC_MUL_DENSE_THREAD_Y][CU_CSC_MUL_DENSE_SHARED_ELEMENT];
-  __shared__
-  int csc_row_sh[CU_CSC_MUL_DENSE_THREAD_Y][CU_CSC_MUL_DENSE_SHARED_ELEMENT];
-
-  // possible optimization, cache this in shared memory
-  int csc_start = csc_col[index_k];
-  int csc_end = csc_col[index_k+1];
-  int csc_index = csc_start + idx;
-  int csc_iter = (csc_end-csc_start)/CU_CSC_MUL_DENSE_SHARED_ELEMENT;
-  int csc_rem = (csc_end-csc_start)%CU_CSC_MUL_DENSE_SHARED_ELEMENT;
-  int index_m = -1;
-
-  real b_r[CU_CSC_MUL_DENSE_THREAD_N] = {0};
-  real *B_d_r;
-  real *C_d_r;
-  int index_n_t;
-  B_d += index_n + __mul24(index_k, dimN);
-  C_d += index_n;
-  for (int csr_i = 0; csr_i < csc_iter; csr_i++) {
-    #pragma unroll
-    for (int i = 0;
-         i < (CU_CSC_MUL_DENSE_SHARED_ELEMENT/CU_CSC_MUL_DENSE_THREAD_X); i++) {
-      if (VALUE_TYPE != 0) {
-        csc_val_sh[idy][idx + i*CU_CSC_MUL_DENSE_THREAD_X] = csc_val[csc_index];
-      }
-      csc_row_sh[idy][idx + i*CU_CSC_MUL_DENSE_THREAD_X] = csc_row[csc_index];
-      csc_index += CU_CSC_MUL_DENSE_THREAD_X;
-    }
-
-    #pragma unroll
-    for (int index = 0; index < CU_CSC_MUL_DENSE_SHARED_ELEMENT; index++) {
-      index_m = csc_row_sh[idy][index];
-      real a_r = VALUE_TYPE == 0 ? 1.0 : csc_val_sh[idy][index];
-      B_d_r = B_d;
-      C_d_r = C_d + __mul24(index_m, dimN);
-
-      index_n_t = index_n;
-      #pragma unroll
-      for (int n = 0; n < CU_CSC_MUL_DENSE_THREAD_N; n++) {
-        if (index_n_t < dimN) {
-          b_r[n] = B_d_r[0];
-          B_d_r += CU_CSC_MUL_DENSE_THREAD_X;
-          index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
-        }
-      }
-
-      index_n_t = index_n;
-      #pragma unroll
-      for (int n = 0; n < CU_CSC_MUL_DENSE_THREAD_N; n++) {
-        if (index_n_t < dimN) {
-          real tmp;
-          tmp = alpha*a_r*b_r[n];
-          paddle::paddleAtomicAdd(C_d_r, tmp);
-          C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
-          index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
-        }
-      }
-    }
-    // __syncthreads();
-  }
-
-  if (csc_rem != 0) {
-    #pragma unroll
-    for (int i = 0;
-         i < (CU_CSC_MUL_DENSE_SHARED_ELEMENT/CU_CSC_MUL_DENSE_THREAD_X); i++) {
-      if (csc_index < csc_end) {
-        if (VALUE_TYPE != 0) {
-          csc_val_sh[idy][idx + i * CU_CSC_MUL_DENSE_THREAD_X] =
-            csc_val[csc_index];
-        }
-        csc_row_sh[idy][idx + i * CU_CSC_MUL_DENSE_THREAD_X] =
-          csc_row[csc_index];
-      }
-      csc_index += CU_CSC_MUL_DENSE_THREAD_X;
-    }
-    // __syncthreads();
-
-    #pragma unroll
-    for (int index = 0; index < csc_rem; index++) {
-      index_m = csc_row_sh[idy][index];
-      real a_r = VALUE_TYPE == 0 ? 1.0 : csc_val_sh[idy][index];
-      B_d_r = B_d;
-      C_d_r = C_d + __mul24(index_m, dimN);
-
-      index_n_t = index_n;
-      #pragma unroll
-      for (int n = 0; n < CU_CSC_MUL_DENSE_THREAD_N; n++) {
-        if (index_n_t < dimN) {
-          b_r[n] = B_d_r[0];
-          B_d_r += CU_CSC_MUL_DENSE_THREAD_X;
-          index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
-        }
-      }
-
-      index_n_t = index_n;
-      #pragma unroll
-      for (int n = 0; n < CU_CSC_MUL_DENSE_THREAD_N; n++) {
-        if (index_n_t < dimN) {
-          real tmp;
-          tmp = alpha*a_r*b_r[n];
-          paddle::paddleAtomicAdd(C_d_r, tmp);
-          C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
-          index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
-        }
-      }
-    }
-  }
-}
-
-/* best perf */
-#ifndef PADDLE_TYPE_DOUBLE
-#define CU_CSCMM_THREAD_M_BEST          9
-#else
-#define CU_CSCMM_THREAD_M_BEST          4
-#endif
-#define CU_CSCMM_THREAD_X_BEST          32
-#define CU_CSCMM_THREAD_Y_BEST          32
-#define CU_CSCMM_BLOCK_M_BEST  (CU_CSCMM_THREAD_M_BEST * CU_CSCMM_THREAD_X_BEST)
-#define CU_CSCMM_BLOCK_N_BEST  (CU_CSCMM_THREAD_Y_BEST)
-template <int VALUE_TYPE>
-__global__ void KeSMatrixDenseMulCsc(real *C_d,
-                                     const real *A_d,
-                                     const real *csc_val,
-                                     const int *csc_row,
-                                     const int *csc_col,
-                                     int dimM,
-                                     int dimN,
-                                     int dimK,
-                                     real alpha,
-                                     real beta) {
-  __shared__ real csc_val_sh[CU_CSCMM_BLOCK_N_BEST][CU_CSCMM_THREAD_X_BEST];
-  __shared__ int csc_row_sh[CU_CSCMM_BLOCK_N_BEST][CU_CSCMM_THREAD_X_BEST];
-  __shared__ real A_s[CU_CSCMM_BLOCK_M_BEST][CU_CSCMM_THREAD_Y_BEST+1];
-
-  int iter_k = dimK/CU_CSCMM_THREAD_Y_BEST;
-  int rem_k = dimK%CU_CSCMM_THREAD_Y_BEST;
-  const int idx = threadIdx.x;
-  const int idy = threadIdx.y;
-  const int index_n = blockIdx.y*CU_CSCMM_BLOCK_N_BEST+threadIdx.y;
-
-  int csc_start;
-  int csc_end;
-  if (index_n < dimN) {
-    csc_start = csc_col[index_n];
-    csc_end = csc_col[index_n+1];
-  } else {
-    csc_start = 0;
-    csc_end = 0;
-  }
-  int csc_index =  csc_start + idx;
-  int csc_iter = (csc_end-csc_start)/CU_CSCMM_THREAD_X_BEST;
-  int csc_rem = (csc_end-csc_start)%CU_CSCMM_THREAD_X_BEST;
-  int index_k = -1;
-
-  if (csc_index < csc_end) {
-    if (VALUE_TYPE != 0) {
-      csc_val_sh[idy][idx] = csc_val[csc_index];
-    }
-    csc_row_sh[idy][idx] = csc_row[csc_index];
-    csc_index += CU_CSCMM_THREAD_X_BEST;
-  }
-
-  const int ibx = blockIdx.x * CU_CSCMM_BLOCK_M_BEST;
-  int dim = ibx+idy;
-  A_d += idx + __mul24(dim, dimK);
-  #pragma unroll
-  for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-    A_s[idy + m * 32][idx] = 0.0f;
-    if (dim + m * 32 < dimM && idx < dimK) {
-      A_s[idy + m * 32][idx] = A_d[m * 32 * dimK];
-    }
-  }
-  __syncthreads();
-
-  real b_r;
-  real a_r[CU_CSCMM_THREAD_M_BEST] = {0};
-  real sum[CU_CSCMM_THREAD_M_BEST] = {0};
-  real A_r_s[CU_CSCMM_THREAD_M_BEST] = {0};
-  int index = 0;
-  int block_end_k = 0;;
-  int index_iter_csc = csc_iter;
-
-  for (int i_k = 0; i_k < iter_k; i_k++) {
-    A_d += CU_CSCMM_THREAD_Y_BEST;
-    block_end_k += CU_CSCMM_THREAD_Y_BEST;
-    #pragma unroll
-    for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-      if (dim + m*32 < dimM && (idx + (i_k+1)*CU_CSCMM_THREAD_Y_BEST < dimK)) {
-        A_r_s[m] = A_d[m*32*dimK];
-      } else {
-        A_r_s[m] = 0.0f;
-      }
-    }
-
-    if (index_iter_csc > 0) {
-      goto WARP_SYNC;
-    } else {
-      goto WARP_SYNC_2;
-    }
-
-    while (index_iter_csc) {
-      if (VALUE_TYPE != 0) {
-        csc_val_sh[idy][idx] = csc_val[csc_index];
-      }
-      csc_row_sh[idy][idx] = csc_row[csc_index];
-      csc_index += CU_CSCMM_THREAD_X_BEST;
-      index = 0;
-
-WARP_SYNC:
-      for (; index < CU_CSCMM_THREAD_X_BEST; index++) {
-        index_k = csc_row_sh[idy][index];
-        if (index_k >= block_end_k) {
-          goto BLOCK_SYNC;
-        }
-        b_r = VALUE_TYPE == 0 ? 1.0 : csc_val_sh[idy][index];
-        #pragma unroll
-        for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-          a_r[m] = A_s[idx+m*32][index_k-i_k*CU_CSCMM_THREAD_Y_BEST];
-          sum[m] = VALUE_TYPE == 0 ? sum[m] + a_r[m] : sum[m] + a_r[m]*b_r;
-        }
-      }
-      index_iter_csc--;
-    }
-
-    if (csc_rem != 0) {
-      if (csc_iter != 0) {
-        if (csc_index < csc_end) {
-          if (VALUE_TYPE != 0) {
-            csc_val_sh[idy][idx] = csc_val[csc_index];
-          }
-          csc_row_sh[idy][idx] = csc_row[csc_index];
-          csc_index += CU_CSCMM_THREAD_X_BEST;
-        }
-        index = 0;
-      }
-      __threadfence_block();
-
-WARP_SYNC_2:
-      for (; index < csc_rem; index++) {
-        index_k = csc_row_sh[idy][index];
-        if (index_k >= block_end_k) {
-          goto BLOCK_SYNC;
-        }
-        b_r = VALUE_TYPE == 0 ? 1.0 : csc_val_sh[idy][index];
-        #pragma unroll
-        for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-          a_r[m] = A_s[idx+m*32][index_k-i_k*CU_CSCMM_THREAD_Y_BEST];
-          sum[m] = VALUE_TYPE == 0 ? sum[m] + a_r[m] : sum[m] + a_r[m]*b_r;
-        }
-      }
-    }
-
-BLOCK_SYNC:
-    __syncthreads();
-    #pragma unroll
-    for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-      A_s[idy+m*32][idx] = A_r_s[m];
-    }
-    __syncthreads();
-  }
-
-  if (rem_k != 0) {
-    if (index_iter_csc == 0) {
-      goto TEMP_TEST;
-    }
-
-    for (; index < CU_CSCMM_THREAD_X_BEST; index++) {
-      index_k = csc_row_sh[idy][index];
-      if (index_k >= dimK) {
-        break;
-      }
-
-      b_r = VALUE_TYPE == 0 ? 1.0 : csc_val_sh[idy][index];
-      #pragma unroll
-      for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-        a_r[m] = A_s[idx+m*32][index_k-iter_k*CU_CSCMM_THREAD_Y_BEST];
-        sum[m] = VALUE_TYPE == 0 ? sum[m] + a_r[m] : sum[m] + a_r[m]*b_r;
-      }
-    }
-
-    if (csc_rem != 0) {
-      if (csc_index < csc_end) {
-        if (VALUE_TYPE != 0) {
-          csc_val_sh[idy][idx] = csc_val[csc_index];
-        }
-        csc_row_sh[idy][idx] = csc_row[csc_index];
-        csc_index += CU_CSCMM_THREAD_X_BEST;
-      }
-      index = 0;
-
-TEMP_TEST:
-      for (; index < csc_rem; index++) {
-        index_k = csc_row_sh[idy][index];
-        if (index_k >= dimK) {
-            break;
-        }
-        b_r = VALUE_TYPE == 0 ? 1.0 : csc_val_sh[idy][index];
-        #pragma unroll
-        for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-          a_r[m] = A_s[idx+m*32][index_k-iter_k*CU_CSCMM_THREAD_Y_BEST];
-          sum[m] = VALUE_TYPE == 0 ? sum[m] + a_r[m] : sum[m] + a_r[m]*b_r;
-        }
-      }
-    }
-  }
-
-  __syncthreads();
-  #pragma unroll
-  for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-    A_s[idx+m*32][idy] = alpha*sum[m];
-  }
-  __syncthreads();
-
-  int index_m_c = ibx + idy;
-  int index_n_c = blockIdx.y*CU_CSCMM_BLOCK_N_BEST + idx;
-  C_d += index_n_c + __mul24(index_m_c, dimN);
-  if (beta == 0.0) {
-    for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-      if (index_m_c < dimM && index_n_c < dimN) {
-        _calculate_c(C_d[0], A_s[idy + m * 32][idx]);
-      }
-      index_m_c += 32;
-      C_d += dimN*32;
-    }
-  } else {
-    for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-      if (index_m_c < dimM && index_n_c < dimN) {
-        _calculate_c(C_d[0], A_s[idy + m * 32][idx], beta);
-      }
-      index_m_c += 32;
-      C_d += dimN*32;
-    }
-  }
-}
-
-#define     CU_DM_CSR_THREAD_X           32
-#define     CU_DM_CSR_THREAD_Y           4
-#define     CU_DM_CSR_N                  4
-#define     CU_DM_CSR_BLOCK_M            (CU_DM_CSR_N*CU_DM_CSR_THREAD_Y)
-#define     CU_DM_CSR_BLOCK_K            (CU_DM_CSR_THREAD_X)
-#define     CU_DM_CSR_SHARED_ELEMENT     (1*CU_DM_CSR_THREAD_Y)
-template <int VALUE_TYPE>
-__global__ void KeSMatrixDenseMulCsr(real *C_d,
-                                     real *A_d,
-                                     real *csr_val,
-                                     const int *csr_row,
-                                     const int *csr_col,
-                                     int dimM,
-                                     int dimN,
-                                     int dimK,
-                                     real alpha,
-                                     real beta) {
-  const int idx = threadIdx.x;
-  const int idy = threadIdx.y;
-  int index_k = __mul24(blockIdx.x, CU_DM_CSR_THREAD_X) + threadIdx.x;
-  int index_m = __mul24(blockIdx.y, CU_DM_CSR_BLOCK_M) +
-    __mul24(threadIdx.y, CU_DM_CSR_N);
-
-  if (index_k >= dimK) {
-    return;
-  }
-
-  __shared__ real csr_val_sh[CU_DM_CSR_THREAD_X][CU_DM_CSR_SHARED_ELEMENT];
-  __shared__ int csr_col_sh[CU_DM_CSR_THREAD_X][CU_DM_CSR_SHARED_ELEMENT];
-
-  // possible optimization, cache this in shared memory
-  int csr_start = csr_row[index_k];
-  int csr_end = csr_row[index_k+1];
-  int csr_index =  csr_start + idy;
-  int csr_iter = (csr_end-csr_start)/CU_DM_CSR_SHARED_ELEMENT;
-  int csr_rem = (csr_end-csr_start)%CU_DM_CSR_SHARED_ELEMENT;
-
-  real tmp = 0.0;
-  int index_n = -1;
-  int index_m_t = index_m;
-  real a_r[CU_DM_CSR_N] = {0};
-  real *A_d_tmp = A_d + __mul24(index_m, dimK) + index_k;
-  real *A_d_r = A_d_tmp;
-
-  #pragma unroll
-  for (int n=0; n < CU_DM_CSR_N; n++) {
-    if ( index_m_t++ < dimM ) {
-      a_r[n] = A_d_r[0];
-      A_d_r += dimK;
-    }
-  }
-
-  for (int csr_i = 0; csr_i < csr_iter; csr_i++) {
-    #pragma unroll
-    for (int i = 0; i < (CU_DM_CSR_SHARED_ELEMENT/CU_DM_CSR_THREAD_Y); i++) {
-      if (VALUE_TYPE != 0) {
-        csr_val_sh[idx][idy + i*CU_DM_CSR_THREAD_Y] = csr_val
-        [csr_index];
-      }
-      csr_col_sh[idx][idy + i*CU_DM_CSR_THREAD_Y] = csr_col[csr_index];
-      csr_index += CU_DM_CSR_THREAD_Y;
-    }
-    __syncthreads();
-
-    #pragma unroll
-    for (int index = 0; index < CU_DM_CSR_SHARED_ELEMENT; index++) {
-      index_n = csr_col_sh[idx][index];
-      real b_r = VALUE_TYPE == 0 ? 1.0 : csr_val_sh[idx][index];
-      real *C_d_r = C_d + __mul24(index_m, dimN) + index_n;
-
-      index_m_t = index_m;
-      #pragma unroll
-      for (int n=0; n < CU_DM_CSR_N; n++) {
-        if (index_m_t++ < dimM) {
-          tmp = alpha * b_r * a_r[n];
-          paddle::paddleAtomicAdd(C_d_r, tmp);
-          C_d_r += dimN;
-        }
-      }
-    }
-    __syncthreads();
-  }
-
-  if (csr_rem != 0) {
-    #pragma unroll
-    for (int i = 0; i < (CU_DM_CSR_SHARED_ELEMENT/CU_DM_CSR_THREAD_Y); i++) {
-      if (csr_index < csr_end) {
-        if (VALUE_TYPE !=0) {
-          csr_val_sh[idx][idy + i*CU_DM_CSR_THREAD_Y] = csr_val[csr_index];
-        }
-        csr_col_sh[idx][idy + i*CU_DM_CSR_THREAD_Y] = csr_col[csr_index];
-      }
-      csr_index += CU_DM_CSR_THREAD_Y;
-    }
-    __syncthreads();
-
-    #pragma unroll
-    for (int index = 0; index < csr_rem; index++) {
-      index_n = csr_col_sh[idx][index];
-      real b_r = VALUE_TYPE == 0 ? 1.0 : csr_val_sh[idx][index];
-      real *C_d_r = C_d + __mul24(index_m, dimN) + index_n;
-      index_m_t = index_m;
-      #pragma unroll
-      for (int n=0; n < CU_DM_CSR_N; n++) {
-        if (index_m_t++ < dimM) {
-          tmp = alpha * b_r * a_r[n];
-          paddle::paddleAtomicAdd(C_d_r, tmp);
-          C_d_r += dimN;
-        }
-      }
-    }
-  }
-}
-
-#define     CU_CSCMM_DMD2CSC_THREAD_X   128
-#define     CU_CSCMM_DMD2CSC_SHARE_X    128
-__global__ void KeSMatrixDenseMulDense2CSC(real *csc_val,
-                                           const int *csc_row,
-                                           const int *csc_col,
-                                           real *A_d,
-                                           real *B_d,
-                                           bool trans_A,
-                                           bool trans_B,
-                                           int dimM,
-                                           int dimN,
-                                           int dimK,
-                                           real alpha,
-                                           real beta) {
-  __shared__ real B_s[CU_CSCMM_DMD2CSC_SHARE_X];
-  const int idx = threadIdx.x;  // one block compute one column
-  const int ibx = blockIdx.x;  // col index
-  int csc_start;
-  int csc_end;
-  if (ibx < dimN) {
-    csc_start = csc_col[ibx];
-    csc_end = csc_col[ibx + 1];
-  } else {
-    csc_start = 0;
-    csc_end = 0;
-  }
-
-  int iter_num = dimK / CU_CSCMM_DMD2CSC_SHARE_X;
-  int iter_rem = dimK % CU_CSCMM_DMD2CSC_SHARE_X;
-  real * B_tmp = B_d + ibx;  // column index
-
-  for (int j = 0; j < iter_num; j++) {
-    int rowStart = (j * CU_CSCMM_DMD2CSC_SHARE_X + idx) * dimN;
-    int index = rowStart;
-    for (int m = idx;
-         m < CU_CSCMM_DMD2CSC_SHARE_X; m += CU_CSCMM_DMD2CSC_THREAD_X) {
-     B_s[m] = B_tmp[index];
-     index = index + CU_CSCMM_DMD2CSC_THREAD_X * dimN;
-    }
-    __syncthreads();
-
-    for (int i = csc_col[ibx] + idx;
-         i < csc_col[ibx + 1]; i += CU_CSCMM_DMD2CSC_THREAD_X) {
-      int row = csc_row[i];  // row Index
-      /* compute C[row, ibx] */
-      float results = 0;
-      if (!trans_A) {
-        int index = row * dimK + j * CU_CSCMM_DMD2CSC_SHARE_X;
-        for (int k = 0; k < CU_CSCMM_DMD2CSC_SHARE_X; k++) {
-          results += A_d[index + k] * B_s[k];
-        }
-      } else {
-        int  index = j * CU_CSCMM_DMD2CSC_SHARE_X;
-        for (int k = 0; k < CU_CSCMM_DMD2CSC_SHARE_X; k++) {
-          results += A_d[(index + k) * dimM + row] * B_s[k];
-        }
-      }
-      csc_val[i]  += results * alpha;
-    }
-  }
-
-  if (iter_rem) {
-    int rowStart = (iter_num * CU_CSCMM_DMD2CSC_SHARE_X + idx) * dimN;
-    int index = rowStart;
-    // #pragma unroll
-    for (int m = idx; m < iter_rem;  m += CU_CSCMM_DMD2CSC_THREAD_X) {
-      B_s[m] = B_tmp[index];
-      index = index + CU_CSCMM_DMD2CSC_THREAD_X * dimN;
-    }
-    __syncthreads();
-    for (int i = csc_start + idx;
-         i < csc_end; i += CU_CSCMM_DMD2CSC_THREAD_X) {
-      int row = csc_row[i];  // row Index
-      /* compute C[row, ibx] */
-      float results = 0;
-      if (!trans_A) {
-        int index = row * dimK + iter_num * CU_CSCMM_DMD2CSC_SHARE_X;
-        for (int k = 0; k < iter_rem; k++) {
-          results += A_d[index + k] * B_s[k];
-        }
-      } else {
-        int  index =  iter_num * CU_CSCMM_DMD2CSC_SHARE_X;
-        for (int k = 0; k < iter_rem; k++) {
-          results += A_d[(index + k) * dimM + row] * B_s[k];
-        }
-      }
-      csc_val[i] += alpha * results;
-    }
-  }
-}
-
-#define     CU_CSCMM_DMD2CSR_THREAD_X   128
-#define     CU_CSCMM_DMD2CSR_SHARE_X    128
-__global__ void KeSMatrixDenseMulDense2CSR(real *csr_val,
-                                     const int *csr_row,
-                                     const int *csr_col,
-                                     real *A_d,
-                                     real *B_d,
-                                     bool  trans_A,
-                                     bool  trans_B,
-                                     int dimM,
-                                     int dimN,
-                                     int dimK,
-                                     real alpha,
-                                     real beta) {
-  __shared__ real A_s[CU_CSCMM_DMD2CSR_SHARE_X];
-  const int idx = threadIdx.x;  // one block comput one row
-  const int ibx = blockIdx.x;  // row index
-
-  int csr_start;
-  int csr_end;
-  if (ibx < dimM) {
-    csr_start = csr_row[ibx];
-    csr_end = csr_row[ibx+1];
-  } else {
-    csr_start = 0;
-    csr_end = 0;
-  }
-
-  int iter_num = dimK / CU_CSCMM_DMD2CSR_SHARE_X;
-  int csr_rem = dimK % CU_CSCMM_DMD2CSR_SHARE_X;
-  for (int j = 0; j < iter_num; j++) {
-    if (!trans_A) {
-      int colStart = j * CU_CSCMM_DMD2CSR_SHARE_X + ibx * dimK;
-      int index = colStart + idx;
-      #pragma unroll
-      for (int m = idx;
-           m < CU_CSCMM_DMD2CSR_SHARE_X; m += CU_CSCMM_DMD2CSR_THREAD_X) {
-        A_s[m] = A_d[index];
-        index = index + CU_CSCMM_DMD2CSR_THREAD_X;
-      }
-    } else {
-      int colStart = (j * CU_CSCMM_DMD2CSR_SHARE_X) * dimM  + ibx;
-      int index = colStart + idx * dimM;
-      for (int m = idx;
-           m < CU_CSCMM_DMD2CSR_SHARE_X; m += CU_CSCMM_DMD2CSR_THREAD_X) {
-        A_s[m] = A_d[index];
-        index = index + CU_CSCMM_DMD2CSR_THREAD_X * dimM;
-      }
-    }
-    __syncthreads();
-    for (int i = csr_start + idx; i < csr_end; i += CU_CSCMM_DMD2CSR_THREAD_X) {
-      int col_idx =  csr_col[i];  // col index
-      /* comput C[ibx, col_idx] */
-      real results = 0;
-      int index = (j * CU_CSCMM_DMD2CSR_SHARE_X) * dimN + col_idx;
-      for (int k = 0; k < CU_CSCMM_DMD2CSR_SHARE_X; k++) {
-        results += A_s[k] * B_d[k * dimN + index];
-      }
-      csr_val[i] += alpha * results;
-    }
-  }
-
-  if (csr_rem) {
-    if (!trans_A) {
-      int colStart = (ibx + 1) * dimK- csr_rem;
-      int index = colStart + idx;
-      #pragma unroll
-      for (int m = idx; m < csr_rem; m += CU_CSCMM_DMD2CSR_THREAD_X) {
-        A_s[m] = A_d[index];
-        index = index + CU_CSCMM_DMD2CSR_THREAD_X;
-      }
-     } else {
-        int colStart = (iter_num * CU_CSCMM_DMD2CSR_SHARE_X) * dimM  + ibx;
-        int index = colStart + idx * dimM;
-        for (int m = idx; m < csr_rem;  m += CU_CSCMM_DMD2CSR_THREAD_X) {
-          A_s[m] = A_d[index];
-          index = index + CU_CSCMM_DMD2CSR_THREAD_X * dimM;
-        }
-     }
-     __syncthreads();
-     for (int i = csr_start + idx;
-          i < csr_end; i += CU_CSCMM_DMD2CSR_THREAD_X) {
-       int col_idx =  csr_col[i];
-       float results = 0;
-       int  index = (iter_num *CU_CSCMM_DMD2CSR_SHARE_X) * dimN + col_idx;
-       for (int k = 0; k < csr_rem; k++) {
-         results += A_s[k ] * B_d[k * dimN + index];
-       }
-       csr_val[i] += alpha * results;
-     }
-  }
-}
-
-
-/**
- *  @brief  Use to calculate row/col index for CSR/CSC sparse matrix
- *          according to csr_row(csc_col) and
- *          the value position in csr_val/csc_val
- *
- *  @param  indice      csr_row for hl_csr_matrix
- *                      csc_col for hl_csc_matrix
- *  @param  num         length of csr_row/csc_col
- *  @param  index       the value position in csr_val/csc_val
- *                      but need to add 1
- *                      that is, 1,2,3,...,nnz
- *  @note   the following kernels doesn't use findIndex,
- *          but may be used in the future.
- */
-__device__ __forceinline__
-int findIndex(int* indice, int num, int index) {
-  int start = 0;
-  int end = num - 1;
-  int mid = -1;
-  while (start < end) {
-    mid = start + ((end - start) / 2);
-    if (indice[mid] < index)
-      start = mid + 1;
-    else
-      end = mid;
-  }
-  return (end - 1);
-}
-
-
-/**
- * @brief sum columns of csr sparse matrix (csr_val), then add to a_val.
- *        This kernel used atomicAdd and adapted to w >> h, w is the
- *        width of csr, and h is the height of csr.
- */
-__global__ void KeSMatrixCsrColumnSum(real* a_val, real* csr_val,
-                                      int* csr_col, const int dimNNZ) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int idx = gid; idx < dimNNZ; idx += gridDim.x * blockDim.x) {
-    int colIdx = csr_col[idx];
-    real val = csr_val[idx];
-    paddle::paddleAtomicAdd(a_val + colIdx, val);
-  }
-}
-
-__global__ void KeSMatrixCsrAddBias(real* csr_val, int* csr_col, real* b_d,
-                                    real scale, const int nnz) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;  // global index
-  for (int idx = gid; idx < nnz; idx += gridDim.x * blockDim.x) {
-    int colIdx = csr_col[idx];
-    // not coalesced access to b_d
-    csr_val[idx] += scale * b_d[colIdx];
-  }
-}
-
-/**
- * @brief  csr sparse matrix add dense matrix.
- *         This kernel occurs load imbalances
- *         if number of each row is different greatly.
- */
-__global__ void KeSMatrixCsrAddDense(real* csr_val, int* csr_row,
-                                     int* csr_col, real* b_d, real alpha,
-                                     real beta, int dimM, int dimN) {
-  int gidx = blockIdx.x * blockDim.x + threadIdx.x;
-  int gidy = blockIdx.y;
-  if (gidy < dimM) {
-    int start = csr_row[gidy];
-    int end = csr_row[gidy + 1];
-    for (int x = gidx; x < (end - start); x += gridDim.x * blockDim.x) {
-      int col = csr_col[start + x];
-      real val = csr_val[start + x];
-      csr_val[start + x] = beta * val + alpha * b_d[gidy * dimN + col];
-    }
-  }
-}
-
-#define CU_BLOCK_K 16
-#define CU_BLOCK_SIZE 128
-
-__global__ void KeSMatrixDenseMulDenseTrans2CSR(
-    real* csr_val, const int* csr_row, const int* csr_col, real* A_d,
-    real* B_d, bool trans_A, bool trans_B, int dimM, int dimN, int dimK,
-    real alpha, real beta) {
-
-  __shared__ real B_s[CU_BLOCK_SIZE][CU_BLOCK_K];
-  __shared__ real A_s[CU_BLOCK_K];
-
-  const int idx = threadIdx.x;
-
-  const int gidx_begin = blockIdx.x * CU_BLOCK_SIZE;
-  const int gidy = blockIdx.y;
-  const int gx_dim = gridDim.x * blockDim.x;
-
-  int start = csr_row[gidy];
-  int end = csr_row[gidy + 1];
-  int size = end - start;
-
-  int c_iter_num = (size + gx_dim - 1) / gx_dim;
-  int iter_num = (dimK + CU_BLOCK_K - 1) / CU_BLOCK_K;
-  for (int i = 0; i < c_iter_num; ++i) {
-    if ((gidx_begin + i * gx_dim) >= size) {
-      return;  // No need to calculate in this block.
-    }
-
-    real res = 0.0;
-    int c_idx = gidx_begin + i * gx_dim + idx;
-
-    for (int j = 0; j < iter_num; ++j) {
-      int col = j * CU_BLOCK_K + idx;
-      if (idx < CU_BLOCK_K) {
-        A_s[idx] = col < dimK ? A_d[gidy * dimK + col] : 0.0;
-      }
-      for (int m = 0; m < CU_BLOCK_K; ++m) {
-        int row = (idx / CU_BLOCK_K) + m * (CU_BLOCK_SIZE / CU_BLOCK_K);
-        col = idx % CU_BLOCK_K;
-        int csr_idx = gidx_begin + i * gx_dim + row;
-        int ldRow = csr_idx < size ? csr_col[start + csr_idx] : 0;
-        int ldCol = j * CU_BLOCK_K + col;
-        B_s[row][col] = (csr_idx < size && ldCol < dimK) ?
-                        B_d[ldRow * dimK + ldCol] : 0.0;
-      }
-      __syncthreads();
-
-      for (int k = 0; k < CU_BLOCK_K; k++) {
-        res += A_s[k] * B_s[idx][k];
-      }
-      __syncthreads();
-    }
-
-    if (c_idx < size) {
-      csr_val[start + c_idx] += alpha * res;
-    }
-  }
-}
diff --git a/paddle/legacy/cuda/src/hl_math.cc b/paddle/legacy/cuda/src/hl_math.cc
deleted file mode 100644
index 585b356d0a7..00000000000
--- a/paddle/legacy/cuda/src/hl_math.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "avx_mathfun.h"
-
-namespace hppl {
-__m256 exp(__m256 a) { return exp256_ps(a); }
-
-__m256 log(__m256 a) { return log256_ps(a); }
-
-__m256 sin(__m256 a) { return sin256_ps(a); }
-
-__m256 cos(__m256 a) { return cos256_ps(a); }
-
-}  // namespace hppl
diff --git a/paddle/legacy/cuda/src/hl_perturbation_util.cu b/paddle/legacy/cuda/src/hl_perturbation_util.cu
deleted file mode 100644
index e15cbb14393..00000000000
--- a/paddle/legacy/cuda/src/hl_perturbation_util.cu
+++ /dev/null
@@ -1,289 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdlib.h>
-#include <cmath>
-#include "hl_base.h"
-#include "hl_cuda.h"
-#include "hl_perturbation_util.cuh"
-#include "hl_time.h"
-
-#define _USE_MATH_DEFINES
-
-/*
- * Get the original coordinate for a pixel in a transformed image.
- * x, y: coordiate in the transformed image.
- * tgtCenter: the center coordiate of the transformed image.
- * imgSCenter: the center coordinate of the source image.
- * centerX, centerY: translation.
- * sourceX, sourceY: output coordinates in the original image.
- */
-__device__ void getTranformCoord(int x,
-                                 int y,
-                                 real theta,
-                                 real scale,
-                                 real tgtCenter,
-                                 real imgCenter,
-                                 real centerR,
-                                 real centerC,
-                                 int* sourceX,
-                                 int* sourceY) {
-  real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};
-
-  // compute coornidates in the rotated and scaled image
-  real x_new = x - tgtCenter + centerC;
-  real y_new = y - tgtCenter + centerR;
-
-  // compute coornidates in the original image
-  x_new -= imgCenter;
-  y_new -= imgCenter;
-  real xx = H[0] * x_new + H[1] * y_new;
-  real yy = H[2] * x_new + H[3] * y_new;
-  *sourceX = __float2int_rn(xx / scale + imgCenter);
-  *sourceY = __float2int_rn(yy / scale + imgCenter);
-}
-
-/*
- * imgs:            (numImages, imgPixels)
- * target:          (numImages * samplingRate, tgtPixels)
- * the channels of one pixel are stored continuously in memory.
- *
- * created by Wei Xu (genome), converted by Jiang Wang
- */
-
-__global__ void kSamplingPatches(const real* imgs,
-                                 real* targets,
-                                 int imgSize,
-                                 int tgtSize,
-                                 const int channels,
-                                 int samplingRate,
-                                 const real* thetas,
-                                 const real* scales,
-                                 const int* centerRs,
-                                 const int* centerCs,
-                                 const real padValue,
-                                 const int numImages) {
-  const int caseIdx = blockIdx.x * 4 + threadIdx.x;
-  const int pxIdx = blockIdx.y * 128 + threadIdx.y;
-  const int imgPixels = imgSize * imgSize;
-  const int tgtPixels = tgtSize * tgtSize;
-  const int numPatches = numImages * samplingRate;
-
-  real tgtCenter = (tgtSize - 1) / 2;
-  real imgCenter = (imgSize - 1) / 2;
-
-  if (pxIdx < tgtPixels && caseIdx < numPatches) {
-    const int imgIdx = caseIdx / samplingRate;
-
-    // transform coordiates
-    const int pxX = pxIdx % tgtSize;
-    const int pxY = pxIdx / tgtSize;
-
-    int srcPxX, srcPxY;
-    getTranformCoord(pxX,
-                     pxY,
-                     thetas[imgIdx],
-                     scales[imgIdx],
-                     tgtCenter,
-                     imgCenter,
-                     centerCs[caseIdx],
-                     centerRs[caseIdx],
-                     &srcPxX,
-                     &srcPxY);
-
-    imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
-    targets += (caseIdx * tgtPixels + pxIdx) * channels;
-    if (srcPxX >= 0 && srcPxX < imgSize && srcPxY >= 0 && srcPxY < imgSize) {
-      for (int j = 0; j < channels; j++) targets[j] = imgs[j];
-    } else {
-      for (int j = 0; j < channels; j++) targets[j] = padValue;
-    }
-  }
-}
-
-/*
- * Functionality: generate the disturb (rotation and scaling) and
- *                sampling location sequence
- *
- * created by Wei Xu
- */
-void hl_generate_disturb_params(real*& gpuAngle,
-                                real*& gpuScaleRatio,
-                                int*& gpuCenterR,
-                                int*& gpuCenterC,
-                                int numImages,
-                                int imgSize,
-                                real rotateAngle,
-                                real scaleRatio,
-                                int samplingRate,
-                                bool isTrain) {
-  // The number of output samples.
-  int numPatches = numImages * samplingRate;
-
-  // create CPU perturbation parameters.
-  real* r_angle = new real[numImages];
-  real* s_ratio = new real[numImages];
-  int* center_r = new int[numPatches];
-  int* center_c = new int[numPatches];
-
-  // generate the random disturbance sequence and the sampling locations
-  if (isTrain) {  // random sampling for training
-    // generate rotation ans scaling parameters
-    // TODO(yuyang18): Since it will initialize random seed here, we can use
-    // rand_r instead of rand to make this method thread safe.
-    srand(getCurrentTimeStick());
-    for (int i = 0; i < numImages; i++) {
-      r_angle[i] =
-          (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0)  // NOLINT
-                                          -
-                                          0.5);
-      s_ratio[i] =
-          1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio;  // NOLINT
-    }
-
-    int imgCenter = (imgSize - 1) / 2;
-
-    // generate sampling location parameters
-    for (int i = 0; i < numImages; i++) {
-      int j = 0;
-      srand((unsigned)time(NULL));
-      while (j < samplingRate) {
-        int pxX =
-            (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0));  // NOLINT
-        int pxY =
-            (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0));  // NOLINT
-
-        const real H[4] = {cos(-r_angle[i]),
-                           -sin(-r_angle[i]),
-                           sin(-r_angle[i]),
-                           cos(-r_angle[i])};
-        real x = pxX - imgCenter;
-        real y = pxY - imgCenter;
-        real xx = H[0] * x + H[1] * y;
-        real yy = H[2] * x + H[3] * y;
-
-        real srcPxX = xx / s_ratio[i] + imgCenter;
-        real srcPxY = yy / s_ratio[i] + imgCenter;
-
-        if (srcPxX >= 0 && srcPxX <= imgSize - 1 && srcPxY >= 0 &&
-            srcPxY <= imgSize - 1) {
-          center_r[i * samplingRate + j] = pxY;
-          center_c[i * samplingRate + j] = pxX;
-          j++;
-        }
-      }
-    }
-  } else {  // central crop for testing
-    for (int i = 0; i < numImages; i++) {
-      r_angle[i] = 0.0;
-      s_ratio[i] = 1.0;
-
-      for (int j = 0; j < samplingRate; j++) {
-        center_r[i * samplingRate + j] = (imgSize - 1) / 2;
-        center_c[i * samplingRate + j] = (imgSize - 1) / 2;
-      }
-    }
-  }
-
-  // copy disturbance sequence to gpu
-  hl_memcpy_host2device(gpuAngle, r_angle, sizeof(real) * numImages);
-  hl_memcpy_host2device(gpuScaleRatio, s_ratio, sizeof(real) * numImages);
-
-  delete[] r_angle;
-  delete[] s_ratio;
-
-  // copy sampling location sequence to gpu
-  hl_memcpy_host2device(gpuCenterR, center_r, sizeof(int) * numPatches);
-  hl_memcpy_host2device(gpuCenterC, center_c, sizeof(int) * numPatches);
-
-  delete[] center_r;
-  delete[] center_c;
-}
-
-void hl_conv_random_disturb_with_params(const real* images,
-                                        int imgSize,
-                                        int tgtSize,
-                                        int channels,
-                                        int numImages,
-                                        int samplingRate,
-                                        const real* gpuRotationAngle,
-                                        const real* gpuScaleRatio,
-                                        const int* gpuCenterR,
-                                        const int* gpuCenterC,
-                                        int paddingValue,
-                                        real* target) {
-  // The number of output samples.
-  int numPatches = numImages * samplingRate;
-  // The memory size of one output patch.
-  int targetSize = tgtSize * tgtSize;
-
-  dim3 threadsPerBlock(4, 128);
-  dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));
-
-  kSamplingPatches<<<numBlocks, threadsPerBlock>>>(images,
-                                                   target,
-                                                   imgSize,
-                                                   tgtSize,
-                                                   channels,
-                                                   samplingRate,
-                                                   gpuRotationAngle,
-                                                   gpuScaleRatio,
-                                                   gpuCenterR,
-                                                   gpuCenterC,
-                                                   paddingValue,
-                                                   numImages);
-
-  hl_device_synchronize();
-}
-
-void hl_conv_random_disturb(const real* images,
-                            int imgSize,
-                            int tgtSize,
-                            int channels,
-                            int numImages,
-                            real scaleRatio,
-                            real rotateAngle,
-                            int samplingRate,
-                            real* gpu_r_angle,
-                            real* gpu_s_ratio,
-                            int* gpu_center_r,
-                            int* gpu_center_c,
-                            int paddingValue,
-                            bool isTrain,
-                            real* targets) {
-  // generate the random disturbance sequence and the sampling locations
-  hl_generate_disturb_params(gpu_r_angle,
-                             gpu_s_ratio,
-                             gpu_center_r,
-                             gpu_center_c,
-                             numImages,
-                             imgSize,
-                             rotateAngle,
-                             scaleRatio,
-                             samplingRate,
-                             isTrain);
-
-  hl_conv_random_disturb_with_params(images,
-                                     imgSize,
-                                     tgtSize,
-                                     channels,
-                                     numImages,
-                                     samplingRate,
-                                     gpu_r_angle,
-                                     gpu_s_ratio,
-                                     gpu_center_r,
-                                     gpu_center_r,
-                                     paddingValue,
-                                     targets);
-}
diff --git a/paddle/legacy/cuda/src/hl_table_apply.cu b/paddle/legacy/cuda/src/hl_table_apply.cu
deleted file mode 100644
index 7411ae35d38..00000000000
--- a/paddle/legacy/cuda/src/hl_table_apply.cu
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_base.h"
-#include "hl_cuda.h"
-#include "hl_device_functions.cuh"
-#include "paddle/legacy/utils/Logging.h"
-
-template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
-__global__ void KeMatrixAddRows(real* output,
-                                int ldo,
-                                real* table,
-                                int ldt,
-                                int* ids,
-                                int numSamples,
-                                int tableSize,
-                                int dim) {
-  int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * gridDimX;
-
-  while (idy < numSamples) {
-    int tableId = ids[idy];
-    if ((0 <= tableId) && (tableId < tableSize)) {
-      real* out = output + idy * ldo;
-      real* tab = table + tableId * ldt;
-      for (int i = idx; i < dim; i += blockDimX) {
-        if (AddRow) {
-          paddle::paddleAtomicAdd(&tab[i], out[i]);
-        } else {
-          out[i] += tab[i];
-        }
-      }
-    }
-    idy += blockDimY * gridDimX;
-  }
-}
-
-void hl_matrix_select_rows(real* output,
-                           int ldo,
-                           real* table,
-                           int ldt,
-                           int* ids,
-                           int numSamples,
-                           int tableSize,
-                           int dim) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(table);
-  CHECK_NOTNULL(ids);
-
-  dim3 threads(128, 8);
-  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      output, ldo, table, ldt, ids, numSamples, tableSize, dim);
-
-  CHECK_SYNC("hl_matrix_select_rows failed");
-}
-
-void hl_matrix_add_to_rows(real* table,
-                           int ldt,
-                           real* input,
-                           int ldi,
-                           int* ids,
-                           int numSamples,
-                           int tableSize,
-                           int dim) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(table);
-  CHECK_NOTNULL(ids);
-
-  dim3 threads(128, 8);
-  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      input, ldi, table, ldt, ids, numSamples, tableSize, dim);
-
-  CHECK_SYNC("hl_matrix_add_to_rows failed");
-}
-
-template <class T, int blockDimX, int gridDimX>
-__global__ void KeVectorSelect(
-    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
-  int idx = threadIdx.x + blockDimX * blockIdx.x;
-  while (idx < sizei) {
-    int index = ids[idx];
-    // check(index < sizes);
-    dst[idx] = src[index];
-    idx += blockDimX * gridDimX;
-  }
-}
-
-template <class T>
-void hl_vector_select_from(
-    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_NOTNULL(ids);
-  CHECK_EQ(sized, sizei);
-
-  dim3 threads(512, 1);
-  dim3 grid(8, 1);
-  KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      dst, sized, src, sizes, ids, sizei);
-
-  CHECK_SYNC("hl_vector_select_from failed");
-}
-
-template void hl_vector_select_from(real* dst,
-                                    int sized,
-                                    const real* src,
-                                    int sizes,
-                                    const int* ids,
-                                    int sizei);
-template void hl_vector_select_from(
-    int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
diff --git a/paddle/legacy/cuda/src/hl_time.cc b/paddle/legacy/cuda/src/hl_time.cc
deleted file mode 100644
index 26af9ec806a..00000000000
--- a/paddle/legacy/cuda/src/hl_time.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_time.h"
-#include <stdlib.h>
-#include <chrono>
-#include <cstdint>
-#include <iostream>
-
-using std::chrono::high_resolution_clock;
-
-int64_t getCurrentTimeStick() {
-  high_resolution_clock::time_point tp = high_resolution_clock::now();
-  high_resolution_clock::duration dtn = tp.time_since_epoch();
-  return dtn.count();
-}
diff --git a/paddle/legacy/cuda/src/hl_top_k.cu b/paddle/legacy/cuda/src/hl_top_k.cu
deleted file mode 100644
index 041ac419f5a..00000000000
--- a/paddle/legacy/cuda/src/hl_top_k.cu
+++ /dev/null
@@ -1,481 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/cuda/include/hl_base.h"
-#include "paddle/legacy/cuda/include/hl_sparse.ph"
-#include "paddle/legacy/cuda/include/hl_top_k.h"
-#include "paddle/legacy/utils/Logging.h"
-
-// using namespace hppl;
-
-struct Pair {
-  __device__ __forceinline__ Pair() {}
-
-  __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {}
-
-  __device__ __forceinline__ void set(real value, int id) {
-    v_ = value;
-    id_ = id;
-  }
-
-  __device__ __forceinline__ void operator=(const Pair& in) {
-    v_ = in.v_;
-    id_ = in.id_;
-  }
-
-  __device__ __forceinline__ bool operator<(const real value) const {
-    return (v_ < value);
-  }
-
-  __device__ __forceinline__ bool operator<(const Pair& in) const {
-    return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
-  }
-
-  __device__ __forceinline__ bool operator>(const Pair& in) const {
-    return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
-  }
-
-  real v_;
-  int id_;
-};
-
-__device__ __forceinline__ void addTo(Pair topK[],
-                                      const Pair& p,
-                                      int beamSize) {
-  for (int k = beamSize - 2; k >= 0; k--) {
-    if (topK[k] < p) {
-      topK[k + 1] = topK[k];
-    } else {
-      topK[k + 1] = p;
-      return;
-    }
-  }
-  topK[0] = p;
-}
-
-template <int beamSize>
-__device__ __forceinline__ void addTo(Pair topK[], const Pair& p) {
-  for (int k = beamSize - 2; k >= 0; k--) {
-    if (topK[k] < p) {
-      topK[k + 1] = topK[k];
-    } else {
-      topK[k + 1] = p;
-      return;
-    }
-  }
-  topK[0] = p;
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(
-    Pair topK[], real* src, int idx, int dim, int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < src[idx]) {
-      Pair tmp(src[idx], idx);
-      addTo(topK, tmp, beamSize);
-    }
-    idx += blockSize;
-  }
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(
-    Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < src[idx]) {
-      Pair tmp(src[idx], idx);
-      if (tmp < max) {
-        addTo(topK, tmp, beamSize);
-      }
-    }
-    idx += blockSize;
-  }
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(
-    Pair topK[], real* val, int* col, int idx, int dim, int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < val[idx]) {
-      Pair tmp(val[idx], col[idx]);
-      addTo(topK, tmp, beamSize);
-    }
-    idx += blockSize;
-  }
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(Pair topK[],
-                                        real* val,
-                                        int* col,
-                                        int idx,
-                                        int dim,
-                                        const Pair& max,
-                                        int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < val[idx]) {
-      Pair tmp(val[idx], col[idx]);
-      if (tmp < max) {
-        addTo(topK, tmp, beamSize);
-      }
-    }
-    idx += blockSize;
-  }
-}
-
-template <int maxLength, int blockSize>
-__device__ __forceinline__ void threadGetTopK(Pair topK[],
-                                              int& beam,
-                                              int beamSize,
-                                              real* src,
-                                              bool& firstStep,
-                                              bool& isEmpty,
-                                              Pair& max,
-                                              int dim,
-                                              const int tid) {
-  if (beam > 0) {
-    int length = beam < beamSize ? beam : beamSize;
-    if (firstStep) {
-      firstStep = false;
-      getTopK<blockSize>(topK, src, tid, dim, length);
-    } else {
-      for (int k = 0; k < maxLength; k++) {
-        if (k < maxLength - beam) {
-          topK[k] = topK[k + beam];
-        } else {
-          topK[k].set(-HL_FLOAT_MAX, -1);
-        }
-      }
-      if (!isEmpty) {
-        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, max, length);
-      }
-    }
-
-    max = topK[maxLength - 1];
-    if (max.id_ == -1) isEmpty = true;
-    beam = 0;
-  }
-}
-
-template <int maxLength, int blockSize>
-__device__ __forceinline__ void threadGetTopK(Pair topK[],
-                                              int& beam,
-                                              int beamSize,
-                                              real* val,
-                                              int* col,
-                                              bool& firstStep,
-                                              bool& isEmpty,
-                                              Pair& max,
-                                              int dim,
-                                              const int tid) {
-  if (beam > 0) {
-    int length = beam < beamSize ? beam : beamSize;
-    if (firstStep) {
-      firstStep = false;
-      getTopK<blockSize>(topK, val, col, tid, dim, length);
-    } else {
-      for (int k = 0; k < maxLength; k++) {
-        if (k < maxLength - beam) {
-          topK[k] = topK[k + beam];
-        } else {
-          topK[k].set(-HL_FLOAT_MAX, -1);
-        }
-      }
-      if (!isEmpty) {
-        getTopK<blockSize>(
-            topK + maxLength - beam, val, col, tid, dim, max, length);
-      }
-    }
-
-    max = topK[maxLength - 1];
-    if (max.id_ == -1) isEmpty = true;
-    beam = 0;
-  }
-}
-
-template <int maxLength, int blockSize>
-__device__ __forceinline__ void blockReduce(Pair* shTopK,
-                                            int* maxId,
-                                            Pair topK[],
-                                            real** topVal,
-                                            int** topIds,
-                                            int& beam,
-                                            int& beamSize,
-                                            const int tid,
-                                            const int warp) {
-  while (true) {
-    __syncthreads();
-    if (tid < blockSize / 2) {
-      if (shTopK[tid] < shTopK[tid + blockSize / 2]) {
-        maxId[tid] = tid + blockSize / 2;
-      } else {
-        maxId[tid] = tid;
-      }
-    }
-    __syncthreads();
-    for (int stride = blockSize / 4; stride > 0; stride = stride / 2) {
-      if (tid < stride) {
-        if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
-          maxId[tid] = maxId[tid + stride];
-        }
-      }
-      __syncthreads();
-    }
-    __syncthreads();
-
-    if (tid == 0) {
-      **topVal = shTopK[maxId[0]].v_;
-      **topIds = shTopK[maxId[0]].id_;
-      (*topVal)++;
-      (*topIds)++;
-    }
-    if (tid == maxId[0]) beam++;
-    if (--beamSize == 0) break;
-    __syncthreads();
-
-    // NOTE(zcd): temporary solution
-    unsigned mask = 0u;
-    CREATE_SHFL_MASK(mask, true);
-
-    if (tid == maxId[0]) {
-      if (beam < maxLength) {
-        shTopK[tid] = topK[beam];
-      }
-    }
-    if (maxId[0] / 32 == warp) {
-      if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break;
-    }
-  }
-}
-
-/**
- * Each block compute one sample.
- * In a block:
- * 1. every thread get top maxLength value;
- * 2. merge to shTopK, block reduce and get max value;
- * 3. go to the second setp, until one thread's topK value is null;
- * 4. go to the first setp, until get the topK value.
- */
-template <int maxLength, int blockSize>
-__global__ void KeMatrixTopK(real* topVal,
-                             int ldv,
-                             int* topIds,
-                             real* src,
-                             int lds,
-                             int dim,
-                             int beamSize) {
-  __shared__ Pair shTopK[blockSize];
-  __shared__ int maxId[blockSize / 2];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-  src += blockIdx.x * lds;
-  topVal += blockIdx.x * ldv;
-  topIds += blockIdx.x * beamSize;
-
-  Pair topK[maxLength];  // NOLINT
-  int beam = maxLength;
-  Pair max;
-  bool isEmpty = false;
-  bool firstStep = true;
-
-  for (int k = 0; k < maxLength; k++) {
-    topK[k].set(-HL_FLOAT_MAX, -1);
-  }
-  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>(
-        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
-
-    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>(
-        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
-  }
-}
-
-template <int maxLength, int blockSize>
-__global__ void KeSMatrixTopK(real* topVal,
-                              int ldv,
-                              int* topIds,
-                              real* val,
-                              int* row,
-                              int* col,
-                              int beamSize) {
-  __shared__ Pair shTopK[blockSize];
-  __shared__ int maxId[blockSize / 2];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-  topVal += blockIdx.x * ldv;
-  topIds += blockIdx.x * beamSize;
-
-  Pair topK[maxLength];  // NOLINT
-  int beam = maxLength;
-  Pair max;
-  bool isEmpty = false;
-  bool firstStep = true;
-
-  int start = row[blockIdx.x];
-  int end = row[blockIdx.x + 1];
-  int dim = end - start;
-  val += start;
-  col += start;
-
-  if (beamSize > dim) {
-    // if the number of values to sort are less than the output size,
-    // use -1 to indicate the end of valid sorted values.
-    if (tid == 0) {
-      topIds[dim] = -1;
-    }
-
-    beamSize = dim;
-  }
-
-  for (int k = 0; k < maxLength; k++) {
-    topK[k].set(-HL_FLOAT_MAX, -1);
-  }
-  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>(
-        topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
-
-    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>(
-        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
-  }
-}
-
-void hl_matrix_top_k(real* topVal,
-                     int ldv,
-                     int* topIds,
-                     real* src,
-                     int lds,
-                     int dim,
-                     int beamSize,
-                     int numSamples) {
-  CHECK_NOTNULL(topVal);
-  CHECK_NOTNULL(topIds);
-  CHECK_NOTNULL(src);
-
-  if (beamSize > dim) beamSize = dim;
-
-  dim3 threads(256, 1);
-  dim3 grid(numSamples, 1);
-  KeMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      topVal, ldv, topIds, src, lds, dim, beamSize);
-
-  CHECK_SYNC("hl_matrix_top_k failed");
-}
-
-void hl_sparse_matrix_top_k(real* topVal,
-                            int ldv,
-                            int* topIds,
-                            hl_sparse_matrix_s src,
-                            int beamSize,
-                            int numSamples) {
-  CHECK_NOTNULL(topVal);
-  CHECK_NOTNULL(topIds);
-  CHECK_NOTNULL(src);
-  CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!";
-
-  hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
-  if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) {
-    LOG(FATAL) << "parameter src is null!";
-  }
-
-  dim3 threads(256, 1);
-  dim3 grid(numSamples, 1);
-  KeSMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
-
-  CHECK_SYNC("hl_sparse_matrix_top_k failed");
-}
-
-/**
- * Each block compute one sample.
- * In a block:
- * 1. every thread get top maxLength value;
- * 2. merge to shTopK, block reduce and get max value;
- * 3. go to the second setp, until one thread's topK value is null;
- * 4. go to the first setp, until get the topK value.
- */
-template <int maxLength, int blockSize>
-__global__ void KeMatrixTopKClassificationError(real* topVal,
-                                                int ldv,
-                                                int* topIds,
-                                                real* src,
-                                                int lds,
-                                                int dim,
-                                                int beamSize,
-                                                int* label,
-                                                real* recResult) {
-  __shared__ Pair shTopK[blockSize];
-  __shared__ int maxId[blockSize / 2];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-  src += blockIdx.x * lds;
-  topVal += blockIdx.x * ldv;
-  topIds += blockIdx.x * beamSize;
-
-  Pair topK[maxLength];  // NOLINT
-  int beam = maxLength;
-  Pair max;
-  bool isEmpty = false;
-  bool firstStep = true;
-  int topkSize = beamSize;
-
-  for (int k = 0; k < maxLength; k++) {
-    topK[k].set(-HL_FLOAT_MAX, -1);
-  }
-
-  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>(
-        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
-
-    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>(
-        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
-  }
-
-  __syncthreads();
-  if (tid == 0) {
-    for (int i = 0; i < topkSize; i++) {
-      if (*--topIds == label[blockIdx.x]) {
-        recResult[blockIdx.x] = 0;
-        break;
-      }
-      recResult[blockIdx.x] = 1.0f;
-    }
-  }
-}
-
-void hl_matrix_classification_error(real* topVal,
-                                    int ldv,
-                                    int* topIds,
-                                    real* src,
-                                    int lds,
-                                    int dim,
-                                    int topkSize,
-                                    int numSamples,
-                                    int* label,
-                                    real* recResult) {
-  CHECK_NOTNULL(topVal);
-  CHECK_NOTNULL(topIds);
-  CHECK_NOTNULL(src);
-
-  if (topkSize > dim) topkSize = dim;
-
-  dim3 threads(256, 1);
-  dim3 grid(numSamples, 1);
-  KeMatrixTopKClassificationError<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
-
-  CHECK_SYNC("hl_matrix_top_k classification error failed");
-}
diff --git a/paddle/legacy/cuda/src/hl_warpctc_wrap.cc b/paddle/legacy/cuda/src/hl_warpctc_wrap.cc
deleted file mode 100644
index 31a8652f1f5..00000000000
--- a/paddle/legacy/cuda/src/hl_warpctc_wrap.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_warpctc_wrap.h"
-#include <mutex>
-#include "paddle/legacy/utils/DynamicLoader.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace dynload {
-
-std::once_flag warpctc_dso_flag;
-void* warpctc_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load warpctc routine
- * via operator overloading. When PADDLE_USE_DSO is
- * false, you need to add the path of libwarp-ctc.so to
- * the linked-libs of paddle or to LD_PRELOAD.
- */
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                              \
-  struct DynLoad__##__name {                                           \
-    template <typename... Args>                                        \
-    auto operator()(Args... args) -> decltype(__name(args...)) {       \
-      using warpctcFunc = decltype(__name(args...)) (*)(Args...);      \
-      std::call_once(                                                  \
-          warpctc_dso_flag, GetWarpCTCDsoHandle, &warpctc_dso_handle); \
-      void* p_##_name = dlsym(warpctc_dso_handle, #__name);            \
-      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);        \
-    }                                                                  \
-  } __name;  // struct DynLoad__##__name
-
-// include all needed warp-ctc functions
-DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version)
-DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString)
-DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss)
-DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size)
-
-#undef DYNAMIC_LOAD_WARPCTC_WRAP
-
-} /* namespace dynload */
-
-#define WARPCTC_GET_VERSION dynload::get_warpctc_version
-#define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString
-
-static int g_warpctcVersion = -1;
-#ifndef PADDLE_TYPE_DOUBLE
-#define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss
-#define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size
-#else
-hl_warpctc_status_t fatal(...) {
-  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion
-             << "] Error: not support double precision.";
-  // both of get_warpctc_version() and get_workspace_size() return an ctcStatus
-  // type value
-  return CTC_STATUS_EXECUTION_FAILED;
-}
-#define WARPCTC_COMPUTE_LOSS fatal
-#define WARPCTC_GET_WORKSPACE_SIZE fatal
-#endif
-
-/**
- * Check build-in warp-ctc function using glog and it also
- * support << operator for more details error info.
- */
-#define CHECK_WARPCTC(warpctcStat)                \
-  CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat)       \
-      << "warp-ctc [version " << g_warpctcVersion \
-      << "] Error: " << WARPCTC_GET_STATUS_STRING(warpctcStat) << " "
-
-void hl_warpctc_init(const size_t blank,
-                     bool useGpu,
-                     hl_warpctc_options_t* options) {
-  CHECK_NOTNULL(options);
-
-  g_warpctcVersion = WARPCTC_GET_VERSION();
-
-  if (useGpu) {
-#ifdef __NVCC__
-    options->loc = CTC_GPU;
-    options->stream = STREAM_DEFAULT;
-#else
-    LOG(FATAL) << "[warpctc init] GPU is not enabled.";
-#endif
-  } else {
-    options->loc = CTC_CPU;
-    options->num_threads = 1;
-  }
-
-  options->blank_label = blank;
-}
-
-void hl_warpctc_compute_loss(const real* batchInput,
-                             real* batchGrad,
-                             const int* cpuLabels,
-                             const int* cpuLabelLengths,
-                             const int* cpuInputLengths,
-                             const size_t numClasses,
-                             const size_t numSequences,
-                             real* cpuCosts,
-                             void* workspace,
-                             hl_warpctc_options_t* options) {
-  CHECK_NOTNULL(batchInput);
-  CHECK_NOTNULL(cpuLabels);
-  CHECK_NOTNULL(cpuLabelLengths);
-  CHECK_NOTNULL(cpuInputLengths);
-  CHECK_NOTNULL(cpuCosts);
-  CHECK_NOTNULL(workspace);
-  CHECK_NOTNULL(options);
-
-  CHECK_WARPCTC(WARPCTC_COMPUTE_LOSS(batchInput,
-                                     batchGrad,
-                                     cpuLabels,
-                                     cpuLabelLengths,
-                                     cpuInputLengths,
-                                     numClasses,
-                                     numSequences,
-                                     cpuCosts,
-                                     workspace,
-                                     *options));
-}
-
-void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
-                                   const int* cpuInputLengths,
-                                   const size_t numClasses,
-                                   const size_t numSequences,
-                                   hl_warpctc_options_t* options,
-                                   size_t* bytes) {
-  CHECK_NOTNULL(cpuLabelLengths);
-  CHECK_NOTNULL(cpuInputLengths);
-  CHECK_NOTNULL(options);
-  CHECK_NOTNULL(bytes);
-
-  CHECK_WARPCTC(WARPCTC_GET_WORKSPACE_SIZE(cpuLabelLengths,
-                                           cpuInputLengths,
-                                           numClasses,
-                                           numSequences,
-                                           *options,
-                                           bytes));
-}
diff --git a/paddle/legacy/function/BlockExpandOp.cpp b/paddle/legacy/function/BlockExpandOp.cpp
deleted file mode 100644
index f01f89a7277..00000000000
--- a/paddle/legacy/function/BlockExpandOp.cpp
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Function.h"
-#include "Im2Col.h"
-
-namespace paddle {
-
-/*
- * \brief Converts the image data of four dimensions(NCHW) into
- *        a sequence data of three dimensions(NST) in the forward calculation,
- *        which is reversed in the backward calculation.
- *        Where N is batch size, S is the length of the sequence after each
- *        image is expanded, T is the size of each time step in the sequence.
- *
- * Arguments in forward function:
- * \param inputs[0]  Image data of NCHW format.
- * \param outputs[0] Sequence data of NST format.
- *
- * Arguments in backward function:
- * \param inputs[0]  Sequence data of NST format.
- * \param outputs[0] Image data of NCHW format.
- */
-class BlockExpandFunction : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    // function arguments
-    strides_ = config.get<std::vector<size_t>>("strides");
-    paddings_ = config.get<std::vector<size_t>>("paddings");
-    blocks_ = config.get<std::vector<size_t>>("blocks");
-
-    // number of inputs and outputs
-    numInputs_ = 1;
-    numOutputs_ = 1;
-  }
-
-  void checkShape(const TensorShape& image, const TensorShape& sequence) const {
-    // image shape should be 4-dimensional.
-    CHECK_EQ(image.ndims(), (size_t)4);
-    // sequence shape should be 3-dimensional.
-    CHECK_EQ(sequence.ndims(), (size_t)3);
-    // The batchSize of the image needs to be equal to
-    // the batchSize of the sequence.
-    CHECK_EQ(image[0], sequence[0]);
-  }
-
-  // Calculate the shape of colData based on the shape of the image
-  // and the shape of the sequence.
-  TensorShape getColShape(const TensorShape& image,
-                          const TensorShape& sequence) const {
-    size_t inputChannels = image[1];
-    size_t inputHeight = image[2];
-    size_t inputWidth = image[3];
-    size_t seqLength = sequence[1];
-    size_t stepSize = sequence[2];
-    size_t outputHeight =
-        1 +
-        (inputHeight + 2 * paddingH() - blockH() + strideH() - 1) / strideH();
-    size_t outputWidth =
-        1 +
-        (inputWidth + 2 * paddingW() - blockW() + strideW() - 1) / strideW();
-    CHECK_EQ(seqLength, outputHeight * outputWidth);
-    CHECK_EQ(stepSize, inputChannels * blockH() * blockW());
-
-    // [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
-    return TensorShape({outputHeight,
-                        outputWidth,
-                        inputChannels,
-                        (size_t)blockH(),
-                        (size_t)blockW()});
-  }
-
- protected:
-  std::vector<size_t> strides_;
-  std::vector<size_t> paddings_;
-  std::vector<size_t> blocks_;
-
-  inline int strideH() const { return strides_[0]; }
-
-  inline int strideW() const { return strides_[1]; }
-
-  inline int paddingH() const { return paddings_[0]; }
-
-  inline int paddingW() const { return paddings_[1]; }
-
-  inline int blockH() const { return blocks_[0]; }
-
-  inline int blockW() const { return blocks_[1]; }
-};
-
-template <DeviceType Device>
-class BlockExpandForward : public BlockExpandFunction {
- public:
-  void init(const FuncConfig& config) override {
-    BlockExpandFunction::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& image = inputs[0].shape();
-    const TensorShape& sequence = outputs[0].shape();
-    checkShape(image, sequence);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    const TensorShape& image = inputs[0].shape();
-    const TensorShape& sequence = outputs[0].shape();
-
-    TensorShape imShape = TensorShape({image[1], image[2], image[3]});
-    TensorShape colShape = getColShape(image, sequence);
-    size_t batchSize = image[0];
-
-    real* imageData = inputs[0].data<real>();
-    real* seqData = outputs[0].data<real>();
-    Im2ColFunctor<kOCF, Device, real> im2col;
-    for (size_t i = 0; i < batchSize; i++) {
-      // The result of im2col is [outputHeight, outputWidth,
-      // inputChannels, filterHeight, filterWidth], and it is easy to
-      // reshape into [seqLength, stepSize], where seqLength is equal
-      // output_height * output_width, stepSize is equal
-      // input_channels * filter_height * filter_width
-      im2col(imageData,
-             imShape,
-             seqData,
-             colShape,
-             strideH(),
-             strideW(),
-             paddingH(),
-             paddingW());
-      imageData += imShape.getElements();
-      seqData += colShape.getElements();
-    }
-  }
-};
-
-template <DeviceType Device>
-class BlockExpandBackward : public BlockExpandFunction {
- public:
-  void init(const FuncConfig& config) override {
-    BlockExpandFunction::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& image = outputs[0].shape();
-    const TensorShape& sequence = inputs[0].shape();
-    checkShape(image, sequence);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    // Since the implementation of Col2ImFunctor is ADD_TO,
-    // this function only supports ADD_TO mode.
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    const TensorShape& image = outputs[0].shape();
-    const TensorShape& sequence = inputs[0].shape();
-
-    TensorShape imShape = TensorShape({image[1], image[2], image[3]});
-    TensorShape colShape = getColShape(image, sequence);
-    size_t batchSize = image[0];
-
-    real* imageData = outputs[0].data<real>();
-    real* seqData = inputs[0].data<real>();
-    Col2ImFunctor<kOCF, Device, real> col2im;
-    for (size_t i = 0; i < batchSize; i++) {
-      col2im(imageData,
-             imShape,
-             seqData,
-             colShape,
-             strideH(),
-             strideW(),
-             paddingH(),
-             paddingW());
-      imageData += imShape.getElements();
-      seqData += colShape.getElements();
-    }
-  }
-};
-
-REGISTER_TYPED_FUNC(BlockExpand, CPU, BlockExpandForward);
-REGISTER_TYPED_FUNC(BlockExpandGrad, CPU, BlockExpandBackward);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(BlockExpand, GPU, BlockExpandForward);
-REGISTER_TYPED_FUNC(BlockExpandGrad, GPU, BlockExpandBackward);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/BlockExpandOpTest.cpp b/paddle/legacy/function/BlockExpandOpTest.cpp
deleted file mode 100644
index 8fca4f6fdc8..00000000000
--- a/paddle/legacy/function/BlockExpandOpTest.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-
-namespace paddle {
-
-TEST(BlockExpandForward, real) {
-  for (size_t batchSize : {5}) {
-    for (size_t channels : {1, 5}) {
-      for (size_t inputHeight : {5, 33}) {
-        for (size_t inputWidth : {5, 32}) {
-          for (size_t block : {1, 3, 5}) {
-            for (size_t stride : {1, 2}) {
-              for (size_t padding : {0, 1}) {
-                // init Test object
-                std::vector<size_t> strides = {stride, stride};
-                std::vector<size_t> paddings = {padding, padding};
-                std::vector<size_t> blocks = {block, block};
-                CpuGpuFuncCompare test("BlockExpand",
-                                       FuncConfig()
-                                           .set("strides", strides)
-                                           .set("paddings", paddings)
-                                           .set("blocks", blocks));
-
-                size_t outputHeight =
-                    1 +
-                    (inputHeight + 2 * padding - block + stride - 1) / stride;
-                size_t outputWidth =
-                    1 +
-                    (inputWidth + 2 * padding - block + stride - 1) / stride;
-                TensorShape inputShape =
-                    TensorShape({batchSize, channels, inputHeight, inputWidth});
-                TensorShape outputShape =
-                    TensorShape({batchSize,
-                                 outputHeight * outputWidth,
-                                 channels * block * block});
-                test.addInputs(BufferArg(VALUE_TYPE_FLOAT, inputShape));
-                test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, outputShape));
-                // run Function
-                test.run();
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(BlockExpandBackward, real) {
-  for (size_t batchSize : {5}) {
-    for (size_t channels : {1, 5}) {
-      for (size_t inputHeight : {5, 33}) {
-        for (size_t inputWidth : {5, 32}) {
-          for (size_t block : {1, 3, 5}) {
-            for (size_t stride : {1, 2}) {
-              for (size_t padding : {0, 1}) {
-                // init Test object
-                std::vector<size_t> strides = {stride, stride};
-                std::vector<size_t> paddings = {padding, padding};
-                std::vector<size_t> blocks = {block, block};
-                CpuGpuFuncCompare test("BlockExpandGrad",
-                                       FuncConfig()
-                                           .set("strides", strides)
-                                           .set("paddings", paddings)
-                                           .set("blocks", blocks));
-
-                size_t outputHeight =
-                    1 +
-                    (inputHeight + 2 * padding - block + stride - 1) / stride;
-                size_t outputWidth =
-                    1 +
-                    (inputWidth + 2 * padding - block + stride - 1) / stride;
-                TensorShape inputShape =
-                    TensorShape({batchSize, channels, inputHeight, inputWidth});
-                TensorShape outputShape =
-                    TensorShape({batchSize,
-                                 outputHeight * outputWidth,
-                                 channels * block * block});
-                test.addInputs(BufferArg(VALUE_TYPE_FLOAT, outputShape));
-                test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, inputShape),
-                                ADD_TO);
-                // run Function
-                test.run();
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/BufferArg.cpp b/paddle/legacy/function/BufferArg.cpp
deleted file mode 100644
index 1f3d505c31b..00000000000
--- a/paddle/legacy/function/BufferArg.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-
-#include "BufferArg.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-
-namespace paddle {
-
-const SequenceArg& BufferArg::sequence() const {
-  CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
-  return dynamic_cast<const SequenceArg&>(*this);
-}
-
-const SparseMatrixArg& BufferArg::sparse() const {
-  CHECK_EQ(bufferType_, TENSOR_SPARSE);
-  return dynamic_cast<const SparseMatrixArg&>(*this);
-}
-
-SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
-    : BufferArg(sparse, argType),
-      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
-      nnz_(sparse.getElementCnt()),
-      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
-      type_(static_cast<SparseDataType>(sparse.getValueType())) {
-  bufferType_ = TENSOR_SPARSE;
-}
-
-SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
-    : BufferArg(sparse, argType),
-      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
-      nnz_(sparse.getElementCnt()),
-      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
-      type_(static_cast<SparseDataType>(sparse.getValueType())) {
-  bufferType_ = TENSOR_SPARSE;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/BufferArg.h b/paddle/legacy/function/BufferArg.h
deleted file mode 100644
index 1f47ad556d2..00000000000
--- a/paddle/legacy/function/BufferArg.h
+++ /dev/null
@@ -1,364 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-
-#include "TensorShape.h"
-#include "TensorType.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-enum BufferType {
-  TENSOR_UNKNOWN = 0,
-  TENSOR_NORMAL = 1,
-  TENSOR_SEQUENCE_ID = 2,
-  TENSOR_SEQUENCE_DATA = 3,
-  TENSOR_SPARSE = 4
-};
-
-class BufferArg;
-class SequenceArg;
-class SparseMatrixArg;
-
-/**
- * \brief BufferArg used as the argument type of Function.
- *
- * The arguments of the Paddle Function have four Buffer types.
- * 1. BufferArg for a dense Buffer of any dimension.
- * 2. SequenceIdArg for a Buffer of sequence start positions.
- * 3. SequenceArg for a Buffer of sequence data.
- * 4. SparseMatrixArg for a Buffer of sparse matrix.
- *
- * Buffer shape
- * For most buffers, the first dimension `shape()[0]` represents
- * the size of the mini-batch.
- *
- * Buffer argType
- * There is an ArgType property for the BufferArg used as Function Output.
- * Whether the result of the Function calculation is assigned to the
- * output Buffer or added to the output Buffer is determined by the
- * argType_ property of the output BufferArg.
- */
-
-// ArgType is only used by output BufferArg.
-// For input argument, argType_ is ignored.
-// For output argument, need to set the argType_ of the BufferArg.
-enum ArgType {
-  UNSPECIFIED = 0,
-  ASSIGN_TO = 1,
-  ADD_TO = 2,
-};
-class BufferArg {
- public:
-  void setArgType(ArgType argType) { argType_ = argType; }
-
-  ArgType getArgType() const { return argType_; }
-
- public:
-  BufferArg(ValueType valueType,
-            const TensorShape& shape,
-            ArgType argType = UNSPECIFIED)
-      : buf_(nullptr), valueType_(valueType), shape_(shape), argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-  }
-
-  BufferArg(void* buf,
-            ValueType valueType,
-            const TensorShape& shape,
-            ArgType argType = UNSPECIFIED)
-      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-  }
-
-  BufferArg(void* buf, ValueType valueType) : buf_(buf), valueType_(valueType) {
-    bufferType_ = TENSOR_NORMAL;
-  }
-
-  BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
-        valueType_(DataType<real>::value),
-        shape_(2),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    shape_.setDim(0, matrix.getHeight());
-    shape_.setDim(1, matrix.getWidth());
-  }
-
-  BufferArg(const Matrix& matrix,
-            const TensorShape& shape,
-            ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
-        valueType_(DataType<real>::value),
-        shape_(shape),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    CHECK_EQ(matrix.getElementCnt(), shape.getElements());
-  }
-
-  BufferArg(const Vector& vector, ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
-        valueType_(DataType<real>::value),
-        shape_(1),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    shape_.setDim(0, vector.getSize());
-  }
-
-  BufferArg(const IVector& vector, ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
-        valueType_(VALUE_TYPE_INT32),
-        shape_(1),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    shape_.setDim(0, vector.getSize());
-  }
-
-  template <DeviceType DType>
-  typename Tensor<real, DType>::Matrix matrix() const {
-    CHECK(buf_);
-    CHECK(valueType_ == DataType<real>::value);
-    // CHECK(deviceType_ == DType);
-    CHECK_EQ((size_t)2, shape_.ndims());
-    return typename Tensor<real, DType>::Matrix(
-        reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
-  }
-
-  template <typename VType, DeviceType DType>
-  typename Tensor<VType, DType>::Vector vector() const {
-    CHECK(buf_);
-    CHECK(valueType_ == DataType<VType>::value);
-    // CHECK(deviceType_ == DType);
-    CHECK_EQ((size_t)1, shape_.ndims());
-    return typename Tensor<VType, DType>::Vector(
-        shape_[0], reinterpret_cast<VType*>(buf_));
-  }
-
-  virtual ~BufferArg() {}
-
-  template <typename T>
-  T* data() const {
-    return reinterpret_cast<T*>(buf_);
-  }
-
-  void* data() const { return buf_; }
-  ValueType valueType() const { return valueType_; }
-  BufferType bufferType() const { return bufferType_; }
-  const TensorShape& shape() const { return shape_; }
-  bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
-  bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
-  virtual size_t numElements() const { return shape_.getElements(); }
-
-  const SequenceArg& sequence() const;
-  const SparseMatrixArg& sparse() const;
-
- protected:
-  void* buf_;
-  ValueType valueType_;
-  TensorShape shape_;
-  BufferType bufferType_{TENSOR_UNKNOWN};
-  ArgType argType_{UNSPECIFIED};
-  // TODO(tianbing), add deviceType_
-  // leading dimensions. The size is dims_.size()
-  // Dims lds_;
-};
-
-// sequence start positions in a mini-batch of sequences
-// shape_.ndims() == 1
-// valueType_ = int32
-// if a < b then value_.buf_[a] < value_.buf_[b]
-class SequenceIdArg : public BufferArg {
- public:
-  SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
-      : BufferArg(VALUE_TYPE_INT32, shape, argType) {
-    bufferType_ = TENSOR_SEQUENCE_ID;
-    CHECK_EQ(shape_.ndims(), 1UL);
-    CHECK_GE(shape_[0], 1UL);
-    numSeqs_ = shape_[0] - 1;
-  }
-
-  SequenceIdArg(void* buf,
-                const TensorShape& shape,
-                ArgType argType = UNSPECIFIED)
-      : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
-    bufferType_ = TENSOR_SEQUENCE_ID;
-    CHECK_EQ(shape_.ndims(), 1UL);
-    numSeqs_ = shape_[0] - 1;
-  }
-
-  SequenceIdArg(const IVector& vector) : BufferArg(vector) {
-    bufferType_ = TENSOR_SEQUENCE_ID;
-    numSeqs_ = shape_[0] - 1;
-  }
-
-  ~SequenceIdArg() {}
-
-  size_t numSeqs() const { return numSeqs_; }
-
- private:
-  size_t numSeqs_;
-};
-
-// sequences data
-// For mini-batch calculate,
-// one batch can contain more than one sequence of data.
-// SequenceArg can be used to represent sequences that contain multiple
-// unequal lengths.
-class SequenceArg : public BufferArg {
- public:
-  SequenceArg(ValueType valueType,
-              const TensorShape& shape,
-              ArgType argType = UNSPECIFIED)
-      : BufferArg(valueType, shape, argType),
-        startPositions_(TensorShape({shape[0]})) {
-    bufferType_ = TENSOR_SEQUENCE_DATA;
-  }
-
-  SequenceArg(void* buf,
-              ValueType valueType,
-              const TensorShape& shape,
-              const SequenceIdArg& startPositions,
-              ArgType argType = UNSPECIFIED)
-      : BufferArg(buf, valueType, shape, argType),
-        startPositions_(startPositions) {
-    bufferType_ = TENSOR_SEQUENCE_DATA;
-  }
-
-  SequenceArg(const Matrix& matrix,
-              const IVector& vector,
-              ArgType argType = UNSPECIFIED)
-      : BufferArg(matrix, argType), startPositions_(vector) {
-    bufferType_ = TENSOR_SEQUENCE_DATA;
-  }
-
-  ~SequenceArg() {}
-
-  void* getIdBuf() const { return startPositions_.data(); }
-  size_t numSeqs() const { return startPositions_.numSeqs(); }
-  SequenceIdArg& getSequenceId() { return startPositions_; }
-  const SequenceIdArg& getSequenceId() const { return startPositions_; }
-
- private:
-  SequenceIdArg startPositions_;
-};
-
-// sparse matrix
-// valueType_ == float or double
-// shape_.ndims() == 2
-class SparseMatrixArg : public BufferArg {
- public:
-  SparseMatrixArg(void* buf,
-                  ValueType valueType,
-                  const TensorShape& shape,
-                  const BufferArg& row,
-                  const BufferArg& col,
-                  size_t nnz,
-                  SparseFormat format,
-                  SparseValueType type,
-                  ArgType argType = UNSPECIFIED)
-      : BufferArg(buf, valueType, shape, argType),
-        row_(row),
-        col_(col),
-        nnz_(nnz),
-        format_(static_cast<SparseDataFormat>(format)),
-        type_(static_cast<SparseDataType>(type)) {
-    bufferType_ = TENSOR_SPARSE;
-    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
-    CHECK_EQ(shape_.ndims(), 2UL);
-    CHECK_EQ(row_.shape().ndims(), 1UL);
-    CHECK_EQ(col_.shape().ndims(), 1UL);
-    if (format_ == T_SPARSE_CSR) {
-      CHECK_EQ(nnz, col.shape()[0]);
-    } else if (format_ == T_SPARSE_CSC) {
-      CHECK_EQ(nnz, row.shape()[0]);
-    }
-  }
-
-  SparseMatrixArg(ValueType valueType,
-                  const TensorShape& shape,
-                  size_t nnz,
-                  SparseFormat format,
-                  SparseValueType type,
-                  ArgType argType = UNSPECIFIED)
-      : BufferArg(valueType, shape, argType),
-        row_(BufferArg(nullptr, VALUE_TYPE_INT32)),
-        col_(BufferArg(nullptr, VALUE_TYPE_INT32)),
-        nnz_(nnz),
-        format_(static_cast<SparseDataFormat>(format)),
-        type_(static_cast<SparseDataType>(type)) {
-    bufferType_ = TENSOR_SPARSE;
-    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
-    CHECK_EQ(shape_.ndims(), 2UL);
-
-    /// len of row_ : height + 1 (CSR) or nnz (CSC), buf_ == nullptr
-    row_ = (format_ == T_SPARSE_CSR
-                ? BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[0] + 1})
-                : BufferArg(VALUE_TYPE_INT32, TensorShape{nnz}));
-    /// len of col_ :  width + 1 (CSC) or nnz (CSR), buf_ == nullptr
-    col_ = (format_ == T_SPARSE_CSR
-                ? BufferArg(VALUE_TYPE_INT32, TensorShape{nnz})
-                : BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[1] + 1}));
-  }
-
-  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
-
-  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
-
-  template <DeviceType DType>
-  typename Tensor<real, DType>::SparseMatrix SparseMatrix() const {
-    CHECK(buf_);
-    CHECK(valueType_ == DataType<real>::value);
-    // CHECK(deviceType_ == DType);
-    CHECK_EQ(2UL, shape_.ndims());
-    return typename Tensor<real, DType>::SparseMatrix(
-        reinterpret_cast<real*>(buf_),
-        reinterpret_cast<int*>(row_.data()),
-        reinterpret_cast<int*>(col_.data()),
-        shape_[0],
-        shape_[1],
-        nnz_,
-        static_cast<SparseValueType>(type_),
-        static_cast<SparseFormat>(format_),
-        false);
-  }
-
-  ~SparseMatrixArg() {}
-
-  void* getRowBuf() const { return row_.data(); }
-
-  void* getColBuf() const { return col_.data(); }
-
-  size_t nnz() const { return nnz_; }
-
-  size_t numElements() const override { return nnz_; }
-
-  SparseDataFormat dataFormat() const { return format_; }
-
-  SparseDataType dataType() const { return type_; }
-
- private:
-  BufferArg row_;
-  BufferArg col_;
-  size_t nnz_;
-  SparseDataFormat format_;
-  SparseDataType type_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/BufferArgTest.cpp b/paddle/legacy/function/BufferArgTest.cpp
deleted file mode 100644
index 1ec153bea89..00000000000
--- a/paddle/legacy/function/BufferArgTest.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BufferArg.h"
-#include <gtest/gtest.h>
-#include "paddle/legacy/math/MemoryHandle.h"
-
-namespace paddle {
-
-TEST(BufferTest, BufferArg) {
-  TensorShape shape({8, 10});
-  CpuMemoryHandle memory(shape.getElements() *
-                         sizeOfValuType(VALUE_TYPE_FLOAT));
-  BufferArg buffer(memory.getBuf(), VALUE_TYPE_FLOAT, shape);
-  EXPECT_EQ(buffer.data(), memory.getBuf());
-}
-
-TEST(BufferTest, SequenceIdArg) {
-  TensorShape shape({10});
-  CpuMemoryHandle memory(shape.getElements() *
-                         sizeOfValuType(VALUE_TYPE_INT32));
-  SequenceIdArg buffer(memory.getBuf(), shape);
-  EXPECT_EQ(buffer.data(), memory.getBuf());
-  EXPECT_EQ(buffer.numSeqs(), 9U);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CMakeLists.txt b/paddle/legacy/function/CMakeLists.txt
deleted file mode 100644
index 29b4ac098e2..00000000000
--- a/paddle/legacy/function/CMakeLists.txt
+++ /dev/null
@@ -1,54 +0,0 @@
-file(GLOB h_files . *Op.h)
-file(GLOB cpp_files . *Op.cpp)
-
-list(APPEND h_files Function.h)
-list(APPEND cpp_files Function.cpp)
-list(APPEND cpp_files BufferArg.cpp)
-list(APPEND cpp_files GemmFunctor.cpp)
-if(USE_EIGEN_FOR_BLAS)
-  list(APPEND cpp_files EigenGemm.cpp)
-endif(USE_EIGEN_FOR_BLAS)
-
-if(WITH_GPU)
-    file(GLOB cu_files . *OpGpu.cu)
-    cuda_compile(cu_objs ${cu_files})
-endif()
-
-if(USE_NNPACK)
-  list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
-  if(WITH_TESTING)
-    add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp)
-  endif()
-endif()
-
-list(APPEND cpp_files neon/NeonDepthwiseConv.cpp)
-
-add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
-add_dependencies(paddle_function ${external_project_dependencies})
-add_dependencies(paddle_function paddle_proto)
-
-if(WITH_TESTING)
-if(WITH_GPU)
-    # TODO:
-    # file(GLOB test_files . *OpTest.cpp)
-    # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
-    add_simple_unittest(CrossMapNormalOpTest)
-    add_simple_unittest(TensorShapeTest)
-    add_simple_unittest(TensorTypeTest)
-    add_simple_unittest(BufferArgTest)
-    add_simple_unittest(FunctionTest)
-    add_simple_unittest(ContextProjectionOpTest)
-    add_simple_unittest(PadOpTest)
-    add_simple_unittest(MulOpTest)
-    add_simple_unittest(CosSimOpTest)
-    add_simple_unittest(RowConvOpTest)
-    add_simple_unittest(BlockExpandOpTest)
-    add_simple_unittest(CropOpTest)
-    add_simple_unittest(SwitchOpTest)
-    add_simple_unittest(ScaleSubRegionOpTest)
-endif()
-
-add_simple_unittest(Im2ColTest)
-add_simple_unittest(GemmConvOpTest)
-add_simple_unittest(DepthwiseConvOpTest)
-endif()
diff --git a/paddle/legacy/function/ContextProjectionOp.cpp b/paddle/legacy/function/ContextProjectionOp.cpp
deleted file mode 100644
index 05a3f915862..00000000000
--- a/paddle/legacy/function/ContextProjectionOp.cpp
+++ /dev/null
@@ -1,412 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ContextProjectionOp.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-
-namespace paddle {
-/**
- * Context Projection Forward with CPU Matrix Device.
- *
- */
-template <>
-void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
-                                               const CpuMatrix& input_mat,
-                                               const CpuMatrix& weight_mat,
-                                               const CpuIVector& seq_vec,
-                                               size_t context_length,
-                                               int context_start,
-                                               size_t begin_pad) {
-  const int* starts = seq_vec.getData();
-  const size_t num_sequences = seq_vec.getSize() - 1;
-  for (size_t i = 0; i < num_sequences; ++i) {
-    for (size_t j = 0; j < context_length; ++j) {
-      int begin = starts[i] + context_start + j;
-      int end = starts[i + 1] + context_start + j;
-      int dst_begin = starts[i];
-      int dst_end = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t pad_size =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        MatrixPtr mat = out_mat.subMatrix(starts[i], pad_size);
-        if (weight_mat) {
-          MatrixPtr sub =
-              const_cast<CpuMatrix&>(weight_mat).subMatrix(j, pad_size);
-          mat->addAtOffset(*sub, j * input_mat.getWidth());
-        }
-        dst_begin = starts[i] + pad_size;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t pad_size =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        MatrixPtr mat = out_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
-        if (weight_mat) {
-          MatrixPtr sub =
-              const_cast<CpuMatrix&>(weight_mat)
-                  .subMatrix(begin_pad + context_start + j - pad_size,
-                             pad_size);
-          mat->addAtOffset(*sub, j * input_mat.getWidth());
-        }
-        dst_end = starts[i + 1] - pad_size;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      MatrixPtr src =
-          const_cast<CpuMatrix&>(input_mat).subMatrix(begin, end - begin);
-      MatrixPtr dst = out_mat.subMatrix(dst_begin, dst_end - dst_begin);
-      dst->addAtOffset(*src, j * input_mat.getWidth());
-    }
-  }
-}
-
-/**
- * Paddle Function for Context Projection Forward.
- * Calculate the output layer value sequence after context projection.
- *
- * What is Context Projection for a sequence?
- * For example, assumed input (x) has 4 words and the dimension of each word
- * representation is 2. If we use zero to pad instead of learned weight to pad,
- * and the context_lenth is 3, the output (y) is:
- *
- * @code
- *  x = [a1, a2;
- *       b1, b2;
- *       c1, c2;
- *       d1, d2]
- *  y = [0,  0,  a1, a2, b1, b2;
- *       a1, a2, b1, b2, c1, c2;
- *       b1, b2, c1, c2, d1, d2;
- *       c1, c2, d1, d2, 0,  0]
- * @endcode
- *
- * \param outputs[0].matrix   output layer value, n * (d * l)
- * \param outputs[0].vector   start position sequence, n * 1
- * \param inputs[0].matrix    input layer value, n * d
- * \param inputs[0].vector    start position sequence, n * 1
- * \param inputs[1].matrix    input layer weight, pad * d
- */
-template <DeviceType Device>
-class ContextProjectionForwardFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-    begin_pad_ = config.get<size_t>("begin_pad");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK(1UL == inputs.size() || 2UL == inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here";
-    const auto val_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
-    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
-
-    CHECK(out_seq.data() && val_seqs.data() && val_seqs.getSequenceId().data());
-    CHECK_EQ(out_seq.shape().ndims(), 2UL);
-    CHECK_EQ(val_seqs.shape().ndims(), 2UL);
-    /// dim of output = dim of input * context_length
-    CHECK_EQ(out_seq.shape()[1], val_seqs.shape()[1] * context_length_);
-    /// input and output has the same batch_size
-    CHECK_EQ(val_seqs.shape()[0], out_seq.shape()[0]);
-    if (2UL == inputs.size()) {
-      CHECK_EQ(inputs[1].shape().ndims(), 2UL);
-      /// dim of input == dim of weight
-      CHECK_EQ(val_seqs.shape()[1], inputs[1].shape()[1]);
-    }
-
-    CHECK_EQ(out_seq.getArgType(), ADD_TO);
-    auto out_mat = out_seq.matrix<Device>();
-    const auto in_mat = val_seqs.matrix<Device>();
-    const auto w_mat =
-        (2UL == inputs.size() && inputs[1].data())
-            ? inputs[1].matrix<Device>()
-            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-    const auto seq_vec = val_seqs.getSequenceId().vector<int, Device>();
-
-    ContextProjectionForward<Device>(out_mat,
-                                     in_mat,
-                                     w_mat,
-                                     seq_vec,
-                                     context_length_,
-                                     context_start_,
-                                     begin_pad_);
-  }
-
- private:
-  size_t context_length_;
-  int context_start_;
-  size_t begin_pad_;
-};
-
-/**
- * Context Projection Backward with CPU Matrix Device.
- *
- */
-template <>
-void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
-                                                CpuMatrix& in_grad_mat,
-                                                CpuMatrix& w_grad_mat,
-                                                const CpuIVector& seq_vec,
-                                                size_t context_length,
-                                                int context_start,
-                                                size_t begin_pad,
-                                                bool is_padding,
-                                                size_t total_pad) {
-  size_t input_dim = in_grad_mat ? in_grad_mat.getWidth()
-                                 : w_grad_mat ? w_grad_mat.getWidth() : 0;
-  const int* starts = seq_vec.getData();
-  size_t num_sequences = seq_vec.getSize() - 1;
-  for (size_t i = 0; i < num_sequences; ++i) {
-    for (size_t j = 0; j < context_length; ++j) {
-      int begin = starts[i] + context_start + j;
-      int end = starts[i + 1] + context_start + j;
-      int dst_begin = starts[i];
-      int dst_end = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t pad_size =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        if (is_padding && w_grad_mat) {
-          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
-                              .subMatrix(starts[i], pad_size);
-          MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
-          sub->addAtOffset(*mat, j * input_dim);
-        }
-        dst_begin = starts[i] + pad_size;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t pad_size =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        if (is_padding && w_grad_mat) {
-          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
-                              .subMatrix(starts[i + 1] - pad_size, pad_size);
-          MatrixPtr sub = w_grad_mat.subMatrix(
-              begin_pad + context_start + j - pad_size, pad_size);
-          sub->addAtOffset(*mat, j * input_dim);
-        }
-        dst_end = starts[i + 1] - pad_size;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      if (!in_grad_mat) continue;
-      MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
-      MatrixPtr dst = const_cast<CpuMatrix&>(out_grad_mat)
-                          .subMatrix(dst_begin, dst_end - dst_begin);
-      src->addAtOffset(*dst, j * input_dim);
-    }
-  }
-}
-
-/**
- * Context Projection Backward Function.
- * Update the weight gradient and input layer gradient with backprop
- *
- * \param inputs[0].matrix          output layer grad, n * (d * l)
- * \param inputs[0].vector          start position sequence, n * 1
- * \param outputs[0].matrix         input layer grad, n * d
- * \param outputs[0].vector         start position sequence, n * 1
- * \param outputs[1]                weight grad, pad * d
- */
-template <DeviceType Device>
-class ContextProjectionBackwardFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-    begin_pad_ = config.get<size_t>("begin_pad");
-    is_padding_ = config.get<bool>("is_padding");
-    total_pad_ = config.get<size_t>("total_pad");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK(1UL == outputs.size() || 2UL == outputs.size());
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here";
-    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
-    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
-    CHECK(in_seq.data() && in_seq.getSequenceId().data());
-    CHECK_EQ(in_seq.shape().ndims(), 2UL);
-    CHECK_EQ(out_seq.shape().ndims(), 2UL);
-    CHECK_EQ(out_seq.getSequenceId().shape().ndims(), 1UL);
-
-    /// input and output grad has the same batch_size
-    CHECK_EQ(out_seq.shape()[0], in_seq.shape()[0]);
-    /// dim of output grad = dim of input grad * context_length
-    CHECK_EQ(in_seq.shape()[1], out_seq.shape()[1] * context_length_);
-    CHECK_EQ(out_seq.getArgType(), ADD_TO);
-
-    if (2UL == outputs.size()) {
-      CHECK_EQ(outputs[1].shape().ndims(), 2UL);
-      /// dim of input grad == dim of weight
-      CHECK_EQ(out_seq.shape()[1], outputs[1].shape()[1]);
-      CHECK_EQ(outputs[1].getArgType(), ADD_TO);
-    }
-
-    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
-    const auto out_grad_mat = in_seq.matrix<Device>();
-    auto in_grad_mat =
-        !out_seq.data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                        : out_seq.matrix<Device>();
-    auto w_grad_mat =
-        (2UL == outputs.size() && outputs[1].data())
-            ? outputs[1].matrix<Device>()
-            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-
-    ContextProjectionBackward<Device>(out_grad_mat,
-                                      in_grad_mat,
-                                      w_grad_mat,
-                                      seq_vec,
-                                      context_length_,
-                                      context_start_,
-                                      begin_pad_,
-                                      is_padding_,
-                                      total_pad_);
-  }
-
- private:
-  size_t context_length_;
-  int context_start_;
-  size_t begin_pad_;
-  bool is_padding_;
-  size_t total_pad_;
-};
-
-/**
- * Context Projection Backward Data Function
- * Update input layer grad
- * input:  sequence of output layer grad
- * output: sequence of input layer grad
- *
- * \param outputs[0].matrix              input layer grad, n * d
- * \param outputs[0].vector              start position sequence, n * 1
- * \param inputs[0].matrix               output layer grad, n * (d * l)
- * \param inputs[0].vector               start positon sequence, n * 1
- */
-template <DeviceType Device>
-class ContextProjectionBackwardDataFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here";
-    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
-    const auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
-
-    CHECK(in_seq.data() && out_seq.data() && in_seq.getSequenceId().data());
-    CHECK_EQ(out_seq.shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
-    /// output layer grad dim == input layer grad dim * context_length_
-    CHECK_EQ(in_seq.shape().ndims(), out_seq.shape().ndims() * context_length_);
-    /// input and output has the same batch_size
-    CHECK_EQ(in_seq.shape()[0], out_seq.shape()[0]);
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    const auto out_grad_mat = in_seq.matrix<Device>();
-    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
-    auto in_grad_mat = out_seq.matrix<Device>();
-
-    ContextProjectionBackwardData<Device>(
-        out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
-  }
-
- private:
-  size_t context_length_;
-  int context_start_;
-};
-
-/**
- * Context Projection Backward Weight Function
- * Update weight grad by backprop
- * input:  sequence of output layer grad
- * output: weight grad
- *
- * \param outputs[0]                   weight grad, pad * d
- * \param inputs[0].matrix             output layer grad, n * (d * l)
- * \param inputs[0].vecotr             start positon sequence, n * 1
- */
-template <DeviceType Device>
-class ContextProjectionBackwardWeightFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-    begin_pad_ = config.get<size_t>("begin_pad");
-    total_pad_ = config.get<size_t>("total_pad");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK(inputs[0].isSequenceArg()) << "SequenceArg required here";
-    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
-    CHECK(in_seq.data() && in_seq.getSequenceId().data() && outputs[0].data());
-    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
-    CHECK_EQ(in_seq.shape()[0], outputs[0].shape()[0]);
-    /// output layer grad dim == weight dim * context_length_
-    CHECK_EQ(in_seq.shape()[1], outputs[0].shape()[1] * context_length_);
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-
-    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
-    const auto out_grad_mat = in_seq.matrix<Device>();
-    auto w_grad_mat = outputs[0].matrix<Device>();
-    ContextProjectionBackwardWeight<Device>(out_grad_mat,
-                                            w_grad_mat,
-                                            seq_vec,
-                                            context_length_,
-                                            context_start_,
-                                            total_pad_,
-                                            begin_pad_);
-  }
-
- private:
-  size_t context_length_;
-  int context_start_;
-  size_t begin_pad_;
-  size_t total_pad_;
-};
-
-REGISTER_TYPED_FUNC(ContextProjectionForward,
-                    CPU,
-                    ContextProjectionForwardFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackward,
-                    CPU,
-                    ContextProjectionBackwardFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(ContextProjectionForward,
-                    GPU,
-                    ContextProjectionForwardFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackward,
-                    GPU,
-                    ContextProjectionBackwardFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
-                    GPU,
-                    ContextProjectionBackwardDataFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
-                    GPU,
-                    ContextProjectionBackwardWeightFunc);
-#endif
-}  // namespace paddle
diff --git a/paddle/legacy/function/ContextProjectionOp.h b/paddle/legacy/function/ContextProjectionOp.h
deleted file mode 100644
index 822734a78e6..00000000000
--- a/paddle/legacy/function/ContextProjectionOp.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "Function.h"
-
-namespace paddle {
-
-/**
- * \brief   Context Projection Forward.
- *
- * \param[in/out]  outputs           output data.
- * \param[in]      input             input data.
- * \param[in]      weight            input weight.
- * \param[in]      sequence          input data.
- * \param[in]      context_length    consecutive rows for concatenation.
- * \param[in]      context_start     context start position.
- * \param[in]      begin_pad         begining pad position.
- * \param[in]      is_padding        whether padding 0 or not.
- *
- */
-template <DeviceType DType>
-void ContextProjectionForward(
-    typename Tensor<real, DType>::Matrix& output,
-    const typename Tensor<real, DType>::Matrix& input,
-    const typename Tensor<real, DType>::Matrix& weight,
-    const typename Tensor<int, DType>::Vector& sequence,
-    size_t context_length,
-    int context_start,
-    size_t begin_pad);
-
-/**
- * \brief   Context Projection Backward.
- *
- * \param[out]  outputs           output gradient.
- * \param[in]   input             input gradient.
- * \param[in]   weight            input weight gradient.
- * \param[in]   sequence          input data.
- * \param[in]   context_length    consecutive rows for concatenation.
- * \param[in]   context_start     context start position.
- * \param[in]   begin_pad         begining pad position.
- * \param[in]   is_padding        whether padding 0 or not.
- *
- */
-template <DeviceType DType>
-void ContextProjectionBackward(
-    const typename Tensor<real, DType>::Matrix& out_grad,
-    typename Tensor<real, DType>::Matrix& in_grad,
-    typename Tensor<real, DType>::Matrix& w_grad,
-    const typename Tensor<int, DType>::Vector& seq_vec,
-    size_t context_length,
-    int context_start,
-    size_t begin_pad,
-    bool is_padding,
-    size_t total_pad);
-
-template <DeviceType DType>
-void ContextProjectionBackwardData(
-    const typename Tensor<real, DType>::Matrix& out_grad,
-    typename Tensor<real, DType>::Matrix& in_grad,
-    const typename Tensor<int, DType>::Vector& sequence,
-    size_t context_length,
-    int context_start);
-
-template <DeviceType DType>
-void ContextProjectionBackwardWeight(
-    const typename Tensor<real, DType>::Matrix& out_grad,
-    typename Tensor<real, DType>::Matrix& w_grad,
-    const typename Tensor<int, DType>::Vector& seq_vec,
-    size_t context_length,
-    int context_start,
-    size_t total_pad,
-    size_t begin_pad);
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/ContextProjectionOpGpu.cu b/paddle/legacy/function/ContextProjectionOpGpu.cu
deleted file mode 100644
index 0a4d865e2c4..00000000000
--- a/paddle/legacy/function/ContextProjectionOpGpu.cu
+++ /dev/null
@@ -1,413 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ContextProjectionOp.h"
-#include "hl_base.h"
-
-namespace paddle {
-
-template <bool padding>
-__global__ void KeContextProjectionForward(const real* input,
-                                           const int* sequence,
-                                           const real* weight,
-                                           real* output,
-                                           int input_dim,
-                                           int context_length,
-                                           int context_start,
-                                           int begin_pad) {
-  int idx = threadIdx.x;
-  int block_size = blockDim.x;
-  int sequenceId = blockIdx.x;
-  int seq_start = sequence[sequenceId];
-  int seq_end = sequence[sequenceId + 1];
-  real value = 0;
-
-  int instances = seq_end - seq_start + context_length - 1;
-  output += seq_start * input_dim * context_length;
-  input += seq_start * input_dim;
-  for (int k = 0; k <= input_dim / block_size; k++) {
-    if (idx < input_dim) {
-      for (int i = 0; i < instances; i++) {
-        // i + context_start;
-        if ((i + context_start) < 0) {
-          if (padding) {
-            value = weight[i * input_dim + idx];
-          } else {
-            continue;
-          }
-        } else if ((i + context_start) >= (seq_end - seq_start)) {
-          if (padding) {
-            value =
-                weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
-                           input_dim +
-                       idx];
-          } else {
-            continue;
-          }
-        } else {
-          value = input[(i + context_start) * input_dim + idx];
-        }
-
-        int outx = (i - context_length) < 0 ? i : (context_length - 1);
-        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
-        real* output_r =
-            output + outy * input_dim * context_length + outx * input_dim;
-        for (int j = outy; j < seq_end - seq_start; j++) {
-          output_r[idx] += value;
-          if (j - outy == outx) break;
-          output_r += (context_length - 1) * input_dim;
-        }
-      }
-    }
-    idx += block_size;
-  }
-}
-
-/**
- * @brief   Context projection forward.
- *
- * @param[in]   input           input sequence.
- * @param[in]   sequence        sequence index.
- * @param[in]   weight          padding data.
- * @param[out]  output          output sequence.
- * @param[in]   num_sequences    number of sequences.
- * @param[in]   input_dim        input sequence dimension.
- * @param[in]   context_length   context length.
- * @param[in]   context_start    context start.
- * @param[in]   begin_pad        number of extra timesteps added at the
- * beginning.
- *
- */
-void hl_context_projection_forward(const real* input,
-                                   const int* sequence,
-                                   const real* weight,
-                                   real* output,
-                                   size_t num_sequences,
-                                   size_t input_dim,
-                                   size_t context_length,
-                                   int context_start,
-                                   size_t begin_pad) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(output);
-
-  int block_size = 128;
-  int blocks_x = num_sequences;
-  int blocks_y = 1;
-  dim3 threads(block_size, 1);
-  dim3 grid(blocks_x, blocks_y);
-
-  if (weight) {
-    KeContextProjectionForward<true><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        input,
-        sequence,
-        weight,
-        output,
-        input_dim,
-        context_length,
-        context_start,
-        begin_pad);
-  } else {
-    KeContextProjectionForward<false><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        input,
-        sequence,
-        weight,
-        output,
-        input_dim,
-        context_length,
-        context_start,
-        begin_pad);
-  }
-  CHECK_SYNC("hl_context_projection_forward failed");
-}
-
-template <>
-void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix& output,
-                                               const GpuMatrix& input,
-                                               const GpuMatrix& weight,
-                                               const GpuIVector& sequence,
-                                               size_t context_length,
-                                               int context_start,
-                                               size_t begin_pad) {
-  hl_context_projection_forward(input.getData(),
-                                sequence.getData(),
-                                weight ? weight.getData() : nullptr,
-                                output.getData(),
-                                sequence.getSize() - 1,
-                                input.getWidth(),
-                                context_length,
-                                context_start,
-                                begin_pad);
-}
-
-__global__ void KeContextProjectionBackwardData(const real* out_grad,
-                                                const int* sequence,
-                                                real* in_grad,
-                                                size_t input_dim,
-                                                int context_length,
-                                                int context_start) {
-  int idx = threadIdx.x;
-  int block_size = blockDim.x;
-  int sequenceId = blockIdx.x;
-  int seq_start = sequence[sequenceId];
-  int seq_end = sequence[sequenceId + 1];
-  real value = 0;
-
-  int instances = seq_end - seq_start + context_length - 1;
-  auto out = const_cast<real*>(out_grad);
-  out += seq_start * input_dim * context_length;
-  in_grad += seq_start * input_dim;
-  for (int k = 0; k <= input_dim / block_size; k++) {
-    if (idx < input_dim) {
-      for (int i = 0; i < instances; i++) {
-        if ((i + context_start) < 0) {
-          continue;
-        } else if ((i + context_start) >= (seq_end - seq_start)) {
-          continue;
-        } else {
-          // value = 0;
-          value = in_grad[(i + context_start) * input_dim + idx];
-        }
-
-        int outx = (i - context_length) < 0 ? i : (context_length - 1);
-        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
-        real* output_r =
-            out + outy * input_dim * context_length + outx * input_dim;
-        for (int j = outy; j < seq_end - seq_start; j++) {
-          value += output_r[idx];
-          if (j - outy == outx) break;
-          output_r += (context_length - 1) * input_dim;
-        }
-        in_grad[(i + context_start) * input_dim + idx] = value;
-      }
-    }
-    idx += block_size;
-  }
-}
-
-/**
- * @brief   Context projection backward data.
- *
- * @param[in]   out_grad         output gradient.
- * @param[in]   sequence         sequence index.
- * @param[out]  input_grad       input gradient.
- * @param[in]   num_sequences    number of sequences.
- * @param[in]   input_dim        input sequence dimension.
- * @param[in]   context_length   context length.
- * @param[in]   context_start    context start.
- *
- */
-void hl_context_projection_backward_data(const real* out_grad,
-                                         const int* sequence,
-                                         real* input_grad,
-                                         size_t num_sequences,
-                                         size_t input_dim,
-                                         size_t context_length,
-                                         int context_start) {
-  CHECK_NOTNULL(out_grad);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(input_grad);
-
-  int block_size = 128;
-  int blocks_x = num_sequences;
-  int blocks_y = 1;
-  dim3 threads(block_size, 1);
-  dim3 grid(blocks_x, blocks_y);
-  KeContextProjectionBackwardData<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      out_grad, sequence, input_grad, input_dim, context_length, context_start);
-  CHECK_SYNC("hl_context_projection_backward_data failed");
-}
-
-template <>
-void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
-                                                    GpuMatrix& in_grad,
-                                                    const GpuIVector& sequence,
-                                                    size_t context_length,
-                                                    int context_start) {
-  hl_context_projection_backward_data(out_grad.getData(),
-                                      sequence.getData(),
-                                      in_grad.getData(),
-                                      sequence.getSize() - 1,
-                                      in_grad.getWidth(),
-                                      context_length,
-                                      context_start);
-}
-
-template <int THREADS_X, int THREADS_Y>
-__global__ void KeContextProjectionBackwardWeight(const real* out_grad,
-                                                  const int* sequence,
-                                                  real* w_grad,
-                                                  int num_sequences,
-                                                  int w_dim,
-                                                  int context_length,
-                                                  int context_start,
-                                                  int begin_pad) {
-  __shared__ real sum_s[THREADS_Y][THREADS_X];
-  int pad_of_block = (w_dim + THREADS_X - 1) / THREADS_X;
-  const int idx = threadIdx.x;
-  const int idy = threadIdx.y;
-  int padId = blockIdx.x / pad_of_block;
-  int weight_idx = idx + THREADS_X * (blockIdx.x % pad_of_block);
-  int instanceId;
-  real value = 0;
-  real* output_r;
-
-  sum_s[idy][idx] = 0.0f;
-  if (weight_idx < w_dim) {
-    for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
-      int seq_start = sequence[seqId];
-      int seq_end = sequence[seqId + 1];
-      output_r =
-          const_cast<real*>(out_grad) + seq_start * w_dim * context_length;
-
-      if (context_start < 0) {
-        if (padId + context_start < 0) {
-          instanceId = padId;
-        } else {
-          // begin_pad > 0;
-          instanceId =
-              (padId - begin_pad) + (seq_end - seq_start) - context_start;
-        }
-      } else {
-        if (padId + (seq_end - seq_start) < context_start) {
-          continue;
-        } else {
-          // begin_pad == 0;
-          instanceId = padId + (seq_end - seq_start) - context_start;
-        }
-      }
-
-      int outx =
-          (instanceId - context_length) < 0 ? instanceId : (context_length - 1);
-      int outy = (instanceId - context_length) < 0
-                     ? 0
-                     : (instanceId - (context_length - 1));
-      output_r += outy * w_dim * context_length + outx * w_dim;
-      for (int j = outy; j < seq_end - seq_start; j++) {
-        value += output_r[weight_idx];
-        if (j - outy == outx) break;
-        output_r += (context_length - 1) * w_dim;
-      }
-    }
-    sum_s[idy][idx] = value;
-  }
-  __syncthreads();
-
-  for (int stride = THREADS_Y / 2; stride > 0; stride = stride / 2) {
-    if (idy < stride) {
-      sum_s[idy][idx] += sum_s[idy + stride][idx];
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (weight_idx < w_dim) {
-    if (idy == 0) {
-      w_grad[padId * w_dim + weight_idx] += sum_s[0][idx];
-    }
-  }
-}
-
-/**
- * @brief   Context projection backward weight.
- *
- * @param[in]   out_grad         output gradient.
- * @param[in]   sequence         sequence index.
- * @param[out]  w_grad           weight gradient.
- * @param[in]   num_sequences    number of sequences.
- * @param[in]   w_dim            input sequence dimension.
- * @param[in]   total_pad        number of extra timesteps.
- * @param[in]   context_length   context length.
- * @param[in]   context_start    context start.
- * @param[in]   begin_pad        number of extra timesteps added at the
- * beginning.
- *
- */
-void hl_context_projection_backward_weight(const real* out_grad,
-                                           const int* sequence,
-                                           real* w_grad,
-                                           size_t num_sequences,
-                                           size_t w_dim,
-                                           size_t total_pad,
-                                           size_t context_length,
-                                           int context_start,
-                                           size_t begin_pad) {
-  CHECK_NOTNULL(out_grad);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(w_grad);
-
-  int threads_x = 32;
-  int threads_y = 32;
-  int blocks_x = total_pad * ((w_dim + threads_x - 1) / threads_x);
-  dim3 threads(threads_x, threads_y);
-  dim3 grid(blocks_x, 1);
-
-  KeContextProjectionBackwardWeight<32,
-                                    32><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      out_grad,
-      sequence,
-      w_grad,
-      num_sequences,
-      w_dim,
-      context_length,
-      context_start,
-      begin_pad);
-  CHECK_SYNC("hl_context_projection_backward_weight failed");
-}
-
-template <>
-void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
-                                                      GpuMatrix& w_grad,
-                                                      const GpuIVector& seq_vec,
-                                                      size_t context_length,
-                                                      int context_start,
-                                                      size_t total_pad,
-                                                      size_t begin_pad) {
-  hl_context_projection_backward_weight(out_grad.getData(),
-                                        seq_vec.getData(),
-                                        w_grad.getData(),
-                                        seq_vec.getSize() - 1,
-                                        w_grad.getWidth(),
-                                        total_pad,
-                                        context_length,
-                                        context_start,
-                                        begin_pad);
-}
-
-template <>
-void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
-                                                GpuMatrix& in_grad,
-                                                GpuMatrix& w_grad,
-                                                const GpuIVector& sequence,
-                                                size_t context_length,
-                                                int context_start,
-                                                size_t begin_pad,
-                                                bool is_padding,
-                                                size_t total_pad) {
-  if (in_grad) {
-    ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
-        out_grad, in_grad, sequence, context_length, context_start);
-  }
-  if (is_padding && w_grad) {
-    ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(out_grad,
-                                                     w_grad,
-                                                     sequence,
-                                                     context_length,
-                                                     context_start,
-                                                     total_pad,
-                                                     begin_pad);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/ContextProjectionOpTest.cpp b/paddle/legacy/function/ContextProjectionOpTest.cpp
deleted file mode 100644
index 3b0a34567fe..00000000000
--- a/paddle/legacy/function/ContextProjectionOpTest.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-void testMatrixProjectionForward(int context_start,
-                                 size_t context_length,
-                                 bool is_padding,
-                                 size_t batch_size,
-                                 size_t input_dim) {
-  size_t pad = std::max(0, -context_start) +
-               std::max(0, (int)(context_start + context_length - 1));
-  if (pad == 0) is_padding = false;
-
-  CpuGpuFuncCompare test(
-      "ContextProjectionForward",
-      FuncConfig()
-          .set("context_length", context_length)
-          .set("context_start", context_start)
-          .set("begin_pad", (size_t)std::max(0, -context_start)));
-
-  // prepare input arguments
-  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
-  test.addInputs(
-      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}));
-  if (is_padding) {  // weight
-    test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}));
-  }
-  test.addOutputs(
-      SequenceArg(VALUE_TYPE_FLOAT,
-                  TensorShape{batch_size, input_dim * context_length}),
-      ADD_TO);
-
-  // run Function
-  test.run();
-}
-
-void testMatrixProjectionBackward(int context_start,
-                                  size_t context_length,
-                                  bool is_padding,
-                                  size_t batch_size,
-                                  size_t input_dim) {
-  size_t pad = std::max(0, -context_start) +
-               std::max(0, (int)(context_start + context_length - 1));
-  if (pad == 0) is_padding = false;
-
-  CpuGpuFuncCompare test(
-      "ContextProjectionBackward",
-      FuncConfig()
-          .set("context_length", context_length)
-          .set("context_start", context_start)
-          .set("begin_pad", (size_t)std::max(0, -context_start))
-          .set("is_padding", is_padding)
-          .set("total_pad", pad));
-
-  // prepare input arguments
-  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
-  test.addInputs(SequenceArg(
-      VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim * context_length}));
-  test.addOutputs(
-      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}),
-      ADD_TO);
-  if (is_padding) {  // weight
-    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}),
-                    ADD_TO);
-  }
-
-  // run Function
-  test.run();
-}
-
-TEST(ContextProjection, Projection) {
-  for (auto context_start : {-5, -3, -1, 0, 3}) {
-    for (auto context_length : {1, 2, 5, 7}) {
-      for (auto trainable_padding : {false, true}) {
-        for (auto batch_size : {1, 2, 5, 20, 100}) {
-          for (auto input_dim : {15, 32, 63, 128, 200}) {
-            VLOG(3) << " context_start=" << context_start
-                    << " context_length=" << context_length
-                    << " trainable_padding=" << trainable_padding
-                    << " batch_size=" << batch_size
-                    << " input_dim=" << input_dim;
-            testMatrixProjectionForward(context_start,
-                                        context_length,
-                                        trainable_padding,
-                                        batch_size,
-                                        input_dim);
-            testMatrixProjectionBackward(context_start,
-                                         context_length,
-                                         trainable_padding,
-                                         batch_size,
-                                         input_dim);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/legacy/function/ConvOp.h b/paddle/legacy/function/ConvOp.h
deleted file mode 100644
index 2d8437bcfe6..00000000000
--- a/paddle/legacy/function/ConvOp.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-
-namespace paddle {
-
-/*
- * \brief Based on the ConvFunctionBase class, the forward calculation,
- *        backward input calculation and backward filter calculation
- *        of convolution operations can be implemented.
- *
- * Arguments of forward and backward calculation:
- *   1. Forward calculation of convolution.
- *      inputs = {INPUT, FILTER}, outputs = {OUTPUT}
- *      The first and second input arguments are input image and filter data.
- *      The output argument is output image.
- *
- *   2. Backward input calculation of convolution.
- *      inputs = {OUTPUT_GRAD, FILTER}, outputs = {INPUT_GRAD}
- *      The first and second input arguments are output grad image
- *      and filter data.
- *      The output argument is input grad image.
- *
- *   3. Backward filter calculation of convolution.
- *      inputs = {OUTPUT_GRAD, INPUT}, outputs = {FILTER_GRAD}
- *      The first and second input arguments are output grad image
- *      and input image.
- *      The output argument is filter grad.
- *
- * Arguments format of input, filter and output:
- *   1. Input image, output image, input image gradient, output image gradient
- *      are all NCHW format. Where N is batch size, C is the number of channels,
- *      H and W is the height and width of image or image gradient.
- *
- *   2. The format of the filter data is MCHW, where M is the number of output
- *      image channels, C is the number of input image channels,
- *      H and W is height and width of filter.
- *
- *      If `groups` is greater than 1, the filter's data format should be GMCHW,
- *      where G is the `groups`, and G * M is the number of output image
- *      channels, G * C is the number of input image channels,
- *      H and W is height and width of filter.
- */
-class ConvFunctionBase : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    // function arguments
-    strides_ = config.get<std::vector<size_t>>("strides");
-    paddings_ = config.get<std::vector<size_t>>("paddings");
-    dilations_ = config.get<std::vector<size_t>>("dilations");
-    groups_ = config.get<size_t>("groups");
-
-    // number of inputs and outputs
-    numInputs_ = 2;
-    numOutputs_ = 1;
-  }
-
-  // input can be INPUT and INPUT_GRAD
-  // filter can be FILTER and FILTER_GRAD
-  // output can be OUTPUT and OUTPUT_GRAD
-  void checkShape(const TensorShape& input,
-                  const TensorShape& filter,
-                  const TensorShape& output) {
-    // inputs and outputs arguments should be 4-dimensional.
-    CHECK_EQ(input.ndims(), (size_t)4);
-    CHECK_EQ(output.ndims(), (size_t)4);
-    // The batchSize of the input needs to be equal to
-    // the batchSize of the output.
-    CHECK_EQ(input[0], output[0]);
-
-    if (filter.ndims() == (size_t)4) {
-      // If the filter's dimension is 4, groups convolution is not supported.
-      CHECK_EQ(groups_, (size_t)1);
-      // The input and output channel dimensions are the second and first
-      // dimensions of the filter shape.
-      CHECK_EQ(input[1], filter[1]);
-      CHECK_EQ(output[1], filter[0]);
-    } else {
-      // filter argument should be 5-dimensional.
-      CHECK_EQ(filter.ndims(), (size_t)5);
-      // The first dimension of the filter is the size of the group
-      CHECK_EQ(filter[0], groups_);
-      // The input and output channel dimensions are the third and second
-      // dimensions of the filter shape.
-      CHECK_EQ(input[1], filter[2] * groups_);
-      CHECK_EQ(output[1], filter[1] * groups_);
-    }
-  }
-
- protected:
-  size_t getFilterHeight(const TensorShape& filter) const {
-    return filter[filter.ndims() - 2];
-  }
-
-  size_t getFilterWidth(const TensorShape& filter) const {
-    return filter[filter.ndims() - 1];
-  }
-
-  // determine whether im2col needs to be performed
-  inline bool isNeedIm2col(const TensorShape& filter) const {
-    return !(getFilterHeight(filter) == 1 && getFilterWidth(filter) == 1 &&
-             strideH() == 1 && strideW() == 1 && paddingH() == 0 &&
-             paddingW() == 0);
-  }
-
-  std::vector<size_t> strides_;
-  std::vector<size_t> paddings_;
-  std::vector<size_t> dilations_;
-
-  /// Group size, refer to grouped convolution in
-  /// Alex Krizhevsky's paper: when group=2, the first half of the
-  /// filters are only connected to the first half of the input channels,
-  /// and the second half only connected to the second half.
-  size_t groups_;
-
-  inline int strideH() const { return strides_[0]; }
-
-  inline int strideW() const { return strides_[1]; }
-
-  inline int paddingH() const { return paddings_[0]; }
-
-  inline int paddingW() const { return paddings_[1]; }
-
-  inline int dilationH() const { return dilations_[0]; }
-
-  inline int dilationW() const { return dilations_[1]; }
-
-  // A temporary memory in convolution calculation.
-  MemoryHandlePtr memory_;
-
-  template <DeviceType Device>
-  void resizeBuffer(size_t newSize) {
-    if (!memory_ || newSize * sizeof(real) > memory_->getAllocSize()) {
-      if (Device == DEVICE_TYPE_CPU) {
-        memory_ = std::make_shared<CpuMemoryHandle>(newSize * sizeof(real));
-      } else {
-        memory_ = std::make_shared<GpuMemoryHandle>(newSize * sizeof(real));
-      }
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/ConvOpTest.h b/paddle/legacy/function/ConvOpTest.h
deleted file mode 100644
index 5eac6089786..00000000000
--- a/paddle/legacy/function/ConvOpTest.h
+++ /dev/null
@@ -1,275 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FunctionTest.h"
-
-namespace paddle {
-
-template <DeviceType DType1, DeviceType DType2>
-void forward(Compare2Function<DType1, DType2>& test,
-             const TensorShape& input,
-             const TensorShape& filter,
-             const TensorShape& output) {
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
-  test.run();
-}
-
-template <DeviceType DType1, DeviceType DType2>
-void backward_input(Compare2Function<DType1, DType2>& test,
-                    const TensorShape& input,
-                    const TensorShape& filter,
-                    const TensorShape& output) {
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
-  test.run();
-}
-
-template <DeviceType DType1, DeviceType DType2>
-void backward_filter(Compare2Function<DType1, DType2>& test,
-                     const TensorShape& input,
-                     const TensorShape& filter,
-                     const TensorShape& output) {
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter), ADD_TO);
-  test.run();
-}
-
-template <DeviceType DType1, DeviceType DType2>
-using Function = void (*)(Compare2Function<DType1, DType2>& test,
-                          const TensorShape& input,
-                          const TensorShape& filter,
-                          const TensorShape& output);
-
-/**
- * \brief A basic convolution function test interface.
- *
- * \param conv1         type name of convolution function 1.
- * \param conv2         type name of convolution function 2.
- * \param function      test function, can be one of the forward, backward_input
- *                      backward_filter function.
- * Example:
- * 1. Compare GemmConv's CPU and GPU implementation:
- *   Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
- *      "GemmConv-CPU", "GemmConv-GPU", forward);
- */
-template <DeviceType DType1, DeviceType DType2>
-void Convolution(const std::string& conv1,
-                 const std::string& conv2,
-                 Function<DType1, DType2> function) {
-  for (size_t batchSize : {1, 5}) {
-    for (size_t inputSize : {7, 14, 31}) {
-      for (size_t filterSize : {1, 3, 5}) {
-        for (size_t inputChannels : {3, 16}) {
-          for (size_t outputChannels : {3, 16}) {
-            if (outputChannels < inputChannels) continue;
-            for (size_t stride : {1, 2}) {
-              for (size_t padding : {0, 1}) {
-                for (size_t dilation : {1, 3}) {
-                  if (padding >= filterSize) break;
-                  size_t filterS = (filterSize - 1) * dilation + 1;
-
-                  if (inputSize + 2 * padding < filterS) break;
-
-                  if ((conv1 == "NaiveConv-CPU" || conv2 == "NaiveConv-CPU" ||
-                       conv1 == "NNPACKConv-CPU" ||
-                       conv2 == "NNPACKConv-CPU") &&
-                      dilation > 1)
-                    break;
-
-                  // NNPACK only supports stride = 1 if batchSize > 1
-                  if ((conv1 == "NNPACKConv-CPU" ||
-                       conv2 == "NNPACKConv-CPU") &&
-                      batchSize > 1 && stride > 1)
-                    break;
-
-                  size_t outputSize =
-                      (inputSize - filterS + 2 * padding + stride) / stride;
-                  VLOG(3) << " batchSize=" << batchSize
-                          << " inputChannels=" << inputChannels
-                          << " inputHeight=" << inputSize
-                          << " inputWidth=" << inputSize
-                          << " outputChannels=" << outputChannels
-                          << " filterHeight=" << filterSize
-                          << " filterWidth=" << filterSize
-                          << " outputHeight=" << outputSize
-                          << " outputWidth=" << outputSize
-                          << " stride=" << stride << " padding=" << padding;
-
-                  std::vector<size_t> paddings = {padding, padding};
-                  std::vector<size_t> strides = {stride, stride};
-                  std::vector<size_t> dilations = {dilation, dilation};
-                  Compare2Function<DType1, DType2> test(
-                      conv1,
-                      conv2,
-                      FuncConfig()
-                          .set("paddings", paddings)
-                          .set("strides", strides)
-                          .set("dilations", dilations)
-                          .set("groups", (size_t)1)
-                          .set("algo", (std::string) "auto"));
-
-                  TensorShape input{
-                      batchSize, inputChannels, inputSize, inputSize};
-                  TensorShape filter{
-                      outputChannels, inputChannels, filterSize, filterSize};
-                  TensorShape output{
-                      batchSize, outputChannels, outputSize, outputSize};
-
-                  function(test, input, filter, output);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief A convolution function test interface for
- *        image height is not equal image width.
- */
-template <DeviceType DType1, DeviceType DType2>
-void Convolution2(const std::string& conv1,
-                  const std::string& conv2,
-                  Function<DType1, DType2> function) {
-  for (size_t batchSize : {4}) {
-    for (size_t inputHeight : {7, 31}) {
-      for (size_t inputWidth : {10, 54}) {
-        for (size_t filterHeight : {1, 5}) {
-          for (size_t filterWidth : {3, 7}) {
-            for (size_t inputChannels : {7}) {
-              for (size_t outputChannels : {7}) {
-                size_t stride = 1;
-                size_t padding = 0;
-                size_t dilation = 1;
-                size_t outputHeight =
-                    (inputHeight - filterHeight + 2 * padding + stride) /
-                    stride;
-                size_t outputWidth =
-                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
-                VLOG(3) << " batchSize=" << batchSize
-                        << " inputChannels=" << inputChannels
-                        << " inputHeight=" << inputHeight
-                        << " inputWidth=" << inputWidth
-                        << " outputChannels=" << outputChannels
-                        << " filterHeight=" << filterHeight
-                        << " filterWidth=" << filterWidth
-                        << " outputHeight=" << outputHeight
-                        << " outputWidth=" << outputWidth
-                        << " stride=" << stride << " padding=" << padding;
-
-                std::vector<size_t> paddings = {padding, padding};
-                std::vector<size_t> strides = {stride, stride};
-                std::vector<size_t> dilations = {dilation, dilation};
-                Compare2Function<DType1, DType2> test(
-                    conv1,
-                    conv2,
-                    FuncConfig()
-                        .set("paddings", paddings)
-                        .set("strides", strides)
-                        .set("groups", (size_t)1)
-                        .set("dilations", dilations)
-                        .set("algo", (std::string) "auto"));
-
-                TensorShape input{
-                    batchSize, inputChannels, inputHeight, inputWidth};
-                TensorShape filter{
-                    outputChannels, inputChannels, filterHeight, filterWidth};
-                TensorShape output{
-                    batchSize, outputChannels, outputHeight, outputWidth};
-
-                function(test, input, filter, output);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief A convolution function test interface for depthwise convolution.
- */
-template <DeviceType DType1, DeviceType DType2>
-void DepthwiseConvolution(const std::string& conv1,
-                          const std::string& conv2,
-                          Function<DType1, DType2> function) {
-  for (size_t batchSize : {1, 32}) {
-    for (size_t inputSize : {7, 14, 54}) {
-      for (size_t filterSize : {3, 4}) {
-        for (size_t inputChannels : {32}) {
-          for (size_t outputChannels : {32, 64}) {
-            for (size_t stride : {1, 2}) {
-              for (size_t padding : {0, 1}) {
-                // NNPACK only supports stride = 1 if batchSize > 1,
-                // and there has some bug when batchSize > 1 and groups != 1
-                if ((conv1 == "NNPACKConv-CPU" || conv2 == "NNPACKConv-CPU") &&
-                    batchSize > 1)
-                  break;
-
-                size_t outputSize =
-                    (inputSize - filterSize + 2 * padding + stride) / stride;
-                VLOG(3) << " batchSize=" << batchSize
-                        << " inputChannels=" << inputChannels
-                        << " inputHeight=" << inputSize
-                        << " inputWidth=" << inputSize
-                        << " outputChannels=" << outputChannels
-                        << " filterHeight=" << filterSize
-                        << " filterWidth=" << filterSize
-                        << " outputHeight=" << outputSize
-                        << " outputWidth=" << outputSize << " stride=" << stride
-                        << " padding=" << padding;
-
-                std::vector<size_t> paddings = {padding, padding};
-                std::vector<size_t> strides = {stride, stride};
-                std::vector<size_t> dilations = {1, 1};
-                size_t groups = inputChannels;
-                Compare2Function<DType1, DType2> test(
-                    conv1,
-                    conv2,
-                    FuncConfig()
-                        .set("paddings", paddings)
-                        .set("strides", strides)
-                        .set("groups", groups)
-                        .set("dilations", dilations)
-                        .set("algo", (std::string) "auto"));
-
-                TensorShape input{
-                    batchSize, inputChannels, inputSize, inputSize};
-                TensorShape filter{groups,
-                                   outputChannels / groups,
-                                   inputChannels / groups,
-                                   filterSize,
-                                   filterSize};
-                TensorShape output{
-                    batchSize, outputChannels, outputSize, outputSize};
-
-                function(test, input, filter, output);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CosSimOp.cpp b/paddle/legacy/function/CosSimOp.cpp
deleted file mode 100644
index d04f4396caa..00000000000
--- a/paddle/legacy/function/CosSimOp.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CosSimOp.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-
-namespace paddle {
-/**
- * Cosine Similarity for CpuMatrix
- *
- * \param out_mat, output value, size: nSamples * 1.
- * \param in1_mat, input value 1, size: nSamples * dim.
- * \param in2_mat, input value 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param scale, default 1.0
- *
- */
-template <>
-void CosSimForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
-                                    const CpuMatrix& in1_mat,
-                                    const CpuMatrix& in2_mat,
-                                    real scale) {
-  CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
-  size_t num_samples = out_mat.getHeight();
-  size_t dim = in1_mat.getWidth();
-  /// column vector [nSamples, 1]
-  real* out = out_mat.getData();
-  const real* x = in1_mat.getData();
-  const real* y = in2_mat.getData();
-
-  /// in2 might only have one row or full rows
-  CHECK(in2_mat.getHeight() == 1LU || in2_mat.getHeight() == num_samples);
-  size_t inc = (in2_mat.getHeight() == 1LU) ? 0 : dim;
-  for (size_t i = 0; i < num_samples; ++i, x += dim, y += inc) {
-    real square_sum_x = 0;
-    real square_sum_y = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      square_sum_x += x[j] * x[j];
-      square_sum_y += y[j] * y[j];
-      xy += x[j] * y[j];
-    }
-    CHECK(square_sum_x > 0 && square_sum_y > 0);
-    out[i] = scale * xy / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
-  }
-}
-
-/**
- * Cosine Similarity
- * for each row i,
- *   out[i] = scale * cos(input1[i], input2[i])
- *      = scale * <input1[i], input2[i]>/sqrt(|input1[i]|^2 * |input2[i]|^2)
- * when input2 only has one row, then for each row i,
- *   out[i] = cos(input1[i], input2[0])
- *
- * \param inputs[0] input matrix 1, size: nSamples * dim.
- * \param inputs[1] input matrix 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param outputs[0] output matrix, size : nSamples * 1.
- */
-
-template <DeviceType Device>
-class CosSimForwardFunc : public FunctionBase {
-  void init(const FuncConfig& config) override {
-    scale_ = config.get<real>("scale");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(inputs.size(), 2UL);
-    CHECK_EQ(outputs.size(), 1UL);
-
-    CHECK_EQ(inputs[0].shape().ndims(), 2UL);
-    CHECK_EQ(inputs[1].shape().ndims(), 2UL);
-    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
-
-    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
-    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
-    CHECK_EQ(outputs[0].shape()[1], 1UL);
-
-    CHECK(outputs[0].data() && inputs[0].data() && inputs[1].data());
-
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    auto out_mat = outputs[0].matrix<Device>();
-    const auto in1_mat = inputs[0].matrix<Device>();
-    const auto in2_mat = inputs[1].matrix<Device>();
-
-    CosSimForward<Device>(out_mat, in1_mat, in2_mat, scale_);
-  }
-
- private:
-  real scale_;
-};
-
-/**
- * Cosine Similarity Derivative for CpuMatrix
- *
- * \param in1_grad  forward input grad 1, size: nSamples * dim.
- * \param in2_grad  forward input grad 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- *
- * \param out_grad  backward loss output grad, size : nSamples * 1.
- * \param out_val   forward output value, size: nSamples * 1.
- * \param in1_val   forward input value 1, size: nSamples * dim.
- * \param in2_val   forward input value 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param scale,    default 1.0
- */
-template <>
-void CosSimBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad,
-                                     const CpuMatrix& out_val,
-                                     const CpuMatrix& in1_val,
-                                     const CpuMatrix& in2_val,
-                                     CpuMatrix& in1_grad,
-                                     CpuMatrix& in2_grad,
-                                     real scale) {
-  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
-        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
-  CHECK_EQ(out_val.useGpu_, false) << "Matrix type are GPU, CPU required";
-
-  const real* grad = out_grad.getData();
-  const real* out = out_val.getData();
-  const real* prev_out_x = in1_val.getData();
-  const real* prev_out_y = in2_val.getData();
-  real* prev_grad_x = in1_grad.getData();
-  real* prev_grad_y = in2_grad.getData();
-
-  size_t num_samples = out_grad.getHeight();
-  size_t dim = in1_val.getWidth();
-  CHECK_EQ(in2_val.getHeight(), in2_grad.getHeight());
-  CHECK(in2_val.getHeight() == 1LU || in2_val.getHeight() == num_samples);
-  size_t inc = (in2_val.getHeight() == 1LU) ? 0 : dim;
-  for (size_t i = 0; i < num_samples; ++i,
-              prev_out_x += dim,
-              prev_out_y += inc,
-              prev_grad_x += dim,
-              prev_grad_y += inc) {
-    real square_sum_x = 0;
-    real square_sum_y = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      square_sum_x += prev_out_x[j] * prev_out_x[j];
-      square_sum_y += prev_out_y[j] * prev_out_y[j];
-      xy += prev_out_x[j] * prev_out_y[j];
-    }
-    CHECK(square_sum_x > 0 && square_sum_y > 0);
-    if (xy == 0) {
-      real reciprocal =
-          1.0f / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
-      for (size_t j = 0; j < dim; ++j) {
-        prev_grad_x[j] += scale * grad[i] * prev_out_y[j] * reciprocal;
-        prev_grad_y[j] += scale * grad[i] * prev_out_x[j] * reciprocal;
-      }
-    } else {
-      real reciprocal_xy = 1.0f / xy;
-      real reciprocal_square_sum_x = 1.0f / square_sum_x;
-      real reciprocal_square_sum_y = 1.0f / square_sum_y;
-      for (size_t j = 0; j < dim; ++j) {
-        prev_grad_x[j] +=
-            out[i] * grad[i] * (prev_out_y[j] * reciprocal_xy -
-                                prev_out_x[j] * reciprocal_square_sum_x);
-        prev_grad_y[j] +=
-            out[i] * grad[i] * (prev_out_x[j] * reciprocal_xy -
-                                prev_out_y[j] * reciprocal_square_sum_y);
-      }
-    }
-  }
-}
-
-/**
- * Cosine Similarity backward Derivative
- *
- * \param outputs[0] forward input grad 1, size: nSamples * dim.
- * \param outputs[1] forward input grad 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- *
- * \param inputs[0] backward loss output grad, size : nSamples * 1.
- * \param inputs[1] forward output value, size: nSamples * 1.
- * \param inputs[2] forward input value 1, size: nSamples * dim.
- * \param inputs[3] forward input value 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- */
-template <DeviceType Device>
-class CosSimBackwardFunc : public FunctionBase {
-  void init(const FuncConfig& config) override {
-    scale_ = config.get<real>("scale");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(inputs.size(), 4UL);
-    CHECK_EQ(outputs.size(), 2UL);
-    /// dim of out_grad and out_val == 1, column vector
-    CHECK_EQ(inputs[0].shape()[1], 1UL);
-    CHECK_EQ(inputs[1].shape()[1], 1UL);
-    /// nSamples of out_grad == out_val == in_val1 == in_grad1
-    CHECK_EQ(inputs[1].shape()[0], inputs[0].shape()[0]);
-    CHECK_EQ(inputs[0].shape()[0], inputs[0].shape()[0]);
-    CHECK_EQ(outputs[0].shape()[0], inputs[0].shape()[0]);
-    /// dim of in1_val1 == in_val2 == in_grad1 == in_grad2
-    CHECK_EQ(inputs[3].shape()[1], inputs[2].shape()[1]);
-    CHECK_EQ(outputs[0].shape()[1], inputs[2].shape()[1]);
-    CHECK_EQ(outputs[1].shape()[1], inputs[2].shape()[1]);
-
-    CHECK(inputs[0].data() && inputs[1].data() && inputs[2].data() &&
-          inputs[3].data() && outputs[0].data() && outputs[1].data());
-
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
-
-    const auto out_grad = inputs[0].matrix<Device>();
-    const auto out_val = inputs[1].matrix<Device>();
-    const auto in1_val = inputs[2].matrix<Device>();
-    const auto in2_val = inputs[3].matrix<Device>();
-    auto in1_grad = outputs[0].matrix<Device>();
-    auto in2_grad = outputs[1].matrix<Device>();
-
-    CosSimBackward<Device>(
-        out_grad, out_val, in1_val, in2_val, in1_grad, in2_grad, scale_);
-  }
-
- private:
-  real scale_;
-};
-
-REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
-REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
-REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
-#endif
-}  // namespace paddle
diff --git a/paddle/legacy/function/CosSimOp.h b/paddle/legacy/function/CosSimOp.h
deleted file mode 100644
index 2d377eb3bef..00000000000
--- a/paddle/legacy/function/CosSimOp.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-
-namespace paddle {
-
-/**
- * \brief   Cosine Similarity Forward.
- * for each row i,
- * out[i] = scale * cos(in1[i], in2[i])
- *        = scale * \sum_j (in1[i][j] * in2[i][j]) /
- *                  sqrt(sum_j (in1[i][j]^2) * sum_j (in2[i][j])^2)
- *
- * \param[out]  output            output value.
- * \param[in]   intput1           input value.
- * \param[in]   intput2           input value.
- * \param[in]   scale             default 1.0.
- *
- */
-template <DeviceType Device>
-void CosSimForward(typename Tensor<real, Device>::Matrix& output,
-                   const typename Tensor<real, Device>::Matrix& input1,
-                   const typename Tensor<real, Device>::Matrix& input2,
-                   real scale);
-
-/**
- * \brief   Cosine Similarity BackWard for Derivative.
- *
- * \param[in]       output grad           backward loss output grad.
- * \param[in]       output val            forward-output value.
- * \param[in]       input val1            forward input value 1.
- * \param[in]       input val2            forward input value 2.
- * \param[in/out]   input grad            forward input grad 1.
- * \param[in/out]   input grad            forward input grad 2.
- * \param[in]       scale                 default 1.0.
- *
- */
-template <DeviceType Device>
-void CosSimBackward(const typename Tensor<real, Device>::Matrix& out_grad,
-                    const typename Tensor<real, Device>::Matrix& out_value,
-                    const typename Tensor<real, Device>::Matrix& in1_value,
-                    const typename Tensor<real, Device>::Matrix& in2_value,
-                    typename Tensor<real, Device>::Matrix& in1_grad,
-                    typename Tensor<real, Device>::Matrix& in2_grad,
-                    real scale);
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CosSimOpGpu.cu b/paddle/legacy/function/CosSimOpGpu.cu
deleted file mode 100644
index 9fe50529ac4..00000000000
--- a/paddle/legacy/function/CosSimOpGpu.cu
+++ /dev/null
@@ -1,248 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CosSimOp.h"
-#include "hl_base.h"
-#include "hl_device_functions.cuh"
-
-namespace paddle {
-
-template <int block_size>
-__global__ void KeCosSim(real* output,
-                         const real* input1,
-                         const real* input2,
-                         int width,
-                         int input1_height,
-                         int input2_height,
-                         real scale) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  __shared__ real xx[block_size];
-  __shared__ real yy[block_size];
-  __shared__ real xy[block_size];
-
-  xx[tid] = 0.0;
-  yy[tid] = 0.0;
-  xy[tid] = 0.0;
-  __syncthreads();
-
-  input1 += ty * width;
-  if (input2_height > 1) {
-    input2 += ty * width;
-  }
-  for (int index = tid; index < width; index += block_size) {
-    real x = input1[index];
-    real y = input2[index];
-    xx[tid] += x * x;
-    yy[tid] += y * y;
-    xy[tid] += x * y;
-  }
-  __syncthreads();
-
-  for (int s = block_size / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      xx[tid] += xx[tid + s];
-      yy[tid] += yy[tid + s];
-      xy[tid] += xy[tid + s];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    output[ty] = scale * xy[0] / (sqrt(xx[0]) * sqrt(yy[0]));
-  }
-}
-
-void hlCossim(real* output,
-              const real* input1,
-              const real* input2,
-              size_t width,
-              size_t input1_height,
-              size_t input2_height,
-              real scale) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(input1);
-  CHECK_NOTNULL(input2);
-  const int block_size = 256;
-  dim3 threads(block_size, 1);
-  dim3 grid(1, input1_height);
-
-  KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      output, input1, input2, width, input1_height, input2_height, scale);
-  CHECK_SYNC("hlCossim failed");
-}
-
-template <>
-void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
-                                    const GpuMatrix& in1_mat,
-                                    const GpuMatrix& in2_mat,
-                                    real scale) {
-  CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
-  CHECK(in1_mat.useGpu_ == true && in2_mat.useGpu_ == true)
-      << "Matrix type are not GPU";
-
-  size_t dim = in1_mat.getWidth();
-  real* out = out_mat.getData();
-  const real* x = in1_mat.getData();
-  const real* y = in2_mat.getData();
-  hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale);
-}
-
-template <int block_size>
-__global__ void KeCosSimDerivative(const real* grad,
-                                   const real* output,
-                                   const real* prev_out_x,
-                                   const real* prev_out_y,
-                                   real* prev_grad_x,
-                                   real* prev_grad_y,
-                                   size_t width,
-                                   size_t input1_height,
-                                   size_t input2_height,
-                                   real scale) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  __shared__ real xx[block_size];
-  __shared__ real yy[block_size];
-  __shared__ real xy[block_size];
-
-  xx[tid] = 0.0;
-  yy[tid] = 0.0;
-  xy[tid] = 0.0;
-  __syncthreads();
-
-  prev_out_x += ty * width;
-  prev_grad_x += ty * width;
-  if (input2_height > 1) {
-    prev_out_y += ty * width;
-    prev_grad_y += ty * width;
-  }
-  for (int index = tid; index < width; index += block_size) {
-    real x = prev_out_x[index];
-    real y = prev_out_y[index];
-    xx[tid] += x * x;
-    yy[tid] += y * y;
-    xy[tid] += x * y;
-  }
-  __syncthreads();
-
-  for (int s = block_size / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      xx[tid] += xx[tid + s];
-      yy[tid] += yy[tid + s];
-      xy[tid] += xy[tid + s];
-    }
-    __syncthreads();
-  }
-  if (xy[0] == 0) {
-    real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
-    for (int index = tid; index < width; index += block_size) {
-      prev_grad_x[index] += scale * grad[ty] * prev_out_y[index] * reciprocal;
-      if (input2_height > 1) {
-        prev_grad_y[index] += scale * grad[ty] * prev_out_x[index] * reciprocal;
-      } else {
-        paddle::paddleAtomicAdd(
-            prev_grad_y + index,
-            scale * grad[ty] * prev_out_x[index] * reciprocal);
-      }
-    }
-  } else {
-    real reciprocalXY = 1.0 / xy[0];
-    real reciprocalSquareSumX = 1.0 / xx[0];
-    real reciprocalSquareSumY = 1.0 / yy[0];
-    for (int index = tid; index < width; index += block_size) {
-      prev_grad_x[index] +=
-          output[ty] * grad[ty] * (prev_out_y[index] * reciprocalXY -
-                                   prev_out_x[index] * reciprocalSquareSumX);
-      if (input2_height > 1) {
-        prev_grad_y[index] +=
-            output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
-                                     prev_out_y[index] * reciprocalSquareSumY);
-      } else {
-        paddle::paddleAtomicAdd(
-            prev_grad_y + index,
-            output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
-                                     prev_out_y[index] * reciprocalSquareSumY));
-      }
-    }
-  }
-}
-
-void hlCossimDerivative(const real* grad,
-                        const real* output,
-                        const real* prev_out_x,
-                        const real* prev_out_y,
-                        real* prev_grad_x,
-                        real* prev_grad_y,
-                        size_t width,
-                        size_t input1_height,
-                        size_t input2_height,
-                        real scale) {
-  CHECK_NOTNULL(grad);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(prev_out_x);
-  CHECK_NOTNULL(prev_out_y);
-  CHECK_NOTNULL(prev_grad_x);
-  CHECK_NOTNULL(prev_grad_y);
-  const int block_size = 256;
-  dim3 threads(block_size, 1);
-  dim3 grid(1, input1_height);
-  KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      grad,
-      output,
-      prev_out_x,
-      prev_out_y,
-      prev_grad_x,
-      prev_grad_y,
-      width,
-      input1_height,
-      input2_height,
-      scale);
-  CHECK_SYNC("hlCossimDerivate failed");
-}
-
-template <>
-void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
-                                     const GpuMatrix& out_val,
-                                     const GpuMatrix& in1_val,
-                                     const GpuMatrix& in2_val,
-                                     GpuMatrix& in1_grad,
-                                     GpuMatrix& in2_grad,
-                                     real scale) {
-  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
-        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
-  CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ &&
-        in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
-      << "Matrix types are not equally GPU";
-
-  size_t dim = in1_val.getWidth();
-  const real* grad = out_grad.getData();
-  const real* out = out_val.getData();
-  const real* prev_out_x = in1_val.getData();
-  const real* prev_out_y = in2_val.getData();
-  real* prev_grad_x = in1_grad.getData();
-  real* prev_grad_y = in2_grad.getData();
-  hlCossimDerivative(grad,
-                     out,
-                     prev_out_x,
-                     prev_out_y,
-                     prev_grad_x,
-                     prev_grad_y,
-                     dim,
-                     in1_val.getHeight(),
-                     in2_val.getHeight(),
-                     scale);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CosSimOpTest.cpp b/paddle/legacy/function/CosSimOpTest.cpp
deleted file mode 100644
index 31bb43e1baa..00000000000
--- a/paddle/legacy/function/CosSimOpTest.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-#include "paddle/legacy/math/Matrix.h"
-
-using namespace paddle;  // NOLINT
-
-void testCosSimForward(size_t height_x,
-                       size_t height_y,
-                       size_t width,
-                       real scale) {
-  CpuGpuFuncCompare test("CosSimForward", FuncConfig().set("scale", scale));
-  // prepare input arguments
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}),
-                  ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-void testCosSimBackward(size_t height_x,
-                        size_t height_y,
-                        size_t width,
-                        real scale) {
-  CpuGpuFuncCompare test("CosSimBackward", FuncConfig().set("scale", scale));
-  // prepare input arguments
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}),
-                  ADD_TO);
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}),
-                  ADD_TO);
-  // run Function
-  test.run();
-}
-
-TEST(Matrix, cosSim) {
-  for (auto height_x : {10, 100, 1000}) {
-    for (auto height_y : {1, height_x}) {
-      for (auto width : {10, 100, 1000}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSimForward(height_x, height_y, width, scale);
-          testCosSimBackward(height_x, height_y, width, scale);
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/legacy/function/CropOp.cpp b/paddle/legacy/function/CropOp.cpp
deleted file mode 100644
index e22678822f0..00000000000
--- a/paddle/legacy/function/CropOp.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CropOp.h"
-#include "paddle/legacy/function/TensorShape.h"
-#include "paddle/legacy/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void Crop<DEVICE_TYPE_CPU>(real* outputs,
-                           const real* inputs,
-                           const TensorShape inShape,
-                           const TensorShape outShape,
-                           const FuncConfig& conf) {
-  std::vector<uint32_t> crop_corner =
-      conf.get<std::vector<uint32_t>>("crop_corner");
-  int cCrop = crop_corner[1];
-  int hCrop = crop_corner[2];
-  int wCrop = crop_corner[3];
-
-  int num = inShape[0];
-  int inC = inShape[1];
-  int inH = inShape[2];
-  int inW = inShape[3];
-
-  int outC = outShape[1];
-  int outH = outShape[2];
-  int outW = outShape[3];
-
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < outC; c++) {
-      for (int h = 0; h < outH; h++) {
-        int outoff = ((n * outC + c) * outH + h) * outW;
-        int inoff = ((n * inC + c + cCrop) * inH + h + hCrop) * inW + wCrop;
-        memcpy(outputs + outoff, inputs + inoff, outW * sizeof(real));
-      }
-    }
-  }
-}
-
-template <>
-void CropGrad<DEVICE_TYPE_CPU>(const real* inGrad,
-                               real* outGrad,
-                               const TensorShape inShape,
-                               const TensorShape outShape,
-                               const FuncConfig& conf) {
-  std::vector<uint32_t> crop_corner =
-      conf.get<std::vector<uint32_t>>("crop_corner");
-  int cCrop = crop_corner[1];
-  int hCrop = crop_corner[2];
-  int wCrop = crop_corner[3];
-
-  int num = outShape[0];
-  int outC = outShape[1];
-  int outH = outShape[2];
-  int outW = outShape[3];
-
-  int inC = inShape[1];
-  int inH = inShape[2];
-  int inW = inShape[3];
-
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < inC; c++) {
-      for (int h = 0; h < inH; h++) {
-        int outoff = ((n * outC + c + cCrop) * outH + h + hCrop) * outW + wCrop;
-        int inoff = ((n * inC + c) * inH + h) * inW;
-        CpuVector inG = CpuVector(inW, const_cast<real*>(inGrad + inoff));
-        CpuVector outG = CpuVector(inW, outGrad + outoff);
-        outG += inG;
-      }
-    }
-  }
-}
-
-/**
- * \brief Crop input according to the specify corner and shape.
- *        The input and output is a 4D tensor. In CropFunc, we only
- *        crop the 2nd to 4th dimension.
- *
- * Argument in this Function:
- * \param pad_    A struct object contains the cropping corner and shape.
- * \param inputs  A 4D tensor, only one input.
- * \param outputs A 4D tensor, the output value after cropping.
- *
- * For example,
- * Input(2,2,2,3) = [
- *                    [ [[1,2,3], [3,4,5]],
- *                      [[2,3,5], [1,6,7]] ],
- *                    [ [[4,3,1], [1,8,7]],
- *                      [[3,8,9], [2,3,5]] ]
- *                  ] # the input shape is (2,2,2,3)
- *
- * pad_: if corner = (0,1,1) and crop_shape = (2,1,2)
- * Output(2,2,1,2) = [
- *                    [ [[4,5]],
- *                      [[6,7]] ],
- *                    [ [[8,7]],
- *                      [[3,5]] ]
- *                  ] # the input shape is (2,2,2,3)
- */
-template <DeviceType Device>
-class CropFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    TensorShape inShape = inputs[0].shape();
-    TensorShape outShape = outputs[0].shape();
-
-    Crop<Device>(outputs[0].data<real>(),
-                 inputs[0].data<real>(),
-                 inShape,
-                 outShape,
-                 conf_);
-  }
-
- private:
-  FuncConfig conf_;
-};
-
-/**
- * \brief The backward propagation of cropping Function.
- *
- * Argument in this Function:
- * \param crop_    The same meaning as it in CropFunc.
- * \param inputs  The gradient with respect to the output value of CropFunc.
- * \param outputs The gradient with respect to the input value of CropFunc.
- */
-
-template <DeviceType Device>
-class CropGradFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-
-    TensorShape outShape = outputs[0].shape();
-    TensorShape inShape = inputs[0].shape();
-
-    CropGrad<Device>(inputs[0].data<real>(),
-                     outputs[0].data<real>(),
-                     inShape,
-                     outShape,
-                     conf_);
-  }
-
- private:
-  FuncConfig conf_;
-};
-
-REGISTER_TYPED_FUNC(Crop, CPU, CropFunc);
-REGISTER_TYPED_FUNC(CropGrad, CPU, CropGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(Crop, GPU, CropFunc);
-REGISTER_TYPED_FUNC(CropGrad, GPU, CropGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CropOp.h b/paddle/legacy/function/CropOp.h
deleted file mode 100644
index 05d4b163b37..00000000000
--- a/paddle/legacy/function/CropOp.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-
-namespace paddle {
-
-/**
- * \brief  This funtion crops inputs according to the specify start point and
- *shape.
- *
- * \param[out] outputs	save results.
- * \param[in]  inputs	input data.
- * \param[in]  inShape  the shape of input tensor.
- * \param[in]  conf     the cropping config
- */
-template <DeviceType Device>
-void Crop(real* outputs,
-          const real* inputs,
-          const TensorShape inShape,
-          const TensorShape outShape,
-          const FuncConfig& conf);
-
-/**
- * \brief   Cropping operation backward.
- *
- * \param[out] inGrad	gradients of previous layer
- * \param[in]  outGrad  output gradient
- * \param[in]  inShape  the shape of input tensor.
- * \param[in]  conf     the cropping config
- */
-template <DeviceType Device>
-void CropGrad(const real* inGrad,
-              real* outGrad,
-              const TensorShape inShape,
-              const TensorShape outShape,
-              const FuncConfig& conf);
-}  // namespace paddle
diff --git a/paddle/legacy/function/CropOpGpu.cu b/paddle/legacy/function/CropOpGpu.cu
deleted file mode 100644
index 56150624337..00000000000
--- a/paddle/legacy/function/CropOpGpu.cu
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CropOp.h"
-#include "hl_base.h"
-
-namespace paddle {
-
-__global__ void KeCrop(real* outputs,
-                       const real* inputs,
-                       int inC,
-                       int inH,
-                       int inW,
-                       int cropC,
-                       int cropH,
-                       int cropW,
-                       int outC,
-                       int outH,
-                       int outW,
-                       int nthreads) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < nthreads) {
-    const int w = idx % outW;
-    const int h = (idx / outW) % outH;
-    const int c = (idx / outW / outH) % outC;
-    const int n = idx / outW / outH / outC;
-
-    const int off = ((n * inC + c + cropC) * inH + h + cropH) * inW + cropW + w;
-    outputs[idx] = inputs[off];
-  }
-}
-
-template <>
-void Crop<DEVICE_TYPE_GPU>(real* outputs,
-                           const real* inputs,
-                           const TensorShape inShape,
-                           const TensorShape outShape,
-                           const FuncConfig& conf) {
-  std::vector<uint32_t> crop_corner =
-      conf.get<std::vector<uint32_t>>("crop_corner");
-  int cropC = crop_corner[1];
-  int cropH = crop_corner[2];
-  int cropW = crop_corner[3];
-
-  int num = inShape[0];
-  int inC = inShape[1];
-  int inH = inShape[2];
-  int inW = inShape[3];
-
-  int outC = outShape[1];
-  int outH = outShape[2];
-  int outW = outShape[3];
-
-  size_t nth = num * outC * outH * outW;
-  int blockSize = 1024;
-  int gridSize = (nth + blockSize - 1) / blockSize;
-
-  KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
-                                                     inputs,
-                                                     inC,
-                                                     inH,
-                                                     inW,
-                                                     cropC,
-                                                     cropH,
-                                                     cropW,
-                                                     outC,
-                                                     outH,
-                                                     outW,
-                                                     nth);
-  CHECK_SYNC("Crop");
-}
-
-__global__ void KeCropDiff(const real* inGrad,
-                           real* outGrad,
-                           int inC,
-                           int inH,
-                           int inW,
-                           int cropC,
-                           int cropH,
-                           int cropW,
-                           int outC,
-                           int outH,
-                           int outW,
-                           int nthreads) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < nthreads) {
-    const int w = idx % inW;
-    const int h = (idx / inW) % inH;
-    const int c = (idx / inW / inH) % inC;
-    const int n = idx / inW / inH / inC;
-
-    const int off =
-        ((n * outC + c + cropC) * outH + h + cropH) * outW + cropW + w;
-
-    outGrad[off] += inGrad[idx];
-  }
-}
-
-template <>
-void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
-                               real* outGrad,
-                               const TensorShape inShape,
-                               const TensorShape outShape,
-                               const FuncConfig& conf) {
-  std::vector<uint32_t> crop_corner =
-      conf.get<std::vector<uint32_t>>("crop_corner");
-  int cropC = crop_corner[1];
-  int cropH = crop_corner[2];
-  int cropW = crop_corner[3];
-
-  int num = outShape[0];
-  int outC = outShape[1];
-  int outH = outShape[2];
-  int outW = outShape[3];
-
-  int inC = inShape[1];
-  int inH = inShape[2];
-  int inW = inShape[3];
-
-  size_t nth = num * inC * inH * inW;
-  int blockSize = 1024;
-  int gridSize = (nth + blockSize - 1) / blockSize;
-
-  KeCropDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
-                                                         outGrad,
-                                                         inC,
-                                                         inH,
-                                                         inW,
-                                                         cropC,
-                                                         cropH,
-                                                         cropW,
-                                                         outC,
-                                                         outH,
-                                                         outW,
-                                                         nth);
-  CHECK_SYNC("CropGrad");
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CropOpTest.cpp b/paddle/legacy/function/CropOpTest.cpp
deleted file mode 100644
index 10c83a0321f..00000000000
--- a/paddle/legacy/function/CropOpTest.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-
-namespace paddle {
-
-TEST(Crop, real) {
-  for (size_t numSamples : {5, 32}) {
-    for (size_t channels : {5, 5, 32}) {
-      for (size_t imgSizeH : {5, 33, 100}) {
-        for (size_t imgSizeW : {5, 32, 96}) {
-          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
-          for (bool test_grad : {false, true}) {
-            CpuGpuFuncCompare compare(
-                test_grad ? "CropGrad" : "Crop",
-                FuncConfig()
-                    .set<std::vector<uint32_t>>("crop_corner", {0, 1, 1, 1})
-                    .set<std::vector<uint32_t>>("crop_shape", {0, 2, 3, 3}));
-            TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
-            TensorShape outDims{numSamples, 2, 3, 3};
-            compare.addInputs(
-                BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
-            compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT,
-                                         test_grad ? inDims : outDims,
-                                         test_grad ? ADD_TO : ASSIGN_TO),
-                               test_grad ? ADD_TO : ASSIGN_TO);
-            compare.run();
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CrossMapNormalOp.cpp b/paddle/legacy/function/CrossMapNormalOp.cpp
deleted file mode 100644
index f28703af00f..00000000000
--- a/paddle/legacy/function/CrossMapNormalOp.cpp
+++ /dev/null
@@ -1,344 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CrossMapNormalOp.h"
-#include "paddle/legacy/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void CrossMapNormal<DEVICE_TYPE_CPU>(real* outputs,
-                                     real* denoms,
-                                     const real* inputs,
-                                     size_t numSamples,
-                                     size_t channels,
-                                     size_t height,
-                                     size_t width,
-                                     size_t size,
-                                     real scale,
-                                     real pow) {
-  size_t oneImage = height * width;
-  size_t oneSample = channels * oneImage;
-
-  CpuVector outputsV(numSamples * oneSample, outputs);
-  CpuVector inputsV(numSamples * oneSample, const_cast<real*>(inputs));
-  CpuVector denomsV(numSamples * oneSample, denoms);
-
-  // f(x) = x * ( 1 + scale * SUM((x)^2) )^(-pow)
-  // x represents inputs
-  // f(x) represents outputs
-  // denoms save the intermediate result for backward
-  denomsV = denomsV.constant(1.0);
-  const int start = -((int)size - 1) / 2;
-  const int end = (int)size + start;
-  for (size_t i = 0; i < numSamples; i++) {
-    real* oneDenom = denoms + i * oneSample;
-    real* oneInput = const_cast<real*>(inputs) + i * oneSample;
-    for (int c = 0; c < (int)channels; c++) {
-      CpuVector denom(oneImage, oneDenom + c * oneImage);
-      for (int s = start; s < end; s++) {
-        if (c + s >= 0 && c + s < (int)channels) {
-          CpuVector input(oneImage, oneInput + (c + s) * oneImage);
-          denom += input.square() * scale;
-        }
-      }
-    }
-  }
-
-  outputsV = inputsV * denomsV.pow(-pow);
-}
-
-template <>
-void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
-                                         const real* inputsValue,
-                                         const real* outputsValue,
-                                         const real* outputsGrad,
-                                         const real* denoms,
-                                         size_t numSamples,
-                                         size_t channels,
-                                         size_t height,
-                                         size_t width,
-                                         size_t size,
-                                         real scale,
-                                         real pow) {
-  size_t oneSample = channels * height * width;
-  std::function<CpuVector(real*, size_t)> oneImage = [=](real* data,
-                                                         size_t offset) {
-    return CpuVector(height * width, data + offset);
-  };
-
-  const int start = -((int)size) / 2;
-  const int end = (int)size + start;
-  const real ratio = -(real)2 * scale * pow;
-  for (size_t i = 0; i < numSamples; i++) {
-    size_t sOffset = i * oneSample;
-    real* oneInputGrad = inputsGrad + sOffset;
-    real* oneInputValue = const_cast<real*>(inputsValue) + sOffset;
-    real* oneDenom = const_cast<real*>(denoms) + sOffset;
-    real* oneOutputGrad = const_cast<real*>(outputsGrad) + sOffset;
-    real* oneOutputValue = const_cast<real*>(outputsValue) + sOffset;
-
-    for (int c = 0; c < (int)channels; c++) {
-      size_t cOffset = c * height * width;
-      CpuVector inputGrad = oneImage(oneInputGrad, cOffset);
-      CpuVector inputValue = oneImage(oneInputValue, cOffset);
-      CpuVector denom = oneImage(oneDenom, cOffset);
-      CpuVector outputGrad = oneImage(oneOutputGrad, cOffset);
-
-      inputGrad = inputGrad + denom.pow(-pow) * outputGrad;
-      for (int s = start; s < end; s++) {
-        if (c + s >= 0 && c + s < (int)channels) {
-          size_t offset = (c + s) * height * width;
-          CpuVector output = oneImage(oneOutputValue, offset);
-          CpuVector outputGrad = oneImage(oneOutputGrad, offset);
-          CpuVector denom = oneImage(oneDenom, offset);
-
-          inputGrad += ((outputGrad * output * ratio) / denom) * inputValue;
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief Normalization with across maps.
- *
- * This Function comes from the paper
- * "ImageNet Classification with Deep Convolutional Neural Networks".
- *
- * The original formula is:
- *
- *                                Input(i, x, y)
- * Output(i, x, y) = ----------------------------------------------
- *                                 -- upper
- *                    (k + alpha * >  (Input(j, x, y))^2) ^ (beta)
- *                                 -- j = lower
- *
- * upper is `min(C, c + N/2)`
- * lower if `max(0, c - N/2)`
- *
- * Function implementation:
- *
- * inputs and outpus is NCHW format, while input.shape.ndims() is equal 4.
- * And the meaning of each dimension(0-3) is respectively batch size,
- * feature maps, rows and columns.
- *
- * Input and Output in the above formula is for each map(i) of one image, and
- * Input(i, x, y), Output(i, x, y) represents an element in an image.
- *
- * C is the number of feature maps of one image, and N is a hyper-parameters
- * is configured when Function is initialized. The sum in the denominator
- * is the sum of the same position in the neighboring maps.
- *
- * In the implementation of Function, k is equal to 1,
- * so Function has no argument for k.
- *
- * Function Arguments:
- *
- * \param size_      represent N
- * \param scale_     represent alpha
- * \param pow_       represent beta
- * \param inputs[0]  represent Input
- * \param outputs[0] represent Output
- * \param outputs[1] represent The denominator in the formula(except beta)
- *
- * Note:
- * Save output[1] is to simplify the backward calculation.
- * TODO, if only consider the forward calculation, we can optimize to
- * remove the output[1].
- */
-template <DeviceType Device>
-class CrossMapNormalFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    // function arguments
-    size_ = config.get<size_t>("size");
-    scale_ = config.get<real>("scale");
-    pow_ = config.get<real>("pow");
-
-    // number of inputs and outputs
-    numInputs_ = 1;
-    numOutputs_ = 2;
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    check(inputs, outputs);
-    // ArgType check still on here,
-    // not sure whether it is better to put inside the check.
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    CHECK_EQ(outputs[1].getArgType(), ASSIGN_TO);
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    CrossMapNormal<Device>(outputs[0].data<real>(),
-                           outputs[1].data<real>(),
-                           inputs[0].data<real>(),
-                           batchSize,
-                           maps,
-                           rows,
-                           columns,
-                           size_,
-                           scale_,
-                           pow_);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
-    CHECK(inputs[0].shape() == outputs[0].shape());
-    CHECK(inputs[0].shape() == outputs[1].shape());
-  }
-
-  // Only need the shape of the input, can calculate the
-  // floating-point operation.
-  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)numInputs_, inputs.size());
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    // number of floating-point operations
-    // an approximate value
-    size_t ops = batchSize * maps * rows * columns * (size_ * 2 + 3);
-
-    return ops;
-  }
-
- private:
-  size_t size_;
-  real scale_;
-  real pow_;
-};
-
-/**
- * \brief Backward calculation for normalization with across maps.
- *
- * Function implementation:
- *
- * The implementation of this Function is derived from the
- * CrossMapNormalFunc implementation.
- *
- * InputGrad = OutputGrad * denoms ^ (-beta)
- *    -- upper
- *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / denoms) * InputValue
- *    -- lower
- *
- * The data of inputs/outputs format is the same as the forward interface
- * and is NCHW.
- *
- * The upper and lower is the same as forward. The logic of the sum
- * is also the same as forward.
- *
- * Function Arguments:
- *
- * \param size_      represent N
- * \param scale_     represent alpha
- * \param pow_       represent beta
- * \param inputs[0]  represent InputValue, inputs[0] of CrossMapNormalFunc
- * \param inputs[1]  represent OutputValue, outputs[0] of CrossMapNormalFunc
- * \param inputs[2]  represent OutputGrad
- * \param inputs[3]  represent denoms, outputs[1] of CrossMapNormalFunc
- *                   This is the intermediate result that is
- *                   preserved in the forward calculation.
- * \param outputs[0] represent InputGrad
- */
-template <DeviceType Device>
-class CrossMapNormalGradFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    // function arguments
-    size_ = config.get<size_t>("size");
-    scale_ = config.get<real>("scale");
-    pow_ = config.get<real>("pow");
-
-    // number of inputs and outputs
-    numInputs_ = 4;
-    numOutputs_ = 1;
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    check(inputs, outputs);
-    if (outputs[0].getArgType() != ADD_TO) {
-      // Currently, some algorithm implementations are ASSIGN_TO mode,
-      // if need to support the ADD_TO calculation, need to clear the output.
-      typename Tensor<real, Device>::Vector tmp(
-          outputs[0].shape().getElements(), outputs[0].data<real>());
-      tmp.zero();
-    }
-
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    CrossMapNormalGrad<Device>(outputs[0].data<real>(),
-                               inputs[0].data<real>(),
-                               inputs[1].data<real>(),
-                               inputs[2].data<real>(),
-                               inputs[3].data<real>(),
-                               batchSize,
-                               maps,
-                               rows,
-                               columns,
-                               size_,
-                               scale_,
-                               pow_);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
-    CHECK(inputs[0].shape() == inputs[1].shape());
-    CHECK(inputs[0].shape() == inputs[2].shape());
-    CHECK(inputs[0].shape() == inputs[3].shape());
-    CHECK(inputs[0].shape() == outputs[0].shape());
-  }
-
-  // Only need the shape of one input, can calculate the
-  // floating-point operation.
-  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_LT((size_t)1, inputs.size());
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    // number of floating-point operations
-    // an approximate value
-    size_t ops = batchSize * maps * rows * columns * (size_ * 4 + 2);
-
-    return ops;
-  }
-
- private:
-  size_t size_;
-  real scale_;
-  real pow_;
-};
-
-REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc);
-REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc);
-REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CrossMapNormalOp.h b/paddle/legacy/function/CrossMapNormalOp.h
deleted file mode 100644
index bb9cdf20216..00000000000
--- a/paddle/legacy/function/CrossMapNormalOp.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-
-namespace paddle {
-
-/**
- * \brief   Cross map respose normalize forward.
- *          The data structure of image data is NCHW.
- *
- * \param[out]  outputs     output data.
- * \param[in]   denoms      denoms buffer.
- * \param[in]   inputs      input data.
- * \param[in]   numSamples  batch size of input image.
- * \param[in]   channels    number of channel.
- * \param[in]   height      image height.
- * \param[in]   width       image width.
- * \param[in]   size        size.
- * \param[in]   scale       scale.
- * \param[in]   pow         scale.
- *
- */
-template <DeviceType Device>
-void CrossMapNormal(real* outputs,
-                    real* denoms,
-                    const real* inputs,
-                    size_t numSamples,
-                    size_t channels,
-                    size_t height,
-                    size_t width,
-                    size_t size,
-                    real scale,
-                    real pow);
-
-/**
- * \brief   Cross map respose normalize backward.
- *          The data structure of image data is NCHW.
- *
- * \param[out]  inputsGrad      input grad.
- * \param[in]   inputsValue     input value.
- * \param[out]  outputsValue    output value.
- * \param[out]  outputsGrad     output grad.
- * \param[in]   denoms          denoms buffer.
- * \param[in]   numSamples      batch size of input image.
- * \param[in]   channels        number of channel.
- * \param[in]   height          image height.
- * \param[in]   width           image width.
- * \param[in]   size            size.
- * \param[in]   scale           scale.
- * \param[in]   pow             scale.
- *
- */
-template <DeviceType Device>
-void CrossMapNormalGrad(real* inputsGrad,
-                        const real* inputsValue,
-                        const real* outputsValue,
-                        const real* outputsGrad,
-                        const real* denoms,
-                        size_t numSamples,
-                        size_t channels,
-                        size_t height,
-                        size_t width,
-                        size_t size,
-                        real scale,
-                        real pow);
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CrossMapNormalOpGpu.cu b/paddle/legacy/function/CrossMapNormalOpGpu.cu
deleted file mode 100644
index 938827610af..00000000000
--- a/paddle/legacy/function/CrossMapNormalOpGpu.cu
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CrossMapNormalOp.h"
-#include "hl_base.h"
-
-namespace paddle {
-
-__global__ void KeCMRNormFillScale(size_t imageSize,
-                                   const real* in,
-                                   real* scale,
-                                   size_t channels,
-                                   size_t height,
-                                   size_t width,
-                                   size_t size,
-                                   real alpha) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < imageSize) {
-    const int w = idx % width;
-    const int h = (idx / width) % height;
-    const int n = idx / width / height;
-    const int offset = (n * channels * height + h) * width + w;
-
-    in += offset;
-    scale += offset;
-    const int step = height * width;
-    const int pre_pad = (size - 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-
-    real accum = 0;
-    int index = 0;
-    while (index < channels + post_pad) {
-      if (index < channels) {
-        accum += in[index * step] * in[index * step];
-      }
-      if (index >= size) {
-        accum -= in[(index - size) * step] * in[(index - size) * step];
-      }
-      if (index >= post_pad) {
-        scale[(index - post_pad) * step] = 1. + accum * alpha;
-      }
-      ++index;
-    }
-  }
-}
-
-__global__ void KeCMRNormOutput(size_t inputSize,
-                                const real* in,
-                                const real* scale,
-                                real negative_beta,
-                                real* out) {
-  const int index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index < inputSize) {
-    out[index] = in[index] * pow(scale[index], negative_beta);
-  }
-}
-
-template <>
-void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs,
-                                     real* denoms,
-                                     const real* inputs,
-                                     size_t numSamples,
-                                     size_t channels,
-                                     size_t height,
-                                     size_t width,
-                                     size_t size,
-                                     real scale,
-                                     real pow) {
-  size_t imageSize = numSamples * height * width;
-  int blockSize = 1024;
-  int gridSize = (imageSize + 1024 - 1) / 1024;
-  KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-      imageSize, inputs, denoms, channels, height, width, size, scale);
-
-  size_t inputSize = numSamples * height * width * channels;
-  blockSize = 1024;
-  gridSize = (inputSize + 1024 - 1) / 1024;
-  KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-      inputSize, inputs, denoms, -pow, outputs);
-
-  CHECK_SYNC("CrossMapNormal");
-}
-
-__global__ void KeCMRNormDiff(size_t imageSize,
-                              const real* bottom_data,
-                              const real* top_data,
-                              const real* scale,
-                              const real* top_diff,
-                              size_t channels,
-                              size_t height,
-                              size_t width,
-                              size_t size,
-                              real negative_beta,
-                              real cache_ratio,
-                              real* bottom_diff) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < imageSize) {
-    const int w = idx % width;
-    const int h = (idx / width) % height;
-    const int n = idx / width / height;
-    const int offset = (n * channels * height + h) * width + w;
-    bottom_data += offset;
-    top_data += offset;
-    scale += offset;
-    top_diff += offset;
-    bottom_diff += offset;
-
-    const int step = height * width;
-    const int pre_pad = size - (size + 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-
-    int index = 0;
-    real accum = 0;
-    while (index < channels + post_pad) {
-      if (index < channels) {
-        accum += top_diff[index * step] * top_data[index * step] /
-                 scale[index * step];
-      }
-      if (index >= size) {
-        accum -= top_diff[(index - size) * step] *
-                 top_data[(index - size) * step] / scale[(index - size) * step];
-      }
-      if (index >= post_pad) {
-        bottom_diff[(index - post_pad) * step] +=
-            top_diff[(index - post_pad) * step] *
-                pow(scale[(index - post_pad) * step], negative_beta) -
-            cache_ratio * bottom_data[(index - post_pad) * step] * accum;
-      }
-      ++index;
-    }
-  }
-}
-
-template <>
-void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad,
-                                         const real* inputsValue,
-                                         const real* outputsValue,
-                                         const real* outputsGrad,
-                                         const real* denoms,
-                                         size_t numSamples,
-                                         size_t channels,
-                                         size_t height,
-                                         size_t width,
-                                         size_t size,
-                                         real scale,
-                                         real pow) {
-  size_t imageSize = numSamples * height * width;
-
-  int blockSize = 1024;
-  int gridSize = (imageSize + 1024 - 1) / 1024;
-  KeCMRNormDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(imageSize,
-                                                            inputsValue,
-                                                            outputsValue,
-                                                            denoms,
-                                                            outputsGrad,
-                                                            channels,
-                                                            height,
-                                                            width,
-                                                            size,
-                                                            -pow,
-                                                            2.0f * pow * scale,
-                                                            inputsGrad);
-  CHECK_SYNC("CrossMapNormalGrad");
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CrossMapNormalOpTest.cpp b/paddle/legacy/function/CrossMapNormalOpTest.cpp
deleted file mode 100644
index dec52adde22..00000000000
--- a/paddle/legacy/function/CrossMapNormalOpTest.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-
-namespace paddle {
-
-TEST(CrossMapNormal, real) {
-  for (size_t numSamples : {5}) {
-    for (size_t channels : {1, 5}) {
-      for (size_t imgSizeH : {5, 33}) {
-        for (size_t imgSizeW : {5, 32}) {
-          for (size_t size : {1, 3}) {
-            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
-                    << " size=" << size;
-
-            // init Test object
-            CpuGpuFuncCompare test("CrossMapNormal",
-                                   FuncConfig()
-                                       .set("size", size)
-                                       .set("scale", (real)1.5)
-                                       .set("pow", (real)0.5));
-            // prepare input arguments
-            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
-            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-            // run Function
-            test.run();
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(CrossMapNormalGrad, real) {
-  for (size_t numSamples : {5}) {
-    for (size_t channels : {1, 5}) {
-      for (size_t imgSizeH : {5, 33}) {
-        for (size_t imgSizeW : {5, 32}) {
-          for (size_t size : {1, 3}) {
-            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
-                    << " size=" << size;
-
-            CpuGpuFuncCompare test("CrossMapNormalGrad",
-                                   FuncConfig()
-                                       .set("size", size)
-                                       .set("scale", (real)1.5)
-                                       .set("pow", (real)0.5));
-            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
-            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-            // run Function
-            test.run();
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/DepthwiseConvOp.cpp b/paddle/legacy/function/DepthwiseConvOp.cpp
deleted file mode 100644
index 958034e08e6..00000000000
--- a/paddle/legacy/function/DepthwiseConvOp.cpp
+++ /dev/null
@@ -1,305 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DepthwiseConvOp.h"
-#include "ConvOp.h"
-
-namespace paddle {
-
-template <class T>
-class DepthwiseConvFunctor<DEVICE_TYPE_CPU, T> {
- public:
-  void operator()(const T* inputData,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* outputData) {
-    // TODO(zhaolong) : cpu implementation of depthwise convolution
-  }
-};
-
-template <class T>
-class DepthwiseConvGradInputFunctor<DEVICE_TYPE_CPU, T> {
- public:
-  void operator()(const T* outputGrad,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* inputGrad) {}
-  // TODO(zhaolong) : cpu implementation of depthwise convolution
-};
-
-template <class T>
-class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_CPU, T> {
- public:
-  void operator()(const T* outputGrad,
-                  const T* inputData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* colData,
-                  T* filterGrad) {}
-  // TODO(zhaolong) : cpu implementation of depthwise convolution
-};
-
-/*
- * \brief Forward calculation of depthwise convolution.
- */
-template <DeviceType Device>
-class DepthwiseConvFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-    size_t filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(inputChannels, groups_);
-
-    real* inputData = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* outputData = outputs[0].data<real>();
-
-    DepthwiseConvFunctor<Device, real> depthwiseConv;
-    depthwiseConv(inputData,
-                  filterData,
-                  batchSize,
-                  outputChannels,
-                  outputHeight,
-                  outputWidth,
-                  inputChannels,
-                  inputHeight,
-                  inputWidth,
-                  filterMultiplier,
-                  filterHeight,
-                  filterWidth,
-                  strideH(),
-                  strideW(),
-                  paddingH(),
-                  paddingW(),
-                  outputData);
-  }
-};
-
-/*
- * \brief Backward input calculation of depthwise convolution.
- */
-template <DeviceType Device>
-class DepthwiseConvGradInputFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& input = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    check(inputs, outputs);
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& input = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-    size_t filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(inputChannels, groups_);
-
-    real* outputGrad = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* inputGrad = outputs[0].data<real>();
-
-    DepthwiseConvGradInputFunctor<Device, real> depthwiseConvGradInput;
-    depthwiseConvGradInput(outputGrad,
-                           filterData,
-                           batchSize,
-                           outputChannels,
-                           outputHeight,
-                           outputWidth,
-                           inputChannels,
-                           inputHeight,
-                           inputWidth,
-                           filterMultiplier,
-                           filterHeight,
-                           filterWidth,
-                           strideH(),
-                           strideW(),
-                           paddingH(),
-                           paddingW(),
-                           inputGrad);
-  }
-};
-
-/*
- * \brief Backward filter calculation of depthwise convolution.
- */
-template <DeviceType Device>
-class DepthwiseConvGradFilterFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& input = inputs[1].shape();
-    const TensorShape& filter = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    check(inputs, outputs);
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& input = inputs[1].shape();
-    const TensorShape& filter = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-    size_t filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(inputChannels, groups_);
-
-    real* outputGrad = inputs[0].data<real>();
-    real* inputData = inputs[1].data<real>();
-    real* filterGrad = outputs[0].data<real>();
-
-    int size = outputChannels * filterHeight * filterWidth * outputHeight *
-               outputWidth;
-    resizeBuffer<Device>(size);
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
-
-    DepthwiseConvGradFilterFunctor<Device, real> depthwiseConvGradFilter;
-
-    depthwiseConvGradFilter(outputGrad,
-                            inputData,
-                            batchSize,
-                            outputChannels,
-                            outputHeight,
-                            outputWidth,
-                            inputChannels,
-                            inputHeight,
-                            inputWidth,
-                            filterMultiplier,
-                            filterHeight,
-                            filterWidth,
-                            strideH(),
-                            strideW(),
-                            paddingH(),
-                            paddingW(),
-                            colData,
-                            filterGrad);
-  }
-};
-
-REGISTER_TYPED_FUNC(DepthwiseConv, CPU, DepthwiseConvFunction);
-REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
-                    CPU,
-                    DepthwiseConvGradInputFunction);
-REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
-                    CPU,
-                    DepthwiseConvGradFilterFunction);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(DepthwiseConv, GPU, DepthwiseConvFunction);
-REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
-                    GPU,
-                    DepthwiseConvGradInputFunction);
-REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
-                    GPU,
-                    DepthwiseConvGradFilterFunction);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/DepthwiseConvOp.h b/paddle/legacy/function/DepthwiseConvOp.h
deleted file mode 100644
index 7837edd1c07..00000000000
--- a/paddle/legacy/function/DepthwiseConvOp.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "TensorType.h"
-
-namespace paddle {
-
-/**
- *\brief   Depthwise convolution forward. The outputData
- *         of depthwise convolution is same with ExpandConvLayer
- *         when groups equals inputChannels in ExpandConvLayer.
- *
- * \param[in]   inputData         input data.
- * \param[in]   filterData        the Paramters of the depthwise conv layer..
- * \param[in]   batchSize         batch size of input data.
- * \param[in]   outputChannels    channels of outputData.
- * \param[in]   outputHeight      height of outputData.
- * \param[in]   outputWidth       width of outputData.
- * \param[in]   inputChannels     channels of inputData.
- * \param[in]   inputHeight       height of inputData.
- * \param[in]   inputWidth        width of inputData..
- * \param[in]   filterMultiplier  equals to outputChannels/groups_.
- * \param[in]   filterHeight      height of filter.
- * \param[in]   filterWidth       widht of filter.
- * \param[in]   strideH           stride size in height direction.
- * \param[in]   strideW           stride size in width direction.
- * \param[in]   paddingH          padding size in height direction.
- * \param[in]   paddingW          padding size in width direction.
- * \param[out]  outputData        outputData.
- *
- */
-template <DeviceType Device, class T>
-class DepthwiseConvFunctor {
- public:
-  void operator()(const T* inputData,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* outputData);
-};
-
-/**
- *\brief  Functor tot compute the depthwise convolution backprop w.r.t input.
- *
- *
- * \param[in]   outputGradData    the grad data of output.
- * \param[in]   filterData        the Paramters of the depthwise conv layer..
- * \param[in]   batchSize         batch size of input data.
- * \param[in]   outputChannels    channels of outputData.
- * \param[in]   outputHeight      height of outputData.
- * \param[in]   outputWidth       width of outputData.
- * \param[in]   inputChannels     channels of input data.
- * \param[in]   inputHeight       height of inputData.
- * \param[in]   inputWidth        width of inputData.
- * \param[in]   filterMultiplier  equals to outputChannels/groups_.
- * \param[in]   filterHeight      height of filter.
- * \param[in]   filterWidth       widht of filter.
- * \param[in]   strideH           stride size in height direction.
- * \param[in]   strideW           stride size in width direction.
- * \param[in]   paddingH          padding size in height direction.
- * \param[in]   paddingW          padding size in width direction.
- * \param[out]  inputGrad         the grad data of input.
- *
- */
-template <DeviceType Device, class T>
-class DepthwiseConvGradInputFunctor {
- public:
-  void operator()(const T* outputGrad,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* inputGrad);
-};
-
-/**
- *\brief  Functor tot compute the depthwise convolution backprop w.r.t filter.
- *
- * \param[in]   outputGradData    the grad data of output.
- * \param[in]   inputData         inputData.
- * \param[in]   batchSize         batch size of input data.
- * \param[in]   outputChannels    channels of outputData.
- * \param[in]   outputHeight      height of outputData.
- * \param[in]   outputWidth       width of outputData.
- * \param[in]   inputChannels     channels of input data.
- * \param[in]   inputHeight       height of inputData.
- * \param[in]   inputWidth        width of inputData.
- * \param[in]   filterMultiplier  equals to outputChannels/groups_.
- * \param[in]   filterHeight      height of filter.
- * \param[in]   filterWidth       widht of filter.
- * \param[in]   strideH           stride size in height direction.
- * \param[in]   strideW           stride size in width direction.
- * \param[in]   paddingH          padding size in height direction.
- * \param[in]   paddingW          padding size in width direction.
- * \param[in]   colData           Auxiliary data when calculating filterGrad.
- * \param[in]   multiplierData    Auxiliary data when calculating filterGrad.
- * \param[out]  filterGrad        the grad data of filter.
- *
- */
-template <DeviceType Device, class T>
-class DepthwiseConvGradFilterFunctor {
- public:
-  void operator()(const T* outputGrad,
-                  const T* inputData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* colData,
-                  T* filterGrad);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/DepthwiseConvOpGpu.cu b/paddle/legacy/function/DepthwiseConvOpGpu.cu
deleted file mode 100644
index 17138cc5639..00000000000
--- a/paddle/legacy/function/DepthwiseConvOpGpu.cu
+++ /dev/null
@@ -1,376 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DepthwiseConvOp.h"
-#include "paddle/legacy/math/BaseMatrix.h"
-
-namespace paddle {
-
-// CUDA kernel to compute the depthwise convolution forward pass
-template <class T>
-__global__ void ConvolutionDepthwiseForward(const int nthreads,
-                                            const T* const inputData,
-                                            const T* const filterData,
-                                            const int batchSize,
-                                            const int outputChannels,
-                                            const int outputHeight,
-                                            const int outputWidth,
-                                            const int inputChannels,
-                                            const int inputHeight,
-                                            const int inputWidth,
-                                            const int filterMultiplier,
-                                            const int filterHeight,
-                                            const int filterWidth,
-                                            const int strideH,
-                                            const int strideW,
-                                            const int paddingH,
-                                            const int paddingW,
-                                            T* const outputData) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if (index < nthreads) {
-    const int batch = index / outputChannels / outputHeight / outputWidth;
-    const int c_out = (index / outputHeight / outputWidth) % outputChannels;
-    const int h_out = (index / outputWidth) % outputHeight;
-    const int w_out = index % outputWidth;
-
-    const int c_in = c_out / filterMultiplier;
-    const T* weight = filterData + c_out * filterHeight * filterWidth;
-    T value = 0;
-    const int h_in_start = -paddingH + h_out * strideH;
-    const int w_in_start = -paddingW + w_out * strideW;
-    const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
-    const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
-    if ((h_in_start >= 0) && (h_in_end < inputHeight) && (w_in_start >= 0) &&
-        (w_in_end < inputWidth)) {
-      for (int kh = 0; kh < filterHeight; ++kh) {
-        for (int kw = 0; kw < filterWidth; ++kw) {
-          const int h_in = -paddingH + h_out * strideH + kh;
-          const int w_in = -paddingW + w_out * strideW + kw;
-          const int offset =
-              ((batch * inputChannels + c_in) * inputHeight + h_in) *
-                  inputWidth +
-              w_in;
-          value += (*weight) * inputData[offset];
-          ++weight;
-        }
-      }
-    } else {
-      for (int kh = 0; kh < filterHeight; ++kh) {
-        for (int kw = 0; kw < filterWidth; ++kw) {
-          const int h_in = -paddingH + h_out * strideH + kh;
-          const int w_in = -paddingW + w_out * strideW + kw;
-          if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
-              (w_in < inputWidth)) {
-            const int offset =
-                ((batch * inputChannels + c_in) * inputHeight + h_in) *
-                    inputWidth +
-                w_in;
-            value += (*weight) * inputData[offset];
-          }
-          ++weight;
-        }
-      }
-    }
-    outputData[index] = value;
-  }
-}
-
-// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
-template <class T>
-__global__ void ConvolutionDepthwiseInputBackward(const int nthreads,
-                                                  const T* const top_diff,
-                                                  const T* const weight_data,
-                                                  const int num,
-                                                  const int outputChannels,
-                                                  const int outputHeight,
-                                                  const int outputWidth,
-                                                  const int inputChannels,
-                                                  const int inputHeight,
-                                                  const int inputWidth,
-                                                  const int filterMultiplier,
-                                                  const int filterHeight,
-                                                  const int filterWidth,
-                                                  const int strideH,
-                                                  const int strideW,
-                                                  const int paddingH,
-                                                  const int paddingW,
-                                                  T* const bottom_diff) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    const int batch = index / inputChannels / inputHeight / inputWidth;
-    const int c_in = (index / inputHeight / inputWidth) % inputChannels;
-    const int h_in = (index / inputWidth) % inputHeight;
-    const int w_in = index % inputWidth;
-
-    const int c_out_start = c_in * filterMultiplier;
-
-    int h_out_start = (h_in - filterHeight + paddingH + strideH) / strideH;
-    h_out_start = 0 > h_out_start ? 0 : h_out_start;
-    int h_out_end = (h_in + paddingH) / strideH;
-    h_out_end = outputHeight - 1 < h_out_end ? outputHeight - 1 : h_out_end;
-    int w_out_start = (w_in - filterWidth + paddingW + strideW) / strideW;
-    w_out_start = 0 > w_out_start ? 0 : w_out_start;
-    int w_out_end = (w_in + paddingW) / strideW;
-    w_out_end = outputWidth - 1 < w_out_end ? outputWidth - 1 : w_out_end;
-
-    T value = 0;
-
-    for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier;
-         c_out++) {
-      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
-        const int filter_h = h_in + paddingH - h_out * strideH;
-        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
-          const int filter_w = w_in + paddingW - w_out * strideW;
-          const int filter_offset = c_out * filterHeight * filterWidth +
-                                    filter_h * filterWidth + filter_w;
-          const int top_diff_offset =
-              ((batch * outputChannels + c_out) * outputHeight + h_out) *
-                  outputWidth +
-              w_out;
-          value += top_diff[top_diff_offset] * weight_data[filter_offset];
-        }
-      }
-    }
-    bottom_diff[index] += value;
-  }
-}
-
-// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
-template <class T>
-__global__ void ConvolutionDepthwiseFilterBackward(const int num_i,
-                                                   const int nthreads,
-                                                   const T* const top_diff,
-                                                   const T* const inputData,
-                                                   const int num,
-                                                   const int outputChannels,
-                                                   const int outputHeight,
-                                                   const int outputWidth,
-                                                   const int inputChannels,
-                                                   const int inputHeight,
-                                                   const int inputWidth,
-                                                   const int filterMultiplier,
-                                                   const int filterHeight,
-                                                   const int filterWidth,
-                                                   const int strideH,
-                                                   const int strideW,
-                                                   const int paddingH,
-                                                   const int paddingW,
-                                                   T* const buffer_data) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    const int h_out = (index / outputWidth) % outputHeight;
-    const int w_out = index % outputWidth;
-    const int kh =
-        (index / filterWidth / outputHeight / outputWidth) % filterHeight;
-    const int kw = (index / outputHeight / outputWidth) % filterWidth;
-    const int h_in = -paddingH + h_out * strideH + kh;
-    const int w_in = -paddingW + w_out * strideW + kw;
-    if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
-        (w_in < inputWidth)) {
-      const int c_out =
-          index / (filterHeight * filterWidth * outputHeight * outputWidth);
-      const int c_in = c_out / filterMultiplier;
-      const int batch = num_i;
-      const int top_offset =
-          ((batch * outputChannels + c_out) * outputHeight + h_out) *
-              outputWidth +
-          w_out;
-      const int bottom_offset =
-          ((batch * inputChannels + c_in) * inputHeight + h_in) * inputWidth +
-          w_in;
-      buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
-    } else {
-      buffer_data[index] = 0;
-    }
-  }
-}
-
-template <class T>
-class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(const T* inputData,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* outputData) {
-    int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
-
-    size_t blocks = (outputSize + 1024 - 1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-
-    ConvolutionDepthwiseForward<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        outputSize,
-        inputData,
-        filterData,
-        batchSize,
-        outputChannels,
-        outputHeight,
-        outputWidth,
-        inputChannels,
-        inputHeight,
-        inputWidth,
-        filterMultiplier,
-        filterHeight,
-        filterWidth,
-        strideH,
-        strideW,
-        paddingH,
-        paddingW,
-        outputData);
-  }
-};
-
-template <class T>
-class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(const T* outputGrad,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* inputGrad) {
-    int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
-
-    size_t blocks = (inputSize + 1024 - 1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-
-    ConvolutionDepthwiseInputBackward<T>
-        // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(inputSize,
-                                               outputGrad,
-                                               filterData,
-                                               batchSize,
-                                               outputChannels,
-                                               outputHeight,
-                                               outputWidth,
-                                               inputChannels,
-                                               inputHeight,
-                                               inputWidth,
-                                               filterMultiplier,
-                                               filterHeight,
-                                               filterWidth,
-                                               strideH,
-                                               strideW,
-                                               paddingH,
-                                               paddingW,
-                                               inputGrad);
-  }
-};
-
-template <class T>
-class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(const T* outputGrad,
-                  const T* inputData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* colData,
-                  T* filterGrad) {
-    int colDataSize = outputChannels * filterHeight * filterWidth *
-                      outputHeight * outputWidth;
-
-    size_t blocks = (colDataSize + 1024 - 1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-    BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
-                                1,
-                                filterGrad,
-                                false,
-                                true);
-
-    for (int i = 0; i < batchSize; i++) {
-      ConvolutionDepthwiseFilterBackward<
-          T><<<grid, threads, 0, STREAM_DEFAULT>>>(i,
-                                                   colDataSize,
-                                                   outputGrad,
-                                                   inputData,
-                                                   batchSize,
-                                                   outputChannels,
-                                                   outputHeight,
-                                                   outputWidth,
-                                                   inputChannels,
-                                                   inputHeight,
-                                                   inputWidth,
-                                                   filterMultiplier,
-                                                   filterHeight,
-                                                   filterWidth,
-                                                   strideH,
-                                                   strideW,
-                                                   paddingH,
-                                                   paddingW,
-                                                   colData);
-      int K = outputHeight * outputWidth;
-      int M = colDataSize / K;
-
-      BaseMatrix colMatrix(M, K, colData, false, true);
-      filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
-    }
-  }
-};
-
-#ifdef PADDLE_TYPE_DOUBLE
-template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
-template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
-template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
-#else
-template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
-template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
-template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/DepthwiseConvOpTest.cpp b/paddle/legacy/function/DepthwiseConvOpTest.cpp
deleted file mode 100644
index caf8f3597ff..00000000000
--- a/paddle/legacy/function/DepthwiseConvOpTest.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "ConvOpTest.h"
-
-namespace paddle {
-
-#ifdef PADDLE_WITH_CUDA
-TEST(DepthwiseConv, Forward) {
-  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
-      "GemmConv-CPU", "DepthwiseConv-GPU", forward);
-}
-
-TEST(DepthwiseConv, BackwardInput) {
-  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
-      "GemmConvGradInput-CPU", "DepthwiseConvGradInput-GPU", backward_input);
-}
-
-TEST(DepthwiseConv, BackwardFilter) {
-  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
-      "GemmConvGradFilter-CPU", "DepthwiseConvGradFilter-GPU", backward_filter);
-}
-#endif
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-TEST(DepthwiseConv, Forward) {
-  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
-      "GemmConv-CPU", "NeonDepthwiseConv-CPU", forward);
-}
-
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/EigenGemm.cpp b/paddle/legacy/function/EigenGemm.cpp
deleted file mode 100644
index 5929c5c68ec..00000000000
--- a/paddle/legacy/function/EigenGemm.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-#include "paddle/legacy/function/EigenThreadDevice.h"
-
-namespace paddle {
-
-template <class T>
-struct EigenBlasGemm {
-  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
-                           Eigen::Aligned>
-      EigenMatrix;
-
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc) {
-    Eigen::array<int, 2> sizeA;
-    if (transA) {
-      sizeA[0] = K;
-      sizeA[1] = M;
-      CHECK_EQ(M, lda);
-    } else {
-      sizeA[0] = M;
-      sizeA[1] = K;
-      CHECK_EQ(K, lda);
-    }
-    Eigen::array<int, 2> sizeB;
-    if (transB) {
-      sizeB[0] = N;
-      sizeB[1] = K;
-      CHECK_EQ(K, ldb);
-    } else {
-      sizeB[0] = K;
-      sizeB[1] = N;
-      CHECK_EQ(N, ldb);
-    }
-    Eigen::array<int, 2> sizeC = {{M, ldc}};
-    Eigen::array<int, 2> offsetC = {{0, 0}};
-    Eigen::array<int, 2> extentC = {{M, N}};
-
-    const EigenMatrix a(const_cast<T*>(A), sizeA);
-    const EigenMatrix b(const_cast<T*>(B), sizeB);
-    EigenMatrix c(C, sizeC);
-
-    typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
-    Eigen::array<DimPair, 1> dims;
-    dims[0] = DimPair(1, 0);
-    dims[0].first = transA ? 0 : 1;
-    dims[0].second = transB ? 1 : 0;
-
-    auto* device = EigenDeviceWarpper::device();
-    if (N == ldc) {
-      if (alpha == T(1) && beta == T(0)) {
-        c.device(*device) = a.contract(b, dims);
-      } else if (alpha == T(1) && beta == T(1)) {
-        c.device(*device) += a.contract(b, dims);
-      } else {
-        c.device(*device) = alpha * a.contract(b, dims) + beta * c;
-      }
-    } else {
-      if (alpha == T(1) && beta == T(0)) {
-        c.slice(offsetC, extentC).device(*device) = a.contract(b, dims);
-      } else if (alpha == T(1) && beta == T(1)) {
-        c.slice(offsetC, extentC).device(*device) += a.contract(b, dims);
-      } else {
-        c.slice(offsetC, extentC).device(*device) =
-            alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
-      }
-    }
-    EigenDeviceWarpper::free_device(device);
-  }
-};
-
-#ifdef PADDLE_TYPE_DOUBLE
-template struct EigenBlasGemm<double>;
-#else
-template struct EigenBlasGemm<float>;
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/EigenThreadDevice.h b/paddle/legacy/function/EigenThreadDevice.h
deleted file mode 100644
index eb92251c827..00000000000
--- a/paddle/legacy/function/EigenThreadDevice.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- *     Unless required by applicable law or agreed to in writing, software
- *     distributed under the License is distributed on an "AS IS" BASIS,
- *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *     See the License for the specific language governing permissions and
- *     limitations under the License. */
-
-#pragma once
-
-#if defined(__OSX__) || defined(__APPLE__)
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#endif
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace paddle {
-
-#if defined(__ANDROID__)
-int GetCpuCount() {
-  FILE* fp = fopen("/sys/devices/system/cpu/possible", "r");
-  if (!fp) {
-    return 1;
-  }
-  int rank0, rank1;
-  int num = fscanf(fp, "%d-%d", &rank0, &rank1);
-  fclose(fp);
-  if (num < 2) return 1;
-  return rank1 + 1;
-}
-#elif defined(__OSX__) || defined(__APPLE__)
-int GetCpuCount() {
-  int count = 0;
-  size_t len = sizeof(int);
-  sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
-  return count > 0 ? count : 1;
-}
-#else
-int GetCpuCount() { return 1; }
-#endif
-
-class EigenDeviceWarpper {
- public:  // NOLINT
-#if EIGEN_USE_THREADS
-  static Eigen::ThreadPoolDevice* device() {
-    const int num_cpus = GetCpuCount();
-    const int num_threads = (num_cpus > 2) ? 2 : num_cpus;
-    static Eigen::ThreadPool tp(num_threads);
-    static Eigen::ThreadPoolDevice* device =
-        new Eigen::ThreadPoolDevice(&tp, num_threads);
-    return device;
-  }
-
-  static void free_device(Eigen::ThreadPoolDevice* device) {
-    // do nothing
-  }
-#else
-  static Eigen::DefaultDevice* device() {
-    Eigen::DefaultDevice* device = new Eigen::DefaultDevice;
-    return device;
-  }
-
-  static void free_device(Eigen::DefaultDevice* device) { delete device; }
-#endif
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/Function.cpp b/paddle/legacy/function/Function.cpp
deleted file mode 100644
index 344358fd3d3..00000000000
--- a/paddle/legacy/function/Function.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Function.h"
-
-namespace paddle {
-
-void BufferArgs::addArg(const Matrix& arg,
-                        const TensorShape& shape,
-                        ArgType argType) {
-  _args_.push_back(new BufferArg(arg, shape, argType));
-  addArg(*_args_.back());
-}
-
-void BufferArgs::addArg(const CpuSparseMatrix& arg, ArgType argType) {
-  _args_.push_back(new SparseMatrixArg(arg, argType));
-  addArg(*_args_.back());
-}
-
-void BufferArgs::addArg(const GpuSparseMatrix& arg, ArgType argType) {
-  _args_.push_back(new SparseMatrixArg(arg, argType));
-  addArg(*_args_.back());
-}
-
-void BufferArgs::addArg(const Matrix& matrix,
-                        const IVector& vector,
-                        ArgType argType) {
-  _args_.push_back(new SequenceArg(matrix, vector, argType));
-  addArg(*_args_.back());
-}
-
-ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/Function.h b/paddle/legacy/function/Function.h
deleted file mode 100644
index bc5ef7e6f20..00000000000
--- a/paddle/legacy/function/Function.h
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <vector>
-#include "BufferArg.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Any.h"
-#include "paddle/legacy/utils/ClassRegistrar.h"
-#include "paddle/legacy/utils/Error.h"
-
-namespace paddle {
-
-/**
- * Function Configuration.
- * The argument type of Function::init.
- */
-class FuncConfig {
- public:
-  template <typename T>
-  T get(const std::string& key, Error* err = nullptr) const {
-    try {
-      return any_cast<T>(valueMap_.at(key));
-    } catch (std::exception& e) {  // could be cast or out of range exception.
-      if (err) {
-        *err = Error(e.what());
-      } else {
-        LOG(FATAL) << "Cannot get key " << key << " with error " << e.what();
-      }
-      return T();
-    }
-  }
-
-  template <typename T>
-  FuncConfig& set(const std::string& key, T v, Error* err = nullptr) {
-    auto it = valueMap_.find(key);
-    if (it != valueMap_.end()) {  // already contains key.
-      if (err) {
-        *err = Error("Key %s is already set in FuncConfig", key.c_str());
-      } else {
-        LOG(FATAL) << "Key " << key << " is already set in FuncConfig.";
-      }
-      return *this;
-    }
-    valueMap_[key] = any(v);
-    return *this;
-  }
-
- protected:
-  mutable std::unordered_map<std::string, any> valueMap_;
-};
-
-/**
- * Argument type for Function::calc().
- * A BufferArgs contains a set of BufferArg,
- * because Function can have multiple inputs and outputs.
- *
- * addArg() with Matix object used to adapt Layer Argument.
- * Will create a BufferArg object in addArg(),
- * and free in destructor of BufferArgs.
- *
- * addArg() with BufferArg object, just save BufferArg object address,
- * and the caller needs to guarantee the validity of the BufferArg object
- * in the BufferArgs life time.
- */
-class BufferArgs {
- public:
-  BufferArgs() {}
-
-  ~BufferArgs() {
-    for (auto arg : _args_) {
-      delete arg;
-    }
-  }
-
-  size_t size() const { return args_.size(); }
-
-  // add argument into BufferArgs
-  // Tensor can be Matrix, Vector, IVector.
-  // For inputs, do not need argType.
-  // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
-  void addArg(const Matrix& arg, ArgType argType = UNSPECIFIED) {
-    _args_.push_back(new BufferArg(arg, argType));
-    addArg(*_args_.back());
-  }
-
-  void addArg(const Vector& arg, ArgType argType = UNSPECIFIED) {
-    _args_.push_back(new BufferArg(arg, argType));
-    addArg(*_args_.back());
-  }
-
-  void addArg(const IVector& arg, ArgType argType = UNSPECIFIED) {
-    _args_.push_back(new BufferArg(arg, argType));
-    addArg(*_args_.back());
-  }
-
-  // Add arg into BufferArgs and reshape the arg.
-  //
-  // For example, arg represents an image buffer,
-  // but Matrix can only represent a two-dimensional Tensor.
-  // So need an extra argument to describe the shape of the image buffer.
-  void addArg(const Matrix& arg,
-              const TensorShape& shape,
-              ArgType argType = UNSPECIFIED);
-
-  void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
-  void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
-
-  void addArg(const Matrix& matrix,
-              const IVector& vector,
-              ArgType argType = UNSPECIFIED);
-
-  // get argument
-  const BufferArg& operator[](size_t num) const {
-    CHECK_LT(num, args_.size());
-    return *args_[num];
-  }
-
-  void addArg(BufferArg& arg) { args_.push_back(&arg); }
-
-  void addArg(SequenceIdArg& arg) { args_.push_back(&arg); }
-
-  void addArg(SequenceArg& arg) { args_.push_back(&arg); }
-
-  void addArg(SparseMatrixArg& arg) { args_.push_back(&arg); }
-
- private:
-  std::vector<BufferArg*> args_;
-  // The BufferArg object is constructed and freed by BufferArgs.
-  std::vector<BufferArg*> _args_;
-};
-
-/**
- * \brief Base class for Function.
- * The basic Function implementation requires override init and calc interfaces.
- *
- * The caller needs to ensure the validity of the arguments
- * during Function execution.
- *
- * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
- * and ADD_TO.
- * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation
- * result of Function assigned to the output BufferArg.
- * If output.getArgType() == ADD_TO, this is add mode, and the calculation
- * result of Function need added to the output BufferArg.
- *
- * For example:
- * ASSIGN_TO: output = Function(inputs)
- * ADD_TO: output += Function(inputs)
- * If Function has more than one output, each output can have different modes.
- */
-class FunctionBase {
- public:
-  virtual ~FunctionBase() {}
-
-  virtual void init(const FuncConfig& config) {}
-
-  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
-
-  // This member function is used to check whether the BufferType and shape of
-  // the inputs and outputs arguments of the Function are correct.
-  // General calc function which will call this check to do arguments check.
-  // And before the calc called, the caller can also check their own arguments.
-  virtual void check(const BufferArgs& inputs, const BufferArgs& outputs) {}
-
-  // Calculate the number of floating-point operations of this Function.
-  // The inputs and outputs arguments do not need to contain the actual data,
-  // only the shape.
-  // And some Functions have the same input and output shapes,
-  // so you may not need to enter the complete number of arguments.
-  // But entering the full arguments is always correct for this interface.
-  virtual size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) {
-    return 0;
-  }
-
-  int getNumInputs() const { return numInputs_; }
-
-  int getNumOutputs() const { return numOutputs_; }
-
-  static ClassRegistrar<FunctionBase> funcRegistrar_;
-
- protected:
-  // numInputs_ and numOutputs_ represents the maximum
-  // input and output supported by Function.
-  // Some functions are optimized for input and output,
-  // so when comparing the number of arguments, for these functions
-  // inputs.size() <= numInputs_ or outputs.size() <= numOutputs_
-  size_t numInputs_;
-  size_t numOutputs_;
-};
-
-#define FUNC_NAME(typeName, deviceName) #typeName "-" #deviceName
-
-#define REGISTER_TYPED_FUNC(typeName, deviceName, className)   \
-  static InitFunction __reg_type_##typeName##deviceName([]() { \
-    FunctionBase::funcRegistrar_                               \
-        .registerClass<className<DEVICE_TYPE_##deviceName>>(   \
-            FUNC_NAME(typeName, deviceName));                  \
-  })
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/FunctionTest.cpp b/paddle/legacy/function/FunctionTest.cpp
deleted file mode 100644
index 1a0993e3135..00000000000
--- a/paddle/legacy/function/FunctionTest.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Function.h"
-#include <gtest/gtest.h>
-#include "paddle/legacy/math/SparseMatrix.h"
-
-namespace paddle {
-
-template <DeviceType DType>
-void FunctionApi(typename Tensor<real, DType>::Matrix& output,
-                 const typename Tensor<real, DType>::Matrix& input);
-
-template <>
-void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 100U);
-  EXPECT_EQ(output.getWidth(), 200U);
-}
-
-template <>
-void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 10U);
-  EXPECT_EQ(output.getWidth(), 20U);
-}
-
-template <DeviceType DType>
-void Function(const BufferArgs& arguments) {
-  const auto input = arguments[0].matrix<DType>();
-  auto output = arguments[1].matrix<DType>();
-  FunctionApi<DType>(output, input);
-}
-
-TEST(Function, BufferArgs) {
-  CpuMatrix cpuInput = CpuMatrix(100, 200);
-  CpuMatrix cpuOutput = CpuMatrix(100, 200);
-  BufferArgs cpuArgments;
-  cpuArgments.addArg(cpuInput);
-  cpuArgments.addArg(cpuOutput);
-  Function<DEVICE_TYPE_CPU>(cpuArgments);
-
-  GpuMatrix gpuInput = GpuMatrix(10, 20);
-  GpuMatrix gpuOutput = GpuMatrix(10, 20);
-  BufferArgs gpuArgments;
-  gpuArgments.addArg(gpuInput);
-  gpuArgments.addArg(gpuOutput);
-  Function<DEVICE_TYPE_GPU>(gpuArgments);
-}
-
-/**
- * Some tests case are used to check the consistency between the BufferArg type
- * argument received by Function and the original type argument.
- *
- * Use Case:
- *  TEST() {
- *    Matrix matrix(...);
- *    CheckBufferArg lambda = [=](const BufferArg& arg) {
- *      // check matrix and arg are equivalent
- *      EXPECT_EQ(matrix, arg);
- *    }
- *
- *   BufferArgs argments{matrix...};
- *   std::vector<CheckBufferArg> checkFunc{lambda...};
- *   testBufferArgs(argments, checkFunc);
- *  }
- */
-typedef std::function<void(const BufferArg&)> CheckBufferArg;
-
-void testBufferArgs(const BufferArgs& inputs,
-                    const std::vector<CheckBufferArg>& check) {
-  EXPECT_EQ(inputs.size(), check.size());
-  for (size_t i = 0; i < inputs.size(); i++) {
-    check[i](inputs[i]);
-  }
-}
-
-void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
-  EXPECT_EQ(inputs.size(), 1U);
-  check(inputs[0]);
-}
-
-TEST(Arguments, Matrix) {
-  MatrixPtr matrix = Matrix::create(100, 200);
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 2U);
-    EXPECT_EQ(arg.shape()[0], 100U);
-    EXPECT_EQ(arg.shape()[1], 200U);
-    EXPECT_EQ(arg.data(), matrix->getData());
-
-    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
-    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth());
-    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
-  };
-
-  BufferArgs argments;
-  argments.addArg(*matrix);
-  std::vector<CheckBufferArg> checkFunc;
-  checkFunc.push_back(check);
-  testBufferArgs(argments, checkFunc);
-}
-
-TEST(Arguments, Vector) {
-  VectorPtr vector = Vector::create(100, false);
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 1U);
-    EXPECT_EQ(arg.shape()[0], 100U);
-    EXPECT_EQ(arg.data(), vector->getData());
-
-    CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
-    EXPECT_EQ(inVector.getSize(), vector->getSize());
-    EXPECT_EQ(inVector.getData(), vector->getData());
-  };
-
-  BufferArgs argments;
-  argments.addArg(*vector);
-  std::vector<CheckBufferArg> checkFunc;
-  checkFunc.push_back(check);
-  testBufferArgs(argments, checkFunc);
-}
-
-TEST(Arguments, CpuSparseMatrix) {
-  CpuSparseMatrix sparse(200, 300, 50);
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 2U);
-    EXPECT_EQ(arg.shape()[0], 200U);
-    EXPECT_EQ(arg.shape()[1], 300U);
-    EXPECT_EQ(arg.data(), sparse.getData());
-    // CHECK_EQ(arg.sparse().nnz(), 50);
-    // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
-    // CHECK_EQ(arg.sparse().dataType(), SPARSE_FLOAT_VALUE);
-    EXPECT_EQ(arg.sparse().getRowBuf(), sparse.getRows());
-    EXPECT_EQ(arg.sparse().getColBuf(), sparse.getCols());
-  };
-
-  BufferArgs argments;
-  argments.addArg(sparse);
-  std::vector<CheckBufferArg> checkFunc;
-  checkFunc.push_back(check);
-  testBufferArgs(argments, checkFunc);
-}
-
-TEST(Arguments, BufferArg) {
-  BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 3U);
-    EXPECT_EQ(arg.shape()[0], 1U);
-    EXPECT_EQ(arg.shape()[1], 2U);
-    EXPECT_EQ(arg.shape()[2], 3U);
-  };
-
-  BufferArgs argments;
-  argments.addArg(arg);
-  testBufferArgs(argments, check);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/FunctionTest.h b/paddle/legacy/function/FunctionTest.h
deleted file mode 100644
index 6f01981a34b..00000000000
--- a/paddle/legacy/function/FunctionTest.h
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Function.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/legacy/math/tests/TensorCheck.h"
-#include "paddle/testing/TestUtil.h"
-
-namespace paddle {
-
-typedef std::shared_ptr<BufferArg> BufferArgPtr;
-
-namespace test {
-template <DeviceType DType>
-struct Allocator;
-
-template <>
-struct Allocator<DEVICE_TYPE_CPU> {
-  using type = CpuMemoryHandle;
-};
-
-template <>
-struct Allocator<DEVICE_TYPE_GPU> {
-  using type = GpuMemoryHandle;
-};
-
-// Copy argument1 to argument2
-template <DeviceType DType1, DeviceType DType2>
-class CopyArgument {
- public:
-  void operator()(const BufferArg& arg1, BufferArg& arg2) {
-    CHECK_EQ(arg1.valueType(), arg2.valueType());
-    CHECK_LE(arg1.shape().getElements(), arg2.shape().getElements());
-
-    if (arg1.valueType() == VALUE_TYPE_INT32) {
-      IVectorPtr vector1 =
-          IVector::create((int*)arg1.data(),
-                          arg1.shape().getElements(),
-                          DType1 == DEVICE_TYPE_CPU ? false : true);
-      IVectorPtr vector2 =
-          IVector::create((int*)arg2.data(),
-                          arg2.shape().getElements(),
-                          DType2 == DEVICE_TYPE_CPU ? false : true);
-      vector2->copyFrom(*vector1);
-    } else {
-      VectorPtr vector1 =
-          Vector::create((real*)arg1.data(),
-                         arg1.shape().getElements(),
-                         DType1 == DEVICE_TYPE_CPU ? false : true);
-      VectorPtr vector2 =
-          Vector::create((real*)arg2.data(),
-                         arg2.shape().getElements(),
-                         DType2 == DEVICE_TYPE_CPU ? false : true);
-      vector2->copyFrom(*vector1);
-    }
-  }
-};
-}  // namespace test
-
-/**
- * \brief A class for comparing two Functions of different implementations.
- *        For example, can be used to compare the CPU and GPU implementation
- *        of the function is consistent.
- *
- * Use case:
- *  // Initializes a test object, the corresponding cpu and gpu Function
- *  // are constructed according to FunctionName and FuncConfig.
- *  CpuGpuFuncCompare test(FunctionName, FuncConfig);
- *  // Prepare inputs and outputs arguments.
- *  // Here the input and output can not contain real data,
- *  // only contains the argument type and shape.
- *  test.addInputs(input1);
- *  test.addInputs(input2);
- *  test.addOutputs(output1);
- *  test.addOutputs(output2);
- *  // Run.
- *  // Will according to the type and shape of arguments(inputs_/outputs_),
- *  // automatic initialization cpu and gpu function required arguments
- *  // (cpuInputs_/cpuOutputs_/gpuInputs_/gpuOutputs_).
- *  // Call the CPU and GPU Function calculation results.
- *  // Compares CPU and GPU calculation results for consistency.
- *  test.run();
- */
-template <DeviceType DType1, DeviceType DType2>
-class Compare2Function {
- public:
-  typedef typename test::Allocator<DType1>::type Allocator1;
-  typedef typename test::Allocator<DType2>::type Allocator2;
-  typedef typename Tensor<real, DType1>::Vector Vector1;
-  typedef typename Tensor<real, DType2>::Vector Vector2;
-  typedef typename Tensor<real, DType1>::SparseMatrix SparseMatrix1;
-  typedef typename Tensor<real, DType2>::SparseMatrix SparseMatrix2;
-
-  Compare2Function(const std::string& name1,
-                   const std::string& name2,
-                   const FuncConfig& config)
-      : function1_(FunctionBase::funcRegistrar_.createByType(name1)),
-        function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
-    function1_->init(config);
-    function2_->init(config);
-    initArgsCallback_ = nullptr;
-  }
-
-  ~Compare2Function() {}
-
-  // input need only contains shape, do not contains data.
-  void addInputs(const BufferArg& input) {
-    size_t size =
-        input.shape().getElements() * sizeOfValuType(input.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    func1Inputs_.emplace_back(std::make_shared<BufferArg>(
-        func1Memory_.back()->getBuf(), input.valueType(), input.shape()));
-    func2Inputs_.emplace_back(std::make_shared<BufferArg>(
-        func2Memory_.back()->getBuf(), input.valueType(), input.shape()));
-  }
-
-  // assume one copy of sequence is shared by different SequenceArgs
-  void addSequence(const SequenceIdArg& input) {
-    CHECK_EQ(input.shape().ndims(), 1UL);
-    size_t batchSize = input.shape()[0];
-    size_t numSeqs = batchSize / 10 + 1;
-    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(sizeId));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(sizeId));
-    seq1_ = std::make_shared<SequenceIdArg>(func1Memory_.back()->getBuf(),
-                                            TensorShape{numSeqs + 1});
-    seq2_ = std::make_shared<SequenceIdArg>(func2Memory_.back()->getBuf(),
-                                            TensorShape{numSeqs + 1});
-    /// init sequence Id
-    initArg(*seq1_, batchSize);
-
-    copyArg_(*seq1_, *seq2_);
-  }
-
-  void addInputs(const SequenceArg& input) {
-    CHECK_EQ(input.shape().ndims(), 2UL);
-    size_t batchSize = input.shape()[0];
-    if (!seq1_ || !seq2_) {  // sequence not exist
-      addSequence(SequenceIdArg(TensorShape{batchSize}));
-    }
-
-    size_t size =
-        input.shape().getElements() * sizeOfValuType(input.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    /// SequenceArg
-    func1Inputs_.emplace_back(
-        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
-                                      input.valueType(),
-                                      input.shape(),
-                                      *seq1_));
-    func2Inputs_.emplace_back(
-        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
-                                      input.valueType(),
-                                      input.shape(),
-                                      *seq2_));
-  }
-
-  void registerInitCallback(std::function<void(BufferArg&, size_t)> callback) {
-    initArgsCallback_ = callback;
-  }
-
-  // output need only contains shape, do not contains data.
-  void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
-    size_t size =
-        output.shape().getElements() * sizeOfValuType(output.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    func1Outputs_.emplace_back(
-        std::make_shared<BufferArg>(func1Memory_.back()->getBuf(),
-                                    output.valueType(),
-                                    output.shape(),
-                                    argType));
-    func2Outputs_.emplace_back(
-        std::make_shared<BufferArg>(func2Memory_.back()->getBuf(),
-                                    output.valueType(),
-                                    output.shape(),
-                                    argType));
-  }
-
-  /// add and init output sparse matrix
-  void addOutputs(const SparseMatrixArg& output, ArgType argType = ASSIGN_TO) {
-    sparse1_ = std::make_shared<SparseMatrix1>(
-        output.shape()[0],
-        output.shape()[1],
-        output.nnz(),
-        static_cast<SparseValueType>(output.dataType()),
-        static_cast<SparseFormat>(output.dataFormat()));
-
-    sparse2_ = std::make_shared<SparseMatrix2>(
-        output.shape()[0],
-        output.shape()[1],
-        output.nnz(),
-        static_cast<SparseValueType>(output.dataType()),
-        static_cast<SparseFormat>(output.dataFormat()));
-
-    /// init sparse matrix
-    hl_stream_t stream(HPPL_STREAM_1);
-    sparse1_->randomizeUniform();
-    sparse2_->copyFrom(*sparse1_, stream);
-    hl_stream_synchronize(stream);
-
-    func1Outputs_.emplace_back(
-        std::make_shared<SparseMatrixArg>(*sparse1_, argType));
-    func2Outputs_.emplace_back(
-        std::make_shared<SparseMatrixArg>(*sparse2_, argType));
-  }
-
-  void addOutputs(const SequenceArg& output, ArgType argType = ASSIGN_TO) {
-    CHECK_EQ(output.shape().ndims(), 2UL);
-    size_t batchSize = output.shape()[0];
-
-    if (!seq1_ || !seq2_) {  // sequence not exist
-      addSequence(SequenceIdArg(TensorShape{batchSize}));
-    }
-    size_t size =
-        output.shape().getElements() * sizeOfValuType(output.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    /// SequenceArg
-    func1Outputs_.emplace_back(
-        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
-                                      output.valueType(),
-                                      output.shape(),
-                                      *seq1_,
-                                      argType));
-    func2Outputs_.emplace_back(
-        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
-                                      output.valueType(),
-                                      output.shape(),
-                                      *seq2_,
-                                      argType));
-  }
-
-  void addInputs(const SparseMatrixArg& input) {
-    sparse1_ = std::make_shared<SparseMatrix1>(
-        input.shape()[0],
-        input.shape()[1],
-        input.nnz(),
-        static_cast<SparseValueType>(input.dataType()),
-        static_cast<SparseFormat>(input.dataFormat()));
-
-    sparse2_ = std::make_shared<SparseMatrix2>(
-        input.shape()[0],
-        input.shape()[1],
-        input.nnz(),
-        static_cast<SparseValueType>(input.dataType()),
-        static_cast<SparseFormat>(input.dataFormat()));
-
-    /// init sparse matrix
-    hl_stream_t stream(HPPL_STREAM_1);
-    sparse1_->randomizeUniform();
-    sparse2_->copyFrom(*sparse1_, stream);
-    hl_stream_synchronize(stream);
-
-    func1Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse1_));
-    func2Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse2_));
-  }
-
-  void run() {
-    // prepare cpu/gpu arguments
-    initInputs();
-
-    initOutputs();
-    // function calculate
-    auto callFunction = [](FunctionBase* function,
-                           std::vector<BufferArgPtr>& inputs,
-                           std::vector<BufferArgPtr>& outputs) {
-      BufferArgs inArgs;
-      BufferArgs outArgs;
-      for (auto arg : inputs) {
-        inArgs.addArg(*arg);
-      }
-      for (auto arg : outputs) {
-        outArgs.addArg(*arg);
-      }
-      function->calc(inArgs, outArgs);
-    };
-
-    callFunction(function1_.get(), func1Inputs_, func1Outputs_);
-    callFunction(function2_.get(), func2Inputs_, func2Outputs_);
-
-    // check outputs
-    compareOutputs();
-  }
-
-  std::shared_ptr<FunctionBase> getFunction1() const { return function1_; }
-
-  std::shared_ptr<FunctionBase> getFunction2() const { return function2_; }
-
- protected:
-  // only init cpu argument, gpu argument copy from cpu argument.
-  void initArg(BufferArg& arg) {
-    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
-    vector.uniform(0.001, 1);
-  }
-
-  void initArg(SequenceArg& arg) {
-    /// init only matrix
-    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
-    vector.uniform(0.001, 1);
-  }
-
-  void initArg(SequenceIdArg& arg, size_t batchSize) {
-    size_t numSeqs = arg.numSeqs();
-    int* buf = reinterpret_cast<int*>(arg.data());
-    int pos = 0;
-    size_t maxLen = 2 * batchSize / numSeqs;
-    for (int i = 0; i < (int)numSeqs; ++i) {
-      int len = 1 + uniformRandom(std::min<int64_t>(
-                        maxLen, batchSize - pos - numSeqs + i));
-      buf[i] = pos;
-      pos += len;
-      VLOG(1) << " len=" << len;
-    }
-    buf[numSeqs] = batchSize;
-  }
-
-  void initInputs() {
-    for (size_t i = 0; i < func1Inputs_.size(); i++) {
-      if (func1Inputs_[i]->isSparseArg()) {
-        continue;  /// sparse matrix already init
-      }
-
-      if (func1Inputs_[i]->isSequenceArg()) {
-        initArg(dynamic_cast<SequenceArg&>(*func1Inputs_[i]));
-      } else {
-        initArg(*func1Inputs_[i]);
-      }
-
-      if (initArgsCallback_ != nullptr) {
-        initArgsCallback_(*func1Inputs_[i], i);
-      }
-
-      copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
-    }
-  }
-
-  void initOutputs() {
-    for (size_t i = 0; i < func1Outputs_.size(); i++) {
-      if (func1Outputs_[i]->isSparseArg()) {
-        continue;  /// sparse matrix already init
-      }
-
-      if (func1Outputs_[i]->isSequenceArg()) {
-        initArg(dynamic_cast<SequenceArg&>(*func1Outputs_[i]));
-      } else {
-        initArg(*func1Outputs_[i]);
-      }
-
-      copyArg_(*func1Outputs_[i], *func2Outputs_[i]);
-    }
-  }
-
-  void compareOutputs() {
-    for (size_t i = 0; i < func1Outputs_.size(); i++) {
-      // TODO, Need a BufferCheck used to compare the two buffers.
-      const auto cpu = func1Outputs_[i];
-      const auto gpu = func2Outputs_[i];
-      CHECK_EQ(cpu->numElements(), gpu->numElements());
-      Vector1 cpuVector(cpu->numElements(), (real*)cpu->data());
-      Vector2 gpuVector(gpu->numElements(), (real*)gpu->data());
-      autotest::TensorCheckErr(cpuVector, gpuVector);
-    }
-  }
-
- protected:
-  std::shared_ptr<FunctionBase> function1_;
-  std::shared_ptr<FunctionBase> function2_;
-  std::vector<std::shared_ptr<Allocator1>> func1Memory_;
-  std::vector<std::shared_ptr<Allocator2>> func2Memory_;
-  std::vector<BufferArgPtr> func1Inputs_;
-  std::vector<BufferArgPtr> func1Outputs_;
-  std::vector<BufferArgPtr> func2Inputs_;
-  std::vector<BufferArgPtr> func2Outputs_;
-  std::shared_ptr<SparseMatrix1> sparse1_;
-  std::shared_ptr<SparseMatrix2> sparse2_;
-  std::shared_ptr<SequenceIdArg> seq1_;
-  std::shared_ptr<SequenceIdArg> seq2_;
-  test::CopyArgument<DType1, DType2> copyArg_;
-  std::function<void(BufferArg&, size_t)> initArgsCallback_;
-};
-
-class CpuGpuFuncCompare
-    : public Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> {
- public:
-  CpuGpuFuncCompare(const std::string& name, const FuncConfig& config)
-      : Compare2Function(name + "-CPU", name + "-GPU", config) {}
-
-  ~CpuGpuFuncCompare() {}
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/GemmConvOp.cpp b/paddle/legacy/function/GemmConvOp.cpp
deleted file mode 100644
index 5a81315661d..00000000000
--- a/paddle/legacy/function/GemmConvOp.cpp
+++ /dev/null
@@ -1,522 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvOp.h"
-#include "GemmFunctor.h"
-#include "Im2Col.h"
-#include "paddle/legacy/math/MemoryHandle.h"
-
-namespace paddle {
-
-/*
- * \brief Forward calculation of convolution.
- */
-template <DeviceType Device>
-class GemmConvFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    // TODO(hedaoyuan): Need to define some index macros,
-    // to avoid useing 0 and 1.
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    real beta;
-    if (outputs[0].getArgType() == ADD_TO) {
-      beta = 1.0;
-    } else {
-      beta = 0.0;
-    }
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* inputData = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* outputData = outputs[0].data<real>();
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
-    TensorShape colShape;
-    real* colData = NULL;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-      resizeBuffer<Device>(colShape.getElements());
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Im2ColFunctor<kCFO, Device, real> im2col;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    for (size_t i = 0; i < batchSize; i++) {
-      for (size_t g = 0; g < groups_; g++) {
-        if (needIm2col) {
-          im2col(inputData + g * inputOffset,
-                 imShape,
-                 colData,
-                 colShape,
-                 strideH(),
-                 strideW(),
-                 paddingH(),
-                 paddingW(),
-                 dilationH(),
-                 dilationW());
-        } else {
-          colData = inputData + g * inputOffset;
-        }
-        int M = outputChannels / groups_;
-        int N = outputHeight * outputWidth;
-        int K = inputChannels / groups_ * filterHeight * filterWidth;
-        BlasGemm<Device, real>::compute(false,
-                                        false,
-                                        M,
-                                        N,
-                                        K,
-                                        1.0f,
-                                        filterData + g * filterOffset,
-                                        K,
-                                        colData,
-                                        N,
-                                        beta,
-                                        outputData + g * outputOffset,
-                                        N);
-      }
-      inputData += inputChannels * inputHeight * inputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifdef PADDLE_MOBILE_INFERENCE
-
-/*
- * \brief Forward calculation of convolution, optimized for mobile.
- */
-template <DeviceType Device>
-class GemmConvMobileFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    // TODO(hedaoyuan): Need to define some index macros,
-    // to avoid useing 0 and 1.
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    real beta;
-    if (outputs[0].getArgType() == ADD_TO) {
-      beta = 1.0;
-    } else {
-      beta = 0.0;
-    }
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* inputData = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* outputData = outputs[0].data<real>();
-    real* colData = NULL;
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape;
-
-    // Max col matrix width 4096, Max col matrix size 4M.
-    size_t outputHeightSteps =
-        std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
-    size_t maxColWidth = outputHeightSteps * outputWidth;
-    size_t channelSteps =
-        std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
-                          (size_t)1),
-                 inputChannels / groups_);
-    size_t maxColHeight = channelSteps * filterHeight * filterWidth;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-
-      resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Im2ColMobileFunctor<real> im2col;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    int nStride = outputHeight * outputWidth;
-    int kStride = inputChannels / groups_ * filterHeight * filterWidth;
-    for (size_t i = 0; i < batchSize; i++) {
-      filterData = inputs[1].data<real>();
-      for (size_t g = 0; g < groups_; g++) {
-        if (needIm2col) {
-          real beta_ = beta;
-          for (size_t ic = 0; ic < inputChannels / groups_;
-               ic += channelSteps) {
-            int channels = std::min(inputChannels / groups_ - ic, channelSteps);
-            for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
-              int height = std::min(outputHeight - oh, outputHeightSteps);
-
-              int M = outputChannels / groups_;
-              int N = height * outputWidth;
-              int K = channels * filterHeight * filterWidth;
-              // im2col
-              im2col(inputData,
-                     imShape,
-                     colData,
-                     colShape,
-                     strideH(),
-                     strideW(),
-                     paddingH(),
-                     paddingW(),
-                     dilationH(),
-                     dilationW(),
-                     channels,
-                     oh,
-                     height,
-                     N);
-
-              // gemm
-              BlasGemm<Device, real>::compute(
-                  false,
-                  false,
-                  M,
-                  N,
-                  K,
-                  1.0f,
-                  filterData + ic * filterHeight * filterWidth,
-                  kStride,
-                  colData,
-                  N,
-                  beta_,
-                  outputData + oh * outputWidth,
-                  nStride);
-            }
-            beta_ = 1.0;
-          }
-        } else {
-          int M = outputChannels / groups_;
-          int N = outputHeight * outputWidth;
-          int K = inputChannels / groups_ * filterHeight * filterWidth;
-          BlasGemm<Device, real>::compute(false,
-                                          false,
-                                          M,
-                                          N,
-                                          K,
-                                          1.0f,
-                                          filterData,
-                                          K,
-                                          inputData,
-                                          N,
-                                          beta,
-                                          outputData,
-                                          N);
-        }
-        inputData += inputOffset;
-        outputData += outputOffset;
-        filterData += filterOffset;
-      }
-    }
-
-    memory_.reset();
-  }
-};
-
-#endif
-
-/*
- * \brief Backward input calculation of convolution.
- */
-template <DeviceType Device>
-class GemmConvGradInputFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& input = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    // Since the implementation of Col2ImFunctor is ADD_TO,
-    // this function only supports ADD_TO mode.
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& input = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* outputGrad = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* inputGrad = outputs[0].data<real>();
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
-    TensorShape colShape;
-    real* colData = NULL;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-      resizeBuffer<Device>(colShape.getElements());
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Col2ImFunctor<kCFO, Device, real> col2im;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    for (size_t i = 0; i < batchSize; i++) {
-      for (size_t g = 0; g < groups_; g++) {
-        int K = outputChannels / groups_;
-        int N = outputHeight * outputWidth;
-        int M = inputChannels / groups_ * filterHeight * filterWidth;
-        real scale = 0.0f;
-        if (!needIm2col) {
-          colData = inputGrad + g * inputOffset;
-          scale = 1.0f;
-        }
-        BlasGemm<Device, real>::compute(true,
-                                        false,
-                                        M,
-                                        N,
-                                        K,
-                                        1.0f,
-                                        filterData + g * filterOffset,
-                                        M,
-                                        outputGrad + g * outputOffset,
-                                        N,
-                                        scale,
-                                        colData,
-                                        N);
-        if (needIm2col) {
-          col2im(inputGrad + g * inputOffset,
-                 imShape,
-                 colData,
-                 colShape,
-                 strideH(),
-                 strideW(),
-                 paddingH(),
-                 paddingW(),
-                 dilationH(),
-                 dilationW());
-        }
-      }
-      inputGrad += inputChannels * inputHeight * inputWidth;
-      outputGrad += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-/*
- * \brief Backward filter calculation of convolution.
- */
-template <DeviceType Device>
-class GemmConvGradFilterFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& input = inputs[1].shape();
-    const TensorShape& filter = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& input = inputs[1].shape();
-    const TensorShape& filter = outputs[0].shape();
-
-    real beta;
-    if (outputs[0].getArgType() == ADD_TO) {
-      beta = 1.0;
-    } else {
-      beta = 0.0;
-    }
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* outputGrad = inputs[0].data<real>();
-    real* inputData = inputs[1].data<real>();
-    real* filterGrad = outputs[0].data<real>();
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
-    TensorShape colShape;
-    real* colData = NULL;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-      resizeBuffer<Device>(colShape.getElements());
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Im2ColFunctor<kCFO, Device, real> im2col;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-    for (size_t i = 0; i < batchSize; i++) {
-      for (size_t g = 0; g < groups_; g++) {
-        if (needIm2col) {
-          im2col(inputData + g * inputOffset,
-                 imShape,
-                 colData,
-                 colShape,
-                 strideH(),
-                 strideW(),
-                 paddingH(),
-                 paddingW(),
-                 dilationH(),
-                 dilationW());
-        } else {
-          colData = inputData + g * inputOffset;
-        }
-        int M = outputChannels / groups_;
-        int K = outputHeight * outputWidth;
-        int N = inputChannels / groups_ * filterHeight * filterWidth;
-        BlasGemm<Device, real>::compute(false,
-                                        true,
-                                        M,
-                                        N,
-                                        K,
-                                        1.0f,
-                                        outputGrad + g * outputOffset,
-                                        K,
-                                        colData,
-                                        K,
-                                        i == 0 ? beta : 1.0f,
-                                        filterGrad + g * filterOffset,
-                                        N);
-      }
-      inputData += inputChannels * inputHeight * inputWidth;
-      outputGrad += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifdef PADDLE_MOBILE_INFERENCE
-REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvMobileFunction);
-#else
-REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
-#endif
-REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
-REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction);
-REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction);
-REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/GemmConvOpTest.cpp b/paddle/legacy/function/GemmConvOpTest.cpp
deleted file mode 100644
index a30b7c90bb0..00000000000
--- a/paddle/legacy/function/GemmConvOpTest.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "ConvOpTest.h"
-
-namespace paddle {
-
-TEST(GemmConv, NaiveConv) {
-  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
-      "NaiveConv-CPU", "GemmConv-CPU", forward);
-  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
-      "NaiveConv-CPU", "GemmConv-CPU", forward);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(GemmConv, Forward) {
-  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
-      "GemmConv-CPU", "GemmConv-GPU", forward);
-  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
-      "GemmConv-CPU", "GemmConv-GPU", forward);
-}
-
-TEST(GemmConv, BackwardInput) {
-  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
-      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", backward_input);
-  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
-      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", backward_input);
-}
-
-TEST(GemmConv, BackwardFilter) {
-  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
-      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", backward_filter);
-  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
-      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", backward_filter);
-}
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/GemmFunctor.cpp b/paddle/legacy/function/GemmFunctor.cpp
deleted file mode 100644
index 450293dfeea..00000000000
--- a/paddle/legacy/function/GemmFunctor.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GemmFunctor.h"
-#include "paddle/legacy/math/MathFunctions.h"
-
-namespace paddle {
-
-template <class T>
-struct BlasGemm<DEVICE_TYPE_CPU, T> {
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc) {
-#ifdef PADDLE_USE_EIGEN_FOR_BLAS
-    EigenBlasGemm<T>::compute(
-        transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
-#else
-    gemm<T>(transA == false ? CblasNoTrans : CblasTrans,
-            transB == false ? CblasNoTrans : CblasTrans,
-            M,
-            N,
-            K,
-            alpha,
-            A,
-            lda,
-            B,
-            ldb,
-            beta,
-            C,
-            ldc);
-#endif
-  }
-};
-
-template <class T>
-struct BlasGemm<DEVICE_TYPE_GPU, T> {
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc) {
-    hl_matrix_mul((T*)A,
-                  transA == false ? HPPL_OP_N : HPPL_OP_T,
-                  (T*)B,
-                  transB == false ? HPPL_OP_N : HPPL_OP_T,
-                  C,
-                  M,
-                  N,
-                  K,
-                  alpha,
-                  beta,
-                  lda,
-                  ldb,
-                  ldc);
-  }
-};
-
-template struct BlasGemm<DEVICE_TYPE_CPU, real>;
-template struct BlasGemm<DEVICE_TYPE_GPU, real>;
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/GemmFunctor.h b/paddle/legacy/function/GemmFunctor.h
deleted file mode 100644
index df63fc64f84..00000000000
--- a/paddle/legacy/function/GemmFunctor.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "TensorType.h"
-
-namespace paddle {
-
-// TODO(hedaoyuan): Since the hl_matrix_mul interface does not conform to the
-// cblas_dgemm interface's parameter format, it is necessary to introduce
-// GemmFunctor as a new interface. Later, when considering the implementation
-// of MatMulFunction, we need to consider the reconstruction of hl_matrix_mul
-// interface.
-template <DeviceType Device, class T>
-struct BlasGemm {
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc);
-};
-
-// TODO(hedaoyuan): Since the definition of the real type in the Paddle
-// conflicts with the Eigen library, so compile the Eigen code can not
-// include the Paddle header file. And need an EigenBlasGemm template class
-// that does not contain the DeviceType parameter.
-// I will fix this problem and merge BlasGemm and EigenBlasGemm into one.
-template <class T>
-struct EigenBlasGemm {
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/GruFunctor.h b/paddle/legacy/function/GruFunctor.h
deleted file mode 100644
index d5a30c33276..00000000000
--- a/paddle/legacy/function/GruFunctor.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "GemmFunctor.h"
-#include "hl_cpu_gru.cuh"
-
-namespace paddle {
-
-template <DeviceType Device, class T>
-struct GruFunctor {
-  template <class OpResetOutput, class OpFinalOutput>
-  static void compute(OpResetOutput opResetOutput,
-                      OpFinalOutput opFinalOutput,
-                      hl_gru_value value,
-                      int frameSize,
-                      int batchSize,
-                      hl_activation_mode_t active_node,
-                      hl_activation_mode_t active_gate) {
-#ifndef __NVCC__
-    if (value.prevOutValue) {
-      BlasGemm<Device, T>::compute(false,
-                                   false,
-                                   batchSize,
-                                   2 * frameSize,
-                                   frameSize,
-                                   1,
-                                   value.prevOutValue,
-                                   frameSize,
-                                   value.gateWeight,
-                                   frameSize * 2,
-                                   1,
-                                   value.gateValue,
-                                   frameSize * 3);
-    }
-
-    forward_reset_output(
-        opResetOutput, value, frameSize, batchSize, active_gate);
-
-    if (value.prevOutValue) {
-      BlasGemm<Device, T>::compute(false,
-                                   false,
-                                   batchSize,
-                                   frameSize,
-                                   frameSize,
-                                   1,
-                                   value.resetOutputValue,
-                                   frameSize,
-                                   value.stateWeight,
-                                   frameSize,
-                                   1,
-                                   value.gateValue + frameSize * 2,
-                                   frameSize * 3);
-    }
-
-    forward_final_output(
-        opFinalOutput, value, frameSize, batchSize, active_node);
-#endif
-  }
-};
-
-template <DeviceType Device, class T>
-struct GruGradFunctor {
-  template <class OpStateGrad, class OpResetGrad>
-  static void compute(OpStateGrad opStateGrad,
-                      OpResetGrad opResetGrad,
-                      hl_gru_value value,
-                      hl_gru_grad grad,
-                      int frameSize,
-                      int batchSize,
-                      hl_activation_mode_t active_node,
-                      hl_activation_mode_t active_gate) {
-#ifndef __NVCC__
-    backward_state_grad(
-        opStateGrad, value, grad, frameSize, batchSize, active_node);
-
-    if (value.prevOutValue && grad.prevOutGrad) {
-      BlasGemm<Device, T>::compute(false,
-                                   true,
-                                   batchSize,
-                                   frameSize,
-                                   frameSize,
-                                   1,
-                                   grad.gateGrad + frameSize * 2,
-                                   frameSize * 3,
-                                   value.stateWeight,
-                                   frameSize,
-                                   0,
-                                   grad.resetOutputGrad,
-                                   frameSize);
-
-      if (grad.stateWeightGrad) {
-        BlasGemm<Device, T>::compute(true,
-                                     false,
-                                     frameSize,
-                                     frameSize,
-                                     batchSize,
-                                     1,
-                                     value.resetOutputValue,
-                                     frameSize,
-                                     grad.gateGrad + frameSize * 2,
-                                     frameSize * 3,
-                                     1,
-                                     grad.stateWeightGrad,
-                                     frameSize);
-      }
-    }
-
-    backward_reset_grad(
-        opResetGrad, value, grad, frameSize, batchSize, active_gate);
-
-    if (grad.prevOutGrad && value.prevOutValue) {
-      BlasGemm<Device, T>::compute(false,
-                                   true,
-                                   batchSize,
-                                   frameSize,
-                                   frameSize * 2,
-                                   1,
-                                   grad.gateGrad,
-                                   frameSize * 3,
-                                   value.gateWeight,
-                                   frameSize * 2,
-                                   1,
-                                   grad.prevOutGrad,
-                                   frameSize);
-
-      if (grad.gateWeightGrad) {
-        BlasGemm<Device, T>::compute(true,
-                                     false,
-                                     frameSize,
-                                     frameSize * 2,
-                                     batchSize,
-                                     1,
-                                     value.prevOutValue,
-                                     frameSize,
-                                     grad.gateGrad,
-                                     frameSize * 3,
-                                     1,
-                                     grad.gateWeightGrad,
-                                     frameSize * 2);
-      }
-    }
-#endif
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/Im2Col.h b/paddle/legacy/function/Im2Col.h
deleted file mode 100644
index e0ce6918a2a..00000000000
--- a/paddle/legacy/function/Im2Col.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "TensorShape.h"
-#include "TensorType.h"
-#include "neon/neon_util.h"
-
-namespace paddle {
-
-/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
-enum ColFormat { kCFO = 0, kOCF = 1 };
-
-/*
- * \brief Converts the image data of three dimensions(CHW) into a colData of
- *        five dimensions in the Im2ColFunctor calculation,
- *        And in the Col2ImFunctor calculation, it is reversed.
- *
- * \param imData   Image data.
- * \param imShape  The shape of imData,
- *                 [inputChannels, inputHeight, inputWidth].
- * \param colData  Column data.
- * \param colShape The shape of colData.
- *
- * If the template argument Format is kCFO, the shape of colData is:
- * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
- * So, it is easy to reshape into a convolution matrix for convolution
- * calculation based on matrix multiplication.
- * The shape of convolution matrix is [height, width], where the height is equal
- * inputChannels * filterHeight * filterWidth, and the width is equal
- * outputHeight * outputWidth.
- *
- * Reshape:
- *     shape of colData           shape of convolution matrix
- *     [inputChannels,
- *      filterHeight,
- *      filterWidth,      ======>      [height, width]
- *      outputHeight,
- *      outputWidth]
- *
- * If the template argument Format is kOCF, the shape of colData is:
- * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
- * So, it is easy to reshape into a sequence matrix for rnn calculation.
- * The shape of sequence matrix is [seqLength, stepSize], where the seqLength
- * is equal outputHeight * outputWidth, and the stepSize is equal
- * inputChannels * filterHeight * filterWidth.
- *
- * Reshape:
- *     shape of colData             shape of sequence matrix
- *     [outputHeight,
- *      outputWidth,
- *      inputChannels,    ======>    [seqLength, stepSize]
- *      filterHeight,
- *      filterWidth]
- *
- * \note The caller needs to ensure that imShape.inputChannels is equal to
- *       colShape.inputChannels.
- */
-template <ColFormat Format, DeviceType Device, class T>
-class Im2ColFunctor {
- public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight = 1,
-                  int dilationWidth = 1);
-};
-
-template <ColFormat Format, DeviceType Device, class T>
-class Col2ImFunctor {
- public:
-  void operator()(T* imData,
-                  const TensorShape& imShape,
-                  const T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight = 1,
-                  int dilationWidth = 1);
-};
-
-template <class T>
-class Im2ColMobileFunctor {
- public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth,
-                  int inputChannels,
-                  int colOffset,
-                  int colOutputHeight,
-                  int colWidth) {
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[1];
-    int filterWidth = colShape[2];
-    int outputWidth = colShape[4];
-
-    for (int ic = 0; ic < inputChannels; ic++) {
-      for (int oh = 0; oh < colOutputHeight; oh++) {
-        T* dstData = colData + oh * outputWidth;
-        for (int fh = 0; fh < filterHeight; fh++) {
-          for (int fw = 0; fw < filterWidth; fw++) {
-            int imRowIdx = (oh + colOffset) * strideHeight +
-                           fh * dilationHeight - paddingHeight;
-            if (imRowIdx < 0 || imRowIdx >= inputHeight) {
-              memset(dstData, 0, outputWidth * sizeof(T));
-            } else {
-              for (int ow = 0; ow < outputWidth; ow++) {
-                int imColIdx =
-                    ow * strideWidth + fw * dilationWidth - paddingWidth;
-                if (imColIdx < 0 || imColIdx >= inputWidth) {
-                  dstData[ow] = T(0);
-                } else {
-                  dstData[ow] = imData[imRowIdx * inputWidth + imColIdx];
-                }
-              }
-            }
-            dstData += colWidth;
-          }
-        }
-      }
-      colData += filterHeight * filterWidth * colWidth;
-      imData += inputHeight * inputWidth;
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/Im2ColOp.cpp b/paddle/legacy/function/Im2ColOp.cpp
deleted file mode 100644
index 55a3ff98db6..00000000000
--- a/paddle/legacy/function/Im2ColOp.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Im2Col.h"
-
-namespace paddle {
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
- */
-template <class T>
-class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> {
- public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[1];
-    int filterWidth = colShape[2];
-    int outputHeight = colShape[3];
-    int outputWidth = colShape[4];
-    int channelsCol = inputChannels * filterHeight * filterWidth;
-
-    for (int c = 0; c < channelsCol; ++c) {
-      int wOffset = c % filterWidth;
-      int hOffset = (c / filterWidth) % filterHeight;
-      int c_im = c / filterWidth / filterHeight;
-      for (int h = 0; h < outputHeight; ++h) {
-        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
-          int imColIdx = w * strideWidth + wOffset * dilationWidth;
-          if ((imRowIdx - paddingHeight) < 0 ||
-              (imRowIdx - paddingHeight) >= inputHeight ||
-              (imColIdx - paddingWidth) < 0 ||
-              (imColIdx - paddingWidth) >= inputWidth) {
-            colData[(c * outputHeight + h) * outputWidth + w] = T(0);
-          } else {
-            imRowIdx += c_im * inputHeight - paddingHeight;
-            imColIdx -= paddingWidth;
-            colData[(c * outputHeight + h) * outputWidth + w] =
-                imData[imRowIdx * inputWidth + imColIdx];
-          }
-        }
-      }
-    }
-  }
-};
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
- */
-template <class T>
-class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, T> {
- public:
-  void operator()(T* imData,
-                  const TensorShape& imShape,
-                  const T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[1];
-    int filterWidth = colShape[2];
-    int outputHeight = colShape[3];
-    int outputWidth = colShape[4];
-    int channelsCol = inputChannels * filterHeight * filterWidth;
-
-    for (int c = 0; c < channelsCol; ++c) {
-      int wOffset = c % filterWidth;
-      int hOffset = (c / filterWidth) % filterHeight;
-      int c_im = c / filterWidth / filterHeight;
-      for (int h = 0; h < outputHeight; ++h) {
-        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
-          int imColIdx = w * strideWidth + wOffset * dilationWidth;
-          if ((imRowIdx - paddingHeight) >= 0 &&
-              (imRowIdx - paddingHeight) < inputHeight &&
-              (imColIdx - paddingWidth) >= 0 &&
-              (imColIdx - paddingWidth) < inputWidth) {
-            imRowIdx += c_im * inputHeight - paddingHeight;
-            imColIdx -= paddingWidth;
-            imData[imRowIdx * inputWidth + imColIdx] +=
-                colData[(c * outputHeight + h) * outputWidth + w];
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, float>;
-template class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, double>;
-template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, float>;
-template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, double>;
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
- */
-template <class T>
-class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, T> {
- public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight = 1,
-                  int dilationWidth = 1) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[3];
-    int filterWidth = colShape[4];
-    int outputHeight = colShape[0];
-    int outputWidth = colShape[1];
-    for (int outputH = 0; outputH < outputHeight; ++outputH) {
-      for (int outputW = 0; outputW < outputWidth; ++outputW) {
-        for (int channel = 0; channel < inputChannels; ++channel) {
-          for (int filterH = 0; filterH < filterHeight; ++filterH) {
-            for (int filterW = 0; filterW < filterWidth; ++filterW) {
-              int imRowOffset = outputH * strideHeight +
-                                filterH * dilationHeight - paddingHeight;
-              int imColOffset = outputW * strideWidth +
-                                filterW * dilationWidth - paddingWidth;
-              int colDataOffset =
-                  (((outputH * outputWidth + outputW) * inputChannels +
-                    channel) *
-                       filterHeight +
-                   filterH) *
-                      filterWidth +
-                  filterW;
-              if (imRowOffset < 0 || imRowOffset >= inputHeight ||
-                  imColOffset < 0 || imColOffset >= inputWidth) {
-                colData[colDataOffset] = float(0);
-              } else {
-                int imDataOffset =
-                    (channel * inputHeight + imRowOffset) * inputWidth +
-                    imColOffset;
-                colData[colDataOffset] = imData[imDataOffset];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
- */
-template <class T>
-class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, T> {
- public:
-  void operator()(T* imData,
-                  const TensorShape& imShape,
-                  const T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight = 1,
-                  int dilationWidth = 1) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[3];
-    int filterWidth = colShape[4];
-    int outputHeight = colShape[0];
-    int outputWidth = colShape[1];
-    for (int outputH = 0; outputH < outputHeight; ++outputH) {
-      for (int outputW = 0; outputW < outputWidth; ++outputW) {
-        for (int channel = 0; channel < inputChannels; ++channel) {
-          for (int filterH = 0; filterH < filterHeight; ++filterH) {
-            for (int filterW = 0; filterW < filterWidth; ++filterW) {
-              int imRowOffset = outputH * strideHeight +
-                                filterH * dilationHeight - paddingHeight;
-              int imColOffset = outputW * strideWidth +
-                                filterW * dilationWidth - paddingWidth;
-              int colDataOffset =
-                  (((outputH * outputWidth + outputW) * inputChannels +
-                    channel) *
-                       filterHeight +
-                   filterH) *
-                      filterWidth +
-                  filterW;
-              if (imRowOffset >= 0 && imRowOffset < inputHeight &&
-                  imColOffset >= 0 && imColOffset < inputWidth) {
-                int imDataOffset =
-                    (channel * inputHeight + imRowOffset) * inputWidth +
-                    imColOffset;
-                imData[imDataOffset] += colData[colDataOffset];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, float>;
-template class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, double>;
-template class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, float>;
-template class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, double>;
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/Im2ColOpGpu.cu b/paddle/legacy/function/Im2ColOpGpu.cu
deleted file mode 100644
index 96dd8f528ea..00000000000
--- a/paddle/legacy/function/Im2ColOpGpu.cu
+++ /dev/null
@@ -1,464 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Im2Col.h"
-#include "hl_device_functions.cuh"
-
-namespace paddle {
-
-template <class T>
-__global__ void im2col(const T* data_im,
-                       int numOuts,
-                       int height,
-                       int width,
-                       int blockH,
-                       int blockW,
-                       int strideH,
-                       int strideW,
-                       int paddingH,
-                       int paddingW,
-                       int dilationH,
-                       int dilationW,
-                       int height_col,
-                       int width_col,
-                       T* data_col) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < numOuts) {
-    int w_out = index % width_col;
-    index /= width_col;
-    int h_out = index % height_col;
-    int channel_in = index / height_col;
-    int channel_out = channel_in * blockH * blockW;
-    int h_in = h_out * strideH;
-    int w_in = w_out * strideW;
-
-    data_col += (channel_out * height_col + h_out) * width_col + w_out;
-    for (int i = 0; i < blockH; ++i) {
-      for (int j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in + i * dilationH);
-        int cIdx = int(w_in + j * dilationW);
-        if ((rIdx - (int)paddingH) >= (int)height ||
-            (rIdx - (int)paddingH) < 0 ||
-            (cIdx - (int)paddingW) >= (int)width ||
-            (cIdx - (int)paddingW) < 0) {
-          *data_col = 0;
-        } else {
-          rIdx = rIdx + channel_in * height - paddingH;
-          cIdx = cIdx - paddingW;
-          *data_col = data_im[rIdx * width + cIdx];
-        }
-        data_col += height_col * width_col;
-      }
-    }
-  }
-}
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
- */
-template <class T>
-class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[1];
-    int filterWidth = colShape[2];
-    int outputHeight = colShape[3];
-    int outputWidth = colShape[4];
-
-    int numKernels = inputChannels * outputHeight * outputWidth;
-    int blocks = (numKernels + 1024 - 1) / 1024;
-    int blockX = 512;
-    int blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-    im2col<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
-                                                    numKernels,
-                                                    inputHeight,
-                                                    inputWidth,
-                                                    filterHeight,
-                                                    filterWidth,
-                                                    strideHeight,
-                                                    strideWidth,
-                                                    paddingHeight,
-                                                    paddingWidth,
-                                                    dilationHeight,
-                                                    dilationWidth,
-                                                    outputHeight,
-                                                    outputWidth,
-                                                    colData);
-    CHECK_SYNC("Im2ColFunctor GPU failed");
-  }
-};
-
-template <class T>
-__global__ void col2im(size_t n,
-                       const T* data_col,
-                       size_t height,
-                       size_t width,
-                       size_t channels,
-                       size_t blockH,
-                       size_t blockW,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t paddingH,
-                       size_t paddingW,
-                       size_t dilationH,
-                       size_t dilationW,
-                       size_t height_col,
-                       size_t width_col,
-                       T* data_im) {
-  size_t index =
-      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    T val = 0;
-    int w = int(index % width);
-    int h = int((index / width) % height);
-    int c = int(index / (width * height));
-    int filterH = (blockH - 1) * dilationH + 1;
-    int filterW = (blockW - 1) * dilationW + 1;
-
-    if ((w - (int)paddingW) >= 0 &&
-        (w - (int)paddingW) < (width - 2 * paddingW) &&
-        (h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) {
-      // compute the start and end of the output
-      int w_col_start =
-          (w < (int)filterW) ? 0 : (w - int(filterW)) / (int)strideW + 1;
-      int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col));
-      int h_col_start =
-          (h < (int)filterH) ? 0 : (h - (int)filterH) / (int)strideH + 1;
-      int h_col_end = min(int(h / strideH + 1), int(height_col));
-
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          // the col location: [c * width * height + h_out, w_out]
-          int h_k = (h - h_col * strideH);
-          int w_k = (w - w_col * strideW);
-          if (h_k % dilationH == 0 && w_k % dilationW == 0) {
-            h_k /= dilationH;
-            w_k /= dilationW;
-            int c_col =
-                (((c * blockH + h_k) * blockW + w_k) * height_col + h_col) *
-                    width_col +
-                w_col;
-            val += data_col[c_col];
-          }
-        }
-      }
-      h -= paddingH;
-      w -= paddingW;
-      data_im[c * ((width - 2 * paddingW) * (height - 2 * paddingH)) +
-              h * (width - 2 * paddingW) + w] += val;
-    }
-  }
-}
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
- */
-template <class T>
-class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(T* imData,
-                  const TensorShape& imShape,
-                  const T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[1];
-    int filterWidth = colShape[2];
-    int outputHeight = colShape[3];
-    int outputWidth = colShape[4];
-
-    size_t numKernels = inputChannels * (inputHeight + 2 * paddingHeight) *
-                        (inputWidth + 2 * paddingWidth);
-
-    size_t blocks = (numKernels + 1024 - 1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-
-    // To avoid involving atomic operations, we will launch one kernel per
-    // bottom dimension, and then in the kernel add up the top dimensions.
-    col2im<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        numKernels,
-        colData,
-        inputHeight + 2 * paddingHeight,
-        inputWidth + 2 * paddingWidth,
-        inputChannels,
-        filterHeight,
-        filterWidth,
-        strideHeight,
-        strideWidth,
-        paddingHeight,
-        paddingWidth,
-        dilationHeight,
-        dilationWidth,
-        outputHeight,
-        outputWidth,
-        imData);
-    CHECK_SYNC("Col2ImFunctor GPU failed");
-  }
-};
-
-template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, float>;
-template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, double>;
-template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, float>;
-template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, double>;
-
-template <class T>
-__global__ void im2colOCF(const T* imData,
-                          T* colData,
-                          int inputChannels,
-                          int inputHeight,
-                          int inputWidth,
-                          int filterHeight,
-                          int filterWidth,
-                          int strideHeight,
-                          int strideWidth,
-                          int paddingHeight,
-                          int paddingWidth,
-                          int dilationHeight,
-                          int dilationWidth,
-                          int outputHeight,
-                          int outputWidth) {
-  int swId = blockIdx.x;
-  int shId = blockIdx.y;
-  for (int channelId = threadIdx.z; channelId < inputChannels;
-       channelId += blockDim.z) {
-    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
-      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
-        int widthOffset =
-            idx * dilationHeight + swId * strideWidth - paddingWidth;
-        int heightOffset =
-            idy * dilationWidth + shId * strideHeight - paddingHeight;
-        int imOffset = widthOffset + heightOffset * inputWidth +
-                       channelId * inputHeight * inputWidth;
-
-        int colOffset = idx + idy * filterWidth +
-                        channelId * filterHeight * filterWidth +
-                        (shId * outputWidth + swId) *
-                            (inputChannels * filterHeight * filterWidth);
-
-        if (heightOffset >= inputHeight || heightOffset < 0 ||
-            widthOffset >= inputWidth || widthOffset < 0) {
-          colData[colOffset] = T(0);
-        } else {
-          colData[colOffset] = imData[imOffset];
-        }
-      }
-    }
-  }
-}
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
- */
-template <class T>
-class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[3];
-    int filterWidth = colShape[4];
-    int outputHeight = colShape[0];
-    int outputWidth = colShape[1];
-
-    int blockDimX = 0;
-    int blockDimY = 0;
-    if (filterHeight <= 4 && filterWidth <= 4) {
-      blockDimX = 4;
-      blockDimY = 4;
-    } else if (filterHeight <= 8 && filterWidth <= 8) {
-      blockDimX = 8;
-      blockDimY = 8;
-    } else if (filterHeight <= 16 && filterWidth <= 16) {
-      blockDimX = 16;
-      blockDimY = 16;
-    } else {
-      blockDimX = 32;
-      blockDimY = 32;
-    }
-
-    int blockDimZ = 1024 / blockDimX / blockDimY;
-    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
-    dim3 grid(outputWidth, outputHeight);
-    im2colOCF<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
-                                                       colData,
-                                                       inputChannels,
-                                                       inputHeight,
-                                                       inputWidth,
-                                                       filterHeight,
-                                                       filterWidth,
-                                                       strideHeight,
-                                                       strideWidth,
-                                                       paddingHeight,
-                                                       paddingWidth,
-                                                       dilationHeight,
-                                                       dilationWidth,
-                                                       outputHeight,
-                                                       outputWidth);
-    CHECK_SYNC("Im2ColFunctor GPU failed");
-  }
-};
-
-template <class T>
-__global__ void col2imOCF(T* imData,
-                          const T* colData,
-                          int inputChannels,
-                          int inputHeight,
-                          int inputWidth,
-                          int filterHeight,
-                          int filterWidth,
-                          int strideHeight,
-                          int strideWidth,
-                          int paddingHeight,
-                          int paddingWidth,
-                          int dilationHeight,
-                          int dilationWidth,
-                          int outputHeight,
-                          int outputWidth) {
-  int swId = blockIdx.x;
-  int shId = blockIdx.y;
-  for (int channelId = threadIdx.z; channelId < inputChannels;
-       channelId += blockDim.z) {
-    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
-      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
-        int widthOffset =
-            idx * dilationWidth + swId * strideWidth - paddingWidth;
-        int heightOffset =
-            idy * dilationHeight + shId * strideHeight - paddingHeight;
-        int imOffset = widthOffset + heightOffset * inputWidth +
-                       channelId * inputHeight * inputWidth;
-
-        int colOffset = idx + idy * filterWidth +
-                        channelId * filterHeight * filterWidth +
-                        (shId * outputWidth + swId) *
-                            (inputChannels * filterHeight * filterWidth);
-
-        if (heightOffset >= 0 && heightOffset < inputHeight &&
-            widthOffset >= 0 && widthOffset < inputWidth) {
-          paddle::paddleAtomicAdd(imData + imOffset, colData[colOffset]);
-        }
-      }
-    }
-  }
-}
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
- */
-template <class T>
-class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(T* imData,
-                  const TensorShape& imShape,
-                  const T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[3];
-    int filterWidth = colShape[4];
-    int outputHeight = colShape[0];
-    int outputWidth = colShape[1];
-
-    int blockDimX = 0;
-    int blockDimY = 0;
-    if (filterHeight <= 4 && filterWidth <= 4) {
-      blockDimX = 4;
-      blockDimY = 4;
-    } else if (filterHeight <= 8 && filterWidth <= 8) {
-      blockDimX = 8;
-      blockDimY = 8;
-    } else if (filterHeight <= 16 && filterWidth <= 16) {
-      blockDimX = 16;
-      blockDimY = 16;
-    } else {
-      blockDimX = 32;
-      blockDimY = 32;
-    }
-
-    int blockDimZ = 1024 / blockDimX / blockDimY;
-    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
-    dim3 grid(outputWidth, outputHeight);
-    col2imOCF<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
-                                                       colData,
-                                                       inputChannels,
-                                                       inputHeight,
-                                                       inputWidth,
-                                                       filterHeight,
-                                                       filterWidth,
-                                                       strideHeight,
-                                                       strideWidth,
-                                                       paddingHeight,
-                                                       paddingWidth,
-                                                       dilationHeight,
-                                                       dilationWidth,
-                                                       outputHeight,
-                                                       outputWidth);
-    CHECK_SYNC("Col2ImFunctor GPU failed");
-  }
-};
-
-template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, float>;
-template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, double>;
-template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, float>;
-template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, double>;
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/Im2ColTest.cpp b/paddle/legacy/function/Im2ColTest.cpp
deleted file mode 100644
index 2c5f06f3899..00000000000
--- a/paddle/legacy/function/Im2ColTest.cpp
+++ /dev/null
@@ -1,223 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Im2Col.h"
-#include <gtest/gtest.h>
-#include "Function.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/tests/TensorCheck.h"
-
-namespace paddle {
-
-template <DeviceType Device, class T>
-void TestIm2ColFunctor() {
-  for (size_t channels : {1, 5, 32}) {
-    for (size_t inputHeight : {5, 33, 100}) {
-      for (size_t inputWidth : {5, 32, 96}) {
-        for (size_t filterHeight : {1, 5}) {
-          for (size_t filterWidth : {3, 7}) {
-            for (size_t stride : {1, 2}) {
-              for (size_t padding : {0, 1}) {
-                for (size_t dilation : {1, 3}) {
-                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
-                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
-                  if (inputHeight + 2 * padding < filterSizeH ||
-                      inputWidth + 2 * padding < filterSizeW)
-                    break;
-                  if (padding >= filterSizeH || padding >= filterSizeW) break;
-                  size_t outputHeight =
-                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
-                  size_t outputWidth =
-                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
-
-                  TensorShape imShape =
-                      TensorShape({channels, inputHeight, inputWidth});
-                  TensorShape colShape1 = TensorShape({channels,
-                                                       filterHeight,
-                                                       filterWidth,
-                                                       outputHeight,
-                                                       outputWidth});
-                  TensorShape colShape2 = TensorShape({outputHeight,
-                                                       outputWidth,
-                                                       channels,
-                                                       filterHeight,
-                                                       filterWidth});
-
-                  size_t height = channels * filterHeight * filterWidth;
-                  size_t width = outputHeight * outputWidth;
-                  VectorPtr input1 =
-                      Vector::create(imShape.getElements(), false);
-                  VectorPtr input2 =
-                      Vector::create(imShape.getElements(), false);
-                  MatrixPtr output1 =
-                      Matrix::create(height, width, false, false);
-                  MatrixPtr output2 =
-                      Matrix::create(width, height, false, false);
-                  input1->uniform(0.001, 1);
-                  input2->copyFrom(*input1);
-
-                  Im2ColFunctor<kCFO, Device, T> im2Col1;
-                  Im2ColFunctor<kOCF, Device, T> im2Col2;
-                  im2Col1(input1->getData(),
-                          imShape,
-                          output1->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  im2Col2(input2->getData(),
-                          imShape,
-                          output2->getData(),
-                          colShape2,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-
-                  // The transposition of the result of ColFormat == kCFO
-                  // is equal to the result of ColFormat == kOCF.
-                  MatrixPtr test;
-                  output2->transpose(test, true);
-                  autotest::TensorCheckErr(*output1, *test);
-
-                  Col2ImFunctor<kCFO, Device, T> col2Im1;
-                  Col2ImFunctor<kOCF, Device, T> col2Im2;
-
-                  col2Im1(input1->getData(),
-                          imShape,
-                          output1->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  col2Im2(input2->getData(),
-                          imShape,
-                          output2->getData(),
-                          colShape2,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  autotest::TensorCheckErr(*input1, *input2);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }
-
-#ifdef PADDLE_WITH_CUDA
-
-TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
-
-#endif
-
-template <class T>
-void TestIm2ColMobileFunctor() {
-  for (size_t channels : {32}) {
-    for (size_t inputHeight : {33, 100}) {
-      for (size_t inputWidth : {32, 96}) {
-        for (size_t filterHeight : {5}) {
-          for (size_t filterWidth : {7}) {
-            for (size_t stride : {2}) {
-              for (size_t padding : {1}) {
-                for (size_t dilation : {1, 3}) {
-                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
-                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
-                  if (inputHeight + 2 * padding < filterSizeH ||
-                      inputWidth + 2 * padding < filterSizeW)
-                    break;
-                  if (padding >= filterSizeH || padding >= filterSizeW) break;
-                  size_t outputHeight =
-                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
-                  size_t outputWidth =
-                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
-
-                  TensorShape imShape =
-                      TensorShape({channels, inputHeight, inputWidth});
-                  TensorShape colShape1 = TensorShape({channels,
-                                                       filterHeight,
-                                                       filterWidth,
-                                                       outputHeight,
-                                                       outputWidth});
-
-                  size_t height = channels * filterHeight * filterWidth;
-                  size_t width = outputHeight * outputWidth;
-                  VectorPtr input1 =
-                      Vector::create(imShape.getElements(), false);
-                  VectorPtr input2 =
-                      Vector::create(imShape.getElements(), false);
-                  MatrixPtr output1 =
-                      Matrix::create(height, width, false, false);
-                  MatrixPtr output2 =
-                      Matrix::create(height, width, false, false);
-                  input1->uniform(0.001, 1);
-                  input2->copyFrom(*input1);
-
-                  Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> im2Col1;
-                  Im2ColMobileFunctor<T> im2Col2;
-                  im2Col1(input1->getData(),
-                          imShape,
-                          output1->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  im2Col2(input2->getData(),
-                          imShape,
-                          output2->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation,
-                          channels,
-                          0,
-                          outputHeight,
-                          outputHeight * outputWidth);
-
-                  autotest::TensorCheckEqual(*output1, *output2);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Im2ColFunctor, Mobile) { TestIm2ColMobileFunctor<float>(); }
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/MulOp.cpp b/paddle/legacy/function/MulOp.cpp
deleted file mode 100644
index 750978fc902..00000000000
--- a/paddle/legacy/function/MulOp.cpp
+++ /dev/null
@@ -1,347 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MulOp.h"
-#include "GemmFunctor.h"
-#include "paddle/legacy/math/SIMDFunctions.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace {
-inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i] += (1.0 == scaleB) ? b[i] : scaleB * b[i];
-  }
-}
-
-inline void colVecAddTo(
-    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i * aWidth] += (1.0 == c) ? b[i * bWidth] : b[i * bWidth] * c;
-  }
-}
-}  // namespace
-
-namespace paddle {
-/// sparse matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuSparseMatrix& out,
-                            const CpuMatrix& a,
-                            const CpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK_EQ(out.getValueType(), FLOAT_VALUE);
-  if (scaleT == 0) {
-    out.zeroMem();
-  }
-  const real* A = a.getData();
-  const real* B = b.getData();
-  real* C = out.getValue();
-  int* rows = out.getRows();
-  int* cols = out.getCols();
-  size_t width = out.getWidth();
-  size_t height = out.getHeight();
-
-  /// SPARSE_CSC, {a any, b not trans}
-  if (out.getFormat() == SPARSE_CSC) {
-    /// b not trans and a any
-    CHECK(!bTrans);
-    size_t m = !aTrans ? a.getWidth() : a.getHeight();
-    for (size_t i = 0; i < width; i++) {
-      size_t start = out.getColStartIdx(i);
-      size_t end = out.getColStartIdx(i + 1);
-      for (size_t j = start; j < end; j++) {
-        real sum = 0;
-        size_t rowIdx = rows[j];
-        for (size_t k = 0; k < m; k++) {
-          sum += (!aTrans ? A[rowIdx * m + k] : A[k * height + rowIdx]) *
-                 B[k * width + i];
-        }
-        C[j] = scaleAB * sum + scaleT * C[j];
-      }
-    }
-    return;
-  }
-
-  /// SPARSE_CSR, {a any, b not trans} or {a not trans, b trans}
-  if (out.getFormat() == SPARSE_CSR) {
-    /// a and b can not both transpose
-    CHECK(!(aTrans && bTrans));
-    size_t m = a.getWidth();
-    for (size_t i = 0; i < height; i++) {
-      size_t start = out.getRowStartIdx(i);
-      size_t end = out.getRowStartIdx(i + 1);
-      for (size_t j = start; j < end; j++) {
-        real sum = 0;
-        size_t colIdx = cols[j];
-        for (size_t k = 0; k < m; k++) {
-          sum += (!aTrans ? A[i * m + k] : A[k * height + i]) *
-                 (!bTrans ? B[k * width + colIdx] : B[colIdx * m + k]);
-        }
-        C[j] = scaleAB * sum + scaleT * C[j];
-      }
-    }
-    return;
-  }
-}
-
-/// dense matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                            const CpuMatrix& a,
-                            const CpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
-      aTrans,
-      bTrans,
-      out.getHeight(),
-      out.getWidth(),
-      !aTrans ? a.getWidth() : a.getHeight(),
-      scaleAB,
-      a.getData(),
-      a.getStride(),
-      b.getData(),
-      b.getStride(),
-      scaleT,
-      out.getData(),
-      out.getStride());
-}
-
-/// dense matrix (+)= sparse matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                            const CpuSparseMatrix& a,
-                            const CpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  if (scaleT == 0) {
-    out.zeroMem();
-  }
-  const real* B = b.getData();
-  real* C = out.getData();
-  if (out.getWidth() % 32 == 0) {
-    CHECK_EQ((size_t)B % 32, 0UL);
-    CHECK_EQ((size_t)C % 32, 0UL);
-  }
-
-  int* cols = a.getCols();
-  real* values = a.getValue();
-  for (size_t i = 0; i < a.getHeight(); ++i) {
-    const int start = a.getRowStartIdx(i);
-    const int end = a.getRowStartIdx(i + 1);
-    for (int j = start; j < end; ++j) {
-      vecAddTo(!aTrans ? out.getRow(i) : out.getRow(cols[j]),
-               !aTrans ? const_cast<CpuMatrix&>(b).getRow(cols[j])
-                       : const_cast<CpuMatrix&>(b).getRow(i),
-               (a.getValueType() == FLOAT_VALUE) ? values[j] : (real)1.0,
-               out.getWidth());
-    }
-  }
-}
-
-/// dense matrix (+)= dense matrix * sparse matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                            const CpuMatrix& a,
-                            const CpuSparseMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  if (scaleT == 0) {
-    out.zeroMem();
-  }
-  real* A = const_cast<real*>(a.getData());
-  real* B = const_cast<real*>(b.getValue());
-  real* C = out.getData();
-  int* rows = b.getRows();
-  int* cols = b.getCols();
-
-  /// SPARSE_CSC format
-  if (b.getFormat() == SPARSE_CSC) {
-    for (size_t j = 0; j < b.getWidth(); ++j) {
-      int start = b.getColStartIdx(j);
-      int end = b.getColStartIdx(j + 1);
-      for (int i = start; i < end; ++i) {
-        colVecAddTo(!bTrans ? C + j : C + rows[i],
-                    !bTrans ? A + rows[i] : A + j,
-                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
-                    out.getHeight(),
-                    out.getWidth(),
-                    a.getWidth());
-      }
-    }
-    return;
-  }
-
-  /// SPARSE_CSR format
-  if (b.getFormat() == SPARSE_CSR) {
-    for (size_t j = 0; j < b.getHeight(); ++j) {
-      int start = b.getRowStartIdx(j);
-      int end = b.getRowStartIdx(j + 1);
-      for (int i = start; i < end; ++i) {
-        colVecAddTo(!bTrans ? C + cols[i] : C + j,
-                    !bTrans ? A + j : A + cols[i],
-                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
-                    out.getHeight(),
-                    out.getWidth(),
-                    a.getWidth());
-      }
-    }
-    return;
-  }
-}
-
-/**
- * mul operator
- * out = scaleT * out + scaleAB * (A * B)
- * here, scaleT in {0, 1}, scaleAB == 1,
- * out = A * B, ASSIGN_TO
- * out += A * B, ADD_TO
- *
- *
- * \param outputs[0]      output matrix (out), M * N,
- *                        could be either Sparse or Dense Matrix
- *                        M is num of rows, N is num of columns
- * \param inputs[0]       first input matrix (A),  M * K (if non-trans)
- *                        could be either Sparse or Dense Matrix
- *                        M is num of rows, K is num of columns
- * \param inputs[1]       second input matrix (B), K * N (if non-trans)
- *                        could be either Sparse or Dense Matrix
- *                        K is num of rows, N is num of columns
- *
- * Support eight Mul operators, with both GPU and CPU devices
- * For each device, four Mul operators are supported:
- * 1. dense (out) = dense (A) * dense (B)
- * 2. dense (out) = sparse (A) * dense (B)
- *    sparse matrix only support SPARSE_CSR format
- * 3. dense (out) = dense (A) * sparse (B)
- *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
- * 4. sparse (out) = dense (A) * dense (B)
- *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
- *
- */
-template <DeviceType Device>
-class MulFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    aTrans_ = config.get<bool>("aTrans");
-    bTrans_ = config.get<bool>("bTrans");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK(!aTrans_ || !bTrans_)
-        << "Not support both a and b are transpose matrices";
-
-    CHECK_EQ((size_t)2, inputs.size());
-    CHECK_EQ((size_t)1, outputs.size());
-    CHECK(inputs[0].data() && inputs[1].data() && outputs[0].data());
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
-    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
-
-    size_t aRow = !aTrans_ ? inputs[0].shape()[0] : inputs[0].shape()[1];
-    size_t aCol = !aTrans_ ? inputs[0].shape()[1] : inputs[0].shape()[0];
-    size_t bRow = !bTrans_ ? inputs[1].shape()[0] : inputs[1].shape()[1];
-    size_t bCol = !bTrans_ ? inputs[1].shape()[1] : inputs[1].shape()[0];
-    /// C = A * B, or C += A * B, for matrix format
-    CHECK_EQ(aCol, bRow);
-    CHECK_EQ(aRow, outputs[0].shape()[0]);
-    CHECK_EQ(bCol, outputs[0].shape()[1]);
-
-    /// only support C = A * B (ASSIGN_TO) or C += A * B (ADD_TO)
-    real scaleT = (outputs[0].getArgType() == ADD_TO) ? 1.0 : 0.0;
-
-    /// support dense = not both sparse * sparse
-    /// or sparse = dense * dense
-    CHECK((!outputs[0].isSparseArg() &&
-           !(inputs[0].isSparseArg() && inputs[1].isSparseArg())) ||
-          (outputs[0].isSparseArg() && !inputs[0].isSparseArg() &&
-           !inputs[1].isSparseArg()));
-
-    auto outMat = outputs[0].matrix<Device>();
-    /// dense matrix = dense matrix * dense matrix
-    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
-        !outputs[0].isSparseArg()) {
-      MulOp<Device>(outMat,
-                    inputs[0].matrix<Device>(),
-                    inputs[1].matrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-
-    /// dense matrix = dense matrix * sparse matrix
-    if (!inputs[0].isSparseArg() && inputs[1].isSparseArg() &&
-        !outputs[0].isSparseArg()) {
-      CHECK(!aTrans_) << "Not supported a transpose";
-      MulOp<Device>(outMat,
-                    inputs[0].matrix<Device>(),
-                    inputs[1].sparse().SparseMatrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-
-    /// dense matrix = sparse matrix * dense matrix
-    if (inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
-        !outputs[0].isSparseArg()) {
-      CHECK(!bTrans_) << "Not supported b transpose";
-      CHECK_EQ(inputs[0].sparse().dataFormat(), T_SPARSE_CSR)
-          << "Only supported SPARSE_CSR format for sparse matrix a";
-      MulOp<Device>(outMat,
-                    inputs[0].sparse().SparseMatrix<Device>(),
-                    inputs[1].matrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-
-    /// sparse matrix = dense matrix * dense matrix
-    auto outSparseMat = outputs[0].sparse().SparseMatrix<Device>();
-    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
-        outputs[0].isSparseArg()) {
-      MulOp<Device>(outSparseMat,
-                    inputs[0].matrix<Device>(),
-                    inputs[1].matrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-  }
-
- private:
-  bool aTrans_;
-  bool bTrans_;
-};
-
-REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc);
-#endif
-}  // namespace paddle
diff --git a/paddle/legacy/function/MulOp.h b/paddle/legacy/function/MulOp.h
deleted file mode 100644
index ab33bde1729..00000000000
--- a/paddle/legacy/function/MulOp.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-
-namespace paddle {
-/// CPU, dense matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(CpuMatrix& out,
-           const CpuMatrix& a,
-           const CpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// CPU, dense matrix (+)= sparse matrix * dense matrix
-template <DeviceType DType>
-void MulOp(CpuMatrix& out,
-           const CpuSparseMatrix& a,
-           const CpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// CPU, dense matrix (+)= dense matrix * sparse matrix
-template <DeviceType DType>
-void MulOp(CpuMatrix& out,
-           const CpuMatrix& a,
-           const CpuSparseMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// CPU, sparse matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(CpuSparseMatrix& out,
-           const CpuMatrix& a,
-           const CpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, dense matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(GpuMatrix& out,
-           const GpuMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, dense matrix (+)= sparse matrix * dense matrix
-template <DeviceType DType>
-void MulOp(GpuMatrix& out,
-           const GpuSparseMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, dense matrix (+)= dense matrix * sparse matrix
-template <DeviceType DType>
-void MulOp(GpuMatrix& out,
-           const GpuMatrix& a,
-           const GpuSparseMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, sparse matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(GpuSparseMatrix& out,
-           const GpuMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/MulOpGpu.cu b/paddle/legacy/function/MulOpGpu.cu
deleted file mode 100644
index 217c983cb75..00000000000
--- a/paddle/legacy/function/MulOpGpu.cu
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MulOp.h"
-#include "hl_base.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-
-namespace paddle {
-/// dense matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
-                            const GpuMatrix& a,
-                            const GpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-  hl_matrix_mul(const_cast<real*>(a.getData()),
-                !aTrans ? HPPL_OP_N : HPPL_OP_T,
-                const_cast<real*>(b.getData()),
-                !bTrans ? HPPL_OP_N : HPPL_OP_T,
-                const_cast<real*>(out.getData()),
-                out.getHeight(),
-                out.getWidth(),
-                !aTrans ? a.getWidth() : a.getHeight(),
-                scaleAB,
-                scaleT,
-                a.getStride(),
-                b.getStride(),
-                out.getStride());
-}
-
-/// dense matrix (+)= sparse matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
-                            const GpuSparseMatrix& a,
-                            const GpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(out.isContiguous());
-  CHECK(b.isContiguous());
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-  hl_matrix_csr_mul_dense(a.sMatrix_.get(),
-                          aTrans ? HPPL_OP_T : HPPL_OP_N,
-                          const_cast<real*>(b.getData()),
-                          HPPL_OP_N,
-                          const_cast<real*>(out.getData()),
-                          out.getHeight(),
-                          out.getWidth(),
-                          b.getHeight(),
-                          scaleAB,
-                          scaleT);
-}
-
-/// dense matrix (+)= dense matrix * sparse matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
-                            const GpuMatrix& a,
-                            const GpuSparseMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(out.isContiguous());
-  CHECK(a.isContiguous());
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-
-  if (b.format_ == SPARSE_CSC) {
-    hl_matrix_dense_mul_csc(const_cast<real*>(a.getData()),
-                            HPPL_OP_N,
-                            b.sMatrix_.get(),
-                            bTrans ? HPPL_OP_T : HPPL_OP_N,
-                            const_cast<real*>(out.getData()),
-                            out.getHeight(),
-                            out.getWidth(),
-                            a.getWidth(),
-                            scaleAB,
-                            scaleT);
-  } else {
-    hl_matrix_dense_mul_csr(const_cast<real*>(a.getData()),
-                            HPPL_OP_N,
-                            b.sMatrix_.get(),
-                            bTrans ? HPPL_OP_T : HPPL_OP_N,
-                            const_cast<real*>(out.getData()),
-                            out.getHeight(),
-                            out.getWidth(),
-                            a.getWidth(),
-                            scaleAB,
-                            scaleT);
-  }
-}
-
-/// sparse matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuSparseMatrix& out,
-                            const GpuMatrix& a,
-                            const GpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-  hl_sparse_matrix_mul(const_cast<real*>(a.getData()),
-                       aTrans ? HPPL_OP_T : HPPL_OP_N,
-                       const_cast<real*>(b.getData()),
-                       bTrans ? HPPL_OP_T : HPPL_OP_N,
-                       out.sMatrix_.get(),
-                       out.getHeight(),
-                       out.getWidth(),
-                       !bTrans ? b.getHeight() : b.getWidth(),
-                       scaleAB,
-                       scaleT);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/MulOpTest.cpp b/paddle/legacy/function/MulOpTest.cpp
deleted file mode 100644
index ab08b6f8696..00000000000
--- a/paddle/legacy/function/MulOpTest.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/legacy/math/tests/test_matrixUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-/**
- *  C += A * B, A, B, C dense matrix
- *  dense = dense * dense
- */
-void testFuncDDDMatrix(
-    bool transa, bool transb, size_t dimM, size_t dimN, size_t dimK) {
-  real scaleT = 1.0;
-  size_t heightA = (transa == false) ? dimM : dimK;
-  size_t widthA = (transa == false) ? dimK : dimM;
-  size_t heightB = (transb == false) ? dimK : dimN;
-  size_t widthB = (transb == false) ? dimN : dimK;
-  size_t heightC = dimM;
-  size_t widthC = dimN;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", transa).set("bTrans", transb));
-  // prepare input arguments
-  /// matrix A : HA * WA
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightA, widthA}));
-  /// matrix B: HB * WB
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightB, widthB}));
-
-  /// output matrix C: HC * WC
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightC, widthC}),
-                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MulOp, DDDMatrixMul) {
-  LOG(INFO) << "function test for dense = dense * dense matrix";
-  for (const auto transa : {false, true}) {
-    for (const auto transb : {false, true}) {
-      for (const auto dimM : {1, 10, 100}) {
-        for (const auto dimN : {1, 10}) {
-          for (const auto dimK : {8}) {
-            if (transa && transb) {
-              continue;
-            }
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " transa=" << transa << " transb=" << transb
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK;
-            testFuncDDDMatrix(transa, transb, dimM, dimN, dimK);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * C += A * B, B, C dense, A sparse
- * dense = sparse * dense
- */
-void testFuncDSparseDMatrix(
-    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
-  real scaleT = 1.0;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
-  // prepare input arguments
-  /// sparse matrix A : M * K
-  test.addInputs(SparseMatrixArg(
-      VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}, nnz, FORMAT, FLOAT_VALUE));
-  /// matrix B: K * N
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
-
-  /// output matrix C: M * N
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
-                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MuLOp, DSparseDMul) {
-  LOG(INFO) << "function test for dense = sparse * dense matrix";
-  for (const auto dimM : {10, 100, 1000}) {
-    for (const auto dimN : {10, 100}) {
-      for (const auto dimK : {3, 10}) {
-        for (const auto nnz : {3, 10}) {
-          for (const auto FORMAT : {SPARSE_CSR}) {
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK
-                    << " nnz=" << std::setw(5) << nnz
-                    << " format=" << std::setw(5) << FORMAT;
-            testFuncDSparseDMatrix(dimM, dimN, dimK, nnz, FORMAT);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * C += A * B, A, C dense, B sparse
- * dense = dense * sparse
- */
-void testFuncDDSparseMatrix(
-    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
-  real scaleT = 1.0;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
-  // prepare input arguments
-  /// matrix A : M * K
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
-
-  /// matrix B: K * N
-  test.addInputs(SparseMatrixArg(
-      VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}, nnz, FORMAT, FLOAT_VALUE));
-
-  /// output matrix C: M * N
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
-                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MulOp, DDSparseMul) {
-  LOG(INFO) << "function test for dense = dense * sparse matrix";
-  for (const auto dimM : {10, 100, 1000}) {
-    for (const auto dimN : {10, 100}) {
-      for (const auto dimK : {3, 10}) {
-        for (const auto nnz : {3, 10}) {
-          for (const auto FORMAT : {SPARSE_CSR, SPARSE_CSC}) {
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK
-                    << " nnz=" << std::setw(5) << nnz
-                    << " format=" << std::setw(5) << FORMAT;
-            testFuncDDSparseMatrix(dimM, dimN, dimK, nnz, FORMAT);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * C += A * B, A sparse, B, C dense
- * sparse = dense * dense
- */
-void testFuncSparseDDMatrix(
-    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
-  real scaleT = 1.0;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
-  // prepare input arguments
-  /// matrix A : M * K
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
-
-  /// matrix B: K * N
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
-
-  /// output sparse matrix C: M * N
-  test.addOutputs(
-      SparseMatrixArg(
-          VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}, nnz, FORMAT, FLOAT_VALUE),
-      scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MulOp, SparseDDMul) {
-  LOG(INFO) << "function test for sparse = dense * dense matrix";
-  for (const auto dimM : {10, 100, 1000}) {
-    for (const auto dimN : {10, 100}) {
-      for (const auto dimK : {3, 10}) {
-        for (const auto nnz : {3, 10}) {
-          for (const auto FORMAT : {SPARSE_CSC, SPARSE_CSR}) {
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK
-                    << " nnz=" << std::setw(5) << nnz
-                    << " format=" << std::setw(5) << FORMAT;
-            testFuncSparseDDMatrix(dimM, dimN, dimK, nnz, FORMAT);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/legacy/function/NaiveConvOp.cpp b/paddle/legacy/function/NaiveConvOp.cpp
deleted file mode 100644
index 99c8b81acbb..00000000000
--- a/paddle/legacy/function/NaiveConvOp.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvOp.h"
-
-namespace paddle {
-
-/*
- * The three arguments are stored in memory in row major order.
- * inputData  = [batchSize, inputChannels, inputHeight, inputWidth]
- * filterData = [outputChannels, inputChannels, filterHeight, filterWidth]
- * outputData = [batchSize, outputChannels, outputHeight, outputWidth]
- */
-template <class T>
-class NaiveConvFunctor {
- public:
-  void operator()(const T* inputData,
-                  size_t batchSize,
-                  size_t inputChannels,
-                  size_t inputHeight,
-                  size_t inputWidth,
-                  const T* filterData,
-                  size_t filterHeight,
-                  size_t filterWidth,
-                  T* outputData,
-                  size_t outputChannels,
-                  size_t outputHeight,
-                  size_t outputWidth,
-                  size_t paddingH,
-                  size_t paddingW,
-                  size_t strideH,
-                  size_t strideW) {
-    for (size_t batch = 0; batch < batchSize; batch++) {
-      for (size_t outC = 0; outC < outputChannels; outC++) {
-        for (size_t outH = 0; outH < outputHeight; outH++) {
-          for (size_t outW = 0; outW < outputWidth; outW++) {
-            const int inStartH = (outH * strideH) - paddingH;
-            const int inStartW = (outW * strideW) - paddingW;
-            T outValue = (T)0;
-            for (size_t inC = 0; inC < inputChannels; inC++) {
-              for (size_t fH = 0; fH < filterHeight; fH++) {
-                for (size_t fW = 0; fW < filterWidth; fW++) {
-                  T inValue;
-                  const int inH = inStartH + fH;
-                  const int inW = inStartW + fW;
-                  if ((inH >= 0 && inH < (int)inputHeight) &&
-                      (inW >= 0 && inW < (int)inputWidth)) {
-                    size_t offsetInput =
-                        batch * inputChannels * inputHeight * inputWidth +
-                        inC * inputHeight * inputWidth + inH * inputWidth + inW;
-                    inValue = inputData[offsetInput];
-                  } else {
-                    inValue = (T)0;
-                  }
-                  size_t offsetFilter =
-                      outC * inputChannels * filterHeight * filterWidth +
-                      inC * filterHeight * filterWidth + fH * filterWidth + fW;
-                  T filterValue = filterData[offsetFilter];
-                  outValue += (inValue * filterValue);
-                }
-              }
-            }
-
-            size_t offset =
-                batch * outputChannels * outputHeight * outputWidth +
-                outC * outputHeight * outputWidth + outH * outputWidth + outW;
-            outputData[offset] = outValue;
-          }
-        }
-      }
-    }
-  }
-};
-
-template <DeviceType Device>
-class NaiveConvFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    check(inputs, outputs);
-
-    size_t batchSize = inputs[0].shape()[0];
-    size_t inputChannels = inputs[0].shape()[1];
-    size_t inputHeight = inputs[0].shape()[2];
-    size_t inputWidth = inputs[0].shape()[3];
-    size_t filterHeight = inputs[1].shape()[2];
-    size_t filterWidth = inputs[1].shape()[3];
-    size_t outputChannels = outputs[0].shape()[1];
-    size_t outputHeight = outputs[0].shape()[2];
-    size_t outputWidth = outputs[0].shape()[3];
-
-    real* inputData = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* outputData = outputs[0].data<real>();
-    NaiveConvFunctor<real> conv;
-    conv(inputData,
-         batchSize,
-         inputChannels,
-         inputHeight,
-         inputWidth,
-         filterData,
-         filterHeight,
-         filterWidth,
-         outputData,
-         outputChannels,
-         outputHeight,
-         outputWidth,
-         paddingH(),
-         paddingW(),
-         strideH(),
-         strideW());
-  }
-};
-
-REGISTER_TYPED_FUNC(NaiveConv, CPU, NaiveConvFunction);
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/PadOp.cpp b/paddle/legacy/function/PadOp.cpp
deleted file mode 100644
index 9d011d28e69..00000000000
--- a/paddle/legacy/function/PadOp.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PadOp.h"
-#include "paddle/legacy/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void Pad<DEVICE_TYPE_CPU>(real* outputs,
-                          const real* inputs,
-                          const int num,
-                          const int inC,
-                          const int inH,
-                          const int inW,
-                          const PadConf& pad) {
-  int cstart = pad.channel[0], cend = pad.channel[1];
-  int hstart = pad.height[0], hend = pad.height[1];
-  int wstart = pad.width[0], wend = pad.width[1];
-  int outC = inC + cstart + cend;
-  int outH = inH + hstart + hend;
-  int outW = inW + wstart + wend;
-  for (int i = 0; i < num; i++) {
-    for (int c = 0; c < inC; c++) {
-      for (int h = 0; h < inH; h++) {
-        int inoff = ((i * inC + c) * inH + h) * inW;
-        int outoff =
-            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
-        memcpy(outputs + outoff, inputs + inoff, inW * sizeof(real));
-      }
-    }
-  }
-}
-
-template <>
-void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
-                              const real* outGrad,
-                              const int num,
-                              const int inC,
-                              const int inH,
-                              const int inW,
-                              const PadConf& pad) {
-  int cstart = pad.channel[0], cend = pad.channel[1];
-  int hstart = pad.height[0], hend = pad.height[1];
-  int wstart = pad.width[0], wend = pad.width[1];
-  int outC = inC + cstart + cend;
-  int outH = inH + hstart + hend;
-  int outW = inW + wstart + wend;
-  for (int i = 0; i < num; i++) {
-    for (int c = 0; c < inC; c++) {
-      for (int h = 0; h < inH; h++) {
-        int inoff = ((i * inC + c) * inH + h) * inW;
-        int outoff =
-            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
-        CpuVector inG = CpuVector(inW, inGrad + inoff);
-        CpuVector outG = CpuVector(inW, const_cast<real*>(outGrad + outoff));
-        inG += outG;
-      }
-    }
-  }
-}
-
-static inline PadConf castToPadConf(const FuncConfig& conf) {
-  return {conf.get<std::vector<uint32_t>>("channel"),
-          conf.get<std::vector<uint32_t>>("height"),
-          conf.get<std::vector<uint32_t>>("width")};
-}
-
-/**
- * \brief Padding zeros to input according to the specify dimension.
- *        The struct pad_ contains the padding size in each dimension.
- *        The input and output is a 4D tensor. In PadFunc, we only
- *        pad zeros to the 2nd to 4th dimension.
- *
- * Argument in this Function:
- * \param pad_    A struct object contains the padding size in each dimension.
- *                It has six integers. The channelStart and channelEnd indicate
- *                how many zeros to add before and after the input in channel
- *                dimension. And the heightStart and heightEnd indicate padding
- *                in height dimension. The widthStart and widthEnd indicate the
- *                padding in width dimension.
- * \param inputs  A 4D tensor, only one input.
- * \param outputs A 4D tensor, the output value after padding.
- *
- * For example,
- * Input(2,2,2,3) = [
- *                    [ [[1,2,3], [3,4,5]],
- *                      [[2,3,5], [1,6,7]] ],
- *                    [ [[4,3,1], [1,8,7]],
- *                      [[3,8,9], [2,3,5]] ]
- *                  ] # the shape is (1,2,2,3)
- *
- * pad_: if channelStart = channelEnd = 1, others are 0.
- * Output(2,4,2,3) = [
- *                    [ [[0,0,0], [0,0,0]],
- *                      [[1,2,3], [3,4,5]],
- *                      [[2,3,5], [1,6,7]],
- *                      [[0,0,0], [0,0,0]] ],
- *                    [ [[0,0,0], [0,0,0]],
- *                      [[4,3,1], [1,8,7]],
- *                      [[3,8,9], [2,3,5]],
- *                      [[0,0,0], [0,0,0]] ]
- *                   ] # the shape is (2,4,2,3)
- *
- * pad_: if widthStart = 1, widthEnd = 2, others are 0.
- * Output(2,2,2,6) = [
- *                     [ [[0,1,2,3,0,0], [0,3,4,5,0,0]],
- *                       [[0,2,3,5,0,0], [0,1,6,7,0,0]] ],
- *                     [ [[0,4,3,1,0,0], [0,1,8,7,0,0]],
- *                       [[0,3,8,9,0,0], [0,2,3,5,0,0]] ],
- *                   ] # the shape is (2,2,2,6)
- *
- * pad_: if heightStart = 1, heightEnd = 1, others are 0.
- * Output(2,2,4,3) = [
- *                     [ [[0,0,0], [1,2,3], [3,4,5], [0,0,0]],
- *                       [[0,0,0], [2,3,5], [1,6,7], [0,0,0]] ],
- *                     [ [[0,0,0], [4,3,1], [1,8,7], [0,0,0]],
- *                       [[0,0,0], [3,8,9], [2,3,5], [0,0,0]] ],
- *                   ] # the shape is (2,2,4,3)
- */
-
-template <DeviceType Device>
-class PadFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    size_t num = inputs[0].shape()[0];
-    size_t inC = inputs[0].shape()[1];
-    size_t inH = inputs[0].shape()[2];
-    size_t inW = inputs[0].shape()[3];
-    typename Tensor<real, Device>::Vector vec(outputs[0].shape().getElements(),
-                                              outputs[0].data<real>());
-    vec.zero();
-
-    Pad<Device>(outputs[0].data<real>(),
-                inputs[0].data<real>(),
-                num,
-                inC,
-                inH,
-                inW,
-                pad_);
-  }
-
- private:
-  PadConf pad_;
-};
-
-/**
- * \brief The backward propagation of padding Function. Remove the elements
- *        in the padding positions of forward.
- *
- * Argument in this Function:
- * \param pad_    The same meaning as it in PadFunc.
- * \param inputs  The gradient with respect to the output value of PadFunc.
- * \param outputs The gradient with respect to the input value of PadFunc.
- */
-
-template <DeviceType Device>
-class PadGradFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-
-    size_t num = outputs[0].shape()[0];
-    size_t inC = outputs[0].shape()[1];
-    size_t inH = outputs[0].shape()[2];
-    size_t inW = outputs[0].shape()[3];
-
-    if (outputs[0].getArgType() != ADD_TO) {
-      // for unit test
-      typename Tensor<real, Device>::Vector tmp(
-          outputs[0].shape().getElements(), outputs[0].data<real>());
-      tmp.zero();
-    }
-
-    PadGrad<Device>(outputs[0].data<real>(),
-                    inputs[0].data<real>(),
-                    num,
-                    inC,
-                    inH,
-                    inW,
-                    pad_);
-  }
-
- private:
-  PadConf pad_;
-};
-
-REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
-REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(Pad, GPU, PadFunc);
-REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/PadOp.h b/paddle/legacy/function/PadOp.h
deleted file mode 100644
index 4b0aa4014bb..00000000000
--- a/paddle/legacy/function/PadOp.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-
-namespace paddle {
-
-struct PadConf {
-  /// how many values to add before/after the data along channel dimension.
-  std::vector<uint32_t> channel;
-  /// how many values to add before/after the data along height dimension.
-  std::vector<uint32_t> height;
-  /// how many values to add before/after the data along width dimension.
-  std::vector<uint32_t> width;
-};
-
-/**
- * \brief  This funtion pads zeros to inputs according to the specify dimension.
- *         The input and output is a 4D tensor. Padding zeros from the 2nd to
- *         the 4th dimenstion according argument of pad.
- *
- * \param[out] outputs save results.
- * \param[in]  inputs  input data.
- * \param[in]  num     batch size of input data.
- * \param[in]  inC     channel number of input data.
- * \param[in]  inH     height of input data.
- * \param[in]  inH     with of input data.
- * \param[in]  pad     the padding config, contains the size along the
- *                     specify dimension.
- */
-template <DeviceType Device>
-void Pad(real* outputs,
-         const real* inputs,
-         const int num,
-         const int inC,
-         const int inH,
-         const int inW,
-         const PadConf& pad);
-
-/**
- * \brief   Padding operation backward.
- *
- * \param[out] inGrad  gradients of previous layer.
- * \param[in]  outGrad output gradients.
- * \param[in]  num     batch size of input data.
- * \param[in]  inC     channel number of input data.
- * \param[in]  inH     height of input data.
- * \param[in]  inH     with of input data.
- * \param[in]  pad     the padding config, contains the size along the
- *                     specify dimension.
- */
-template <DeviceType Device>
-void PadGrad(real* inGrad,
-             const real* outGrad,
-             const int num,
-             const int inC,
-             const int inH,
-             const int inW,
-             const PadConf& pad);
-}  // namespace paddle
diff --git a/paddle/legacy/function/PadOpGpu.cu b/paddle/legacy/function/PadOpGpu.cu
deleted file mode 100644
index 01d9b5c3b2a..00000000000
--- a/paddle/legacy/function/PadOpGpu.cu
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PadOp.h"
-#include "hl_base.h"
-
-namespace paddle {
-
-__global__ void KePad(real* outputs,
-                      const real* inputs,
-                      int inC,
-                      int inH,
-                      int inW,
-                      int padc,
-                      int padh,
-                      int padw,
-                      int outC,
-                      int outH,
-                      int outW,
-                      int nthreads) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < nthreads) {
-    const int w = idx % inW;
-    const int h = (idx / inW) % inH;
-    const int c = (idx / inW / inH) % inC;
-    const int n = idx / inW / inH / inC;
-
-    const int off = ((n * outC + c + padc) * outH + h + padh) * outW + padw + w;
-    outputs[off] = inputs[idx];
-  }
-}
-
-template <>
-void Pad<DEVICE_TYPE_GPU>(real* outputs,
-                          const real* inputs,
-                          const int num,
-                          const int inC,
-                          const int inH,
-                          const int inW,
-                          const PadConf& pad) {
-  size_t nth = num * inC * inH * inW;
-  int blockSize = 1024;
-  int gridSize = (nth + 1024 - 1) / 1024;
-  int cstart = pad.channel[0], cend = pad.channel[1];
-  int hstart = pad.height[0], hend = pad.height[1];
-  int wstart = pad.width[0], wend = pad.width[1];
-  int outC = inC + cstart + cend;
-  int outH = inH + hstart + hend;
-  int outW = inW + wstart + wend;
-  KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
-                                                    inputs,
-                                                    inC,
-                                                    inH,
-                                                    inW,
-                                                    cstart,
-                                                    hstart,
-                                                    wstart,
-                                                    outC,
-                                                    outH,
-                                                    outW,
-                                                    nth);
-  CHECK_SYNC("Pad");
-}
-
-__global__ void KePadDiff(real* inGrad,
-                          const real* outGrad,
-                          int inC,
-                          int inH,
-                          int inW,
-                          int padc,
-                          int padh,
-                          int padw,
-                          int outC,
-                          int outH,
-                          int outW,
-                          int nthreads) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < nthreads) {
-    const int w = idx % inW;
-    const int h = (idx / inW) % inH;
-    const int c = (idx / inW / inH) % inC;
-    const int n = idx / inW / inH / inC;
-
-    const int off = ((n * outC + c + padc) * outH + h + padh) * outW + padw + w;
-    inGrad[idx] += outGrad[off];
-  }
-}
-
-template <>
-void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
-                              const real* outGrad,
-                              const int num,
-                              const int inC,
-                              const int inH,
-                              const int inW,
-                              const PadConf& pad) {
-  int nth = num * inC * inH * inW;
-  int blockSize = 1024;
-  int gridSize = (nth + 1024 - 1) / 1024;
-  int cstart = pad.channel[0], cend = pad.channel[1];
-  int hstart = pad.height[0], hend = pad.height[1];
-  int wstart = pad.width[0], wend = pad.width[1];
-  int outC = inC + cstart + cend;
-  int outH = inH + hstart + hend;
-  int outW = inW + wstart + wend;
-  KePadDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
-                                                        outGrad,
-                                                        inC,
-                                                        inH,
-                                                        inW,
-                                                        cstart,
-                                                        hstart,
-                                                        wstart,
-                                                        outC,
-                                                        outH,
-                                                        outW,
-                                                        nth);
-  CHECK_SYNC("PadGrad");
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/PadOpTest.cpp b/paddle/legacy/function/PadOpTest.cpp
deleted file mode 100644
index a4474f85498..00000000000
--- a/paddle/legacy/function/PadOpTest.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-
-namespace paddle {
-
-TEST(Pad, real) {
-  for (size_t numSamples : {5, 32}) {
-    for (size_t channels : {1, 5, 32}) {
-      for (size_t imgSizeH : {5, 33, 100}) {
-        for (size_t imgSizeW : {5, 32, 96}) {
-          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
-          for (bool test_grad : {false, true}) {
-            CpuGpuFuncCompare compare(
-                test_grad ? "PadGrad" : "Pad",
-                FuncConfig()
-                    .set<std::vector<uint32_t>>("channel", {2, 3})
-                    .set<std::vector<uint32_t>>("height", {1, 2})
-                    .set<std::vector<uint32_t>>("width", {3, 2}));
-            TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
-            TensorShape outDims{
-                numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
-            compare.addInputs(
-                BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
-            compare.addOutputs(BufferArg(
-                VALUE_TYPE_FLOAT, test_grad ? inDims : outDims, ASSIGN_TO));
-            compare.run();
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/RowConvOp.cpp b/paddle/legacy/function/RowConvOp.cpp
deleted file mode 100644
index 3be50e80d71..00000000000
--- a/paddle/legacy/function/RowConvOp.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RowConvOp.h"
-#include <iostream>
-#include "paddle/legacy/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void RowConv<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                              const CpuMatrix& in,
-                              const CpuMatrix& filter,
-                              const CpuIVector& seq) {
-  const int* starts = seq.getData();
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  for (size_t i = 0; i < numSeq; ++i) {
-    size_t begin = starts[i];
-    size_t end = starts[i + 1];
-    for (size_t j = begin; j < end; ++j) {
-      MatrixPtr x;
-      MatrixPtr w;
-      if ((j + contextLength) < end) {
-        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, contextLength);
-        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, contextLength);
-      } else {
-        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, end - j);
-        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, end - j);
-      }
-      MatrixPtr y = out.subMatrix(j, 1);
-      y->addDotMulVMM(*x, *w);
-    }
-  }
-}
-
-template <>
-void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
-                                  const CpuMatrix& in,
-                                  const CpuMatrix& filter,
-                                  CpuMatrix& inG,
-                                  CpuMatrix& filterG,
-                                  const CpuIVector& seq) {
-  // gradient w.r.t filter
-  const int* starts = seq.getData();
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  if (filterG) {
-    for (size_t i = 0; i < numSeq; ++i) {
-      size_t begin = starts[i];
-      size_t end = starts[i + 1];
-      size_t steps = end - begin;
-      for (size_t j = 0; j < contextLength && (begin + j) < end; ++j) {
-        MatrixPtr x =
-            (const_cast<CpuMatrix&>(in)).subMatrix(begin + j, steps - j);
-        MatrixPtr dy =
-            (const_cast<CpuMatrix&>(outG)).subMatrix(begin, steps - j);
-        MatrixPtr dw = filterG.subMatrix(j, 1);
-        dw->addDotMulVMM(*dy, *x);
-      }
-    }
-  }
-
-  // gradient w.r.t input feature
-  if (inG) {
-    for (size_t i = 0; i < numSeq; ++i) {
-      size_t begin = starts[i];
-      size_t end = starts[i + 1];
-      size_t steps = end - begin;
-      for (size_t j = 0; j < steps; ++j) {
-        MatrixPtr dx = inG.subMatrix(begin + j, 1);
-        for (size_t t = 0; t < contextLength; ++t) {
-          if (int(j - t) >= 0) {
-            MatrixPtr dy =
-                (const_cast<CpuMatrix&>(outG)).subMatrix(begin + j - t, 1);
-            MatrixPtr w = (const_cast<CpuMatrix&>(filter)).subMatrix(t, 1);
-            dx->addDotMul(*dy, *w, 1.0, 1.0);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief The row convolution is called lookahead convolution. It is firstly
- * introduced in deep-speech2 system. The bidirectional RNN that learns
- * representation for a sequence by performing a forward and a backward pass
- * through the entire sequence. However, unlike unidirectional RNNs,
- * bidirectional RNNs are challenging to deploy in an online and low-latency
- * setting. The lookahead convolution incorporates information from future
- * subsequences in a computationally efficient manner to improve unidirectional
- * recurrent neural networks.
- *
- * The connection of row convolution is different form the 1D sequence
- * convolution. Assumed that, the future context-length is k, that is to say,
- * it can get the output at timestep t by using the the input feature from t-th
- * timestep to (t+k)-th timestep. Assumed that the hidden dim of input
- * activations are d, the activations r_t for the new layer at time-step t are:
- *
- *
- *            -- k + 1
- *  r(t,i) =  >       W(i,j) * h(t+j-1, i),  for (1 <= i <= d)
- *            -- j = 1
- *
- *
- * The weight shape is: (k + 1) x d
- * Function Arguments:
- *
- * \param inputs[0]  The input activations.
- * \param inputs[0]  The filter (or weight) and shape is (k+1) x d.
- * \param outputs[1] The output activations.
- *
- * [1] Dario Amodei, etc. Deep Speech 2 : End-to-End Speech Recognition in
- * English
- *     and Mandarin. https://arxiv.org/abs/1512.02595
- */
-
-template <DeviceType Device>
-class RowConvFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    // check
-    CHECK_EQ(2UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    // TODO(qingqing): support ASSIGN_TO.
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here.";
-    const auto in = dynamic_cast<const SequenceArg&>(inputs[0]);
-    auto out = dynamic_cast<const SequenceArg&>(outputs[0]);
-    auto w = inputs[1];
-    CHECK(in.data() && out.data() && in.getSequenceId().data());
-    CHECK_EQ(in.shape().ndims(), 2UL);
-    CHECK(in.shape() == out.shape());
-    CHECK_EQ(w.shape()[1], in.shape()[1]);
-
-    auto outMat = out.matrix<Device>();
-    const auto inMat = in.matrix<Device>();
-    const auto wMat = w.matrix<Device>();
-    const auto seqId = in.getSequenceId().vector<int, Device>();
-
-    RowConv<Device>(outMat, inMat, wMat, seqId);
-  }
-};
-
-/**
- * \brief The backward of row convolution function. This function calculated
- * the gradient w.r.t filter and the gradient w.r.t input activations(or data).
- *
- * Argument in this Function:
- *
- * \param inputs[0]  The gradient w.r.t output activations.
- * \param inputs[1]  The input activations.
- * \param inputs[2]  The filter (or weight) and shape is (k+1) x d.
- * \param outputs[0] The gradient w.r.t input activations.
- * \param outputs[1] The gradient w.r.r filter.
- *
- * Abbreviation:
- * w.r.t: with respect to.
- */
-
-template <DeviceType Device>
-class RowConvGradFunc : public FunctionBase {
-  // TODO(qingqing): split into RowConvDataFunc and RowConvWeightFunc
- public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    // check
-    CHECK_EQ(3UL, inputs.size());
-    CHECK_EQ(2UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
-    CHECK(inputs[0].isSequenceArg() && inputs[1].isSequenceArg() &&
-          outputs[0].isSequenceArg())
-        << "SequenceArg required here.";
-
-    const auto outGrad = dynamic_cast<const SequenceArg&>(inputs[0]);
-    const auto in = dynamic_cast<const SequenceArg&>(inputs[1]);
-    const auto w = inputs[2];
-    auto inGrad = dynamic_cast<const SequenceArg&>(outputs[0]);
-    auto wGrad = outputs[1];
-
-    CHECK_EQ(in.shape().ndims(), 2UL);
-    CHECK(in.shape() == inGrad.shape());
-    CHECK(in.shape() == outGrad.shape());
-    CHECK_EQ(wGrad.shape()[1], in.shape()[1]);
-
-    const auto outGMat = outGrad.matrix<Device>();
-    const auto inMat = in.matrix<Device>();
-    const auto wMat = w.matrix<Device>();
-    auto inGMat = inGrad.data()
-                      ? inGrad.matrix<Device>()
-                      : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-    auto wGMat = wGrad.data()
-                     ? wGrad.matrix<Device>()
-                     : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-    const auto seqId = in.getSequenceId().vector<int, Device>();
-
-    RowConvGrad<Device>(outGMat, inMat, wMat, inGMat, wGMat, seqId);
-  }
-};
-
-REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
-REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
-REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/RowConvOp.h b/paddle/legacy/function/RowConvOp.h
deleted file mode 100644
index bfe775e014d..00000000000
--- a/paddle/legacy/function/RowConvOp.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-
-namespace paddle {
-
-/**
- * \brief The forward of row convolution.
- *
- * \param[out] out      The output data and shape is h x d. h is the sum of
- *                      time steps of all samples in one mini-batch.
- * \param[in]  in       The input data and shape is h x d.
- * \param[in]  filter   The filter and shape is k x d. The lookahead step
- *                      number plus one equals k.
- * \param[in]  seq      The sequence start positions.
- *
- */
-template <DeviceType DType>
-void RowConv(typename Tensor<real, DType>::Matrix& out,
-             const typename Tensor<real, DType>::Matrix& in,
-             const typename Tensor<real, DType>::Matrix& filter,
-             const typename Tensor<int, DType>::Vector& seq);
-
-/**
- * \brief The backward of row convolution.
- *
- * \param[in]  outG     The gradient w.r.t output data.
- * \param[in]  in       The input data.
- * \param[in]  filter   The filter.
- * \param[out] inG      The gradient w.r.t input data.
- * \param[out] filterG  The gradient w.r.t filter.
- * \param[in]  seq      The sequence start positions.
- *
- */
-template <DeviceType DType>
-void RowConvGrad(const typename Tensor<real, DType>::Matrix& outG,
-                 const typename Tensor<real, DType>::Matrix& in,
-                 const typename Tensor<real, DType>::Matrix& filter,
-                 typename Tensor<real, DType>::Matrix& inG,
-                 typename Tensor<real, DType>::Matrix& filterG,
-                 const typename Tensor<int, DType>::Vector& seq);
-}  // namespace paddle
diff --git a/paddle/legacy/function/RowConvOpGpu.cu b/paddle/legacy/function/RowConvOpGpu.cu
deleted file mode 100644
index a6d2e4c7e38..00000000000
--- a/paddle/legacy/function/RowConvOpGpu.cu
+++ /dev/null
@@ -1,373 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/cuda/include/hl_base.h"
-#include "paddle/legacy/function/RowConvOp.h"
-
-namespace paddle {
-
-template <int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConv(real* y,
-                          const real* x,
-                          const real* w,
-                          const int* starts,
-                          const int height,
-                          const int width,
-                          const int numSeq,
-                          const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sw[BLOCK_H][BLOCK_W];
-
-  for (int i = tidy; i < context; i += blky) {
-    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
-  }
-
-  __syncthreads();
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      real sum = 0;
-      int off = (start + j) * width;
-      for (int t = 0; t < context; ++t) {
-        if ((start + j + t) < end) {
-          int xoff = off + t * width;
-          real xVal = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
-          sum += sw[t][tidx] * xVal;
-        }
-      }
-      if (gidx + tidx < width) {
-        y[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-__global__ void KeRowConv2(real* y,
-                           const real* x,
-                           const real* w,
-                           const int* starts,
-                           const int height,
-                           const int width,
-                           const int numSeq,
-                           const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      int off = (start + j) * width;
-      real sum = 0;
-      for (int t = 0; t < context && (start + j + t) < end; ++t) {
-        int xoff = off + t * width;
-        real xd = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
-        real wd = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
-        sum += wd * xd;
-      }
-      if (gidx + tidx < width) {
-        y[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-template <>
-void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,  // NOLINT
-                              const GpuMatrix& in,
-                              const GpuMatrix& filter,
-                              const GpuIVector& seq) {
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  const size_t height = in.getHeight();
-  const size_t width = in.getWidth();
-
-  real* y = out.getData();
-  const real* x = in.getData();
-  const real* w = filter.getData();
-  const int* starts = seq.getData();
-
-  dim3 dimBlock(32, 32);
-  dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
-
-  if (contextLength <= 32) {
-    KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-        y, x, w, starts, height, width, numSeq, contextLength);
-  } else {
-    KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-        y, x, w, starts, height, width, numSeq, contextLength);
-  }
-  CHECK_SYNC("RowConv");
-}
-
-template <int BLOCK_H, int BLOCK_W, int CONTEXT>
-__global__ void KeRowConvBwWeight(real* dw,
-                                  const real* x,
-                                  const real* dy,
-                                  const int* starts,
-                                  const int height,
-                                  const int width,
-                                  const int numSeq,
-                                  const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sh_x[BLOCK_W][BLOCK_H];
-  __shared__ real sh_dy[BLOCK_W][BLOCK_H + CONTEXT - 1];
-  __shared__ real sh_dw[CONTEXT][BLOCK_W];
-
-  if (tidy < context) {
-    sh_dw[tidy][tidx] = 0.0;
-  }
-  __syncthreads();
-
-  // NOTE(zcd): temporary solution
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, true);
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
-    for (int j = tidy; j < size; j += BLOCK_H) {
-      int xoff = gidx + tidx;
-      int yoff = start + j;
-
-      // transpose
-      sh_x[tidx][tidy] =
-          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      sh_dy[tidx][tidy + context - 1] =
-          (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
-      __syncthreads();
-      if (tidy < (context - 1)) {
-        yoff = yoff - context + 1;
-        sh_dy[tidx][tidy] =
-            (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
-      }
-      __syncthreads();
-
-      for (int t = 0; t < context; t++) {
-        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx + context - 1 - t];
-        __syncthreads();
-        // warp size and blockDim.x is 32.
-
-        for (int offset = 16; offset > 0; offset /= 2)
-          val += __shfl_down_sync(mask, val, offset);
-
-        __syncthreads();
-        if (tidx == 0) {
-          sh_dw[t][tidy] += val;
-        }
-        __syncthreads();
-      }
-    }
-  }
-
-  for (int t = tidy; (t < context) && ((gidx + tidx) < width); t += blky) {
-    dw[t * width + gidx + tidx] += sh_dw[t][tidx];
-  }
-}
-
-template <int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConvBwWeight2(real* dw,
-                                   const real* x,
-                                   const real* dy,
-                                   const int* starts,
-                                   const int height,
-                                   const int width,
-                                   const int numSeq,
-                                   const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sh_x[BLOCK_H][BLOCK_W];
-  __shared__ real sh_dy[BLOCK_H][BLOCK_W];
-
-  // NOTE(zcd): temporary solution
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, true);
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-
-    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
-    for (int j = tidy; j < size; j += BLOCK_H) {
-      int xoff = gidx + tidx;
-      int yoff = start + j;
-
-      // transpose
-      sh_x[tidx][tidy] =
-          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      __syncthreads();
-
-      for (int t = 0; t < context; t++) {
-        sh_dy[tidx][tidy] =
-            (xoff < width && (yoff - t) >= start && yoff - t < end)
-                ? dy[(yoff - t) * width + xoff]
-                : 0.0;
-        __syncthreads();
-
-        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
-        __syncthreads();
-        // warp size and blockDim.x is 32.
-        for (int offset = 16; offset > 0; offset /= 2)
-          val += __shfl_down_sync(mask, val, offset);
-
-        __syncthreads();
-
-        if (tidx == 0 && (gidx + tidy) < width) {
-          dw[t * width + gidx + tidy] += val;
-        }
-      }
-    }
-  }
-}
-
-template <int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConvBwData(real* dx,
-                                const real* w,
-                                const real* dy,
-                                const int* starts,
-                                const int height,
-                                const int width,
-                                const int numSeq,
-                                const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sw[BLOCK_H][BLOCK_W];
-
-  for (int i = tidy; i < context; i += blky) {
-    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
-  }
-
-  __syncthreads();
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      real sum = 0;
-      int off = (start + j) * width;
-      for (int t = 0; t < context && (j - t) >= 0; ++t) {
-        int dyOff = off - t * width;
-        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
-        sum += sw[t][tidx] * dyVal;
-      }
-      if (gidx + tidx < width) {
-        dx[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-__global__ void KeRowConvBwData2(real* dx,
-                                 const real* w,
-                                 const real* dy,
-                                 const int* starts,
-                                 const int height,
-                                 const int width,
-                                 const int numSeq,
-                                 const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      real sum = 0;
-      int off = (start + j) * width;
-      for (int t = 0; t < context && (j - t) >= 0; ++t) {
-        int dyOff = off - t * width;
-        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
-        real wVal = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
-        sum += wVal * dyVal;
-      }
-      if (gidx + tidx < width) {
-        dx[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-template <>
-void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
-                                  const GpuMatrix& in,
-                                  const GpuMatrix& filter,
-                                  GpuMatrix& inG,      // NOLINT
-                                  GpuMatrix& filterG,  // NOLINT
-                                  const GpuIVector& seq) {
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  const size_t height = in.getHeight();
-  const size_t width = in.getWidth();
-
-  const real* dy = outG.getData();
-  const real* x = in.getData();
-  const real* w = filter.getData();
-  const int* starts = seq.getData();
-
-  if (filterG) {
-    dim3 dimBlock(32, 32);
-    dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
-    real* dw = filterG.getData();
-    if (contextLength <= 32) {
-      KeRowConvBwWeight<32, 32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-          dw, x, dy, starts, height, width, numSeq, contextLength);
-    } else {
-      KeRowConvBwWeight2<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-          dw, x, dy, starts, height, width, numSeq, contextLength);
-    }
-  }
-
-  if (inG) {
-    real* dx = inG.getData();
-    dim3 dimBlock2(32, 32);
-    dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
-    if (contextLength <= 64) {
-      KeRowConvBwData<32, 64><<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
-          dx, w, dy, starts, height, width, numSeq, contextLength);
-    } else {
-      KeRowConvBwData2<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
-          dx, w, dy, starts, height, width, numSeq, contextLength);
-    }
-  }
-
-  CHECK_SYNC("RowConvGrad");
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/RowConvOpTest.cpp b/paddle/legacy/function/RowConvOpTest.cpp
deleted file mode 100644
index bbc29ad6a6b..00000000000
--- a/paddle/legacy/function/RowConvOpTest.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-
-namespace paddle {
-
-void testRowConvFw(size_t batchSize, size_t dim, size_t contextLength) {
-  CpuGpuFuncCompare test("RowConv", FuncConfig());
-
-  test.addSequence(SequenceIdArg(TensorShape{batchSize}));
-  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}));
-
-  test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}),
-                  ADD_TO);
-
-  test.run();
-}
-
-void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) {
-  CpuGpuFuncCompare test("RowConvGrad", FuncConfig());
-
-  test.addSequence(SequenceIdArg(TensorShape{batchSize}));
-  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
-  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}));
-
-  test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}),
-                  ADD_TO);
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}),
-                  ADD_TO);
-
-  test.run();
-}
-
-TEST(RowConv, real) {
-  for (size_t numSamples : {17, 129, 2020}) {
-    for (size_t dim : {16, 512, 2560}) {
-      for (size_t context : {3, 19, 65}) {
-        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
-                << " context length=" << context;
-        testRowConvFw(numSamples, dim, context);
-        testRowConvBw(numSamples, dim, context);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/ScaleSubRegionOp.cpp b/paddle/legacy/function/ScaleSubRegionOp.cpp
deleted file mode 100644
index 03a422a740d..00000000000
--- a/paddle/legacy/function/ScaleSubRegionOp.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ScaleSubRegionOp.h"
-#include "paddle/legacy/function/TensorShape.h"
-
-namespace paddle {
-
-template <>
-void ScaleSubRegion<DEVICE_TYPE_CPU>(real* outputs,
-                                     const real* inputs,
-                                     const real* indices,
-                                     const TensorShape shape,
-                                     const FuncConfig& conf) {
-  real value = conf.get<real>("value");
-
-  int number = shape[0];
-  int channel = shape[1];
-  int height = shape[2];
-  int width = shape[3];
-
-  memcpy(outputs, inputs, number * channel * height * width * sizeof(real));
-
-  for (int n = 0; n < number; ++n) {
-    // indices start from 1
-    int offset = n * 6;
-    for (int c = indices[offset] - 1; c < indices[offset + 1]; ++c) {
-      for (int h = indices[offset + 2] - 1; h < indices[offset + 3]; ++h) {
-        for (int w = indices[offset + 4] - 1; w < indices[offset + 5]; ++w) {
-          int idx = ((n * channel + c) * height + h) * width + w;
-          outputs[idx] *= value;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void ScaleSubRegionGrad<DEVICE_TYPE_CPU>(const real* inGrad,
-                                         real* outGrad,
-                                         const real* indices,
-                                         const TensorShape shape,
-                                         const FuncConfig& conf) {
-  real value = conf.get<real>("value");
-
-  int number = shape[0];
-  int channel = shape[1];
-  int height = shape[2];
-  int width = shape[3];
-
-  for (int n = 0; n < number; ++n) {
-    for (int c = 0; c < channel; ++c) {
-      for (int h = 0; h < height; ++h) {
-        for (int w = 0; w < width; ++w) {
-          int idx = ((n * channel + c) * height + h) * width + w;
-          int offset = n * 6;
-          if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
-              h >= (indices[offset + 2] - 1) &&
-              h <= (indices[offset + 3] - 1) &&
-              w >= (indices[offset + 4] - 1) &&
-              w <= (indices[offset + 5] - 1)) {
-            outGrad[idx] += inGrad[idx] * value;
-          } else {
-            outGrad[idx] += inGrad[idx];
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief For each instance, ScaleSubRegion can be used to multiply a value to
- *        a specified sub continuous region. By providing start index and end
- *        index for C/H/W, you can specify the location and shape of the region.
- *
- * Argument in this Function:
- * \param inputs    A 4-D tensor with shape [N, C, H, W], only one input.
- * \param indices   A 2-D tensor with shape [N, 6], indicates the sub region.
- * \param outputs   A 4-D tensor with same shape as inputs, output value.
- */
-template <DeviceType Device>
-class ScaleSubRegionFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(2UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    TensorShape shape = inputs[0].shape();
-
-    ScaleSubRegion<Device>(outputs[0].data<real>(),
-                           inputs[0].data<real>(),
-                           inputs[1].data<real>(),
-                           shape,
-                           conf_);
-  }
-
- private:
-  FuncConfig conf_;
-};
-
-/**
- * \brief The backward propagation of ScaleSubRegion Function.
- *
- * Argument in this Function:
- * \param inputs  A 4-D tensor with shape [N, C, H, W], output gradient.
- * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
- * \param outputs A 4-D tensor with shape [N, C, H, W], gradient of input value.
- */
-
-template <DeviceType Device>
-class ScaleSubRegionGradFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(2UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-
-    TensorShape shape = inputs[0].shape();
-
-    ScaleSubRegionGrad<Device>(inputs[0].data<real>(),
-                               outputs[0].data<real>(),
-                               inputs[1].data<real>(),
-                               shape,
-                               conf_);
-  }
-
- private:
-  FuncConfig conf_;
-};
-
-REGISTER_TYPED_FUNC(ScaleSubRegion, CPU, ScaleSubRegionFunc);
-REGISTER_TYPED_FUNC(ScaleSubRegionGrad, CPU, ScaleSubRegionGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(ScaleSubRegion, GPU, ScaleSubRegionFunc);
-REGISTER_TYPED_FUNC(ScaleSubRegionGrad, GPU, ScaleSubRegionGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/ScaleSubRegionOp.h b/paddle/legacy/function/ScaleSubRegionOp.h
deleted file mode 100644
index ed7d6b8ad3c..00000000000
--- a/paddle/legacy/function/ScaleSubRegionOp.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-
-namespace paddle {
-
-/**
- * \brief Function to multiply a value to values in specified sub continuous
- *        region. Indices must be provided to indcate the location and shape of
- *        the region and the multiplied value is passed by configure variable.
- *
- *
- * \param[out] outputs  Output value.
- * \param[in]  inputs   Input data which contains NCHW information.
- * \param[in]  indices  Indices data to indcate the sub region.
- * \param[in]  shape    Tensor shape of input value.
- * \param[in]  conf     Configure variable which contains the multiplied value.
- */
-template <DeviceType Device>
-void ScaleSubRegion(real* outputs,
-                    const real* inputs,
-                    const real* indices,
-                    const TensorShape shape,
-                    const FuncConfig& conf);
-
-/**
- * \brief Backward propagation function of ScaleSubRegion.
- *
- * \param[out] inGrad   Gradients of previous layer.
- * \param[in]  outGrad  Output gradient.
- * \param[in]  indices  Indices data.
- * \param[in]  shape    The Shape of input tensor.
- * \param[in]  conf     Configure variable.
- */
-template <DeviceType Device>
-void ScaleSubRegionGrad(const real* inGrad,
-                        real* outGrad,
-                        const real* indices,
-                        const TensorShape shape,
-                        const FuncConfig& conf);
-}  // namespace paddle
diff --git a/paddle/legacy/function/ScaleSubRegionOpGpu.cu b/paddle/legacy/function/ScaleSubRegionOpGpu.cu
deleted file mode 100644
index 9784c51ae03..00000000000
--- a/paddle/legacy/function/ScaleSubRegionOpGpu.cu
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ScaleSubRegionOp.h"
-#include "hl_base.h"
-
-namespace paddle {
-
-__global__ void KeScaleSubRegion(real* outputs,
-                                 const real* inputs,
-                                 const real* indices,
-                                 real value,
-                                 int channel,
-                                 int height,
-                                 int width,
-                                 int nthreads) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < nthreads) {
-    const int w = idx % width;
-    const int h = (idx / width) % height;
-    const int c = (idx / width / height) % channel;
-    const int n = idx / width / height / channel;
-
-    const int offset = n * 6;
-    if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
-        h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
-        w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
-      outputs[idx] = inputs[idx] * value;
-    } else {
-      outputs[idx] = inputs[idx];
-    }
-  }
-}
-
-template <>
-void ScaleSubRegion<DEVICE_TYPE_GPU>(real* outputs,
-                                     const real* inputs,
-                                     const real* indices,
-                                     const TensorShape shape,
-                                     const FuncConfig& conf) {
-  real value = conf.get<real>("value");
-
-  int number = shape[0];
-  int channel = shape[1];
-  int height = shape[2];
-  int width = shape[3];
-
-  size_t nth = number * channel * height * width;
-  int blockSize = 1024;
-  int gridSize = (nth + blockSize - 1) / blockSize;
-
-  KeScaleSubRegion<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-      outputs, inputs, indices, value, channel, height, width, nth);
-  CHECK_SYNC("ScaleSubRegion");
-}
-
-__global__ void KeScaleSubRegionDiff(const real* inGrad,
-                                     real* outGrad,
-                                     const real* indices,
-                                     real value,
-                                     int channel,
-                                     int height,
-                                     int width,
-                                     int nthreads) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < nthreads) {
-    const int w = idx % width;
-    const int h = (idx / width) % height;
-    const int c = (idx / width / height) % channel;
-    const int n = idx / width / height / channel;
-
-    const int offset = n * 6;
-    if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
-        h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
-        w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
-      outGrad[idx] += inGrad[idx] * value;
-    } else {
-      outGrad[idx] += inGrad[idx];
-    }
-  }
-}
-
-template <>
-void ScaleSubRegionGrad<DEVICE_TYPE_GPU>(const real* inGrad,
-                                         real* outGrad,
-                                         const real* indices,
-                                         const TensorShape shape,
-                                         const FuncConfig& conf) {
-  real value = conf.get<real>("value");
-
-  int number = shape[0];
-  int channel = shape[1];
-  int height = shape[2];
-  int width = shape[3];
-
-  size_t nth = number * channel * height * width;
-  int blockSize = 1024;
-  int gridSize = (nth + blockSize - 1) / blockSize;
-
-  KeScaleSubRegionDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-      inGrad, outGrad, indices, value, channel, height, width, nth);
-  CHECK_SYNC("ScaleSubRegionGrad");
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/ScaleSubRegionOpTest.cpp b/paddle/legacy/function/ScaleSubRegionOpTest.cpp
deleted file mode 100644
index dd6ee671089..00000000000
--- a/paddle/legacy/function/ScaleSubRegionOpTest.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-
-namespace paddle {
-
-TEST(ScaleSubRegion, real) {
-  for (size_t numSamples : {5, 32}) {
-    for (size_t channels : {5, 32}) {
-      for (size_t imgSizeH : {5, 33}) {
-        for (size_t imgSizeW : {5, 32}) {
-          for (real value : {-0.5, 0.0, 0.5}) {
-            for (bool firstHalf : {false, true}) {
-              VLOG(3) << " numSamples=" << numSamples
-                      << " channels=" << channels << " imgSizeH=" << imgSizeH
-                      << " imgSizeW=" << imgSizeW;
-
-              for (bool testGrad : {false, true}) {
-                CpuGpuFuncCompare compare(
-                    testGrad ? "ScaleSubRegionGrad" : "ScaleSubRegion",
-                    FuncConfig().set<real>("value", value));
-
-                TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
-                TensorShape indicesShape{numSamples, 6};
-
-                compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-                compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, indicesShape));
-
-                compare.registerInitCallback([=](BufferArg& arg, size_t index) {
-                  if (index == 1) {
-                    real* data = (real*)arg.data();
-
-                    for (size_t i = 0; i < numSamples; ++i) {
-                      size_t offset = i * 6;
-                      data[offset] = firstHalf ? 1 : channels / 2;
-                      data[offset + 1] = firstHalf ? channels / 2 : channels;
-                      data[offset + 2] = firstHalf ? 1 : imgSizeH / 2;
-                      data[offset + 3] = firstHalf ? imgSizeH / 2 : imgSizeH;
-                      data[offset + 4] = firstHalf ? 1 : imgSizeW / 2;
-                      data[offset + 5] = firstHalf ? imgSizeW / 2 : imgSizeW;
-                    }
-                  }
-                });
-
-                compare.addOutputs(
-                    BufferArg(
-                        VALUE_TYPE_FLOAT, shape, testGrad ? ADD_TO : ASSIGN_TO),
-                    testGrad ? ADD_TO : ASSIGN_TO);
-                compare.run();
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/SwitchOp.cpp b/paddle/legacy/function/SwitchOp.cpp
deleted file mode 100644
index c6accd18039..00000000000
--- a/paddle/legacy/function/SwitchOp.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SwitchOp.h"
-#include "paddle/legacy/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void NCHW2NHWC<DEVICE_TYPE_CPU>(real* outputs,
-                                const real* inputs,
-                                const int num,
-                                const int inC,
-                                const int inH,
-                                const int inW,
-                                const int argType) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < inC; ++c) {
-      for (int h = 0; h < inH; ++h) {
-        for (int w = 0; w < inW; ++w) {
-          if (argType == ADD_TO) {
-            outputs[((n * inH + h) * inW + w) * inC + c] += *(inputs++);
-          } else {
-            outputs[((n * inH + h) * inW + w) * inC + c] = *(inputs++);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <>
-void NHWC2NCHW<DEVICE_TYPE_CPU>(real* outputs,
-                                const real* inputs,
-                                const int num,
-                                const int inH,
-                                const int inW,
-                                const int inC,
-                                const int argType) {
-  for (int n = 0; n < num; ++n) {
-    for (int h = 0; h < inH; ++h) {
-      for (int w = 0; w < inW; ++w) {
-        for (int c = 0; c < inC; ++c) {
-          if (argType == ADD_TO) {
-            outputs[((n * inC + c) * inH + h) * inW + w] += *(inputs++);
-          } else {
-            outputs[((n * inC + c) * inH + h) * inW + w] = *(inputs++);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief  Switch dimension order of image input.
- *         The input and output is a 4D tensor. Switch order
- *         'batch_size,channels, height, width' to
- *         order 'batch_size, height, width, channels'.
- *
- * Argument in this Function:
- * \param inputs  input data with order 'batch_size,channels, height, width'.
- * \param outputs output data with order 'batch_size, height, width, channels'.
- */
-template <DeviceType Device>
-class NCHW2NHWCFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-
-    size_t num = inputs[0].shape()[0];
-    size_t inC = inputs[0].shape()[1];
-    size_t inH = inputs[0].shape()[2];
-    size_t inW = inputs[0].shape()[3];
-    NCHW2NHWC<Device>(outputs[0].data<real>(),
-                      inputs[0].data<real>(),
-                      num,
-                      inC,
-                      inH,
-                      inW,
-                      outputs[0].getArgType());
-  }
-};
-
-/**
- * \brief  Switch dimension order of image input.
- *         The input and output is a 4D tensor. Switch order
- *         'batch_size, height, width, channels' to
- *         order 'batch_size, channels, height, width'.
- *
- * Argument in this Function:
- * \param inputs  input data with order 'batch_size, height, width, channels'.
- * \param outputs output data with order 'batch_size, channels, height, width'.
- */
-template <DeviceType Device>
-class NHWC2NCHWFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-
-    size_t num = inputs[0].shape()[0];
-    size_t inH = inputs[0].shape()[1];
-    size_t inW = inputs[0].shape()[2];
-    size_t inC = inputs[0].shape()[3];
-
-    NHWC2NCHW<Device>(outputs[0].data<real>(),
-                      inputs[0].data<real>(),
-                      num,
-                      inH,
-                      inW,
-                      inC,
-                      outputs[0].getArgType());
-  }
-};
-
-REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc);
-REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc);
-REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/SwitchOp.h b/paddle/legacy/function/SwitchOp.h
deleted file mode 100644
index b5eb0883cb6..00000000000
--- a/paddle/legacy/function/SwitchOp.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-
-namespace paddle {
-
-/**
- * \brief  This funtion switch dimension order of image input.
- *         The input and output is a 4D tensor. Switch order 'batch_size,
- *channels, height, width' to
- *         order 'batch_size, height, width, channels'.
- *
- * \param[out] outputs save results.
- * \param[in]  inputs  input data.
- * \param[in]  num     batch size of input data.
- * \param[in]  inC     channel number of input data.
- * \param[in]  inH     height of input data.
- * \param[in]  inH     with of input data.
- * \param[in]  argType     type of output argument.
- */
-template <DeviceType Device>
-void NCHW2NHWC(real* outputs,
-               const real* inputs,
-               const int num,
-               const int inC,
-               const int inH,
-               const int inW,
-               const int argtype);
-
-/**
- * \brief  This funtion switch dimension order of image input.
- *         The input and output is a 4D tensor. Switch order 'batch_size,
- *height, width, channels' to
- *         order 'batch_size, channels, height, width'.
- *
- * \param[out] inGrad  gradients of previous layer.
- * \param[in]  outGrad output gradients.
- * \param[in]  num     batch size of input data.
- * \param[in]  inH     height of input data.
- * \param[in]  inW     with of input data.
- * \param[in]  inC     channel number of input data.
- * \param[in]  argType     type of output argument.
- */
-template <DeviceType Device>
-void NHWC2NCHW(real* inGrad,
-               const real* outGrad,
-               const int num,
-               const int inH,
-               const int inW,
-               const int inC,
-               const int argType);
-}  // namespace paddle
diff --git a/paddle/legacy/function/SwitchOpGpu.cu b/paddle/legacy/function/SwitchOpGpu.cu
deleted file mode 100644
index 45390a56c3f..00000000000
--- a/paddle/legacy/function/SwitchOpGpu.cu
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 Paddle
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SwitchOp.h"
-#include "hl_base.h"
-
-namespace paddle {
-
-__global__ void KeNCHW2NHWC(real* outputs,
-                            const real* inputs,
-                            int inC,
-                            int inH,
-                            int inW,
-                            int nthreads,
-                            int argType) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < nthreads) {
-    const int w = idx % inW;
-    const int h = (idx / inW) % inH;
-    const int c = (idx / inW / inH) % inC;
-    const int n = idx / inW / inH / inC;
-
-    const int off = ((n * inH + h) * inW + w) * inC + c;
-    if (argType == ADD_TO) {
-      outputs[off] += inputs[idx];
-    } else {
-      outputs[off] = inputs[idx];
-    }
-  }
-}
-
-template <>
-void NCHW2NHWC<DEVICE_TYPE_GPU>(real* outputs,
-                                const real* inputs,
-                                const int num,
-                                const int inC,
-                                const int inH,
-                                const int inW,
-                                const int argType) {
-  size_t nth = num * inC * inH * inW;
-  int blockSize = 1024;
-  int gridSize = (nth + 1024 - 1) / 1024;
-  KeNCHW2NHWC<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-      outputs, inputs, inC, inH, inW, nth, argType);
-  CHECK_SYNC("NCHW2NHWC");
-}
-
-__global__ void KeNHWC2NCHW(real* outputs,
-                            const real* inputs,
-                            int inH,
-                            int inW,
-                            int inC,
-                            int nthreads,
-                            int argType) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < nthreads) {
-    const int c = idx % inC;
-    const int w = (idx / inC) % inW;
-    const int h = (idx / inC / inW) % inH;
-    const int n = idx / inW / inH / inC;
-
-    const int off = ((n * inC + c) * inH + h) * inW + w;
-    if (argType == ADD_TO) {
-      outputs[off] += inputs[idx];
-    } else {
-      outputs[off] = inputs[idx];
-    }
-  }
-}
-
-template <>
-void NHWC2NCHW<DEVICE_TYPE_GPU>(real* outputs,
-                                const real* inputs,
-                                const int num,
-                                const int inH,
-                                const int inW,
-                                const int inC,
-                                const int argType) {
-  int nth = num * inC * inH * inW;
-  int blockSize = 1024;
-  int gridSize = (nth + 1024 - 1) / 1024;
-  KeNHWC2NCHW<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-      outputs, inputs, inH, inW, inC, nth, argType);
-  CHECK_SYNC("NHWC2NCHW");
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/SwitchOpTest.cpp b/paddle/legacy/function/SwitchOpTest.cpp
deleted file mode 100644
index 08e5a613c06..00000000000
--- a/paddle/legacy/function/SwitchOpTest.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-
-namespace paddle {
-
-TEST(Pad, real) {
-  for (size_t numSamples : {1, 4, 8, 16}) {
-    for (size_t channels : {1, 4, 8, 16}) {
-      for (size_t imgSizeH : {1, 4, 8, 16}) {
-        for (size_t imgSizeW : {1, 4, 8, 16}) {
-          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
-          for (bool test_grad : {true, false}) {
-            CpuGpuFuncCompare compare(test_grad ? "NHWC2NCHW" : "NCHW2NHWC",
-                                      FuncConfig());
-            TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
-            TensorShape outDims{numSamples, imgSizeH, imgSizeW, channels};
-            compare.addInputs(
-                BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
-            compare.addOutputs(BufferArg(
-                VALUE_TYPE_FLOAT, test_grad ? inDims : outDims, ASSIGN_TO));
-            compare.run();
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/TensorShape.h b/paddle/legacy/function/TensorShape.h
deleted file mode 100644
index d4d1eae3960..00000000000
--- a/paddle/legacy/function/TensorShape.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-
-namespace paddle {
-
-/**
- * TensorShape used to represent shape of normal tensor.
- */
-class TensorShape {
- public:
-  TensorShape() : ndims_(0), nelements_(0) { initDims(0); }
-
-  TensorShape(size_t ndims) : ndims_(ndims), nelements_(1) { initDims(ndims); };
-
-  TensorShape(std::initializer_list<size_t> dims) {
-    ndims_ = dims.size();
-    initDims(ndims_);
-    dims_.assign(dims);
-    numElements();
-  };
-
-  TensorShape(const TensorShape& t)
-      : ndims_(t.ndims_), nelements_(t.nelements_) {
-    initDims(ndims_);
-    dims_.assign(t.dims_.begin(), t.dims_.end());
-  };
-
-  // get the size of specified dimension
-  size_t operator[](size_t dim) const {
-    CHECK_GE(dim, (size_t)0);
-    CHECK_LT(dim, ndims_);
-    return dims_[dim];
-  }
-
-  // set the size of specified dimension
-  void setDim(size_t dim, size_t size) {
-    CHECK_GE(dim, (size_t)0);
-    CHECK_LT(dim, ndims_);
-    dims_[dim] = size;
-    numElements();
-  }
-
-  void reshape(std::initializer_list<size_t> dims) {
-    ndims_ = dims.size();
-    if (ndims_ > kMinDims) {
-      dims_.resize(ndims_);
-    }
-    dims_.assign(dims);
-    numElements();
-  }
-
-  // number of dimensions of the tensor
-  size_t ndims() const { return ndims_; }
-
-  size_t getElements() const { return nelements_; }
-
-  bool operator==(const TensorShape& t) const {
-    if (ndims() != t.ndims()) return false;
-    for (size_t i = 0; i < ndims(); i++) {
-      if (dims_[i] != t.dims_[i]) return false;
-    }
-
-    return true;
-  }
-
-  bool operator!=(const TensorShape& t) const { return !(*this == t); }
-
- private:
-  // compute number of elements
-  void numElements() {
-    nelements_ = 1;
-    for (size_t n = 0; n < ndims_; n++) {
-      nelements_ *= dims_[n];
-    }
-  }
-
-  // init dims_
-  void initDims(size_t ndims) {
-    size_t count = ndims < kMinDims ? kMinDims : ndims;
-    dims_.assign(count, 1);
-  }
-
-  // number of dimensions
-  // ndims_ may be not equeal dims_.size()
-  size_t ndims_;
-  // number of elements
-  size_t nelements_;
-  std::vector<size_t> dims_;
-  static const size_t kMinDims = 4;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/TensorShapeTest.cpp b/paddle/legacy/function/TensorShapeTest.cpp
deleted file mode 100644
index 4d692b9b97a..00000000000
--- a/paddle/legacy/function/TensorShapeTest.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TensorShape.h"
-#include <gtest/gtest.h>
-
-namespace paddle {
-
-TEST(TensorShape, Constructor) {
-  TensorShape t1;
-  EXPECT_EQ(t1.ndims(), 0U);
-  EXPECT_EQ(t1.getElements(), 0U);
-
-  TensorShape t2(3);
-  EXPECT_EQ(t2.ndims(), 3U);
-  EXPECT_EQ(t2.getElements(), 1U);
-
-  TensorShape t3({8, 10});
-  EXPECT_EQ(t3.ndims(), 2U);
-  EXPECT_EQ(t3.getElements(), 80U);
-
-  TensorShape t4(t3);
-  EXPECT_EQ(t4.ndims(), t3.ndims());
-  EXPECT_EQ(t4.getElements(), t3.getElements());
-
-  TensorShape t5({1, 2, 3, 4, 5});
-  EXPECT_EQ(t5.ndims(), 5U);
-  EXPECT_EQ(t5.getElements(), 120U);
-}
-
-TEST(TensorShape, GetAndSet) {
-  TensorShape t({1, 2, 3});
-  EXPECT_EQ(t.ndims(), 3U);
-  EXPECT_EQ(t.getElements(), 6U);
-
-  EXPECT_EQ(t[1], 2U);
-  t.setDim(1, 100);
-  EXPECT_EQ(t.getElements(), 300U);
-  EXPECT_EQ(t[1], 100U);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/TensorType.h b/paddle/legacy/function/TensorType.h
deleted file mode 100644
index 13994821be7..00000000000
--- a/paddle/legacy/function/TensorType.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-enum ValueType {
-  VALUE_TYPE_INT32 = 0,
-  VALUE_TYPE_FLOAT = 1,
-  VALUE_TYPE_DOUBLE = 2,
-  VALUE_TYPE_BYTE = 3
-};
-
-enum DeviceType {
-  DEVICE_TYPE_UNSPECIFIED = 0,
-  DEVICE_TYPE_CPU = 1,
-  DEVICE_TYPE_GPU = 2
-};
-
-enum SparseDataType { T_NO_VALUE = 0, T_FLOAT_VALUE = 1 };
-
-enum SparseDataFormat { T_SPARSE_CSR = 0, T_SPARSE_CSC = 1 };
-
-inline int sizeOfValuType(ValueType valueType) {
-  if (valueType == VALUE_TYPE_INT32) {
-    return 4;
-  } else if (valueType == VALUE_TYPE_FLOAT) {
-    return 4;
-  } else if (valueType == VALUE_TYPE_DOUBLE) {
-    return 8;
-  } else {
-    LOG(FATAL) << "Unknown type: " << valueType;
-    return 0;
-  }
-}
-
-template <typename T>
-struct DataType;
-
-template <>
-struct DataType<float> {
-  static const ValueType value = VALUE_TYPE_FLOAT;
-};
-
-template <>
-struct DataType<double> {
-  static const ValueType value = VALUE_TYPE_DOUBLE;
-};
-
-template <>
-struct DataType<int> {
-  static const ValueType value = VALUE_TYPE_INT32;
-};
-
-namespace detail {
-
-template <typename VType, DeviceType Device>
-struct MatrixT;
-
-template <>
-struct MatrixT<real, DEVICE_TYPE_CPU> {
-  using type = CpuMatrix;
-};
-
-template <>
-struct MatrixT<real, DEVICE_TYPE_GPU> {
-  using type = GpuMatrix;
-};
-
-template <>
-struct MatrixT<int, DEVICE_TYPE_CPU> {
-  using type = void;  // Not implemented
-};
-
-template <>
-struct MatrixT<int, DEVICE_TYPE_GPU> {
-  using type = void;  // Not implemented
-};
-
-template <typename VType, DeviceType Device>
-struct SparseMatrixT;
-
-template <>
-struct SparseMatrixT<real, DEVICE_TYPE_CPU> {
-  using type = CpuSparseMatrix;
-};
-
-template <>
-struct SparseMatrixT<real, DEVICE_TYPE_GPU> {
-  using type = GpuSparseMatrix;
-};
-
-template <>
-struct SparseMatrixT<int, DEVICE_TYPE_CPU> {
-  using type = void;  // Not implemented
-};
-
-template <>
-struct SparseMatrixT<int, DEVICE_TYPE_GPU> {
-  using type = void;  // Not implemented
-};
-
-template <typename VType, DeviceType Device>
-struct VectorT;
-
-template <>
-struct VectorT<real, DEVICE_TYPE_CPU> {
-  using type = CpuVector;
-};
-
-template <>
-struct VectorT<real, DEVICE_TYPE_GPU> {
-  using type = GpuVector;
-};
-
-template <>
-struct VectorT<int, DEVICE_TYPE_CPU> {
-  using type = CpuIVector;
-};
-
-template <>
-struct VectorT<int, DEVICE_TYPE_GPU> {
-  using type = GpuIVector;
-};
-
-}  // namespace detail
-
-template <typename VType, DeviceType DType>
-struct Tensor {
-  typedef typename detail::VectorT<VType, DType>::type Vector;
-  typedef typename detail::MatrixT<VType, DType>::type Matrix;
-  typedef typename detail::SparseMatrixT<VType, DType>::type SparseMatrix;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/TensorTypeTest.cpp b/paddle/legacy/function/TensorTypeTest.cpp
deleted file mode 100644
index d0cd63147a8..00000000000
--- a/paddle/legacy/function/TensorTypeTest.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TensorType.h"
-#include <gtest/gtest.h>
-
-namespace paddle {
-
-TEST(TensorType, Matrix) {
-  Tensor<real, DEVICE_TYPE_CPU>::Matrix matrix(100, 200);
-  EXPECT_EQ(matrix.getHeight(), 100U);
-  EXPECT_EQ(matrix.getWidth(), 200U);
-  EXPECT_EQ(matrix.getElementCnt(), 100U * 200U);
-  EXPECT_EQ(matrix.useGpu(), false);
-
-  Tensor<real, DEVICE_TYPE_GPU>::Matrix testGpu(100, 200);
-  EXPECT_EQ(testGpu.useGpu(), true);
-}
-
-TEST(TensorType, Vector) {
-  Tensor<real, DEVICE_TYPE_CPU>::Vector cpuVector(100);
-  Tensor<real, DEVICE_TYPE_GPU>::Vector gpuVector(100);
-  EXPECT_EQ(cpuVector.useGpu(), false);
-  EXPECT_EQ(gpuVector.useGpu(), true);
-  EXPECT_EQ(cpuVector.getSize(), 100U);
-  EXPECT_EQ(gpuVector.getSize(), 100U);
-
-  Tensor<int, DEVICE_TYPE_CPU>::Vector cpuIVector(100);
-  Tensor<int, DEVICE_TYPE_GPU>::Vector gpuIVector(100);
-  EXPECT_EQ(cpuIVector.useGpu(), false);
-  EXPECT_EQ(gpuIVector.useGpu(), true);
-  EXPECT_EQ(cpuIVector.getSize(), 100U);
-  EXPECT_EQ(gpuIVector.getSize(), 100U);
-}
-
-TEST(TensorType, EmptyMatrix) {
-  CpuMatrix empty(nullptr, 0, 0);
-  CpuMatrix nonEmpty(10, 10);
-  EXPECT_EQ(empty.isEmpty(), true);
-  EXPECT_EQ(nonEmpty.isEmpty(), false);
-  CHECK(nonEmpty);
-  auto function = [](const CpuMatrix& matrix) {
-    if (matrix) {
-      EXPECT_NE(matrix.getData(), nullptr);
-    } else {
-      EXPECT_EQ(matrix.getData(), nullptr);
-    }
-  };
-  function(empty);
-  function(nonEmpty);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/neon/NeonDepthwiseConv.cpp b/paddle/legacy/function/neon/NeonDepthwiseConv.cpp
deleted file mode 100644
index 6179635a9fe..00000000000
--- a/paddle/legacy/function/neon/NeonDepthwiseConv.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NeonDepthwiseConv.h"
-#include "paddle/legacy/function/ConvOp.h"
-
-namespace paddle {
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-template <DeviceType Device>
-class NeonDepthwiseConvFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    int batchSize = input[0];
-    int inputChannels = input[1];
-    int inputHeight = input[2];
-    int inputWidth = input[3];
-    int filterHeight = getFilterHeight(filter);
-    int filterWidth = getFilterWidth(filter);
-    int outputChannels = output[1];
-    int outputHeight = output[2];
-    int outputWidth = output[3];
-    int filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(static_cast<size_t>(inputChannels), groups_);
-
-    // only support strideH() == strideW() and filterHeight == filterWidth.
-    CHECK_EQ(strideH(), strideW());
-    CHECK_EQ(filterHeight, filterWidth);
-
-    float* inputData = inputs[0].data<float>();
-    float* filterData = inputs[1].data<float>();
-    float* outputData = outputs[0].data<float>();
-
-    // padding the input
-    float* inputPadding = inputData;
-    int padInputHeight = inputHeight + 2 * paddingH();
-    int padInputWidth = inputWidth + 2 * paddingW();
-    int newSize =
-        batchSize * (inputChannels + 1) * padInputHeight * padInputWidth;
-
-    resizeBuffer<Device>(newSize);
-    inputPadding = reinterpret_cast<float*>(memory_->getBuf());
-    neon::Padding<float>::run(inputData,
-                              inputPadding,
-                              batchSize * inputChannels,
-                              inputHeight,
-                              inputWidth,
-                              padInputHeight,
-                              padInputWidth);
-
-    std::function<void(
-        const float*, const float*, int, int, int, int, int, int, float*)>
-        DepthWiseConv;
-
-    if (filterWidth == 3 && strideW() == 1) {
-      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
-    } else if (filterWidth == 3 && strideW() == 2) {
-      DepthWiseConv = neon::DepthwiseConvKernel<3, 2>::run;
-    } else if (filterWidth == 4 && strideW() == 1) {
-      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
-    } else if (filterWidth == 4 && strideW() == 2) {
-      DepthWiseConv = neon::DepthwiseConvKernel<4, 2>::run;
-    } else {
-      LOG(FATAL) << "Not supported";
-    }
-
-    for (int i = 0; i < batchSize; i++) {
-      DepthWiseConv(inputPadding,
-                    filterData,
-                    padInputHeight,
-                    padInputWidth,
-                    outputChannels,
-                    outputHeight,
-                    outputWidth,
-                    filterMultiplier,
-                    outputData);
-      inputPadding += inputChannels * padInputHeight * padInputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifndef PADDLE_TYPE_DOUBLE
-REGISTER_TYPED_FUNC(NeonDepthwiseConv, CPU, NeonDepthwiseConvFunction);
-#endif
-
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/neon/NeonDepthwiseConv.h b/paddle/legacy/function/neon/NeonDepthwiseConv.h
deleted file mode 100644
index 8b2cba263e7..00000000000
--- a/paddle/legacy/function/neon/NeonDepthwiseConv.h
+++ /dev/null
@@ -1,627 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string.h>
-#include "neon_util.h"
-
-namespace paddle {
-namespace neon {
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-template <int filterSize, int stride>
-struct DepthwiseConvKernel {};
-
-inline float32_t conv3x3(const float* r0,
-                         const float* r1,
-                         const float* r2,
-                         float32x4_t k0,
-                         float32x4_t k1,
-                         float32x4_t k2) {
-  float32_t tmp[12];
-  vst1q_f32(&(tmp[0]), k0);
-  vst1q_f32(&(tmp[4]), k1);
-  vst1q_f32(&(tmp[8]), k2);
-  float32_t sum0 = r0[0] * tmp[0] + r0[1] * tmp[1] + r0[2] * tmp[2];
-  float32_t sum1 = r1[0] * tmp[4] + r1[1] * tmp[5] + r1[2] * tmp[6];
-  float32_t sum2 = r2[0] * tmp[8] + r2[1] * tmp[9] + r2[2] * tmp[10];
-  return sum0 + sum1 + sum2;
-}
-
-inline float32_t conv4x4(float32x4_t r0,
-                         float32x4_t r1,
-                         float32x4_t r2,
-                         float32x4_t r3,
-                         float32x4_t k0,
-                         float32x4_t k1,
-                         float32x4_t k2,
-                         float32x4_t k3) {
-  float32x4_t tmp;
-  tmp = vmulq_f32(r0, k0);
-  tmp = vmlaq_f32(tmp, r1, k1);
-  tmp = vmlaq_f32(tmp, r2, k2);
-  tmp = vmlaq_f32(tmp, r3, k3);
-  return vaddvq_f32(tmp);
-}
-
-/**
- * Each step calculates four elements of the output.
- * First step:
- *   R0[0, 1, 2, 3...] * K[0][0]
- *   R0[1, 2, 3, 4...] * K[0][1]
- *   R0[2, 3, 4, 5...] * K[0][2]
- *   R1[0, 1, 2, 3...] * K[1][0]
- *   R1[1, 2, 3, 4...] * K[1][1]
- *   R1[2, 3, 4, 5...] * K[1][2]
- *   R2[0, 1, 2, 3...] * K[2][0]
- *   R2[1, 2, 3, 4...] * K[2][1]
- * + R2[2, 3, 4, 5...] * K[2][2]
- * ------------------------------
- *     Output[0, 1, 2, 3]
- */
-template <>
-struct DepthwiseConvKernel<3, 1> {
-  static void run(const float* inputData,
-                  const float* filterData,
-                  int inputHeight,
-                  int inputWidth,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int filterMultiplier,
-                  float* outputData) {
-    const int steps = outputWidth >> 2;
-    const int remain = outputWidth & 3;
-    for (int c = 0; c < outputChannels; c++, filterData += 9) {
-      // Load the filters
-      float32x4_t k[3];
-      k[0] = vld1q_f32(filterData);
-      k[1] = vld1q_f32(filterData + 3);
-      k[2] = vld1q_f32(filterData + 6);
-      k[0] = vsetq_lane_f32(0.f, k[0], 3);
-      k[1] = vsetq_lane_f32(0.f, k[1], 3);
-      k[2] = vsetq_lane_f32(0.f, k[2], 3);
-
-      const float* r0 =
-          inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
-      const float* r1 = r0 + inputWidth;
-      const float* r2 = r0 + inputWidth * 2;
-      float32x4_t input[3][3];
-      for (int h = 0; h < outputHeight; h++) {
-        for (int s = 0; s < steps; s++) {
-          // Load the inputs
-          float32x4_t tmp;
-          input[0][0] = vld1q_f32(r0);
-          tmp = vld1q_f32(r0 + 4);
-          input[0][1] = vextq_f32(input[0][0], tmp, 1);
-          input[0][2] = vextq_f32(input[0][0], tmp, 2);
-          input[1][0] = vld1q_f32(r1);
-          tmp = vld1q_f32(r1 + 4);
-          input[1][1] = vextq_f32(input[1][0], tmp, 1);
-          input[1][2] = vextq_f32(input[1][0], tmp, 2);
-          input[2][0] = vld1q_f32(r2);
-          tmp = vld1q_f32(r2 + 4);
-          input[2][1] = vextq_f32(input[2][0], tmp, 1);
-          input[2][2] = vextq_f32(input[2][0], tmp, 2);
-
-          float32x4_t tmp1 = vdupq_n_f32(0.f);
-          float32x4_t tmp2 = vdupq_n_f32(0.f);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
-          tmp1 = vaddq_f32(tmp1, tmp2);
-
-          vst1q_f32(outputData, tmp1);
-          r0 += 4;
-          r1 += 4;
-          r2 += 4;
-          outputData += 4;
-        }
-
-        for (int r = 0; r < remain; r++) {
-          *outputData = conv3x3(r0, r1, r2, k[0], k[1], k[2]);
-          r0++;
-          r1++;
-          r2++;
-          outputData++;
-        }
-
-        r0 += 2;
-        r1 += 2;
-        r2 += 2;
-      }
-    }
-  }
-};
-
-/**
- * Each step calculates four elements of the output.
- * First step:
- *   R0[0, 2, 4, 6...] * K[0][0]
- *   R0[1, 3, 5, 7...] * K[0][1]
- *   R0[2, 4, 6, 8...] * K[0][2]
- *   R1[0, 2, 4, 6...] * K[1][0]
- *   R1[1, 3, 5, 7...] * K[1][1]
- *   R1[2, 4, 6, 8...] * K[1][2]
- *   R2[0, 2, 4, 6...] * K[2][0]
- *   R2[1, 3, 5, 7...] * K[2][1]
- *   R2[2, 4, 6, 8...] * K[2][2]
- * ------------------------------
- *     Output[0, 1, 2, 3]
- */
-template <>
-struct DepthwiseConvKernel<3, 2> {
-  static void run(const float* inputData,
-                  const float* filterData,
-                  int inputHeight,
-                  int inputWidth,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int filterMultiplier,
-                  float* outputData) {
-    const int steps = outputWidth >> 2;
-    const int remain = outputWidth & 3;
-    for (int c = 0; c < outputChannels; c++, filterData += 9) {
-      // Load the filters
-      float32x4_t k[3];
-      k[0] = vld1q_f32(filterData);
-      k[1] = vld1q_f32(filterData + 3);
-      k[2] = vld1q_f32(filterData + 6);
-      k[0] = vsetq_lane_f32(0.f, k[0], 3);
-      k[1] = vsetq_lane_f32(0.f, k[1], 3);
-      k[2] = vsetq_lane_f32(0.f, k[2], 3);
-
-      const float* start =
-          inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
-      float32x4_t input[3][3];
-      for (int h = 0; h < outputHeight; h++) {
-        const float* r0 = start + 2 * h * inputWidth;
-        const float* r1 = start + (2 * h + 1) * inputWidth;
-        const float* r2 = start + (2 * h + 2) * inputWidth;
-        for (int s = 0; s < steps; s++) {
-          // Load the inputs
-          float32x4_t data1;
-          float32x4x2_t data2;
-
-          data2 = vld2q_f32(r0);
-          input[0][0] = data2.val[0];
-          input[0][1] = data2.val[1];
-          data1 = vld1q_f32(r0 + 8);
-          input[0][2] = vextq_f32(data2.val[0], data1, 1);
-
-          data2 = vld2q_f32(r1);
-          input[1][0] = data2.val[0];
-          input[1][1] = data2.val[1];
-          data1 = vld1q_f32(r1 + 8);
-          input[1][2] = vextq_f32(data2.val[0], data1, 1);
-
-          data2 = vld2q_f32(r2);
-          input[2][0] = data2.val[0];
-          input[2][1] = data2.val[1];
-          data1 = vld1q_f32(r2 + 8);
-          input[2][2] = vextq_f32(data2.val[0], data1, 1);
-
-          float32x4_t tmp1 = vdupq_n_f32(0.f);
-          float32x4_t tmp2 = vdupq_n_f32(0.f);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
-          tmp1 = vaddq_f32(tmp1, tmp2);
-
-          vst1q_f32(outputData, tmp1);
-          r0 += 8;
-          r1 += 8;
-          r2 += 8;
-          outputData += 4;
-        }
-
-        for (int r = 0; r < remain; r++) {
-          *outputData = conv3x3(r0, r1, r2, k[0], k[1], k[2]);
-          r0 += 2;
-          r1 += 2;
-          r2 += 2;
-          outputData++;
-        }
-      }
-    }
-  }
-};
-
-/**
- * Each step calculates four elements of the output.
- */
-template <>
-struct DepthwiseConvKernel<4, 1> {
-  static void run(const float* inputData,
-                  const float* filterData,
-                  int inputHeight,
-                  int inputWidth,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int filterMultiplier,
-                  float* outputData) {
-    const int steps = outputWidth >> 2;
-    const int remain = outputWidth & 3;
-    for (int c = 0; c < outputChannels; c++, filterData += 16) {
-      // Load the filters
-      float32x4_t k[4];
-      k[0] = vld1q_f32(filterData);
-      k[1] = vld1q_f32(filterData + 4);
-      k[2] = vld1q_f32(filterData + 8);
-      k[3] = vld1q_f32(filterData + 12);
-
-      const float* r0 =
-          inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
-      const float* r1 = r0 + inputWidth;
-      const float* r2 = r0 + inputWidth * 2;
-      const float* r3 = r0 + inputWidth * 3;
-      float32x4_t input[4][4];
-      for (int h = 0; h < outputHeight; h++) {
-        for (int s = 0; s < steps; s++) {
-          // Load the inputs
-          float32x4_t tmp;
-          input[0][0] = vld1q_f32(r0);
-          tmp = vld1q_f32(r0 + 4);
-          input[0][1] = vextq_f32(input[0][0], tmp, 1);
-          input[0][2] = vextq_f32(input[0][0], tmp, 2);
-          input[0][3] = vextq_f32(input[0][0], tmp, 3);
-
-          input[1][0] = vld1q_f32(r1);
-          tmp = vld1q_f32(r1 + 4);
-          input[1][1] = vextq_f32(input[1][0], tmp, 1);
-          input[1][2] = vextq_f32(input[1][0], tmp, 2);
-          input[1][3] = vextq_f32(input[1][0], tmp, 3);
-
-          input[2][0] = vld1q_f32(r2);
-          tmp = vld1q_f32(r2 + 4);
-          input[2][1] = vextq_f32(input[2][0], tmp, 1);
-          input[2][2] = vextq_f32(input[2][0], tmp, 2);
-          input[2][3] = vextq_f32(input[2][0], tmp, 3);
-
-          input[3][0] = vld1q_f32(r3);
-          tmp = vld1q_f32(r3 + 4);
-          input[3][1] = vextq_f32(input[3][0], tmp, 1);
-          input[3][2] = vextq_f32(input[3][0], tmp, 2);
-          input[3][3] = vextq_f32(input[3][0], tmp, 3);
-
-          float32x4_t tmp1 = vdupq_n_f32(0.f);
-          float32x4_t tmp2 = vdupq_n_f32(0.f);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3);
-          tmp1 = vaddq_f32(tmp1, tmp2);
-
-          vst1q_f32(outputData, tmp1);
-          r0 += 4;
-          r1 += 4;
-          r2 += 4;
-          r3 += 4;
-          outputData += 4;
-        }
-
-        for (int r = 0; r < remain; r++) {
-          float32x4_t i0 = vld1q_f32(r0);
-          float32x4_t i1 = vld1q_f32(r1);
-          float32x4_t i2 = vld1q_f32(r2);
-          float32x4_t i3 = vld1q_f32(r3);
-          *outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]);
-          r0++;
-          r1++;
-          r2++;
-          r3++;
-          outputData++;
-        }
-
-        r0 += 3;
-        r1 += 3;
-        r2 += 3;
-        r3 += 3;
-      }
-    }
-  }
-};
-
-/**
- * Each step calculates four elements of the output.
- */
-template <>
-struct DepthwiseConvKernel<4, 2> {
-  static void run(const float* inputData,
-                  const float* filterData,
-                  int inputHeight,
-                  int inputWidth,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int filterMultiplier,
-                  float* outputData) {
-    const int steps = outputWidth >> 2;
-    const int remain = outputWidth & 3;
-    for (int c = 0; c < outputChannels; c++, filterData += 16) {
-      // Load the filters
-      float32x4_t k[4];
-      k[0] = vld1q_f32(filterData);
-      k[1] = vld1q_f32(filterData + 4);
-      k[2] = vld1q_f32(filterData + 8);
-      k[3] = vld1q_f32(filterData + 12);
-
-      const float* start =
-          inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
-      float32x4_t input[4][4];
-      for (int h = 0; h < outputHeight; h++) {
-        const float* r0 = start + 2 * h * inputWidth;
-        const float* r1 = start + (2 * h + 1) * inputWidth;
-        const float* r2 = start + (2 * h + 2) * inputWidth;
-        const float* r3 = start + (2 * h + 3) * inputWidth;
-        for (int s = 0; s < steps; s++) {
-          // Load the inputs
-          float32x4x2_t data1;
-          float32x4x2_t data2;
-
-          data1 = vld2q_f32(r0);
-          data2 = vld2q_f32(r0 + 8);
-          input[0][0] = data1.val[0];
-          input[0][1] = data1.val[1];
-          input[0][2] = vextq_f32(data1.val[0], data2.val[0], 1);
-          input[0][3] = vextq_f32(data1.val[1], data2.val[1], 1);
-
-          data1 = vld2q_f32(r1);
-          data2 = vld2q_f32(r1 + 8);
-          input[1][0] = data1.val[0];
-          input[1][1] = data1.val[1];
-          input[1][2] = vextq_f32(data1.val[0], data2.val[0], 1);
-          input[1][3] = vextq_f32(data1.val[1], data2.val[1], 1);
-
-          data1 = vld2q_f32(r2);
-          data2 = vld2q_f32(r2 + 8);
-          input[2][0] = data1.val[0];
-          input[2][1] = data1.val[1];
-          input[2][2] = vextq_f32(data1.val[0], data2.val[0], 1);
-          input[2][3] = vextq_f32(data1.val[1], data2.val[1], 1);
-
-          data1 = vld2q_f32(r3);
-          data2 = vld2q_f32(r3 + 8);
-          input[3][0] = data1.val[0];
-          input[3][1] = data1.val[1];
-          input[3][2] = vextq_f32(data1.val[0], data2.val[0], 1);
-          input[3][3] = vextq_f32(data1.val[1], data2.val[1], 1);
-
-          float32x4_t tmp1 = vdupq_n_f32(0.f);
-          float32x4_t tmp2 = vdupq_n_f32(0.f);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3);
-          tmp1 = vaddq_f32(tmp1, tmp2);
-
-          vst1q_f32(outputData, tmp1);
-          r0 += 8;
-          r1 += 8;
-          r2 += 8;
-          r3 += 8;
-          outputData += 4;
-        }
-
-        for (int r = 0; r < remain; r++) {
-          float32x4_t i0 = vld1q_f32(r0);
-          float32x4_t i1 = vld1q_f32(r1);
-          float32x4_t i2 = vld1q_f32(r2);
-          float32x4_t i3 = vld1q_f32(r3);
-          *outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]);
-          r0 += 2;
-          r1 += 2;
-          r2 += 2;
-          r3 += 2;
-          outputData++;
-        }
-      }
-    }
-  }
-};
-
-template <class T>
-struct Padding {
-  static void run(const T* input,
-                  T* inputPadding,
-                  int channels,
-                  int inputHeight,
-                  int inputWidth,
-                  int padInputHeight,
-                  int padInputWidth) {
-    const int paddingHeight = (padInputHeight - inputHeight) / 2;
-    const int paddingWidth = (padInputWidth - inputWidth) / 2;
-    for (int c = 0; c < channels; c++) {
-      if (paddingHeight > 0) {
-        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(T));
-        inputPadding += padInputWidth * paddingHeight;
-      }
-
-      for (int i = 0; i < inputHeight; i++) {
-        // padding head
-        for (int j = 0; j < paddingWidth; j++) {
-          *inputPadding++ = T(0);
-        }
-
-        memcpy(inputPadding, input, inputWidth * sizeof(T));
-        inputPadding += inputWidth;
-        input += inputWidth;
-
-        // padding tail
-        for (int j = 0; j < paddingWidth; j++) {
-          *inputPadding++ = T(0);
-        }
-      }
-
-      if (paddingHeight > 0) {
-        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(T));
-        inputPadding += padInputWidth * paddingHeight;
-      }
-    }
-  }
-};
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-template <>
-struct Padding<float> {
-  static void run(const float* input,
-                  float* inputPadding,
-                  int channels,
-                  int inputHeight,
-                  int inputWidth,
-                  int padInputHeight,
-                  int padInputWidth) {
-    const int paddingHeight = (padInputHeight - inputHeight) / 2;
-    const int paddingWidth = (padInputWidth - inputWidth) / 2;
-    for (int c = 0; c < channels; c++) {
-      if (paddingHeight > 0) {
-        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
-        inputPadding += padInputWidth * paddingHeight;
-      }
-
-      for (int i = 0; i < inputHeight; i++) {
-        // padding head
-        for (int j = 0; j < paddingWidth; j++) {
-          *inputPadding++ = float(0);
-        }
-
-        int step = inputWidth >> 2;
-        int remain = inputWidth & 3;
-        for (int s = 0; s < step; s++) {
-          float32x4_t s0 = vld1q_f32(input);
-          vst1q_f32(inputPadding, s0);
-          input += 4;
-          inputPadding += 4;
-        }
-        for (int r = 0; r < remain; r++) {
-          *inputPadding++ = *input++;
-        }
-
-        // padding tail
-        for (int j = 0; j < paddingWidth; j++) {
-          *inputPadding++ = float(0);
-        }
-      }
-
-      if (paddingHeight > 0) {
-        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
-        inputPadding += padInputWidth * paddingHeight;
-      }
-    }
-  }
-};
-
-// for stride is 2
-struct StridePadding {
-  static void run(const float* input,
-                  float* inputPadding,
-                  int channels,
-                  int inputHeight,
-                  int inputWidth,
-                  int padInputHeight,
-                  int padInputWidth) {
-    const int paddingHeight = (padInputHeight - (inputHeight * 2 - 1)) / 2;
-    const int paddingWidth = (padInputWidth - (inputWidth * 2 - 1)) / 2;
-    for (int c = 0; c < channels; c++) {
-      if (paddingHeight > 0) {
-        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
-        inputPadding += padInputWidth * paddingHeight;
-      }
-
-      for (int i = 0; i < inputHeight; i++) {
-        // padding head
-        for (int j = 0; j < paddingWidth; j++) {
-          *inputPadding++ = float(0);
-        }
-
-        int step = inputWidth >> 2;
-        int remain = inputWidth & 3;
-        float32x4_t s1 = vdupq_n_f32(0.f);
-        for (int s = 0; s < step; s++) {
-          float32x4_t s0 = vld1q_f32(input);
-          float32x4x2_t v = {{s0, s1}};
-          vst2q_f32(inputPadding, v);
-          input += 4;
-          inputPadding += 8;
-        }
-        for (int r = 0; r < remain; r++) {
-          *inputPadding++ = *input++;
-          *inputPadding++ = float(0);
-        }
-        inputPadding--;
-
-        // padding tail
-        for (int j = 0; j < paddingWidth; j++) {
-          *inputPadding++ = float(0);
-        }
-        if (i != inputHeight - 1) {
-          memset(inputPadding, 0, padInputWidth * sizeof(float));
-          inputPadding += padInputWidth;
-        }
-      }
-
-      if (paddingHeight > 0) {
-        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
-        inputPadding += padInputWidth * paddingHeight;
-      }
-    }
-  }
-};
-
-#endif
-
-#endif
-
-}  // namespace neon
-}  // namespace paddle
diff --git a/paddle/legacy/function/neon/NeonDepthwiseConvTranspose.cpp b/paddle/legacy/function/neon/NeonDepthwiseConvTranspose.cpp
deleted file mode 100644
index feb77e1ff9f..00000000000
--- a/paddle/legacy/function/neon/NeonDepthwiseConvTranspose.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NeonDepthwiseConv.h"
-#include "paddle/legacy/function/ConvOp.h"
-
-namespace paddle {
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-template <DeviceType Device>
-class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    int batchSize = input[0];
-    int inputChannels = input[1];
-    int inputHeight = input[2];
-    int inputWidth = input[3];
-    int filterHeight = getFilterHeight(filter);
-    int filterWidth = getFilterWidth(filter);
-    int outputChannels = output[1];
-    int outputHeight = output[2];
-    int outputWidth = output[3];
-    int filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(inputChannels, groups_);
-
-    // only support strideH() == strideW() and filterHeight == filterWidth.
-    CHECK_EQ(strideH(), strideW());
-    CHECK_EQ(paddingH(), paddingW());
-    CHECK_EQ(filterHeight, filterWidth);
-
-    float* inputData = inputs[0].data<float>();
-    float* filterData = inputs[1].data<float>();
-    float* outputData = outputs[0].data<float>();
-
-    // padding the input, input -> inputPadding
-    float* inputPadding = inputData;
-    int padInputHeight =
-        (inputHeight - 1) * strideH() + 2 * filterHeight - 1 - 2 * paddingH();
-    int padInputWidth =
-        (inputWidth - 1) * strideW() + 2 * filterWidth - 1 - 2 * paddingW();
-
-    if (padInputHeight > inputHeight || padInputWidth > inputWidth) {
-      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
-      resizeBuffer<Device>(newSize);
-      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
-      if (strideH() == 1) {
-        neon::Padding<float>::run(inputData,
-                                  inputPadding,
-                                  batchSize * inputChannels,
-                                  inputHeight,
-                                  inputWidth,
-                                  padInputHeight,
-                                  padInputWidth);
-      } else if (strideH() == 2) {
-        neon::StridePadding::run(inputData,
-                                 inputPadding,
-                                 batchSize * inputChannels,
-                                 inputHeight,
-                                 inputWidth,
-                                 padInputHeight,
-                                 padInputWidth);
-      } else {
-        LOG(FATAL) << "Not supported";
-      }
-    }
-
-    std::function<void(
-        const float*, const float*, int, int, int, int, int, int, float*)>
-        DepthWiseConv;
-
-    if (filterWidth == 3) {
-      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
-    } else if (filterWidth == 4) {
-      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
-    } else {
-      LOG(FATAL) << "Not supported";
-    }
-
-    for (int i = 0; i < batchSize; i++) {
-      DepthWiseConv(inputPadding,
-                    filterData,
-                    padInputHeight,
-                    padInputWidth,
-                    outputChannels,
-                    outputHeight,
-                    outputWidth,
-                    filterMultiplier,
-                    outputData);
-      inputPadding += inputChannels * padInputHeight * padInputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifndef PADDLE_TYPE_DOUBLE
-
-REGISTER_TYPED_FUNC(NeonDepthwiseConvTranspose,
-                    CPU,
-                    NeonDepthwiseConvTransposeFunction);
-
-#endif
-
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/neon/neon_util.h b/paddle/legacy/function/neon/neon_util.h
deleted file mode 100644
index 95076b1387a..00000000000
--- a/paddle/legacy/function/neon/neon_util.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#include <arm_neon.h>
-
-namespace paddle {
-
-namespace neon {
-
-inline float32x4_t vld1q_f32_aligned(const float* p) {
-  return vld1q_f32(
-      (const float*)__builtin_assume_aligned(p, sizeof(float32x4_t)));
-}
-
-#ifndef __aarch64__
-inline float32_t vaddvq_f32(float32x4_t a) {
-  float32x2_t v = vadd_f32(vget_high_f32(a), vget_low_f32(a));
-  return vget_lane_f32(vpadd_f32(v, v), 0);
-}
-
-#define vmlaq_laneq_f32(a, b, v, lane) \
-  vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane))
-#endif
-
-}  // namespace neon
-}  // namespace paddle
-
-#endif
diff --git a/paddle/legacy/function/nnpack/NNPACKConvOp.cpp b/paddle/legacy/function/nnpack/NNPACKConvOp.cpp
deleted file mode 100644
index 81c832e7747..00000000000
--- a/paddle/legacy/function/nnpack/NNPACKConvOp.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "nnpack.h"
-#include "paddle/legacy/function/ConvOp.h"
-
-DEFINE_bool(nnpack_allocate_outside,
-            true,
-            "Allocate and free workspace memory outside the NNPACK interface.");
-DEFINE_int32(nnpack_num_threads,
-             0,
-             "The number of nnpack threads"
-             "default: 0; 0 to disable threadpool.");
-
-namespace paddle {
-
-nnp_convolution_algorithm get_nnp_convolution_algorithm(
-    const std::string& algorithm) {
-  if (algorithm == "auto") {
-    return nnp_convolution_algorithm_auto;
-  } else if (algorithm == "ft8x8") {
-    return nnp_convolution_algorithm_ft8x8;
-  } else if (algorithm == "ft16x16") {
-    return nnp_convolution_algorithm_ft16x16;
-  } else if (algorithm == "wt8x8") {
-    return nnp_convolution_algorithm_wt8x8;
-  } else if (algorithm == "implicit-gemm") {
-    return nnp_convolution_algorithm_implicit_gemm;
-  } else if (algorithm == "direct") {
-    return nnp_convolution_algorithm_direct;
-  } else {
-    return nnp_convolution_algorithm_auto;
-  }
-}
-
-template <DeviceType Device>
-class NNPACKConvFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-    algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
-    transform_strategy_ = nnp_convolution_transform_strategy_compute;
-    nnp_status status = nnp_initialize();
-    CHECK_EQ(status, nnp_status_success);
-    workspaceBuffer_ = nullptr;
-    workspaceSize_ = 0;
-
-    create_nnpack_threadpool();
-  }
-
-  ~NNPACKConvFunction() {
-    if (workspaceBuffer_) {
-      free(workspaceBuffer_);
-    }
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    check(inputs, outputs);
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    nnp_size inputSize = {.width = inputWidth, .height = inputHeight};
-    nnp_padding padding = {.top = (size_t)paddingH(),
-                           .right = (size_t)paddingW(),
-                           .bottom = (size_t)paddingH(),
-                           .left = (size_t)paddingW()};
-    nnp_size kernelSize = {.width = filterWidth, .height = filterHeight};
-    nnp_size outputSubsampling = {.width = (size_t)strideW(),
-                                  .height = (size_t)strideH()};
-
-    float* inputData = inputs[0].data<float>();
-    float* filterData = inputs[1].data<float>();
-    float* outputData = outputs[0].data<float>();
-
-    void* bufferPtr = nullptr;
-    size_t* sizePtr = nullptr;
-    size_t needSize;
-    if (FLAGS_nnpack_allocate_outside) {
-      if (batchSize == 1) {
-        nnp_status status = nnp_convolution_inference(algorithm_,
-                                                      transform_strategy_,
-                                                      inputChannels,
-                                                      outputChannels,
-                                                      inputSize,
-                                                      padding,
-                                                      kernelSize,
-                                                      outputSubsampling,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr,
-                                                      &needSize,
-                                                      nnp_activation_identity,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr);
-        CHECK_EQ(status, nnp_status_success);
-      } else {
-        // only supports stride = 1
-        CHECK_EQ(strideH(), 1);
-        CHECK_EQ(strideW(), 1);
-        nnp_status status = nnp_convolution_output(algorithm_,
-                                                   batchSize,
-                                                   inputChannels,
-                                                   outputChannels,
-                                                   inputSize,
-                                                   padding,
-                                                   kernelSize,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr,
-                                                   &needSize,
-                                                   nnp_activation_identity,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr);
-        CHECK_EQ(status, nnp_status_success);
-      }
-
-      VLOG(3) << "workspace size is " << needSize;
-      if (needSize > workspaceSize_) {
-        workspaceSize_ = needSize;
-        if (workspaceBuffer_) {
-          free(workspaceBuffer_);
-        } else {
-          posix_memalign(&workspaceBuffer_, 64, needSize);
-        }
-      }
-
-      if (needSize) {
-        bufferPtr = workspaceBuffer_;
-        sizePtr = &needSize;
-      }
-    }
-
-    size_t inputOffset = inputChannels / groups_ * inputHeight * inputWidth;
-    size_t outputOffset = outputChannels / groups_ * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    if (batchSize == 1) {
-      for (size_t g = 0; g < groups_; g++) {
-        nnp_status status =
-            nnp_convolution_inference(algorithm_,
-                                      transform_strategy_,
-                                      inputChannels / groups_,
-                                      outputChannels / groups_,
-                                      inputSize,
-                                      padding,
-                                      kernelSize,
-                                      outputSubsampling,
-                                      inputData + inputOffset * g,
-                                      filterData + filterOffset * g,
-                                      nullptr, /* bias */
-                                      outputData + outputOffset * g,
-                                      bufferPtr,
-                                      sizePtr,
-                                      nnp_activation_identity,
-                                      nullptr,
-                                      threadpool_, /* threadpool */
-                                      nullptr);
-        CHECK_EQ(status, nnp_status_success);
-      }
-    } else {
-      // only supports stride = 1
-      CHECK_EQ(strideH(), 1);
-      CHECK_EQ(strideW(), 1);
-
-      // TODO(hedaoyuan): There has some bug when batchSize > 1 and groups_ > 1.
-      CHECK_EQ(groups_, static_cast<size_t>(1));
-      nnp_status status = nnp_convolution_output(algorithm_,
-                                                 batchSize,
-                                                 inputChannels,
-                                                 outputChannels,
-                                                 inputSize,
-                                                 padding,
-                                                 kernelSize,
-                                                 inputData,
-                                                 filterData,
-                                                 nullptr, /* bias */
-                                                 outputData,
-                                                 bufferPtr,
-                                                 sizePtr,
-                                                 nnp_activation_identity,
-                                                 nullptr,
-                                                 threadpool_, /* threadpool */
-                                                 nullptr);
-      CHECK_EQ(status, nnp_status_success);
-    }
-  }
-
-  static void create_nnpack_threadpool() {
-    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
-      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
-      VLOG(3) << "Number of threads "
-              << pthreadpool_get_threads_count(threadpool_);
-    }
-  }
-
- private:
-  nnp_convolution_algorithm algorithm_;
-  nnp_convolution_transform_strategy transform_strategy_;
-  void* workspaceBuffer_;
-  size_t workspaceSize_;
-  static pthreadpool_t threadpool_;
-};
-
-template <DeviceType Device>
-pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
-
-REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/nnpack/NNPACKConvOpTest.cpp b/paddle/legacy/function/nnpack/NNPACKConvOpTest.cpp
deleted file mode 100644
index a2db83f5a36..00000000000
--- a/paddle/legacy/function/nnpack/NNPACKConvOpTest.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/legacy/function/ConvOpTest.h"
-
-namespace paddle {
-
-TEST(NNPACK, Forward) {
-  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
-      "GemmConv-CPU", "NNPACKConv-CPU", forward);
-}
-
-TEST(NNPACK, Depthwise) {
-  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
-      "GemmConv-CPU", "NNPACKConv-CPU", forward);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/CMakeLists.txt b/paddle/legacy/gserver/CMakeLists.txt
deleted file mode 100644
index 6dc877dd90e..00000000000
--- a/paddle/legacy/gserver/CMakeLists.txt
+++ /dev/null
@@ -1,152 +0,0 @@
-# Gserver package contains:
-#   * Layers
-#   * Activations
-#   * DataProviders
-#   * Evaluators
-#   * GradientMachines(NeuralNetwork)
-file(GLOB_RECURSE GSERVER_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.h")
-file(GLOB_RECURSE GSERVER_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cpp")
-set(GSERVER_SOURCES
-    layers/LstmCompute.cu
-    layers/GruCompute.cu
-    ${GSERVER_SOURCES})
-
-macro(filter_test VAR_NAME)
-    set(tmp)
-    foreach(p IN LISTS ${VAR_NAME})
-        if(NOT ${p} MATCHES ".*tests/.*")
-             set(tmp ${p} ${tmp})
-        endif()
-    endforeach()
-    set(${VAR_NAME} ${tmp})
-endmacro()
-
-filter_test(GSERVER_HEADER)
-filter_test(GSERVER_SOURCES)
-
-if(NOT WITH_MKLDNN)
-    file(GLOB_RECURSE DNN_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.h")
-    file(GLOB_RECURSE DNN_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.cpp")
-    list(REMOVE_ITEM GSERVER_HEADER ${DNN_HEADER})
-    list(REMOVE_ITEM GSERVER_SOURCES ${DNN_SOURCES})
-    message(STATUS "Skip compiling with MKLDNNLayers and MKLDNNActivations")
-else()
-    message(STATUS "Compile with MKLDNNLayers and MKLDNNActivations")
-endif()
-
-if(NOT WITH_MKLML)
-    file(GLOB_RECURSE MKL_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.h")
-    file(GLOB_RECURSE MKL_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.cpp")
-    list(REMOVE_ITEM GSERVER_HEADER ${MKL_HEADER})
-    list(REMOVE_ITEM GSERVER_SOURCES ${MKL_SOURCES})
-    message(STATUS "Skip compiling with MKLPackedLayers")
-else()
-    message(STATUS "Compile with MKLPackedLayers")
-endif()
-
-if(NOT WITH_GPU)
-    list(REMOVE_ITEM GSERVER_HEADER
-        layers/CudnnConvBaseLayer.h
-        layers/CudnnConvLayer.h
-        layers/CudnnConvTransLayer.h
-        layers/CudnnPoolLayer.h
-        layers/CudnnBatchNormLayer.h)
-
-    list(REMOVE_ITEM GSERVER_SOURCES
-        layers/CudnnConvBaseLayer.cpp
-        layers/CudnnConvLayer.cpp
-        layers/CudnnConvTransLayer.cpp
-        layers/CudnnPoolLayer.cpp
-        layers/CudnnBatchNormLayer.cpp)
-    compile_cu_as_cpp(layers/LstmCompute.cu)
-    compile_cu_as_cpp(layers/GruCompute.cu)
-endif()
-
-if(NOT WITH_PYTHON)
-    list(REMOVE_ITEM GSERVER_SOURCES
-            dataproviders/PyDataProvider.cpp)
-    
-    list(REMOVE_ITEM GSERVER_HEADER
-            dataproviders/PyDataProvider.h)
-endif()
-
-if(MOBILE_INFERENCE)
-    # Remove evaluators
-    list(REMOVE_ITEM GSERVER_SOURCES
-         layers/ValidationLayer.cpp
-         evaluators/Evaluator.cpp
-         evaluators/DetectionMAPEvaluator.cpp
-         evaluators/CTCErrorEvaluator.cpp
-         evaluators/ChunkEvaluator.cpp)
-
-    # Remove dataproviders
-    list(REMOVE_ITEM GSERVER_SOURCES
-         dataproviders/DataProvider.cpp
-         dataproviders/MultiDataProvider.cpp
-         dataproviders/PyDataProvider2.cpp
-         dataproviders/PyDataProvider.cpp)
-
-    # Remove useless gradientmachines
-    list(REMOVE_ITEM GSERVER_SOURCES
-         gradientmachines/MultiNetwork.cpp
-         gradientmachines/RecurrentGradientMachine.cpp
-         gradientmachines/ParallelNeuralNetwork.cpp
-         gradientmachines/GradientMachineMode.cpp
-         gradientmachines/MultiGradientMachine.cpp)
-
-    # Remove layers that used in training
-    list(REMOVE_ITEM GSERVER_SOURCES
-    	 layers/RecurrentLayerGroup.cpp
-         layers/CostLayer.cpp
-         layers/MultiBoxLossLayer.cpp
-         layers/WarpCTCLayer.cpp
-         layers/CTCLayer.cpp
-         layers/LinearChainCTC.cpp
-         layers/PrintLayer.cpp)
-    list(REMOVE_ITEM GSERVER_SOURCES
-         layers/OuterProdLayer.cpp
-         layers/SumToOneNormLayer.cpp
-         layers/ConvShiftLayer.cpp
-         layers/InterpolationLayer.cpp
-         layers/AgentLayer.cpp
-         layers/DotMulOperator.cpp
-         layers/GruStepLayer.cpp
-         layers/LstmStepLayer.cpp
-         layers/ConvexCombinationLayer.cpp
-         layers/Conv3DLayer.cpp
-         layers/DeConv3DLayer.cpp
-         layers/CropLayer.cpp
-         layers/CrossEntropyOverBeam.cpp
-         layers/DataNormLayer.cpp
-         layers/FeatureMapExpandLayer.cpp
-         layers/HierarchicalSigmoidLayer.cpp
-         layers/MultinomialSampler.cpp
-         layers/NCELayer.cpp
-         layers/KmaxSeqScoreLayer.cpp
-         layers/MDLstmLayer.cpp
-         layers/MultiplexLayer.cpp
-         layers/PadLayer.cpp
-         layers/Pool3DLayer.cpp
-         layers/ResizeLayer.cpp
-         layers/RotateLayer.cpp
-         layers/RowConvLayer.cpp
-         layers/RowL2NormLayer.cpp
-         layers/SamplingIdLayer.cpp
-         layers/ScaleShiftLayer.cpp
-         layers/SelectiveFullyConnectedLayer.cpp
-         layers/SpatialPyramidPoolLayer.cpp
-         layers/BilinearInterpLayer.cpp
-         layers/ClipLayer.cpp)
-endif()
-
-if(WITH_GPU)
-    cuda_add_library(paddle_gserver ${GSERVER_SOURCES})
-else()
-    add_library(paddle_gserver STATIC
-        ${GSERVER_SOURCES})
-endif()
-
-add_dependencies(paddle_gserver paddle_proto ${external_project_dependencies})
-if(WITH_TESTING)
-    add_subdirectory(tests)
-endif()
diff --git a/paddle/legacy/gserver/activations/ActivationFunction.cpp b/paddle/legacy/gserver/activations/ActivationFunction.cpp
deleted file mode 100644
index ae07c7e6d7f..00000000000
--- a/paddle/legacy/gserver/activations/ActivationFunction.cpp
+++ /dev/null
@@ -1,509 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ActivationFunction.h"
-
-#include <algorithm>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <thread>
-#include <type_traits>
-#include "paddle/legacy/parameter/Argument.h"
-#include "paddle/legacy/utils/ClassRegistrar.h"
-#include "paddle/legacy/utils/Logging.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "MKLDNNActivation.h"
-#endif
-
-namespace paddle {
-
-static ClassRegistrar<ActivationFunction> gActivationRegistrar;
-/**
- * @def ACTIVATION_CLASS_NAME
- * @brief Macro for getting derived activation class name
- * @note ACTIVATION_CLASS_NAME(softmax) softmax_;
- * means softmaxActivation softmax_;
- */
-#define ACTIVATION_CLASS_NAME(ACTIVATION_NAME) ACTIVATION_NAME##Activation
-/**
- * @def BEGIN_DEFINE_ACTIVATION
- * @brief Macro for defining a devried activation class
- */
-#define BEGIN_DEFINE_ACTIVATION(ACTIVATION_NAME)                             \
-  class ACTIVATION_CLASS_NAME(ACTIVATION_NAME) : public ActivationFunction { \
-   private:                                                                  \
-    static const std::string name;                                           \
-                                                                             \
-   public:                                                                   \
-    const std::string& getName() const { return name; }
-/**
- * @def END_DEFINE_ACTIVATION
- * @brief Macro for registering a derived activation class
- */
-#define END_DEFINE_ACTIVATION(ACTIVATION_NAME)                     \
-  }                                                                \
-  ;                                                                \
-  const std::string ACTIVATION_CLASS_NAME(ACTIVATION_NAME)::name = \
-      #ACTIVATION_NAME;                                            \
-  static InitFunction __reg_activation__##ACTIVATION_NAME([] {     \
-    gActivationRegistrar                                           \
-        .registerClass<ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(    \
-            #ACTIVATION_NAME);                                     \
-  });
-
-/**
- * @brief The IdentityActivation class
- *
- * Do nothing when forward/backward.
- */
-class IdentityActivation : public ActivationFunction {
- public:
-  static const std::string name;
-  Error __must_check forward(Argument& act) {
-    (void)act;
-    return Error();
-  }
-  Error __must_check backward(Argument& act) {
-    (void)act;
-    return Error();
-  }
-  const std::string& getName() const { return name; }
-};
-const std::string IdentityActivation::name = "";
-static InitFunction __reg_activation__identity([] {
-  gActivationRegistrar.registerClass<IdentityActivation>("");
-  gActivationRegistrar.registerClass<IdentityActivation>("linear");
-});
-
-/**
- * @brief Sigmoid Activation
- * \f[
- * f(z) = \frac{1}{1+exp(-z)}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(sigmoid)
-Error __must_check forward(Argument& act) {
-  act.value->sigmoid(*act.value);
-  return Error();
-}
-Error __must_check backward(Argument& act) {
-  act.grad->sigmoidDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(sigmoid)
-
-/**
- * @brief Softmax Activation
- * \f[
- * P(y=j|x) = \frac{e^{x^Tw_j}}{\sum^K_{k=1}e^{x^Tw_k}}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(softmax)
-private:
-MatrixPtr sftMaxSum_;
-MatrixPtr sftMaxDot_;
-
-public:
-Error __must_check forward(Argument& act) {
-  act.value->softmax(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  MatrixPtr outputV = act.value;
-  MatrixPtr outputG = act.grad;
-
-  if (outputG->useGpu()) {
-    outputG->softmaxBackward(*outputV);
-  } else {
-    SetDevice device(act.deviceId);
-    Matrix::resizeOrCreate(sftMaxDot_,
-                           outputG->getHeight(),
-                           outputG->getWidth(),
-                           /* trans */ false,
-                           useGpu(act.deviceId));
-    Matrix::resizeOrCreate(sftMaxSum_,
-                           outputG->getHeight(),
-                           1,
-                           /* trans */ false,
-                           useGpu(act.deviceId));
-
-    sftMaxDot_->dotMul(*outputG, *outputV);
-    sftMaxSum_->colMerge(*sftMaxDot_);
-
-    act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
-  }
-  return Error();
-}
-END_DEFINE_ACTIVATION(softmax)
-
-/**
- * @brief Sequence_softmax Activation
- * @note Softmax on all frames of one sequence.
- * Width of frame must be one.
- */
-BEGIN_DEFINE_ACTIVATION(sequence_softmax)
-private:
-ACTIVATION_CLASS_NAME(softmax) softmax_;
-Argument argument_;
-
-public:
-Error __must_check forward(Argument& act) {
-  if (act.value->getWidth() != 1UL) {
-    return Error(
-        "Input width for each timestep of sequence softmax should be 1");
-  }
-
-  if (!argument_.value) {
-    argument_.value = Matrix::create(nullptr,
-                                     /* height= */ 1,
-                                     1,
-                                     /* trans= */ false,
-                                     useGpu(act.deviceId));
-    argument_.grad = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    1,
-                                    /* trans= */ false,
-                                    useGpu(act.deviceId));
-  }
-
-  auto starts =
-      act.hasSubseq()
-          ? act.subSequenceStartPositions->getVector(useGpu(act.deviceId))
-          : act.sequenceStartPositions->getVector(useGpu(act.deviceId));
-  act.value->sequenceSoftmax(*act.value, *starts);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  if (act.value->getWidth() != 1UL) {
-    return Error(
-        "Input width for each timestep of sequence softmax should be 1");
-  }
-
-  size_t numSequences =
-      act.hasSubseq() ? act.getNumSubSequences() : act.getNumSequences();
-  const int* starts = act.getCpuStartPositions();
-
-  for (size_t i = 0; i < numSequences; ++i) {
-    // TODO(Dangqingqing) optimization for GPU
-    size_t offset = starts[i];
-    size_t size = starts[i + 1] - starts[i];
-    argument_.value->setData(act.value->getData() + offset, 1UL, size);
-    argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
-
-    Error err = softmax_.backward(argument_);
-    if (!err.isOK()) return err;
-  }
-  return Error();
-}
-END_DEFINE_ACTIVATION(sequence_softmax)
-
-/*
- * @brief SoftSign Activation.
- * \f[
- * f(z) = \frac{z}{1 + |z|}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(softsign)
-private:
-MatrixPtr denominator_;
-
-Error __must_check forward(Argument& act) {
-  size_t height = act.value->getHeight();
-  size_t width = act.value->getWidth();
-  Matrix::resizeOrCreate(
-      denominator_, height, width, false, useGpu(act.deviceId));
-  denominator_->assign(*act.value);
-  denominator_->abs2();
-  denominator_->add(1.);
-
-  act.value->dotDiv(*act.value, *denominator_);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  denominator_->square2();
-  denominator_->scalarDiv(*denominator_, 1.);
-  act.grad->dotMul(*act.grad, *denominator_);
-  return Error();
-}
-END_DEFINE_ACTIVATION(softsign)
-
-/**
- * @brief Relu Activation.
- * forward. y = max(0, z)
- *
- * derivative of relu is:
- *
- *    1 if z > 0
- *
- *    0 otherwise.
- */
-BEGIN_DEFINE_ACTIVATION(relu)
-Error __must_check forward(Argument& act) {
-  act.value->relu(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->reluDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(relu)
-
-/**
- * @brief BRelu Activation.
- *
- * forward. y = min(24, max(0, z))
- *
- * derivative of brelu is:
- *
- *    1 if 0 < z < 24
- *
- *    0 otherwise.
- *
- * TODO(yuyang18): Remove magic number 24 or make it configuable.
- */
-BEGIN_DEFINE_ACTIVATION(brelu)
-Error __must_check forward(Argument& act) {
-  act.value->brelu(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->breluDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(brelu)
-
-/**
- * @brief Tanh Activation.
- * \f[
- * f(z) = tanh(z)=\frac{e^z-e^{-z}}{e^z+e^{-z}}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(tanh)
-Error __must_check forward(Argument& act) {
-  act.value->tanh(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->tanhDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(tanh)
-
-/**
- * @brief Scaled Tanh Activation
- * \f[
- * f(z) = 1.7159 * tanh(2/3*z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(stanh)
-private:
-real a, b;
-
-public:
-ACTIVATION_CLASS_NAME(stanh)() : a(1.7159), b(2. / 3.) {}
-Error __must_check forward(Argument& act) {
-  act.value->scaledTanh(*act.value, a, b);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->scaledTanhDerivative(*act.value, a, b);
-  return Error();
-}
-END_DEFINE_ACTIVATION(stanh)
-
-/**
- * @brief Soft Relu Activation.
- * \f[
- * f(z) = ln(1+e^z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(softrelu)
-Error __must_check forward(Argument& act) {
-  act.value->softrelu(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->softreluDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(softrelu)
-
-/**
- * @brief Abs Activation.
- * Forward: f(z) = abs(z)
- *
- * Derivative:
- *
- *     1   if z>0
- *
- *    -1   if z<0
- *
- *     0   if z=0
- */
-BEGIN_DEFINE_ACTIVATION(abs)
-Error __must_check forward(Argument& act) {
-  SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in,
-                         act.value->getHeight(),
-                         act.value->getWidth(),
-                         /* trans */ false,
-                         useGpu(act.deviceId));
-
-  act.in->copyFrom(*act.value);
-  act.value->abs2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->absDerivative(*act.in);
-  return Error();
-}
-END_DEFINE_ACTIVATION(abs)
-
-/**
- * @brief Square Activation.
- * \f[
- * f(z) = z^2.
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(square)
-Error __must_check forward(Argument& act) {
-  SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in,
-                         act.value->getHeight(),
-                         act.value->getWidth(),
-                         /* trans */ false,
-                         useGpu(act.deviceId));
-
-  act.in->copyFrom(*act.value);
-  act.value->square2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->squareDerivative(*act.in);
-  return Error();
-}
-END_DEFINE_ACTIVATION(square)
-
-/**
- * @brief Exponential Activation.
- * \f[
- * f(z) = e^z
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(exponential)
-Error __must_check forward(Argument& act) {
-  act.value->exp2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->expDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(exponential)
-
-/**
- * @brief Reciprocal Activation.
- * \f[
- * f(z) = 1/z
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(reciprocal)
-Error __must_check forward(Argument& act) {
-  act.value->reciprocal2();
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->dotMulSquare(*act.value);
-  act.grad->neg();
-  return Error();
-}
-END_DEFINE_ACTIVATION(reciprocal)
-
-/**
- * @brief Square Root Activation.
- * \f[
- * f(z) = sqrt(z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(sqrt)
-Error __must_check forward(Argument& act) {
-  act.value->sqrt2();
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->dotDiv(*act.grad, *act.value);
-  act.grad->mulScalar(0.5);
-  return Error();
-}
-END_DEFINE_ACTIVATION(sqrt)
-
-/**
- * @brief Logarithm Activation.
- * \f[
- * f(z) = log(z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(log)
-Error __must_check forward(Argument& act) {
-  SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in,
-                         act.value->getHeight(),
-                         act.value->getWidth(),
-                         /* trans */ false,
-                         useGpu(act.deviceId));
-
-  act.in->copyFrom(*act.value);
-  act.value->log2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->dotDiv(*act.grad, *act.in);
-  return Error();
-}
-END_DEFINE_ACTIVATION(log)
-
-ActivationFunction* ActivationFunction::create(const std::string& type) {
-#ifdef PADDLE_WITH_MKLDNN
-  if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) {
-    return MKLDNNActivation::create(type);
-  }
-#endif
-
-  return gActivationRegistrar.createByType(type);
-}
-
-std::vector<std::string> ActivationFunction::getAllRegisteredTypes() {
-  std::vector<std::string> types;
-  gActivationRegistrar.forEachType(
-      [&](const std::string& type) { types.push_back(type); });
-  return types;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/activations/ActivationFunction.h b/paddle/legacy/gserver/activations/ActivationFunction.h
deleted file mode 100644
index 8bc5b0f529a..00000000000
--- a/paddle/legacy/gserver/activations/ActivationFunction.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-#include "paddle/legacy/utils/Error.h"
-
-namespace paddle {
-
-struct Argument;
-/**
- * @brief Activation function is a function that transforms a set of input
- * signals into an output signals. The purpose of the activation function
- * is to introduce non-liearilty into the network.
- *
- * @note Common activation function are provieded, including linear,
- * sigmoid, softmax, sequence_max, relu, brelu, tanh, stanh,
- * softrelu, abs, square, exponential.
- *
- */
-class ActivationFunction {
- public:
-  static ActivationFunction* create(const std::string& type);
-  static std::vector<std::string> getAllRegisteredTypes();
-
-  ActivationFunction() {}
-
-  virtual ~ActivationFunction() {}
-
-  /**
-   * @brief Foward propagation
-   *
-   * act.value <- f(act.value),
-   * where f is the activation function.
-   * Suppose that before calling forward(), act.value is x and
-   * after forward() is called, act.value is y, then y = f(x).
-   *
-   * Usually, act is Layer::output_
-   */
-  virtual Error __must_check forward(Argument& act) = 0;
-
-  /**
-   * @brief Backward propagaion
-   *
-   * x and y are defined in the above comment for forward().
-   * - Before calling backward(), act.grad = dE / dy, where E is the error/cost
-   * - After backward() returns, act.grad = dE / dx = (dE/dy) * (dy/dx)
-   */
-  virtual Error __must_check backward(Argument& act) = 0;
-
-  virtual const std::string& getName() const = 0;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/activations/MKLDNNActivation.cpp b/paddle/legacy/gserver/activations/MKLDNNActivation.cpp
deleted file mode 100644
index 2eed7af70a8..00000000000
--- a/paddle/legacy/gserver/activations/MKLDNNActivation.cpp
+++ /dev/null
@@ -1,249 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNActivation.h"
-#include "mkldnn.hpp"
-#include "paddle/legacy/utils/ClassRegistrar.h"
-
-namespace paddle {
-
-static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
-/**
- * @def MKLDNN_ACTIVATION_CLASS_NAME
- * @note MKLDNN_ACTIVATION_CLASS_NAME(relu) relu_;
- * means mkldnn_reluActivation relu_;
- */
-#define MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) mkldnn_##ACT_TYPE##Activation
-
-/**
- * @def BEGIN_MKLDNN_ACTIVATION
- */
-#define BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
-  class MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) : public BASE_CLASS {
-/**
- * @def END_MKLDNN_ACTIVATION
- */
-#define END_MKLDNN_ACTIVATION(ACT_TYPE)                            \
- private:                                                          \
-  static const std::string name;                                   \
-                                                                   \
- public:                                                           \
-  const std::string& getName() const { return name; }              \
-  }                                                                \
-  ;                                                                \
-  const std::string MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::name = \
-      "mkldnn_" #ACT_TYPE;                                         \
-  static InitFunction __reg_activation__mkldnn_##ACT_TYPE([] {     \
-    gMKLDNNActivationRegistrar                                     \
-        .registerClass<MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)>(    \
-            "mkldnn_" #ACT_TYPE);                                  \
-  });
-
-/**
- * @def DEFINE_MKLDNN_ACTIVATION
- */
-#define DEFINE_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
-  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)        \
-  END_MKLDNN_ACTIVATION(ACT_TYPE)
-
-/**
- * @def DEFINE_MKLDNN_ELTWISE_ACTIVATION
- */
-#define DEFINE_MKLDNN_ELTWISE_ACTIVATION(                            \
-    ACT_TYPE, BASE_CLASS, ALPHA, BWD_ALPHA)                          \
-  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)                      \
- private:                                                            \
-  static const float alpha;                                          \
-  static const float bwdAlpha;                                       \
-                                                                     \
- public:                                                             \
-  float getAlpha() const { return alpha; }                           \
-  float getBwdAlpha() const { return bwdAlpha; }                     \
-  END_MKLDNN_ACTIVATION(ACT_TYPE)                                    \
-  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::alpha = ALPHA; \
-  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::bwdAlpha = BWD_ALPHA;
-
-/**
- * @brief MKLDNN Relu Activation.
- * Actually mkldnn_relu is Leaky Relu.
- *  f(x) = x                   (x >= 0)
- *  f(x) = negative_slope * x  (x <  0)
- * @note the negative_slope should be -0.f in forward
- */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(relu, MKLDNNEltwiseActivation, -0.f, 0.f)
-
-/**
- * @brief MKLDNN Tanh Activation.
- */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(tanh, MKLDNNEltwiseActivation, 0.f, 0.f)
-
-/**
- * @brief MKLDNN ELU(Exponential Linear Unit) Activation.
- *  f(x) = x                              (x >= 0)
- *  f(x) = negative_slope * (exp(x) - 1)  (x <  0)
- */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(elu, MKLDNNEltwiseActivation, 0.f, 0.f)
-
-mkldnn::algorithm MKLDNNEltwiseActivation::getAlgo(std::string type) const {
-  const std::map<std::string, mkldnn::algorithm> algoMap = {
-      {"relu", algorithm::eltwise_relu},
-      {"tanh", algorithm::eltwise_tanh},
-      {"elu", algorithm::eltwise_elu}};
-  type.erase(0, 7);  // remove mkldnn_
-  algorithm algo = (algorithm)0;
-  mapGet(type, algoMap, &algo);
-  return algo;
-}
-
-void MKLDNNEltwiseActivation::resetFwd(Argument& act) {
-  if (cnt_ == act.value->getElementCnt()) {
-    return;
-  }
-  MKLDNNActivation::resetFwd(act);
-  // note: alpha represents the NegativeSlope when used in relu.
-  float alpha = getAlpha();
-  float beta = getBeta();
-  algorithm algo = getAlgo(this->getName());
-  auto fwdDesc = eltwise_fwd::desc(mkldnn::prop_kind::forward_training,
-                                   algo,
-                                   val_->getMemoryDesc(),
-                                   alpha,
-                                   beta);
-  fwdPD_.reset(new eltwise_fwd::primitive_desc(fwdDesc, *engine_));
-  // use inplace for forward but save input value before submit
-  inVal_ = val_;
-  copyInVal_ = nullptr;
-  if (act.grad && algo == algorithm::eltwise_tanh) {
-    // tanh need save src input for backward
-    inVal_ = MKLDNNMatrix::create(val_->getPrimitiveDesc());
-    copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
-    CHECK(copyInVal_) << "should not be emptry";
-    pipelineFwd_.push_back(*copyInVal_);
-  }
-  fwd_.reset(new eltwise_fwd(*fwdPD_, *val_, *val_));
-  pipelineFwd_.push_back(*fwd_);
-  needResetBwd_ = true;
-}
-
-void MKLDNNEltwiseActivation::resetBwd(Argument& act) {
-  if (!needResetBwd_) {
-    return;
-  }
-  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
-  needResetBwd_ = false;
-  algorithm algo = getAlgo(this->getName());
-  float alpha = getBwdAlpha();
-  float beta = getBeta();
-  grad_ = MKLDNNMatrix::create(val_->getPrimitiveDesc(), act.grad);
-  auto eng = CPUEngine::Instance().getEngine();
-  auto bwdDesc = eltwise_bwd::desc(
-      algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
-  auto bwdPD = eltwise_bwd::primitive_desc(bwdDesc, eng, *fwdPD_);
-  CHECK(inVal_);
-  bwd_.reset(new eltwise_bwd(bwdPD, *inVal_, *grad_, *grad_));
-  pipelineBwd_.clear();
-  pipelineBwd_.push_back(*bwd_);
-}
-
-/**
- * @brief MKLDNN Softmax Activation
- */
-DEFINE_MKLDNN_ACTIVATION(softmax, MKLDNNSoftmaxActivation)
-
-void MKLDNNSoftmaxActivation::resetFwd(Argument& act) {
-  if (cnt_ == act.value->getElementCnt()) {
-    return;
-  }
-  MKLDNNActivation::resetFwd(act);
-  int axis = 1;
-  auto fwdDesc = softmax_fwd::desc(
-      mkldnn::prop_kind::forward_scoring, val_->getMemoryDesc(), axis);
-  auto fwdPD = softmax_fwd::primitive_desc(fwdDesc, *engine_);
-  fwd_.reset(new softmax_fwd(fwdPD, *val_, *val_));
-  pipelineFwd_.push_back(*fwd_);
-}
-
-Error __must_check MKLDNNSoftmaxActivation::forward(Argument& act) {
-  resetFwd(act);
-  stream_->submit(pipelineFwd_);
-  real* v = act.value->getData();
-  real threshold = exp(-64);
-#pragma omp parallel for
-  for (size_t i = 0; i < act.value->getElementCnt(); ++i) {
-    v[i] = v[i] < threshold ? threshold : v[i];
-  }
-  return Error();
-}
-
-Error __must_check MKLDNNSoftmaxActivation::backward(Argument& act) {
-  MatrixPtr outputV = act.value;
-  MatrixPtr outputG = act.grad;
-  Matrix::resizeOrCreate(sftMaxDot_,
-                         outputG->getHeight(),
-                         outputG->getWidth(),
-                         /* trans */ false,
-                         /* useGpu */ false);
-  Matrix::resizeOrCreate(sftMaxSum_,
-                         outputG->getHeight(),
-                         1,
-                         /* trans */ false,
-                         /* useGpu */ false);
-  sftMaxDot_->dotMul(*outputG, *outputV);
-  sftMaxSum_->colMerge(*sftMaxDot_);
-  act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
-  return Error();
-}
-
-ActivationFunction* MKLDNNActivation::create(const std::string& type) {
-  return gMKLDNNActivationRegistrar.createByType(type);
-}
-
-std::vector<std::string> MKLDNNActivation::getAllRegisteredTypes() {
-  std::vector<std::string> types;
-  gMKLDNNActivationRegistrar.forEachType(
-      [&](const std::string& type) { types.push_back(type); });
-  return types;
-}
-
-void MKLDNNActivation::resetFwd(Argument& act) {
-  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
-  cnt_ = act.value->getElementCnt();
-  pipelineFwd_.clear();
-  stream_.reset(new MKLDNNStream());
-  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
-  val_ = std::dynamic_pointer_cast<MKLDNNMatrix>(act.value);
-  if (val_ == nullptr) {
-    int bs = act.getBatchSize();
-    int ih = act.getFrameHeight() > 0 ? act.getFrameHeight() : 1;
-    int iw = act.getFrameWidth() > 0 ? act.getFrameWidth() : 1;
-    int ic = cnt_ / bs / ih / iw;
-    CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
-    val_ = MKLDNNMatrix::create(
-        {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_, act.value);
-    CHECK(val_);
-    val_->downSpatial();
-  }
-}
-
-Error __must_check MKLDNNActivation::forward(Argument& act) {
-  resetFwd(act);
-  stream_->submit(pipelineFwd_);
-  return Error();
-}
-Error __must_check MKLDNNActivation::backward(Argument& act) {
-  resetBwd(act);
-  stream_->submit(pipelineBwd_);
-  return Error();
-}
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/activations/MKLDNNActivation.h b/paddle/legacy/gserver/activations/MKLDNNActivation.h
deleted file mode 100644
index 59c447ad073..00000000000
--- a/paddle/legacy/gserver/activations/MKLDNNActivation.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "ActivationFunction.h"
-#include "mkldnn.hpp"
-#include "paddle/legacy/gserver/layers/MKLDNNBase.h"
-#include "paddle/legacy/math/MKLDNNMatrix.h"
-#include "paddle/legacy/parameter/Argument.h"
-
-namespace paddle {
-
-/**
- * @brief Base class of MKLDNN Activation.
- * Common activation function are provieded,
- * including mkldnn_relu, mkldnn_elu, mkldnn_tanh, mkldnn_softmax
- */
-class MKLDNNActivation : public ActivationFunction {
- protected:
-  // input value element count
-  size_t cnt_;
-  // should not merge the resetBwd into resetFwd,
-  // because the grad data would be changing before backward.
-  bool needResetBwd_;
-  // mkldnn matrix, primitive, stream and pipeline
-  MKLDNNMatrixPtr val_;
-  MKLDNNMatrixPtr grad_;
-  std::shared_ptr<mkldnn::engine> engine_;
-  std::shared_ptr<MKLDNNStream> stream_;
-  std::shared_ptr<mkldnn::primitive> fwd_;
-  std::shared_ptr<mkldnn::primitive> bwd_;
-  std::vector<mkldnn::primitive> pipelineFwd_;
-  std::vector<mkldnn::primitive> pipelineBwd_;
-
- public:
-  MKLDNNActivation() : cnt_(0), needResetBwd_(true) {}
-  ~MKLDNNActivation() {}
-  static ActivationFunction* create(const std::string& type);
-  static std::vector<std::string> getAllRegisteredTypes();
-  virtual const std::string& getName() const = 0;
-  /**
-   * reset the forward primitives
-   */
-  virtual void resetFwd(Argument& act);
-  /**
-   * reset the backward primitives,
-   * can not merge this functions into resetFwd as the grad data
-   * would be changing before backward.
-   */
-  virtual void resetBwd(Argument& act) {}
-  virtual Error __must_check forward(Argument& act);
-  virtual Error __must_check backward(Argument& act);
-};
-
-/**
- * @brief Base class of MKLDNN Eltwise Activation,
- * includes mkldnn_relu, mkldnn_elu and mkldnn_tanh.
- */
-class MKLDNNEltwiseActivation : public MKLDNNActivation {
-  typedef mkldnn::eltwise_forward eltwise_fwd;
-  typedef mkldnn::eltwise_backward eltwise_bwd;
-  typedef mkldnn::algorithm algorithm;
-
- protected:
-  // save the forward primitive desc, which can be used backward
-  std::shared_ptr<eltwise_fwd::primitive_desc> fwdPD_;
-  // eltwise_bwd need src input value
-  MKLDNNMatrixPtr inVal_;
-  // use for copy data
-  std::shared_ptr<mkldnn::reorder> copyInVal_;
-
- public:
-  MKLDNNEltwiseActivation() {}
-  ~MKLDNNEltwiseActivation() {}
-  virtual const std::string& getName() const = 0;
-
-  // in common, the alpha of forward and backward should be equal.
-  // but for relu, to avoid negative value, they should be opposite
-  virtual float getAlpha() const = 0;
-  virtual float getBwdAlpha() const = 0;
-  virtual float getBeta() const { return 0.f; }
-  virtual algorithm getAlgo(std::string type) const;
-  void resetFwd(Argument& act) override;
-  void resetBwd(Argument& act) override;
-};
-
-/**
- * @brief Base class of MKLDNN softmax Activation,
- * only have mkldnn forward, use cpu implement for backward.
- */
-class MKLDNNSoftmaxActivation : public MKLDNNActivation {
-  typedef mkldnn::softmax_forward softmax_fwd;
-
- private:
-  // for backward
-  MatrixPtr sftMaxSum_;
-  MatrixPtr sftMaxDot_;
-
- public:
-  MKLDNNSoftmaxActivation() {}
-  ~MKLDNNSoftmaxActivation() {}
-  virtual const std::string& getName() const = 0;
-  void resetFwd(Argument& act) override;
-  Error __must_check forward(Argument& act) override;
-  Error __must_check backward(Argument& act) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/DataProvider.cpp b/paddle/legacy/gserver/dataproviders/DataProvider.cpp
deleted file mode 100644
index b67af8a326b..00000000000
--- a/paddle/legacy/gserver/dataproviders/DataProvider.cpp
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DataProvider.h"
-
-#include <unistd.h>
-#include <algorithm>
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/StringUtil.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-void BufferBatch::swap(BufferBatch* bufBatch) {
-  DataBatch* batchData = bufBatch->getDataBatch();
-  hl_event_t hlEvent = bufBatch->getCuEvent();
-  hl_stream_t hlStream = bufBatch->getCuStream();
-  bufBatch->setDataBatch(batchData_);
-  bufBatch->setCuStream(hlStream_);
-  bufBatch->setCuEvent(hlEvent_);
-
-  batchData_ = batchData;
-  hlEvent_ = hlEvent;
-  hlStream_ = hlStream;
-}
-
-void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
-  if (batchData_ == NULL) {
-    batchData_ = new DataBatch();
-  }
-  std::vector<Argument>& destData = batchData_->getStreams();
-  int numStreams = srcBatch->getNumStreams();
-  destData.resize(numStreams);
-  batchData_->setSize(srcBatch->getSize());
-  if (useGpu) {
-    createCuEvent();
-  }
-
-  for (int i = 0; i < numStreams; i++) {
-    destData[i].resizeAndCopyFrom(srcBatch->getStream(i), useGpu, hlStream_);
-  }
-  if (useGpu) {
-    hl_stream_record_event(hlStream_, hlEvent_);
-  }
-}
-
-DoubleBuffer::DoubleBuffer(DataProvider* dataPool,
-                           bool useGpu,
-                           int64_t batchSize) {
-  batchSize_ = batchSize;
-  dataPool_ = dataPool;
-  useGpu_ = useGpu;
-  dataQueue_ = new BufferBatchQueue();
-  bufferQueue_ = new BufferBatchQueue();
-
-  // insert a empty buffer
-  bufferQueue_->enqueue(new BufferBatch());
-  stopping_ = false;
-  pending_ = true;
-}
-
-DoubleBuffer::~DoubleBuffer() {
-  finishAsyncLoad();
-  while (dataQueue_->size()) {
-    BufferBatch* dataBtch = dataQueue_->dequeue();
-    delete dataBtch;
-    dataBtch = NULL;
-  }
-  while (bufferQueue_->size()) {
-    BufferBatch* bufBtch = bufferQueue_->dequeue();
-    delete bufBtch;
-    bufBtch = NULL;
-  }
-  delete dataQueue_;
-  dataQueue_ = NULL;
-  delete bufferQueue_;
-  bufferQueue_ = NULL;
-}
-
-void DoubleBuffer::removeOneBatch(DataBatch* dataBatch) {
-  // get data
-  BufferBatch* batch = dataQueue_->dequeue();
-  batch->syncEvent();  // when use GPU, need synchronized with the cuEvent
-  *dataBatch = *(batch->getDataBatch());
-
-  // push anothor buffer
-  if (*usingBatch_ == nullptr) {
-    *usingBatch_ = std::make_shared<BufferBatch>();
-  }
-
-  // Mark the using-batch
-  batch->swap((*usingBatch_).get());
-  bufferQueue_->enqueue(batch);
-
-  if (0 == dataBatch->getSize()) {
-    setPending(true);
-  }
-}
-
-void DoubleBuffer::insertOneBatch(DataBatch* batch) {
-  while (!bufferQueue_->waitNotEmptyFor(2 /* seconds */)) {  // time out
-    if (stopping_) return;
-  }
-  BufferBatch* bufBatch = bufferQueue_->dequeue();
-  // clone and copy the data from an Threadlocal Variable
-  bufBatch->clone(batch, useGpu_);
-  dataQueue_->enqueue(bufBatch);
-}
-
-void DoubleBuffer::asyncLoadBatch() {
-  int64_t actualSize = 0;
-  if (useGpu_) {
-    hl_set_device(FLAGS_gpu_id);
-  }
-  setPending(false);
-
-  while (true) {
-    taskReadySem_.wait();
-    if (stopping_) break;
-
-    while (batchSize_ == 0 && !stopping_) {
-      usleep(5);
-    }
-    if (stopping_) break;
-
-    do {
-      DataBatch newBatch;
-      {
-        REGISTER_TIMER("getNextBatchInternal");
-        actualSize = dataPool_->getNextBatchInternal(batchSize_, &newBatch);
-      }
-      insertOneBatch(&newBatch);
-    } while (actualSize > 0 && !stopping_);
-  }
-}
-
-void DoubleBuffer::startAsyncLoad() {
-  if (asyncLoader_ == nullptr) {
-    asyncLoader_.reset(new std::thread([this]() { this->asyncLoadBatch(); }));
-  }
-  taskReadySem_.post();
-}
-
-ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool>
-    DataProvider::registrar_;
-
-DataProvider* DataProvider::create(const DataConfig& config,
-                                   const ModelConfig& modelConfig,
-                                   bool useGpu) {
-  return registrar_.createByType(config.type(), config, modelConfig, useGpu);
-}
-
-REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
-REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
-
-int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
-  int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)
-                                    : getNextBatchInternal(size, batch);
-
-  if (!batchSize) return 0;
-
-  if (!config_.constant_slots_size()) return batchSize;
-
-  auto& constantSlots = *constantSlots_;
-  constantSlots.resize(config_.constant_slots_size());
-
-  for (int i = 0; i < config_.constant_slots_size(); ++i) {
-    MemoryHandlePtr handle =
-        constantSlots[i] ? constantSlots[i]->getMemoryHandle() : nullptr;
-    Matrix::resizeOrCreate(constantSlots[i],
-                           batchSize,
-                           1,         // = width
-                           false,     // = trans
-                           useGpu_);  // = useGpu
-    if (handle != constantSlots[i]->getMemoryHandle()) {
-      // memory buf was reallocated. We need to initialize the value
-      constantSlots[i]->assign(config_.constant_slots(i));
-    }
-    batch->appendData(constantSlots[i],
-                      batch->getStream(0).sequenceStartPositions);
-  }
-
-  return batchSize;
-}
-
-int64_t DataProvider::getNextBatchFromBuffer(int64_t size, DataBatch* batch) {
-  CHECK(doubleBuffer_ != nullptr);
-
-  if (doubleBuffer_->getBatchSize() != size) {
-    doubleBuffer_->setBatchSize(size);
-  }
-
-  doubleBuffer_->removeOneBatch(batch);
-  return batch->getSize();
-}
-
-void DataProvider::initAsyncLoader() {
-  if (doubleBuffer_ == nullptr) {
-    doubleBuffer_.reset(new DoubleBuffer(this, useGpu_));
-  }
-  useGpu_ = false;  // Avoid D2D copy, it will delay the computing performance
-}
-
-SimpleDataProviderBase::SimpleDataProviderBase(const DataConfig& config,
-                                               bool useGpu,
-                                               bool withInfo)
-    : DataProvider(config, useGpu) {
-  /* initialize the size of a sample, and the buffer */
-  sampleDim_ = config_.feat_dim() * (2 * config_.context_len() + 1);
-  bufferCapacity_ = config_.buffer_capacity();
-  withInfo_ = withInfo;
-  sampleNumInBuf_ = 0;
-  nextItemIndex_ = 0;
-
-  /* malloc buffer in cpu */
-  hInputDataBuf_ = std::make_shared<CpuMatrix>(bufferCapacity_, sampleDim_);
-  hInputLabelBuf_ = std::make_shared<CpuIVector>(bufferCapacity_);
-  hInputInfoBuf_ = std::make_shared<CpuIVector>(bufferCapacity_);
-}
-
-void SimpleDataProviderBase::shuffle() {
-  int i, t;
-  int len = sampleNumInBuf_;
-  std::vector<real> temp(sampleDim_);
-  real* data = hInputDataBuf_->getData();
-  int* label = hInputLabelBuf_->getData();
-  int* info = hInputInfoBuf_->getData();
-  int sampleSz = sizeof(real) * sampleDim_;
-  for (i = 0; i < len; i++) {
-    int randNum = rand();  // NOLINT TODO(yuyang18): Use rand_r instead?
-    t = randNum % (len - i) + i;
-    // swap
-    if (i != t) {
-      // swap data
-      memcpy(&temp[0], &data[i * sampleDim_], sampleSz);
-      memcpy(&data[i * sampleDim_], &data[t * sampleDim_], sampleSz);
-      memcpy(&data[t * sampleDim_], &temp[0], sampleSz);
-      std::swap(label[i], label[t]);
-      if (withInfo_) {
-        std::swap(info[i], info[t]);
-      }
-    }
-  }
-}
-
-int64_t SimpleDataProviderBase::getNextBatchInternal(int64_t size,
-                                                     DataBatch* batch) {
-  CHECK(batch != NULL);
-  batch->clear();
-
-  int64_t startIndex;
-  int64_t cpySize;
-
-  std::lock_guard<RWLock> guard(lock_);
-  if (sampleNumInBuf_ - nextItemIndex_ < size) {
-    int64_t n = fillBuffer();
-    VLOG(1) << "fillBuffer return " << n << " samples.\n";
-  }
-
-  startIndex = nextItemIndex_;
-  cpySize = std::min(size, sampleNumInBuf_ - nextItemIndex_);
-  nextItemIndex_ += cpySize;
-
-  if (cpySize > 0) {
-    real* data = hInputDataBuf_->getData() + startIndex * sampleDim_;
-    int* label = hInputLabelBuf_->getData() + startIndex;
-    int* info = hInputInfoBuf_->getData() + startIndex;
-
-    MatrixPtr& dataBatch = *dataBatch_;     // get the thread local object
-    IVectorPtr& labelBatch = *labelBatch_;  // get the thread local object
-    IVectorPtr& infoBatch = *infoBatch_;    // get the thread local object
-    if (!dataBatch) {
-      dataBatch = Matrix::create(cpySize, sampleDim_, false, useGpu_);
-      labelBatch = IVector::create(cpySize, useGpu_);
-      if (withInfo_) {
-        infoBatch = IVector::create(cpySize, 0);
-      }
-    } else {
-      dataBatch->resize(cpySize, sampleDim_);
-      labelBatch->resize(cpySize);
-      if (withInfo_) {
-        infoBatch->resize(cpySize);
-      }
-    }
-    dataBatch->copyFrom(data, cpySize * sampleDim_);
-    labelBatch->copyFrom(label, cpySize);
-    batch->appendData(dataBatch);
-    batch->appendLabel(labelBatch);
-    if (withInfo_) {
-      infoBatch->copyFrom(info, cpySize);
-      batch->appendLabel(infoBatch);
-    }
-  }
-
-  batch->setSize(cpySize);
-  return cpySize;
-}
-
-void SimpleDataProviderBase::reset() {
-  sampleNumInBuf_ = 0;
-  nextItemIndex_ = 0;
-  DataProvider::reset();
-}
-
-int64_t SimpleDataProviderBase::getSize() {
-  LOG(FATAL) << "Currently, not implemented";
-  return 0;
-}
-
-int64_t SimpleDataProviderBase::fillBuffer() {
-  int64_t n = sampleNumInBuf_ - nextItemIndex_;
-
-  /* flash the remaining data to the beginning of the buffer */
-  if (n > 0) {
-    hInputDataBuf_->copyFrom(
-        hInputDataBuf_->getData() + nextItemIndex_ * sampleDim_,
-        n * sampleDim_);
-    hInputLabelBuf_->copyFrom(hInputLabelBuf_->getData() + nextItemIndex_, n);
-    if (withInfo_) {
-      hInputInfoBuf_->copyFrom(hInputInfoBuf_->getData() + nextItemIndex_, n);
-    }
-  }
-
-  sampleNumInBuf_ =
-      n + fillBufferImp(hInputDataBuf_->getData() + n * sampleDim_,
-                        hInputLabelBuf_->getData() + n,
-                        hInputInfoBuf_->getData() + n,
-                        bufferCapacity_ - n);
-
-  /* for stachastic gradient training */
-  if (!skipShuffle_) {
-    shuffle();
-  }
-
-  nextItemIndex_ = 0;
-
-  return sampleNumInBuf_;
-}
-
-SimpleDataProvider::SimpleDataProvider(const DataConfig& config, bool useGpu)
-    : SimpleDataProviderBase(config, useGpu, /* withInfo= */ false),
-      currentSampleIndex_(0) {
-  loadData(config_.files());
-}
-
-SimpleDataProvider::~SimpleDataProvider() {}
-
-int64_t SimpleDataProvider::fillBufferImp(real* data,
-                                          int* label,
-                                          int* info,
-                                          int64_t size) {
-  (void)info;
-  int64_t n = std::min<int64_t>(labels_.size() - currentSampleIndex_, size);
-  memcpy(data,
-         &data_[currentSampleIndex_ * sampleDim_],
-         n * sampleDim_ * sizeof(real));
-  memcpy(label, &labels_[currentSampleIndex_], sizeof(int) * n);
-  currentSampleIndex_ += n;
-
-  return n;
-}
-
-void SimpleDataProvider::reset() {
-  currentSampleIndex_ = 0;
-  SimpleDataProviderBase::reset();
-}
-
-void SimpleDataProvider::loadData(const std::string& fileName) {
-  std::ifstream is(fileName);
-  CHECK(is) << "Fail to open " << fileName;
-  std::string line;
-  while (is) {
-    if (!getline(is, line)) break;
-    LOG(INFO) << "load data file " << line;
-    loadDataFile(line);
-  }
-  LOG(INFO) << "read done, num of instance=" << labels_.size()
-            << " data size=" << data_.size();
-}
-
-void SimpleDataProvider::loadDataFile(const std::string& fileName) {
-  std::ifstream is(fileName);
-  std::string line;
-  std::vector<std::string> pieces;
-  while (is) {
-    if (!getline(is, line)) break;
-    str::split(line, ' ', &pieces);
-    CHECK_EQ((uint64_t)(sampleDim_ + 1), pieces.size())
-        << " Dimension mismatch, " << pieces.size() - 1 << " in " << fileName
-        << " " << sampleDim_ << " from config";
-    labels_.push_back(atoi(pieces[0].c_str()));
-    for (int i = 0; i < sampleDim_; ++i) {
-      data_.push_back(atof(pieces[i + 1].c_str()));
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/DataProvider.h b/paddle/legacy/gserver/dataproviders/DataProvider.h
deleted file mode 100644
index c2e1c5fdd6d..00000000000
--- a/paddle/legacy/gserver/dataproviders/DataProvider.h
+++ /dev/null
@@ -1,480 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <mutex>
-#include <vector>
-
-#include "DataConfig.pb.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/parameter/Argument.h"
-#include "paddle/legacy/utils/ClassRegistrar.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Queue.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-/**
- * @def REGISTER_DATA_PROVIDER
- * @brief Macro for registering a data provider. The class type should contain
- *        a consturctor with parameter (DataConfig, bool).
- */
-#define REGISTER_DATA_PROVIDER(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {                    \
-    DataProvider::registrar_.registerClass(                              \
-        #__type_name,                                                    \
-        [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
-          DataProvider* dp = new __class_name(conf, useGpu);             \
-          return dp;                                                     \
-        });                                                              \
-  })
-
-/**
- * @def REGISTER_DATA_PROVIDER_EX
- * @brief Macro for registering a data provider, which contains a constructor
- *        with parameter (DataConfig, ModelConfig, bool).
- */
-#define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name)            \
-  static InitFunction __reg_type_##__type_name([] {                     \
-    DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-
-class DataBatch;
-class BufferBatch;
-typedef std::shared_ptr<DataBatch> DataBatchPtr;
-typedef std::shared_ptr<BufferBatch> BufferBatchPtr;
-/**
- * @brief Data for batch training a neural network
- */
-class DataBatch {
- public:
-  DataBatch() : size_(0) { data_.clear(); }
-  /**
-   * @brief Get batch size
-   * @return batch size
-   */
-  int64_t getSize() const { return size_; }
-  /**
-   * @brief Get num of sequences of sequence data
-   * @return num of sequences
-   */
-  int64_t getNumSequences() const {
-    if (data_.empty()) return size_;
-    return data_[0].sequenceStartPositions
-               ? data_[0].sequenceStartPositions->getSize() - 1
-               : size_;
-  }
-  /**
-   * @brief Set batch size
-   * @param[in] size size
-   */
-  void setSize(int64_t size) { size_ = size; }
-  /**
-   * @brief Get size of argument vector
-   * @return size of argument vector
-   * @note For usual supervised learning, input data and label is needed,
-   * then there will be two argument.
-   */
-  int64_t getNumStreams() const { return data_.size(); }
-
-  /**
-   * @brief Get a argument with index i
-   * @param[in] i index in argument vector
-   * @return a argument with index i
-   */
-  const Argument& getStream(int i) const { return data_[i]; }
-  /**
-   * @brief Get all argument
-   * @return an argument vector
-   */
-  std::vector<Argument>& getStreams() { return data_; }
-  /**
-   * @brief Get all argument const
-   * @return an argument vector
-   */
-  std::vector<Argument> getStreams() const { return data_; }
-  /**
-   * @brief Clear DataBatch
-   */
-  void clear() {
-    data_.clear();
-    size_ = 0;
-  }
-
-  /**
-   * @brief Append data to DataBatch
-   * @param[in] data  matrix data
-   * @note The order in which each data stream is appended must match the order
-   * specified in stream_names of DataConfig. The stream_names can be obtained
-   * using DataProvider::getStreamNames().
-   */
-  void appendData(MatrixPtr data) {
-    Argument argu;
-    argu.value = data;
-    data_.push_back(argu);
-  }
-
-  /**
-   * @brief Append sequence data to DataBatch
-   * @param[in] data                      matrix data
-   * @param[in] sequenceStartPositions    sequence data
-   * @note The order in which each data stream is appended must match the order
-   * specified in stream_names of DataConfig. The stream_names can be obtained
-   * using DataProvider::getStreamNames().
-   */
-  void appendData(const MatrixPtr& data,
-                  const ICpuGpuVectorPtr& sequenceStartPositions) {
-    Argument argu;
-    argu.value = data;
-    argu.sequenceStartPositions = sequenceStartPositions;
-    data_.push_back(argu);
-  }
-  /**
-   * @brief Append label data
-   * @param[in]  label    label data
-   * @param[in]  value    matrix data, default null
-   */
-  void appendLabel(IVectorPtr label, MatrixPtr value = nullptr) {
-    Argument argu;
-    argu.ids = label;
-    argu.value = value;
-    data_.push_back(argu);
-  }
-
-  /*
-   * @brief Append argument
-   * @param[in]  argus   DataBatch.getStreams()
-   * @param[in]  size    DataBatch.getSize()
-   * @param[in]  dataId  sub dataprovider id (in MultiDataProvider)
-   */
-  void appendArguments(const std::vector<Argument>& argus,
-                       int size,
-                       int dataId) {
-    size_ += size;
-    for (const auto& argu : argus) {
-      data_.push_back(argu);
-      data_.back().dataId = dataId;
-    }
-  }
-
- protected:
-  /**
-   * @brief batch size
-   */
-  int64_t size_;
-  /**
-   * @brief A batch data consist of a Argument vector,
-   * An argument corresponds to a type of input data.
-   */
-  std::vector<Argument> data_;
-};
-
-class BufferBatch {
- public:
-  BufferBatch() {
-    hlStream_ = HPPL_STREAM_DEFAULT;
-    hlEvent_ = NULL;
-    batchData_ = NULL;
-  }
-  ~BufferBatch() {
-    if (hlEvent_) {
-      hl_destroy_event(hlEvent_);
-      hlEvent_ = NULL;
-    }
-    delete batchData_;
-    batchData_ = NULL;
-  }
-
-  void setDataBatch(DataBatch* batchData) { batchData_ = batchData; }
-  DataBatch* getDataBatch() { return batchData_; }
-
-  void setCuStream(hl_stream_t stream) { hlStream_ = stream; }
-  hl_stream_t getCuStream() const { return hlStream_; }
-
-  void setCuEvent(hl_event_t event) { hlEvent_ = event; }
-
-  hl_event_t getCuEvent() const { return hlEvent_; }
-
-  void createCuEvent() {
-    if (!hlEvent_) {
-      hlStream_ = HPPL_STREAM_1;
-      hl_create_event(&hlEvent_);
-    }
-  }
-
-  void syncEvent() {
-    if (hlEvent_) {
-      hl_stream_wait_event(hlStream_, hlEvent_);
-    }
-  }
-
-  void swap(BufferBatch* bufBatch);
-  void clone(DataBatch* srcBatch, bool useGpu);
-
- protected:
-  DataBatch* batchData_;
-  hl_stream_t hlStream_;
-  hl_event_t hlEvent_;
-};
-
-class DataProvider;
-typedef std::shared_ptr<DataProvider> DataProviderPtr;
-
-typedef Queue<BufferBatch*> BufferBatchQueue;
-
-class DoubleBuffer {
- public:
-  DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
-  virtual ~DoubleBuffer();
-  void removeOneBatch(DataBatch* dataBatch);
-
-  void setBatchSize(int64_t newBatchSize) { batchSize_ = newBatchSize; }
-
-  int64_t getBatchSize() { return batchSize_; }
-
-  void startAsyncLoad();
-  void finishAsyncLoad() {
-    stopping_ = true;
-    taskReadySem_.post();
-    if (asyncLoader_) {
-      asyncLoader_->join();
-    }
-  }
-
-  void setPending(bool pending) { pending_ = pending; }
-
- protected:
-  virtual void asyncLoadBatch();
-  void insertOneBatch(DataBatch* batch);
-
-  DataProvider* dataPool_;
-  bool useGpu_;
-  int32_t batchSize_;
-  ThreadLocal<BufferBatchPtr> usingBatch_;
-  BufferBatchQueue* dataQueue_;
-  BufferBatchQueue* bufferQueue_;
-  std::unique_ptr<std::thread> asyncLoader_;
-  Semaphore taskReadySem_;
-  bool stopping_;
-  bool pending_;
-};
-
-/**
- * @brief Base class for DataProvider, which supplies data for training
- * @note It can supplies multiple streams of data.
- * For typical supervised training, there are two streams:
- * one is for input, one is for label.
- */
-class DataProvider {
- public:
-  static ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool> registrar_;
-  static DataProvider* create(const DataConfig& config,
-                              const ModelConfig& modelConfig,
-                              bool useGpu = FLAGS_use_gpu);
-
-  /**
-   * @brief create only used for unittest.
-   */
-  inline static DataProvider* create(const DataConfig& config,
-                                     bool useGpu = FLAGS_use_gpu) {
-    return create(config, ModelConfig(), useGpu);
-  }
-
-  DataProvider(const DataConfig& config, bool useGpu)
-      : config_(config),
-        skipShuffle_(false),
-        usageRatio_(config.usage_ratio()),
-        useGpu_(useGpu) {
-    if (config_.async_load_data()) {
-      initAsyncLoader();
-    }
-  }
-  virtual ~DataProvider() {}
-
-  const DataConfig& getConfig() const { return config_; }
-
-  void setSkipShuffle() { skipShuffle_ = true; }
-
-  /**
-   * @brief Get next batch of training samples
-   * @param[in]    size    size of training samples to get
-   * @param[out]   batch   a batch of training samples
-   * @return actual size of obtained training samples
-   */
-  int64_t getNextBatch(int64_t size, DataBatch* batch);
-
-  /**
-   * @brief Shuffle the data set
-   */
-  virtual void shuffle() = 0;
-
-  /**
-   * @brief reset all the value of index
-   * @note reset() must be called before any calls to getNextBatch()
-   * IMPORTANT: subclass reset() should always call the base class reset()
-   * at the end of the function
-   */
-  virtual void reset() {
-    if (doubleBuffer_ != nullptr) {
-      doubleBuffer_->startAsyncLoad();
-    }
-  }
-
-  /**
-   * @brief Get the size of training samples
-   * @return the number of training samples in the data set.
-   * @note return -1 to indicate unlimited number of samples.
-   */
-  virtual int64_t getSize() = 0;
-
-  /**
-   * @brief Get next batch training samples internally
-   * @param[in]    size      size of training samples to get
-   * @param[out]   batch     a batch of training samples
-   * @return actual size of obtained training samples
-   */
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) = 0;
-
- protected:
-  DataConfig config_;
-  bool skipShuffle_;
-  float usageRatio_;
-  bool useGpu_;
-  std::unique_ptr<DoubleBuffer> doubleBuffer_;
-  ThreadLocal<std::vector<MatrixPtr>> constantSlots_;
-  /**
-   * @@brief Get next batch training samples from buffer
-   * @param[in]    size      size of training samples to get
-   * @param[out]   batch     a batch of training samples
-   * @return actual size of obtained training samples
-   */
-  int64_t getNextBatchFromBuffer(int64_t size, DataBatch* batch);
-
-  void initAsyncLoader();
-};
-
-/**
- * A data provider which does nothing. It only serves as providing
- * necessary configurations such as stream_names
- */
-class DummyDataProvider : public DataProvider {
- public:
-  DummyDataProvider(const DataConfig& config, bool useGpu)
-      : DataProvider(config, useGpu) {}
-  virtual void shuffle() {}
-  virtual void reset() { DataProvider::reset(); }
-  virtual int64_t getSize() { return 0; }
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) {
-    (void)size;
-    (void)batch;
-    return 0;
-  }
-};
-
-/**
- * Data provider for one input and one integer label.
- */
-class SimpleDataProviderBase : public DataProvider {
- protected:
-  /// sample feature dimension
-  int64_t sampleDim_;
-  /// the number of samples
-  int64_t bufferCapacity_;
-  int64_t sampleNumInBuf_;
-  /// next item to read in buffer
-  int64_t nextItemIndex_;
-  /// some user defined info for validation
-  bool withInfo_;
-
-  /// data buffer: bufferCapacity_ * nDataDim_
-  CpuMatrixPtr hInputDataBuf_;
-
-  /// label buffer:bufferCapacity_ * 1
-  CpuIVectorPtr hInputLabelBuf_;
-
-  /// info buffer:bufferCapacity_ * 1
-  CpuIVectorPtr hInputInfoBuf_;
-
-  ThreadLocal<MatrixPtr> dataBatch_;
-  ThreadLocal<IVectorPtr> labelBatch_;
-  ThreadLocal<IVectorPtr> infoBatch_;
-
-  RWLock lock_;
-
- public:
-  SimpleDataProviderBase(const DataConfig& config, bool useGpu, bool withInfo);
-  ~SimpleDataProviderBase() {}
-
-  void shuffle();
-
-  virtual void reset();
-
-  virtual int64_t getSize();
-
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-
-  /// return the number of samples in the buffer
-  int64_t fillBuffer();
-
- protected:
-  /**
-   * @brief Fill at most size samples into data and label.
-   *
-   * Each input is stored in contiguous memory locations in data.
-   *
-   * data[n * sampleDim_] .. data[n * sampleDim_ + sampleDim_ - 1] is for
-   * the input of the n-th sample.
-   *
-   * label[n] is the label for the n-th sample.
-   */
-  virtual int64_t fillBufferImp(real* data,
-                                int* label,
-                                int* info,
-                                int64_t size) = 0;
-};
-
-class SimpleDataProvider : public SimpleDataProviderBase {
- public:
-  SimpleDataProvider(const DataConfig& config, bool useGpu);
-  ~SimpleDataProvider();
-  virtual void reset();
-
- protected:
-  void loadData(const std::string& fileName);
-  void loadDataFile(const std::string& fileName);
-  virtual int64_t fillBufferImp(real* data,
-                                int* label,
-                                int* info,
-                                int64_t size);
-
- protected:
-  size_t currentSampleIndex_;
-  std::vector<int> labels_;
-  std::vector<real> data_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/DataProviderGroup.h b/paddle/legacy/gserver/dataproviders/DataProviderGroup.h
deleted file mode 100644
index 91c94dc986c..00000000000
--- a/paddle/legacy/gserver/dataproviders/DataProviderGroup.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "DataProvider.h"
-
-namespace paddle {
-
-template <class T>
-class DataProviderGroup : public DataProvider {
- protected:
-  typedef T ProviderType;
-  typedef std::shared_ptr<ProviderType> ProviderPtrType;
-  ProviderPtrType provider_;
-
-  std::vector<std::string> fileList_;
-  std::mutex lock_;
-  std::unique_ptr<MultiThreadWorker<ProviderType>> loader_;
-
- public:
-  DataProviderGroup(const DataConfig& config, bool useGpu);
-  ~DataProviderGroup() {}
-
-  virtual void reset();
-  virtual void shuffle() {}
-  virtual int64_t getSize() { return -1; }
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-
- private:
-  void startLoader();
-  void stopLoader();
-  void forceStopLoader();
-  ProviderPtrType loadFile(const std::vector<std::string>& fileList);
-};
-
-template <class T>
-DataProviderGroup<T>::DataProviderGroup(const DataConfig& config, bool useGpu)
-    : DataProvider(config, useGpu) {
-  // load file list
-  loadFileList(config_.files(), fileList_);
-  CHECK_GT(fileList_.size(), 0LU);
-  LOG(INFO) << "load file list, numfiles=" << fileList_.size()
-            << ", max_num_of_data_providers_in_memory="
-            << (1 + config_.file_group_conf().queue_capacity() +
-                config_.file_group_conf().load_thread_num());
-}
-
-template <class T>
-void DataProviderGroup<T>::reset() {
-  forceStopLoader();
-  CHECK(!loader_);
-  provider_ = nullptr;
-
-  // shuffle file list
-  std::shuffle(
-      fileList_.begin(), fileList_.end(), ThreadLocalRandomEngine::get());
-
-  startLoader();
-  DataProvider::reset();
-}
-
-template <class T>
-int64_t DataProviderGroup<T>::getNextBatchInternal(int64_t size,
-                                                   DataBatch* batch) {
-  std::lock_guard<std::mutex> guard(lock_);
-
-  if (!loader_) {
-    return 0;
-  }
-  if (provider_) {
-    int64_t ret = provider_->getNextBatchInternal(size, batch);
-    if (ret > 0) {
-      return ret;
-    }
-  }
-
-  // else get data from next data provider
-  if (loader_->testResult()) {
-    LOG(INFO) << "WAIT provider";
-  }
-  provider_ = loader_->waitResult();
-  if (!provider_) {
-    stopLoader();  // All the data providers have been returned
-    return 0;
-  }
-  int64_t ret = provider_->getNextBatchInternal(size, batch);
-  CHECK(ret > 0) << "new data provider does not contain any valid samples!";
-  return ret;
-}
-
-template <class T>
-void DataProviderGroup<T>::startLoader() {
-  loader_.reset(new MultiThreadWorker<ProviderType>(
-      config_.file_group_conf().load_thread_num(),
-      config_.file_group_conf().queue_capacity()));
-
-  int loadFileCount = config_.file_group_conf().load_file_count();
-  for (size_t startPos = 0; startPos < fileList_.size();
-       startPos += loadFileCount) {
-    size_t endPos = std::min(fileList_.size(), startPos + loadFileCount);
-    std::vector<std::string> fileVec(fileList_.begin() + startPos,
-                                     fileList_.begin() + endPos);
-    loader_->addJob([this, fileVec]() -> ProviderPtrType {
-      return this->loadFile(fileVec);
-    });
-  }
-  loader_->stopAddJob();
-}
-
-template <class T>
-void DataProviderGroup<T>::stopLoader() {
-  if (loader_) {
-    loader_->stop();
-    loader_ = nullptr;
-  }
-}
-
-template <class T>
-void DataProviderGroup<T>::forceStopLoader() {
-  if (loader_) {
-    loader_->forceStop();
-    loader_ = nullptr;
-  }
-}
-
-template <class T>
-std::shared_ptr<T> DataProviderGroup<T>::loadFile(
-    const std::vector<std::string>& fileList) {
-  // disable async_load_data in sub dataprovider
-  DataConfig subConfig = config_;
-  subConfig.set_async_load_data(false);
-
-  CHECK(!fileList.empty()) << "fileList is empty";
-  ProviderPtrType provider =
-      std::make_shared<ProviderType>(subConfig, useGpu_, false);
-  provider->loadData(fileList);
-  provider->reset();
-  return provider;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp b/paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp
deleted file mode 100644
index e5fc6d8a88f..00000000000
--- a/paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MultiDataProvider.h"
-#include <algorithm>
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-using namespace std;
-
-MultiDataProvider::MultiDataProvider(const DataConfig& config,
-                                     const ModelConfig& modelConfig,
-                                     bool useGpu)
-    : DataProvider(config, useGpu) {
-  bool atLeastOneMainDataFlag = false;
-  totalDataRatio_ = 0;
-  LOG(INFO) << "MultiDataProvider: sub data provider size: "
-            << config.sub_data_configs_size();
-  LOG(INFO) << "MultiDataProvider: for_test: " << config.for_test();
-  isTestMode_ = config.for_test();
-  for (int i = 0; i < config.sub_data_configs_size(); i++) {
-    LOG(INFO) << "dataRatio of sub(" << i
-              << ") is: " << config.sub_data_configs(i).data_ratio();
-    totalDataRatio_ += config.sub_data_configs(i).data_ratio();
-    if (config.sub_data_configs(i).is_main_data()) {
-      LOG(INFO) << "main data is [" << i << "]";
-      atLeastOneMainDataFlag = true;
-    }
-  }
-  CHECK(atLeastOneMainDataFlag) << "all sub dataproviders in MultiData do not"
-                                << " have is_main_data flag";
-  LOG(INFO) << "totalDataRatio_=" << totalDataRatio_;
-  DataConfig subConfig;
-  int subDataProviderCount = config.sub_data_configs_size();
-  if (isTestMode()) {
-    LOG(INFO) << "construct MultiDataProvider in test mode";
-  } else {
-    LOG(INFO) << "construct MultiDataProvider in train mode";
-  }
-  subDataProviders_.resize(subDataProviderCount);
-  for (int i = 0; i < subDataProviderCount; i++) {
-    subConfig = config.sub_data_configs(i);
-    if (subConfig.async_load_data()) {
-      LOG(INFO) << "can not use async_load_data in sub dataprovider of "
-                   "MultiDataProvider";
-      subConfig.set_async_load_data(false);
-    }
-    subDataProviders_[i] = std::unique_ptr<DataProvider>(
-        DataProvider::create(subConfig, modelConfig, useGpu_));
-  }
-}
-
-void MultiDataProvider::reset() {
-  for (auto& elem : subDataProviders_) {
-    elem->reset();
-  }
-  DataProvider::reset();
-}
-
-void MultiDataProvider::shuffle() {
-  for (auto& elem : subDataProviders_) {
-    elem->shuffle();
-  }
-}
-
-int64_t MultiDataProvider::getNextBatchInternal(int64_t size,
-                                                DataBatch* batch) {
-  batch->clear();
-  for (size_t i = 0; i < subDataProviders_.size(); ++i) {
-    // calc size according to data ratio
-    int64_t subSize =
-        (int64_t)(1.0 * size * config_.sub_data_configs(i).data_ratio() /
-                  totalDataRatio_);
-    DataBatch subBatch;
-    int64_t realSize =
-        subDataProviders_[i]->getNextBatchInternal(subSize, &subBatch);
-    if (realSize == 0) {
-      // current subDataProvider has no data
-      if (!isTestMode()) {
-        // in train mode
-        if (config_.sub_data_configs(i).is_main_data()) {
-          // is main data provider. then return 0
-          batch->clear();
-          return 0;
-        } else {
-          // not main data provider, reset current subDataProvider and try again
-          subDataProviders_[i]->reset();
-          subBatch.clear();
-          realSize =
-              subDataProviders_[i]->getNextBatchInternal(subSize, &subBatch);
-          CHECK_GT(realSize, 0);
-        }
-      } else {
-        // in test mode, make an empty argument
-        Argument emptyArgu;
-        std::vector<Argument> argus;
-        argus.push_back(emptyArgu);
-        batch->appendArguments(argus, 0, -1);
-        continue;
-      }
-    }
-    batch->appendArguments(subBatch.getStreams(), subBatch.getSize(), i);
-  }
-  return batch->getSize();
-}
-
-REGISTER_DATA_PROVIDER_EX(multi, MultiDataProvider);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/MultiDataProvider.h b/paddle/legacy/gserver/dataproviders/MultiDataProvider.h
deleted file mode 100644
index baa1fc01900..00000000000
--- a/paddle/legacy/gserver/dataproviders/MultiDataProvider.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "DataProvider.h"
-
-namespace paddle {
-
-class MultiDataProvider : public DataProvider {
- protected:
-  std::vector<std::unique_ptr<DataProvider>> subDataProviders_;
-
- public:
-  MultiDataProvider(const DataConfig& config,
-                    const ModelConfig& modelConfig,
-                    bool useGpu);
-  ~MultiDataProvider() {}
-  virtual void reset();
-  virtual void shuffle();
-  virtual int64_t getSize() { return -1; }
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-  bool isTestMode() const { return isTestMode_; }
-
- private:
-  int totalDataRatio_;
-  bool isTestMode_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/ProtoReader.h b/paddle/legacy/gserver/dataproviders/ProtoReader.h
deleted file mode 100644
index 08d045226e1..00000000000
--- a/paddle/legacy/gserver/dataproviders/ProtoReader.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-
-#include <google/protobuf/io/coded_stream.h>
-#include <google/protobuf/io/gzip_stream.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/message_lite.h>
-
-namespace paddle {
-
-/**
- * ProtoReader/ProtoWriter are used to read/write a sequence of protobuf
- * messages from/to i/ostream.
- */
-class ProtoReader {
- public:
-  explicit ProtoReader(std::istream* s, bool dataCompression = false) {
-    CHECK(s) << "istream pointer is nullptr";
-    istreamInput_.reset(new google::protobuf::io::IstreamInputStream(s));
-    if (dataCompression) {
-      gzipInput_.reset(
-          new google::protobuf::io::GzipInputStream(istreamInput_.get()));
-      codedInput_.reset(
-          new google::protobuf::io::CodedInputStream(gzipInput_.get()));
-    } else {
-      codedInput_.reset(
-          new google::protobuf::io::CodedInputStream(istreamInput_.get()));
-    }
-    dataCompression_ = dataCompression;
-    approximateReadedBytes_ = 0;
-    codedInput_->SetTotalBytesLimit(kDefaultTotalBytesLimit,
-                                    kDefaultTotalBytesLimit);
-  }
-
-  /**
-   * read one message
-   */
-  bool read(google::protobuf::MessageLite* msg) {
-    if (approximateReadedBytes_ >= kMaxLimitBytes) {
-      // Once bytes we read get close to 64MB(larger than 55MB),
-      // we re-intialize the codedInputStream object.
-      approximateReadedBytes_ = 0;
-
-      /**
-       * Explicitly destroys the object owned by unique_ptr at first and then
-       * construct an new object.
-       *
-       * 1.reset()
-       *
-       * 2.reset(new ...)   <-- such sequence is EXTREAMLY important!
-       *
-       * Reason: (!!!Read me before you modify the following 2 lines of
-       * codes!!!)
-       *
-       * Otherwise, reset() method will ask the CodedInputStream constructor
-       * to construct the new object at first forcing the IstreamInputStream
-       * object to move its underlying pointer to the next 8192 bytes.
-       *
-       * Then the old object will be destroied calling
-       * IstreamInputStream::BackUp() to move the underlying pointer back.
-       * This means that the InstreamInputStream object is referenced by
-       * 2 different CodedInputStream object at the same time which "confuses"
-       * the position of istreamInput_'s underlying pointer. Such fatal
-       * confusion will lead to undefined behaviour when 'codedInput_' is
-       * used to read new data.
-       *
-       */
-      codedInput_.reset();
-      if (dataCompression_) {
-        codedInput_.reset(
-            new google::protobuf::io::CodedInputStream(gzipInput_.get()));
-      } else {
-        codedInput_.reset(
-            new google::protobuf::io::CodedInputStream(istreamInput_.get()));
-      }
-      codedInput_->SetTotalBytesLimit(kDefaultTotalBytesLimit,
-                                      kDefaultTotalBytesLimit);
-    }
-
-    uint32_t size;
-    if (!codedInput_->ReadVarint32(&size)) {
-      return false;
-    }
-    google::protobuf::io::CodedInputStream::Limit limit =
-        codedInput_->PushLimit(size);
-    CHECK(msg->ParseFromCodedStream(codedInput_.get()));
-    codedInput_->PopLimit(limit);
-
-    /**
-     * size is varint in the data file, we don't know the length.
-     * We assume every size takes 4 bytes in the data file.
-     */
-    approximateReadedBytes_ += 4 + size;
-    return true;
-  }
-
- protected:
-  std::unique_ptr<google::protobuf::io::ZeroCopyInputStream> istreamInput_;
-  std::unique_ptr<google::protobuf::io::GzipInputStream> gzipInput_;
-  std::unique_ptr<google::protobuf::io::CodedInputStream> codedInput_;
-  bool dataCompression_;
-
-  /**
-   * This is the maximum number of bytes that this CodedInputStream will read
-   * before refusing to continue.
-   */
-  static const int kDefaultTotalBytesLimit = 64 << 20;  // 64MB
-
-  /**
-   * If data readed by the reader is more than 55MB( << 64MB),
-   * we reset the CodedInputStream object.
-   * This can help avoid 64MB warning which will cause the ParseFromCodedStream
-   * to fail.
-   */
-  static const int kMaxLimitBytes = 55 << 20;
-
-  /**
-   * This variable dosen't store the exact bytes readed by CodedInputStream
-   * object since which is constructed. Instead, it store the approximate bytes
-   * because we can't tell how many bytes are readed by the object with the
-   * help of API.
-   *
-   * @note this code depends on protobuf 2.4.0. There is nothing like
-   * CodedInputStream::CurrentPosition() in protobuf 2.5.0 to tell us how many
-   * bytes has the object readed so far. Therefore, we calculated bytes
-   * ourselves.
-   */
-  int approximateReadedBytes_;
-};
-
-class ProtoWriter {
- public:
-  explicit ProtoWriter(std::ostream* s, bool dataCompression = false) {
-    CHECK(s) << "ostream pointer is nullptr";
-    ostreamOutput_.reset(new google::protobuf::io::OstreamOutputStream(s));
-    if (dataCompression) {
-      gzipOutput_.reset(
-          new google::protobuf::io::GzipOutputStream(ostreamOutput_.get()));
-      codedOutput_.reset(
-          new google::protobuf::io::CodedOutputStream(gzipOutput_.get()));
-    } else {
-      codedOutput_.reset(
-          new google::protobuf::io::CodedOutputStream(ostreamOutput_.get()));
-    }
-  }
-
-  /**
-   * write one message.
-   */
-  bool write(const google::protobuf::MessageLite& msg) {
-    codedOutput_->WriteVarint32(msg.ByteSize());
-    bool ret = msg.SerializeToCodedStream(codedOutput_.get());
-    return ret;
-  }
-
- protected:
-  std::unique_ptr<google::protobuf::io::ZeroCopyOutputStream> ostreamOutput_;
-  std::unique_ptr<google::protobuf::io::GzipOutputStream> gzipOutput_;
-  std::unique_ptr<google::protobuf::io::CodedOutputStream> codedOutput_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/PyDataProvider.cpp b/paddle/legacy/gserver/dataproviders/PyDataProvider.cpp
deleted file mode 100644
index 0827bd39d4c..00000000000
--- a/paddle/legacy/gserver/dataproviders/PyDataProvider.cpp
+++ /dev/null
@@ -1,498 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PyDataProvider.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-#ifndef PADDLE_NO_PYTHON
-REGISTER_DATA_PROVIDER(py, PyDataProvider);
-#endif
-
-PyDataProvider::PyDataProvider(const DataConfig& config,
-                               bool useGpu,
-                               bool loadDataAll)
-    : DataProvider(config, useGpu), batchSize_(0) {
-  PyGuard guard;
-  pyModuleName_ = config_.load_data_module();
-  pyClassName_ = config_.load_data_object();
-  if (config_.load_data_args() != "") {
-    pyUserArgs_["load_data_args"] = config_.load_data_args();
-  }
-
-  if (loadDataAll) {
-    std::vector<std::string> fileList;
-    if (!config_.files().empty()) {
-      loadFileList(config_.files(), fileList);
-    }
-    loadData(fileList);
-  }
-}
-
-void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
-  VLOG(1) << "module:" << pyModuleName_ << " class:" << pyClassName_;
-  classInstance_ =
-      createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
-  CHECK(classInstance_) << "Create class instance failed.";
-  PyObjectPtr obj(PyObject_CallMethod(
-      classInstance_.get(), const_cast<char*>("getHeader"), NULL));
-  CHECK_PY(obj) << "Call function getHeader failed.";
-  std::string headerInfo =
-      std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
-  parseHeaderData(headerInfo);
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-}
-
-void PyDataProvider::parseHeaderData(const std::string& headerData) {
-  char* pHeader = const_cast<char*>(headerData.c_str());
-  char* pHeaderEnd = pHeader + headerData.size();
-  slotNum_ = readT<unsigned int>(pHeader, pHeaderEnd);
-  unsigned int useSequenceFlag = readT<unsigned int>(pHeader, pHeaderEnd);
-  isIID_ = useSequenceFlag != 1;
-  slots_.clear();
-  slots_.reserve(slotNum_);
-  for (size_t i = 0; i < slotNum_; ++i) {
-    unsigned int slotType = readT<unsigned int>(pHeader, pHeaderEnd);
-    unsigned int slotDim = readT<unsigned int>(pHeader, pHeaderEnd);
-    slots_.emplace_back();
-    slots_.back().dim = slotDim;
-    slots_.back().type = static_cast<SlotDef_SlotType>(slotType);
-  }
-}
-
-void PyDataProvider::resetSlots() {
-  for (auto& slot : slots_) {
-    slot.indexData.clear();
-    slot.denseData.clear();
-    slot.sparseNonValueData.clear();
-    slot.sparseFloatValueData.clear();
-    slot.indices.clear();
-    slot.sequenceStartPositions.clear();
-    slot.sampleSequenceIdVec.clear();
-    slot.subSequenceStartPositions.clear();
-    slot.strData.clear();
-  }
-}
-
-void PyDataProvider::fillDenseSlot(ProtoSlot& slot,
-                                   char*& data,
-                                   const char* dataEnd) {
-  unsigned int dim = slot.dim;
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  slot.denseData.resize(slot.sampleNum * dim);
-#ifdef PADDLE_TYPE_DOUBLE
-  CHECK_LE(data + sizeof(real) * dim * slot.sampleNum, dataEnd)
-      << "std::copy data is out of range";
-  // PyDataProvider always provide data in float
-  float* dat = reinterpret_cast<float*>(data);
-  std::copy(dat, dat + slot.sampleNum * dim, slot.denseData.begin());
-#else
-  memcpyWithCheck(slot.denseData.data(),
-                  data,
-                  sizeof(real) * dim * slot.sampleNum,
-                  dataEnd);
-#endif
-  // PyDataProvider always provide data in float
-  data += sizeof(float) * dim * slot.sampleNum;
-}
-
-void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot,
-                                            char*& data,
-                                            const char* dataEnd) {
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  unsigned int* indexPtr = (unsigned int*)data;
-  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
-      << "Vector assign value is out of range";
-  slot.indices.assign(indexPtr, indexPtr + slot.sampleNum);
-  data += sizeof(unsigned int) * slot.sampleNum;
-  unsigned int length = 0;
-  length = readT<unsigned int>(data, dataEnd);
-  slot.indices.push_back(length);
-  slot.sparseNonValueData.resize(length);
-  memcpyWithCheck(slot.sparseNonValueData.data(),
-                  data,
-                  sizeof(unsigned int) * length,
-                  dataEnd);
-  data += sizeof(unsigned int) * length;
-}
-
-void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot,
-                                         char*& data,
-                                         const char* dataEnd) {
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  unsigned int* indexPtr = (unsigned int*)data;
-  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
-      << "Vector assign value is out of range";
-  slot.indices.assign(indexPtr, indexPtr + slot.sampleNum);
-  data += sizeof(unsigned int) * slot.sampleNum;
-  unsigned int length = 0;
-  length = readT<unsigned int>(data, dataEnd);
-  unsigned int* colPtr = reinterpret_cast<unsigned int*>(data);
-  CHECK_LE(data + sizeof(unsigned int) * length, dataEnd)
-      << "Data is out of range";
-  data += sizeof(unsigned int) * length;
-  size_t colLen = readT<unsigned int>(data, dataEnd);
-  CHECK_EQ(colLen, length);
-  float* valuePtr = reinterpret_cast<float*>(data);
-  CHECK_LE(data + sizeof(real) * length, dataEnd) << "Data is out of range";
-  data += sizeof(real) * length;
-  slot.indices.push_back(length);
-  slot.sparseFloatValueData.resize(length);
-  for (unsigned int ii = 0; ii < length; ++ii) {
-    slot.sparseFloatValueData[ii].col = colPtr[ii];
-    slot.sparseFloatValueData[ii].value = valuePtr[ii];
-  }
-}
-
-void PyDataProvider::fillIndexSlot(ProtoSlot& slot,
-                                   char*& data,
-                                   const char* dataEnd) {
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
-      << "Vector assign is out of range";
-  slot.indexData.assign(reinterpret_cast<int*>(data),
-                        reinterpret_cast<int*>(data) + slot.sampleNum);
-  data += sizeof(unsigned int) * slot.sampleNum;
-}
-
-void PyDataProvider::fillStringSlot(ProtoSlot& slot,
-                                    char*& data,
-                                    const char* dataEnd) {
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  for (unsigned int i = 0; i < slot.sampleNum; ++i) {
-    size_t len = readT<uint32_t>(data, dataEnd);
-    auto str_begin = data;
-    data += len;
-    CHECK_LE(data, dataEnd) << "Data is out of range";
-    slot.strData.emplace_back(str_begin, len);
-  }
-}
-
-void PyDataProvider::fillSlotsByStr(const std::string& samples) {
-  char* data = const_cast<char*>(samples.c_str());
-  char* dataEnd = data + samples.size();
-  batchSize_ = readT<unsigned int>(data, dataEnd);
-  if (0 == batchSize_) {
-    return;
-  }
-
-  for (size_t j = 0; j < slotNum_; ++j) {
-    auto& slot = slots_[j];
-    CHECK(SlotDef::INDEX >= slot.type || SlotDef::STRING == slot.type)
-        << " Slot type:" << slot.type << " is out of range.";
-    CHECK_GE(slot.type, SlotDef::VECTOR_DENSE) << " Slot type:" << slot.type
-                                               << " is out of range.";
-    switch (slot.type) {
-      case SlotDef::VECTOR_DENSE:
-        fillDenseSlot(slot, data, dataEnd);
-        break;
-      case SlotDef::VECTOR_SPARSE_NON_VALUE:
-        fillSparseNonValueSlot(slot, data, dataEnd);
-        break;
-      case SlotDef::VECTOR_SPARSE_VALUE:
-        fillSparseValueSlot(slot, data, dataEnd);
-        break;
-      case SlotDef::INDEX:
-        fillIndexSlot(slot, data, dataEnd);
-        break;
-      case SlotDef::VAR_MDIM_DENSE:
-        LOG(FATAL) << "Not implemented";
-        break;
-      case SlotDef::VAR_MDIM_INDEX:
-        LOG(FATAL) << "Not implemented";
-        break;
-      case SlotDef::STRING:
-        fillStringSlot(slot, data, dataEnd);
-        break;
-    }
-  }
-  // read sequenceStartPositions
-  for (size_t j = 0; j < slotNum_; ++j) {
-    auto& slot = slots_[j];
-    if (!iidData()) {
-      unsigned int sequenceNum = readT<unsigned int>(data, dataEnd);
-      slot.sequenceNum = sequenceNum;
-      for (size_t i = 0; i < sequenceNum; ++i) {
-        slot.sequenceStartPositions.push_back(
-            readT<unsigned int>(data, dataEnd));
-      }
-      for (size_t i = 0; i < sequenceNum; ++i) {
-        size_t begin = slot.sequenceStartPositions[i];
-        size_t end = (i < sequenceNum - 1) ? slot.sequenceStartPositions[i + 1]
-                                           : slot.sampleNum;
-        for (size_t ii = begin; ii < end; ++ii) {
-          slot.sampleSequenceIdVec.push_back(ii);
-        }
-      }
-    } else {
-      for (size_t i = 0; i < slot.sampleNum; ++i) {
-        slot.sampleSequenceIdVec.push_back(i);
-      }
-    }
-  }
-  // read subSequenceStartPositions, not all slots have this infomation.
-  for (size_t j = 0; j < slotNum_; ++j) {
-    auto& slot = slots_[j];
-    if (!iidData() && data != dataEnd) {
-      unsigned int subSequenceNum = readT<unsigned int>(data, dataEnd);
-      slot.subSequenceNum = subSequenceNum;
-      for (size_t i = 0; i < subSequenceNum; ++i) {
-        slot.subSequenceStartPositions.push_back(
-            readT<unsigned int>(data, dataEnd));
-      }
-    }
-  }
-}
-
-void PyDataProvider::reset() {
-  {  // Invoke PyDataProvider Reset
-    PyGuard guard;
-    PyObjectPtr obj(PyObject_CallMethod(
-        classInstance_.get(), const_cast<char*>("reset"), NULL));
-    CHECK_PY(obj) << "Call function reset failed.";
-  }
-
-  if (!skipShuffle_) {
-    // Invoke PyDataProvider Shuffle
-    shuffle();
-  }
-  DataProvider::reset();
-}
-
-void PyDataProvider::shuffle() {
-  // py shuffle
-  PyGuard guard;
-  PyObjectPtr obj(PyObject_CallMethod(
-      classInstance_.get(), const_cast<char*>("shuffle"), NULL));
-  CHECK_PY(obj) << "Call function shuffle failed.";
-}
-
-void PyDataProvider::handleDenseSlot(ProtoSlot& slot,
-                                     size_t slotIndex,
-                                     std::vector<Argument>& cpuArguments) {
-  unsigned int dim = slot.dim;
-  Matrix::resizeOrCreate(cpuArguments[slotIndex].value,
-                         slot.sampleNum,
-                         dim,
-                         false,   // trans = false
-                         false);  // useGpu = false
-  real* buf = cpuArguments[slotIndex].value->getData();
-  for (size_t i = 0; i < slot.sampleNum; ++i) {
-    memcpyWithCheck(buf + i * dim,
-                    slot.denseData.data() + slot.sampleSequenceIdVec[i] * dim,
-                    sizeof(real) * dim,
-                    slot.denseData.data() + slot.denseData.size());
-  }
-}
-
-void PyDataProvider::handleSparseNonValueSlot(
-    ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
-  unsigned int dim = slot.dim;
-  if (!(cpuArguments[slotIndex].value)) {
-    cpuArguments[slotIndex].value =
-        Matrix::createSparseMatrix(slot.sampleNum,
-                                   dim,
-                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
-                                   NO_VALUE,
-                                   SPARSE_CSR,
-                                   false,
-                                   useGpu_);
-  }
-  auto mat = cpuArguments[slotIndex].value;
-  mat->resize(slot.sampleNum, dim, slot.sampleNum, NO_VALUE, SPARSE_CSR);
-  if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-        slot.sampleSequenceIdVec.data(),
-        slot.indices.data(),
-        slot.sparseNonValueData.data(),
-        HPPL_STREAM_1);
-  } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-        slot.sampleSequenceIdVec.data(),
-        slot.indices.data(),
-        slot.sparseNonValueData.data());
-  } else {
-    LOG(FATAL) << "Not Supported";
-  }
-}
-
-void PyDataProvider::handleSparseValueSlot(
-    ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
-  unsigned int dim = slot.dim;
-  if (!(cpuArguments[slotIndex].value)) {
-    cpuArguments[slotIndex].value =
-        Matrix::createSparseMatrix(slot.sampleNum,
-                                   dim,
-                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
-                                   FLOAT_VALUE,
-                                   SPARSE_CSR,
-                                   false,
-                                   useGpu_);
-  }
-  auto mat = cpuArguments[slotIndex].value;
-  mat->resize(slot.sampleNum, dim, slot.sampleNum, FLOAT_VALUE, SPARSE_CSR);
-  if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-        slot.sampleSequenceIdVec.data(),
-        slot.indices.data(),
-        slot.sparseFloatValueData.data(),
-        HPPL_STREAM_DEFAULT);
-  } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-        slot.sampleSequenceIdVec.data(),
-        slot.indices.data(),
-        slot.sparseFloatValueData.data());
-  } else {
-    LOG(FATAL) << "Not Supported";
-  }
-}
-
-void PyDataProvider::handleIndexSlot(ProtoSlot& slot,
-                                     size_t slotIndex,
-                                     std::vector<Argument>& cpuArguments) {
-  IVector::resizeOrCreate(cpuArguments[slotIndex].ids,
-                          slot.sampleNum,
-                          /*useGpu_*/ false);
-  int* buf = cpuArguments[slotIndex].ids->getData();
-  for (size_t i = 0; i < slot.sampleNum; ++i) {
-    buf[i] = slot.indexData[slot.sampleSequenceIdVec[i]];
-  }
-}
-
-void PyDataProvider::handleStringSlot(ProtoSlot& slot,
-                                      size_t slotIndex,
-                                      std::vector<Argument>& cpuArguments) {
-  if (cpuArguments[slotIndex].strs) {
-    cpuArguments[slotIndex].strs->resize(slot.sampleNum);
-  } else {
-    cpuArguments[slotIndex].strs =
-        std::make_shared<std::vector<std::string>>(slot.sampleNum);
-  }
-  for (size_t i = 0; i < slot.sampleNum; ++i) {
-    (*cpuArguments[slotIndex].strs)[i] =
-        slot.strData[slot.sampleSequenceIdVec[i]];
-  }
-}
-
-int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
-  PyGuard guard;
-  PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
-                                      const_cast<char*>("getNextBatch"),
-                                      const_cast<char*>("i"),
-                                      size));
-  CHECK_PY(obj) << "Call function getNextBatch failed.";
-  const std::string& samples =
-      std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
-  resetSlots();
-  fillSlotsByStr(samples);
-  size = batchSize_;
-  if (size <= 0) return 0;
-
-  DataBatch& cpuBatch = *cpuBatch_;
-  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-  cpuBatch.setSize(size);
-  cpuArguments.resize(slotNum_);
-
-  if (!iidData()) {
-    for (size_t j = 0; j < slotNum_; ++j) {
-      auto& slot = slots_[j];
-      ICpuGpuVector::resizeOrCreate(cpuArguments[j].sequenceStartPositions,
-                                    slot.sequenceNum + 1,
-                                    /* useGpu= */ false);
-      int* buf = cpuArguments[j].sequenceStartPositions->getMutableData(false);
-      std::copy(slot.sequenceStartPositions.begin(),
-                slot.sequenceStartPositions.end(),
-                buf);
-      buf[slot.sequenceStartPositions.size()] = slot.sampleNum;
-
-      if (slot.subSequenceStartPositions.size()) {
-        ICpuGpuVector::resizeOrCreate(cpuArguments[j].subSequenceStartPositions,
-                                      slot.subSequenceNum + 1,
-                                      /*  useGpu= */ false);
-        int* buf =
-            cpuArguments[j].subSequenceStartPositions->getMutableData(false);
-        std::copy(slot.subSequenceStartPositions.begin(),
-                  slot.subSequenceStartPositions.end(),
-                  buf);
-        buf[slot.subSequenceNum] = slot.sampleNum;
-        // check subSequenceStartPositions and sequenceStartPositions
-        cpuArguments[j].checkSubset();
-      }
-    }
-  }
-
-  for (size_t slotIndex = 0; slotIndex < slotNum_; ++slotIndex) {
-    auto& slot = slots_[slotIndex];
-    SlotDef::SlotType slotType = slot.type;
-    switch (slotType) {
-      case SlotDef::VECTOR_DENSE:
-        handleDenseSlot(slot, slotIndex, cpuArguments);
-        break;
-      case SlotDef::VECTOR_SPARSE_NON_VALUE:
-        handleSparseNonValueSlot(slot, slotIndex, cpuArguments);
-        break;
-      case SlotDef::VECTOR_SPARSE_VALUE:
-        handleSparseValueSlot(slot, slotIndex, cpuArguments);
-        break;
-      case SlotDef::INDEX:
-        handleIndexSlot(slot, slotIndex, cpuArguments);
-        break;
-      case SlotDef::VAR_MDIM_DENSE:
-        LOG(FATAL) << "Not implemented";
-        break;
-      case SlotDef::VAR_MDIM_INDEX:
-        LOG(FATAL) << "Not implemented";
-        break;
-      case SlotDef::STRING:
-        handleStringSlot(slot, slotIndex, cpuArguments);
-        break;
-    }
-  }
-
-  if (useGpu_) {
-    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-    DataBatch& gpuBatch = *gpuBatch_;
-    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
-    gpuArguments.resize(cpuArguments.size());
-    gpuBatch.setSize(size);
-    for (size_t i = 0; i < slotNum_; ++i) {
-      SlotDef::SlotType slotType = slots_[i].type;
-      if (SlotDef::VECTOR_SPARSE_VALUE == slotType ||
-          SlotDef::VECTOR_SPARSE_NON_VALUE == slotType) {
-        gpuArguments[i] = cpuArguments[i];
-        gpuArguments[i].sequenceStartPositions =
-            cpuArguments[i].sequenceStartPositions;
-
-        if (slots_[i].subSequenceStartPositions.size()) {
-          gpuArguments[i].subSequenceStartPositions =
-              cpuArguments[i].subSequenceStartPositions;
-        }
-      } else {
-        gpuArguments[i].resizeAndCopyFrom(
-            cpuArguments[i], useGpu_, HPPL_STREAM_1);
-      }
-    }
-    hl_stream_synchronize(HPPL_STREAM_1);
-    *batch = gpuBatch;
-  } else {
-    *batch = cpuBatch;
-  }
-
-  return batch->getSize();
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/PyDataProvider.h b/paddle/legacy/gserver/dataproviders/PyDataProvider.h
deleted file mode 100644
index 4b8bea04a16..00000000000
--- a/paddle/legacy/gserver/dataproviders/PyDataProvider.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <paddle/legacy/utils/PythonUtil.h>
-#include "DataFormat.pb.h"
-#include "DataProvider.h"
-
-#include <vector>
-
-namespace paddle {
-
-class PyDataProvider : public DataProvider {
- public:
-  PyDataProvider(const DataConfig& config,
-                 bool useGpu,
-                 bool loadDataAll = true);
-
-  virtual void reset();
-
-  // Note this size includes the sequences which are skipped because they
-  // are longer than the batch size
-  virtual int64_t getSize() {
-    LOG(FATAL) << "Not implement yet";
-    return -1;
-  }
-  virtual void shuffle();
-
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-
- protected:
-  struct ProtoSlot;
-  // return false if each each sample is one sequence, i.e., independent
-  // of other samples.
-  inline bool iidData() const { return isIID_; }
-
-  void parseHeaderData(const std::string& headerData);
-  void fillDenseSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillSparseNonValueSlot(ProtoSlot& slot,
-                              char*& data,
-                              const char* dataEnd);
-  void fillSparseValueSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillIndexSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillStringSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillSlotsByStr(const std::string& samples);
-  void handleDenseSlot(ProtoSlot& slot,
-                       size_t slotIndex,
-                       std::vector<Argument>& cpuArguments);
-  void handleSparseNonValueSlot(ProtoSlot& slot,
-                                size_t slotIndex,
-                                std::vector<Argument>& cpuArguments);
-  void handleSparseValueSlot(ProtoSlot& slot,
-                             size_t slotIndex,
-                             std::vector<Argument>& cpuArguments);
-  void handleIndexSlot(ProtoSlot& slot,
-                       size_t slotIndex,
-                       std::vector<Argument>& cpuArguments);
-  void handleStringSlot(ProtoSlot& slot,
-                        size_t slotIndex,
-                        std::vector<Argument>& cpuArguments);
-  void resetSlots();
-  void loadData(const std::vector<std::string>& fileList);
-
- protected:
-  struct ProtoSlot {
-    SlotDef::SlotType type;
-    int dim;
-    unsigned int sampleNum;
-    unsigned int sequenceNum;
-    unsigned int subSequenceNum;
-    // Store the data of index type slot
-    std::vector<int> indexData;
-    // Store the data of dense type slot
-    std::vector<real> denseData;
-    // Store the data of sparseNonValue type slot
-    std::vector<sparse_non_value_t> sparseNonValueData;
-    // Store the data of sparseValue type slot
-    std::vector<sparse_float_value_t> sparseFloatValueData;
-    // Used to store the index of each sample in slot values
-    std::vector<int64_t> indices;
-    // The starting position of each sequence in samples
-    // The last element should be the number of samples
-    // If empty, each sample is one sequence.
-    std::vector<size_t> sequenceStartPositions;
-    // The index id of sequences in slot
-    std::vector<int64_t> sampleSequenceIdVec;
-    // The starting position of each subsequence in samples
-    // The last element should be the number of subsequence
-    // If empty, each sequence of sample has no subsequence.
-    std::vector<size_t> subSequenceStartPositions;
-    // Store the data of string type slot
-    std::vector<std::string> strData;
-  };
-  std::vector<ProtoSlot> slots_;
-
-  PyObjectPtr classInstance_;
-  unsigned int batchSize_;
-  unsigned int slotNum_;
-  // if use sequence, isIID_ equals false, otherwise it is true.
-  bool isIID_;
-  // The name of python module name
-  std::string pyModuleName_;
-  // The name of python class name
-  std::string pyClassName_;
-  // User args set in config
-  std::map<std::string, std::string> pyUserArgs_;
-
-  ThreadLocalD<DataBatch> cpuBatch_;
-  ThreadLocalD<DataBatch> gpuBatch_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp b/paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp
deleted file mode 100644
index 8e931e40611..00000000000
--- a/paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp
+++ /dev/null
@@ -1,1031 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_NO_PYTHON
-
-#include <Python.h>
-#include <numpy/numpyconfig.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <list>
-#include <unordered_set>
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#include <numpy/ndarrayobject.h>
-
-#include "DataProvider.h"
-
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-namespace unittest {
-
-static std::unique_ptr<std::function<void(size_t /*poolActualSize */)>>
-    OnPoolFilled;
-
-namespace pydp2 {
-
-void setOnPoolFilledHook(const std::function<void(size_t)>& callback) {
-  OnPoolFilled.reset(new std::function<void(size_t)>());
-  *OnPoolFilled = callback;
-}
-
-void clearOnPoolFilledHook() { OnPoolFilled.reset(); }
-
-}  // namespace pydp2
-}  // namespace unittest
-
-/**
- * Slot type
- */
-enum SlotType {
-  ST_DENSE = 0,
-  ST_NON_SPARSE_VALUE = 1,
-  ST_SPARSE_VALUE = 2,
-  ST_INDEX = 3
-};
-
-/**
- * Sequence type
- */
-enum SeqType { SQT_NONE = 0, SQT_SEQ, SQT_SUBSEQ };
-
-/**
- * Cache Type.
- */
-enum CacheType {
-  NO_CACHE = 0,           // Each pass will load data from PyDataProvider2.
-  CACHE_PASS_IN_MEM = 1,  // First pass will load data from PyDataProvider2,
-                          // then cache all data in memory. Load data from
-                          // memory in rest passes.
-};
-
-struct SlotHeader {  // Slot Header will parse from python object's slots field.
-  size_t dim;
-  SlotType slotType;
-  SeqType seqType;
-};
-
-inline std::ostream& operator<<(std::ostream& os, const SlotHeader& header) {
-  os << "Dim = " << header.dim << " Type = " << header.slotType
-     << " SeqType = " << header.seqType;
-  return os;
-}
-
-/**
- * FieldScanner Interface.
- *
- * It will read python object, and fill to argument's each slot.
- * There are two steps, prepare and fill. Scanner will alloc memory during
- * prepare step, fill data into argument during fill step.
- */
-class IFieldScanner {
- public:
-  DISABLE_COPY(IFieldScanner);
-  /**
-   * Ctor.
-   * @param headerPtr slot header that scanner belong to.
-   */
-  explicit IFieldScanner(SlotHeader* headerPtr) : headerPtr_(headerPtr) {}
-  virtual ~IFieldScanner() {}
-
-  /**
-   * Start prepare step.
-   */
-  virtual void startPrepare(Argument& argument) {}
-
-  /**
-   * Prepare step.
-   *
-   * @note the obj could be a timestep of sample or whole sample. It depends
-   * what scanner it is.
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) {}
-
-  /**
-   * Finish Prepare step.
-   */
-  virtual void finishPrepare(Argument& argument) {}
-
-  /**
-   * Start fill step.
-   */
-  virtual void startFill(Argument& argument) {}
-
-  /**
-   * Fill step.
-   *
-   * @note the obj could be a timestep of sample or whole sample. It depends
-   * what scanner it is.
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {}
-
-  /**
-   * Finish fill step.
-   */
-  virtual void finishFill(Argument& argument) {}
-
-  /**
-   * Factory method. Create a scanner by header. The final scanner may be
-   * combine many scanners.
-   *
-   * @note Fatal if header is not support.
-   */
-  static IFieldScanner* create(SlotHeader* header);
-
- protected:
-  SlotHeader* headerPtr_;
-};
-
-/**
- * Py Data Provider Cache Interface.
- */
-class IPyDataProviderCache {
- public:
-  virtual ~IPyDataProviderCache() {}
-
-  /**
-   * invoke when DataProvider::reset()
-   * @return true if read data from python.
-   */
-  virtual bool reset() = 0;
-
-  /**
-   * invoke when these data are used by DataProvider, and need to clear.
-   * @param [inout] data used data.
-   *
-   * @note The implemented class must clear these data array. Or if you want to
-   * delete the PyObjectPtr later, you should make sure the paddle process only
-   * have one active thread calling python code (use PyGuard otherwise).
-   */
-  virtual void drop(std::deque<PyObjectPtr>* data) = 0;
-
-  /**
-   * Return whole data in cache.
-   */
-  virtual std::deque<PyObjectPtr>* load() = 0;
-
-  /**
-   * Factory method. Convert CacheType to IPyDataProviderCache*
-   */
-  static IPyDataProviderCache* create(CacheType ct);
-};
-
-/**
- * PyDataProvider2.
- *
- * For usage, please refer python module 'paddle.trainer.PyDataProvider2'
- *
- * Here, we start a thread to read data. It is totally asynchronous for reading
- * data. And it support cache strategies.
- */
-class PyDataProvider2 : public DataProvider {
- public:
-  /**
-   * Ctor
-   */
-  PyDataProvider2(const DataConfig& config,
-                  const ModelConfig& modelConfig,
-                  bool useGpu)
-      : DataProvider(config, useGpu), callingContextCreated_(2) {
-    if (PyArray_API == NULL) import_array();
-    auto& args = config.load_data_args();
-    PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
-    if (!args.empty()) {
-      kwargs = callPythonFuncRetPyObj(
-          "paddle.trainer.PyDataProvider2", "deserialize_args", {args});
-    }
-
-    py::DictHelper kwargsDict(kwargs);
-    kwargsDict.setBool("is_train", !config.for_test());
-    std::vector<std::string> inputs;
-    inputs.reserve(modelConfig.input_layer_names().size());
-    std::copy(modelConfig.input_layer_names().begin(),
-              modelConfig.input_layer_names().end(),
-              std::back_inserter(inputs));
-    kwargsDict.setStringList("input_order", inputs);
-
-    // kwargs is keyword arguemts to create object.
-    this->createPyDataObj(config.load_data_module(),
-                          config.load_data_object(),
-                          config.files(),
-                          std::move(kwargs));
-    DBG << "Instance " << instance_.get() << " loaded.";
-    this->readPyFields(config.for_test());
-    DBG << "Py Field Done";
-  }
-
-  /**
-   * Dtor
-   * @note will stop loading thread when destructing
-   */
-  virtual ~PyDataProvider2() { resetImpl(false); }
-
- private:
-  void createPyDataObj(const std::string& model,
-                       const std::string& className,
-                       const std::string& fileListName,
-                       PyObjectPtr&& kwargs  // NOLINT
-                       ) {
-    LOG(INFO) << "loading dataprovider " << model << "::" << className;
-
-    PyObjectPtr module = py::import(model);
-    PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
-    CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
-    PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(), className.c_str()));
-    CHECK_PY(cls) << "load class " << className.c_str() << "error";
-
-    // If there are multiple python instance share same module, the PyObjectPtr
-    // only for instance will make python reference-count error.
-    //
-    // So here, we increase reference count manually.
-    Py_XINCREF(module.get());
-    Py_XINCREF(moduleDict.get());
-    Py_XINCREF(cls.get());
-
-    PyObjectPtr fileListInPy = loadPyFileLists(fileListName);
-    PyDict_SetItemString(kwargs.get(), "file_list", fileListInPy.get());
-    {
-      PyGuard guard;
-      instance_.reset(PyObject_Call(cls.get(), zeroTuple_.get(), kwargs.get()));
-    }
-    CHECK_PY(instance_) << "Cannot Create instance";
-  }
-
-  void readPyFields(bool testing) {
-    py::ObjectHelper self(this->instance_);
-    bool ok;
-
-    this->skipShuffle_ =
-        !self.getBoolAttr("should_shuffle", &ok /*isBoolType*/);
-    if (!ok) {
-      this->skipShuffle_ = testing;  // shuffle when is training, skip shuffle
-                                     // when is testing.
-    }
-    DBG << "Provider Skip Shuffle " << this->skipShuffle_;
-
-    this->poolSize_ = self.getIntAttr<size_t>("pool_size", &ok);
-    if (!ok) {
-      this->poolSize_ = -1UL;
-    }
-    this->minPoolSize_ = self.getIntAttr<size_t>("min_pool_size", &ok);
-    if (!ok) {
-      this->minPoolSize_ = -1UL;
-    }
-    this->minPoolSize_ = std::min(this->poolSize_, this->minPoolSize_);
-
-    this->canOverBatchSize_ = self.getBoolAttr("can_over_batch_size");
-
-    calcBatchSize_.reset(self.getAttr("calc_batch_size"));
-    if (this->calcBatchSize_ && !py::isCallable(this->calcBatchSize_)) {
-      this->calcBatchSize_.reset();
-    }
-
-    generator_.reset(self.getAttr("generator"));
-    CHECK(py::isCallable(generator_));
-
-    // Reading slots.
-    PyObjectPtr slotsPtr(self.getAttr("slots"));
-    py::SequenceHelper slots(slotsPtr);
-    headers_.reserve(slots.size());
-    for (size_t i = 0; i < slots.size(); ++i) {
-      headers_.emplace_back();
-      auto& header = headers_.back();
-      PyObject* hdPtr = slots[i];
-      CHECK(hdPtr != nullptr);
-      Py_XINCREF(hdPtr);
-      PyObjectPtr headerPtrWrap(hdPtr);
-      py::ObjectHelper hd(headerPtrWrap);
-      header.dim = hd.getIntAttrWithError<size_t>("dim");
-      header.seqType = (SeqType)hd.getIntAttrWithError<int>("seq_type");
-      header.slotType = (SlotType)hd.getIntAttrWithError<int>("type");
-    }
-
-    DBG << "Data header size " << headers_.size();
-    for (auto& header : headers_) {
-      DBG << header;
-    }
-    cache_.reset(IPyDataProviderCache::create(
-        (CacheType)self.getIntAttrWithError<int>("cache")));
-  }
-
-  PyObjectPtr loadPyFileLists(const std::string& fileListName) {
-    loadFileList(fileListName, fileLists_);
-    PyObject* lst = PyList_New(fileLists_.size());
-    for (size_t i = 0; i < fileLists_.size(); ++i) {
-      PyList_SET_ITEM(lst, i, PyString_FromString(fileLists_[i].c_str()));
-    }
-    return PyObjectPtr(lst);
-  }
-
-  void loadThread() {
-    DBG << "Creating context";
-    for (auto& filename : fileLists_) {
-      PyGuard g;
-      py::CallableHelper generator(this->generator_);
-      generator.setArgsSize(2);
-      generator.getArgs().set(0, instance_);
-      generator.getArgs().set(1, PyString_FromString(filename.c_str()), true);
-      callingContexts_.emplace_back(generator());
-      CHECK_PY(callingContexts_.back()) << "Generator error.";
-      CHECK(PyIter_Check(callingContexts_.back()));
-    }
-    DBG << "Create context done";
-    callingContextCreated_.wait();
-
-    PositionRandom p(skipShuffle_);
-
-    while (!exit_ && !callingContexts_.empty()) {
-      PyObject* data = nullptr;
-
-      {  // Read data.
-        size_t cid = p(callingContexts_.size());
-        bool atEnd;
-        data = py::iterNext(callingContexts_[cid], &atEnd);
-        if (atEnd || data == nullptr) {
-          if (cid != 0) {
-            std::swap(callingContexts_[cid], callingContexts_[0]);
-            cid = 0;
-          }
-
-          PyObjectPtr front;
-          {
-            std::unique_lock<std::mutex> l(mtx_);
-            front = pop_get_front(callingContexts_);
-          }
-          {
-            PyGuard g;
-            front.reset();
-          }
-          this->pullCV_.notify_all();
-          continue;
-        }
-      }
-
-      size_t additionalBatchSize = 1;
-      if (calcBatchSize_) {
-        PyGuard guard;
-        py::CallableHelper calcBatchSize(this->calcBatchSize_);
-        calcBatchSize.setArgsSize(1);
-        calcBatchSize.getArgs().set(0, data);
-        PyObjectPtr bs(calcBatchSize());
-        CHECK_PY(bs);
-        bool ok;
-        additionalBatchSize = py::castInt<size_t>(bs.get(), &ok);
-        CHECK(ok) << "CalcBatchSize must return int or long";
-      }
-
-      if (this->loadThread_) {  // wait poolActualSize < poolSize;
-        std::unique_lock<std::mutex> l(mtx_);
-        pushCV_.wait(l, [this] { return this->poolActualSize_ < poolSize_; });
-      }
-
-      {
-        std::lock_guard<std::mutex> guard(mtx_);
-        poolActualSize_ += additionalBatchSize;
-        dataPool_.emplace_back(data);
-      }
-      pullCV_.notify_all();
-    }
-    DBG << "load thread end";
-  }
-
-  inline void resetImpl(bool startNewThread) {
-    DBG << "Reseting " << startNewThread;
-    exit_.store(true);
-    if (loadThread_) {  // is loading.
-      loadThread_->join();
-      loadThread_.reset();
-    }
-    {
-      PyGuard g;
-      callingContexts_.clear();
-      this->pullCV_.notify_one();
-    }
-
-    std::lock_guard<std::mutex> guard(mutexForReset_);
-    {
-      PyGuard g;
-      dataPool_.clear();
-    }
-    poolActualSize_ = 0;
-
-    if (startNewThread && cache_->reset()) {
-      DBG << "Start new thread.";
-      loadThread_.reset(new std::thread([this] {
-        exit_ = false;
-        loadThread();
-      }));
-      callingContextCreated_.wait();
-    }
-    DBG << "Reset done";
-    exit_ = false;
-  }
-
- private:
-  std::unique_ptr<std::thread> loadThread_;
-  std::atomic<bool> exit_;
-  std::deque<PyObjectPtr> callingContexts_;
-  std::deque<PyObjectPtr> dataPool_;
-  size_t poolActualSize_;
-  std::condition_variable pushCV_;
-  std::condition_variable pullCV_;
-  std::mutex mtx_;
-
-  std::mutex mutexForReset_;
-
-  ThreadBarrier callingContextCreated_;
-  std::unique_ptr<IPyDataProviderCache> cache_;
-
-  PyObjectPtr instance_;
-  size_t poolSize_;
-  size_t minPoolSize_;
-  bool canOverBatchSize_;
-  PyObjectPtr calcBatchSize_;
-  PyObjectPtr generator_;
-  std::vector<std::string> fileLists_;
-  std::vector<SlotHeader> headers_;
-  static PyObjectPtr zeroTuple_;
-
-  class PositionRandom {
-   public:
-    inline explicit PositionRandom(bool skipRand)
-        : eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
-
-    inline size_t operator()(size_t len) {
-      if (!skipRand_) {
-        if (!dist_ || dist_->b() != len - 1) {
-          dist_.reset(new std::uniform_int_distribution<size_t>(0, len - 1));
-        }
-        return (*dist_)(eng_);
-      } else {
-        return 0;
-      }
-    }
-
-   private:
-    std::default_random_engine& eng_;
-    std::unique_ptr<std::uniform_int_distribution<size_t>> dist_;
-    bool skipRand_;
-  };
-
-  // DataProvider interface
- public:
-  /**
-   * Resetting the PyDataProvider. May start reading thread here.
-   */
-  virtual void reset() {
-    resetImpl(true);
-    DataProvider::reset();
-  }
-
-  /**
-   * Shuffle. Do nothing because PyDataProvider do shuffle implicitly by random
-   * select data from datapool.
-   */
-  void shuffle() {}
-
-  /**
-   * Not limited size.
-   */
-  int64_t getSize() { return -1; }
-
-  /**
-   * Loading a batch of data.
-   */
-  int64_t getNextBatchInternal(int64_t size_, DataBatch* batch) {
-    std::lock_guard<std::mutex> guard(mutexForReset_);
-    REGISTER_TIMER("PyDP2.getNextBatchInternal")
-    CHECK_GE(size_, 0);
-    size_t size = (size_t)size_;
-    if (loadThread_) {  // loading from thread should wait for data pool ready.
-                        // but, loading from cache, cache object should ensure
-                        // data pool ready.
-      std::unique_lock<std::mutex> l(mtx_);
-      pullCV_.wait(l, [this, &size] {
-        return this->poolActualSize_ >= std::max(size, this->minPoolSize_) ||
-               callingContexts_.empty();
-      });
-
-      if (unittest::OnPoolFilled) {
-        (*unittest::OnPoolFilled)(this->poolActualSize_);
-      }
-    }
-    std::deque<PyObjectPtr> data;
-    size_t bsize = 0;
-    std::deque<PyObjectPtr>* poolPtr = nullptr;
-
-    if (this->loadThread_) {  // loading from thread.
-      poolPtr = &this->dataPool_;
-    } else {  // loading from cache.
-      poolPtr = this->cache_->load();
-    }
-    if (exit_) {
-      // PyDataProvider is destructing.
-      return 0;
-    }
-    CHECK(poolPtr != nullptr);
-
-    std::deque<PyObjectPtr>& pool = *poolPtr;
-
-    while (bsize < size && !pool.empty()) {
-      {
-        // move data from pool to data
-        std::lock_guard<std::mutex> guard(mtx_);
-        if (skipShuffle_) {
-          size_t i = 0;
-          CHECK(pool[i] != nullptr);
-          data.emplace_back(std::move(pool[i]));
-          pool.pop_front();
-        } else {  // when shuffle, use swap to drop only last pool element.
-          size_t i = ThreadLocalRand::rand() % pool.size();
-          CHECK(pool[i] != nullptr);
-          if (i != 0) {
-            std::swap(pool[i], pool.front());
-          }
-          data.emplace_back(std::move(pool.front()));
-          pool.pop_front();
-        }
-
-        if (calcBatchSize_) {  // custom calc batch size.
-          PyGuard guard;
-          Py_INCREF(data.back().get());
-          py::CallableHelper calcBatchSize(calcBatchSize_);
-          calcBatchSize.setArgsSize(1);
-          calcBatchSize.getArgs().set(0, data.back());
-          PyObjectPtr customBatchSize(calcBatchSize());
-          bool ok;
-          size_t tmp = py::castInt<size_t>(customBatchSize.get(), &ok);
-          CHECK(ok) << "calc_batch_size must return int";
-
-          if (bsize + tmp > size && !canOverBatchSize_) {
-            // Put data back.
-            pool.push_front(std::move(data.back()));
-            data.pop_back();
-            break;
-          } else {
-            bsize += tmp;
-          }
-        } else {
-          bsize += 1;
-        }
-      }
-    }
-
-    if (this->loadThread_) {
-      {
-        std::lock_guard<std::mutex> g(mtx_);
-        poolActualSize_ -= bsize;
-      }
-      this->pushCV_.notify_all();
-    }
-
-    if (bsize == 0) {  // end of pass. In data pool, cannot get any data.
-      return 0;
-    }
-
-    DataBatch cpuBatch;
-    cpuBatch.setSize(bsize);
-    auto& inArgs = cpuBatch.getStreams();
-    inArgs.resize(headers_.size());
-    std::vector<std::unique_ptr<IFieldScanner>> scanners;
-    scanners.reserve(headers_.size());
-    for (auto& header : headers_) {
-      scanners.emplace_back(IFieldScanner::create(&header));
-    }
-    DBG << "Scanner created.";
-    for (size_t i = 0; i < headers_.size(); ++i) {
-      scanners[i]->startPrepare(inArgs[i]);
-    }
-    for (auto& d : data) {
-      py::SequenceHelper s(d);
-      for (size_t i = 0; i < headers_.size(); ++i) {
-        scanners[i]->prepare(inArgs[i], s[i]);
-      }
-    }
-    for (size_t i = 0; i < headers_.size(); ++i) {
-      scanners[i]->finishPrepare(inArgs[i]);
-    }
-    for (size_t i = 0; i < headers_.size(); ++i) {
-      scanners[i]->startFill(inArgs[i]);
-    }
-    for (auto& d : data) {
-      py::SequenceHelper s(d);
-      for (size_t i = 0; i < headers_.size(); ++i) {
-        scanners[i]->fill(inArgs[i], s[i]);
-      }
-    }
-
-    for (size_t i = 0; i < headers_.size(); ++i) {
-      scanners[i]->finishFill(inArgs[i]);
-    }
-
-    {
-      PyGuard g;
-      cache_->drop(&data);
-    }
-
-    DBG << "Reading CPU Batch Done.";
-
-    if (useGpu_) {
-      std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-      DataBatch& gpuBatch = *batch;
-      std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
-      gpuArguments.resize(cpuArguments.size());
-      gpuBatch.setSize(bsize);
-      for (size_t i = 0; i < headers_.size(); ++i) {
-        gpuArguments[i].resizeAndCopyFrom(
-            cpuArguments[i], useGpu_, HPPL_STREAM_1);
-      }
-      hl_stream_synchronize(HPPL_STREAM_1);
-    } else {
-      *batch = cpuBatch;
-    }
-    return bsize;
-  }
-};
-
-PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0));
-
-REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
-
-/**
- * Scanner for dense slot.
- */
-class DenseScanner : public IFieldScanner {
- public:
-  explicit DenseScanner(SlotHeader* ptr) : IFieldScanner(ptr), height_(0) {}
-
-  /**
-   * Prepare.
-   * @param argument target argument
-   * @param obj each timestep of a sample.
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) { ++height_; }
-
-  virtual void finishPrepare(Argument& argument) {
-    Matrix::resizeOrCreate(
-        argument.value, height_, headerPtr_->dim, false, false);
-    height_ = 0;
-  }
-
-  /**
-   * Fill argument from obj.
-   * @param argument
-   * @param obj
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {
-    real* dat = argument.value->getData() + height_ * headerPtr_->dim;
-    if (PyArray_Check(obj)) {
-      auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
-      if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
-        real* data = (real*)PyArray_DATA((PyArrayObject*)obj);
-        auto sz = PyArray_SIZE((PyArrayObject*)obj);
-        std::copy(data, data + sz, dat);
-      } else {
-        LOG(FATAL) << "You should yield float" << sizeof(real) * 8 << " array";
-      }
-    } else {
-      py::SequenceHelper s(obj);
-      // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
-      for (size_t i = 0; i < headerPtr_->dim; ++i) {
-        dat[i] = (real)s.getDouble(i);
-      }
-    }
-    ++height_;
-  }
-
- private:
-  size_t height_;
-};
-
-/**
- * Scanner for index slot
- */
-class IndexScanner : public IFieldScanner {
- public:
-  explicit IndexScanner(SlotHeader* ptr) : IFieldScanner(ptr), cnt_(0) {}
-
-  /**
-   * Prepare memory space.
-   *
-   * @note obj is a single timestep of sample
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) { ++cnt_; }
-
-  virtual void finishPrepare(Argument& argument) {
-    IVector::resizeOrCreate(argument.ids, cnt_, false);
-    cnt_ = 0;
-  }
-
-  /**
-   * Fill one index to argument.
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {
-    bool ok;
-    argument.ids->getData()[cnt_++] = py::castInt<int>(obj, &ok);
-    CHECK(ok) << "Cannot cast int " << py::repr(obj);
-  }
-
- private:
-  size_t cnt_;
-};
-
-class SparseNonValueScanner : public IFieldScanner {
- public:
-  explicit SparseNonValueScanner(SlotHeader* ptr)
-      : IFieldScanner(ptr), nnz_(0), height_(0) {}
-
-  /**
-   * Prepare memory space
-   * @note obj is a timestep of one sample.
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) {
-    ++height_;
-    nnz_ += py::SequenceHelper(obj).size();
-  }
-
-  virtual void finishPrepare(Argument& argument) {
-    Matrix::resizeOrCreateSparseMatrix(
-        argument.value, height_, headerPtr_->dim, nnz_, NO_VALUE);
-  }
-
-  virtual void startFill(Argument& argument) {
-    auto smat = (CpuSparseMatrix*)(argument.value.get());
-    smat->getRows()[0] = 0;
-    nnz_ = 0;
-    height_ = 1;
-  }
-
-  /**
-   * Fill one sparse vector to argument.
-   * @note obj is a timestep of one sample.
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {
-    py::SequenceHelper s(obj);
-    auto sz = s.size();
-    auto smat = (CpuSparseMatrix*)(argument.value.get());
-    int* row = smat->getRows();
-    int* col = smat->getCols();
-    real* dat = smat->getData();
-    row[height_] = row[height_ - 1] + (int)sz;
-
-    for (decltype(sz) i = 0; i < sz; ++i) {
-      setData(col + nnz_, dat + nnz_, s[i]);
-      ++nnz_;
-    }
-    ++height_;
-  }
-
- protected:
-  /**
-   * Set a single sparse index and value.
-   * @param [out] col sparse index
-   * @param [out] dat sparse value
-   * @param [in] obj Python Object. For sparse_non_value is a PyInt or PyLong.
-   *                 For sparse_value is a Tuple (int, float).
-   */
-  virtual void setData(int* col, real* dat, PyObject* obj) {
-    bool ok;
-    *col = py::castInt<int>(obj, &ok);
-    CHECK(ok);
-  }
-
-  size_t nnz_;
-  size_t height_;
-};
-
-class SparseValueScanner : public SparseNonValueScanner {
- public:
-  explicit SparseValueScanner(SlotHeader* ptr) : SparseNonValueScanner(ptr) {}
-
-  virtual void finishPrepare(Argument& argument) {
-    Matrix::resizeOrCreateSparseMatrix(
-        argument.value, height_, headerPtr_->dim, nnz_, FLOAT_VALUE);
-  }
-
- protected:
-  virtual void setData(int* col, real* dat, PyObject* obj) {
-    py::SequenceHelper s(obj);
-    SparseNonValueScanner::setData(col, dat, s[0]);
-    *dat = (real)s.getDouble(1);
-  }
-};
-
-/**
- * Sequence Scanner. Scanner for sequence or sub-sequence.
- */
-class SequenceScanner : public IFieldScanner {
- public:
-  /**
-   * Ctor
-   * @param innerScanner inner scanner for each timestep or sub-sequence.
-   * @param getSeqStartPos A callback, (Argument) => ICpuGpuVectorPtr.
-   *                       return a sequence start position or a sub-sequence
-   *                       start position.
-   */
-  SequenceScanner(
-      std::unique_ptr<IFieldScanner>&& innerScanner,
-      const std::function<ICpuGpuVectorPtr&(Argument&)>& getSeqStartPos)
-      : IFieldScanner(nullptr),
-        inner_(std::move(innerScanner)),
-        cnt_(0),
-        getSeqStartPos_(getSeqStartPos) {}
-
-  /**
-   * Start prepare. Invoke inner->startPrepare too.
-   */
-  virtual void startPrepare(Argument& argument) {
-    inner_->startPrepare(argument);
-  }
-
-  /**
-   * Prepare. obj is a list or tuple. it will invoke inner_->prepare for each
-   * element of sequence obj.
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) {
-    py::SequenceHelper s(obj);
-    ++cnt_;
-    for (size_t i = 0; i < s.size(); ++i) {
-      inner_->prepare(argument, s[i]);
-    }
-  }
-
-  /**
-   * Finish prepare. invoke inner_->finishPrepare too.
-   */
-  virtual void finishPrepare(Argument& argument) {
-    ICpuGpuVector::resizeOrCreate(getSeqStartPos_(argument), cnt_ + 1, false);
-    inner_->finishPrepare(argument);
-  }
-
-  /**
-   * Start fill. invoke inner->startFill too.
-   */
-  virtual void startFill(Argument& argument) {
-    getSeqStartPos_(argument)->getMutableData(false)[0] = 0;
-    cnt_ = 1;
-    inner_->startFill(argument);
-  }
-
-  /**
-   * Fill. Obj is a tuple or list. invoke inner->fill for each element of
-   * sequence obj. And set seqStartPos at same time. The seqStartPos will be
-   * calculated by getSeqStartPos callback passed in ctor.
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {
-    getSeqStartPos_(argument)->getMutableData(false)[cnt_] =
-        getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
-        (int)getSize(obj);
-    py::SequenceHelper s(obj);
-    ++cnt_;
-    for (size_t i = 0; i < s.size(); ++i) {
-      inner_->fill(argument, s[i]);
-    }
-  }
-
-  /**
-   * Finish fill. will invoke inner->finishFill too.
-   */
-  virtual void finishFill(Argument& argument) { inner_->finishFill(argument); }
-
- protected:
-  size_t getSize(PyObject* obj) {
-    py::SequenceHelper s(obj);
-    auto sc = dynamic_cast<SequenceScanner*>(inner_.get());
-    if (sc) {
-      size_t sum = 0;
-      for (size_t i = 0; i < s.size(); ++i) {
-        sum += sc->getSize(s[i]);
-      }
-      return sum;
-    } else {
-      return s.size();
-    }
-  }
-
- private:
-  std::unique_ptr<IFieldScanner> inner_;
-  size_t cnt_;
-  std::function<ICpuGpuVectorPtr&(Argument&)> getSeqStartPos_;
-};
-
-IFieldScanner* IFieldScanner::create(SlotHeader* header) {
-  IFieldScanner* retv = nullptr;
-  switch (header->slotType) {
-    case ST_DENSE:
-      retv = new DenseScanner(header);
-      break;
-    case ST_INDEX:
-      retv = new IndexScanner(header);
-      break;
-    case ST_NON_SPARSE_VALUE:
-      retv = new SparseNonValueScanner(header);
-      break;
-    case ST_SPARSE_VALUE:
-      retv = new SparseValueScanner(header);
-      break;
-    default:
-      LOG(FATAL) << "Not implemented " << header->slotType;
-  }
-
-  switch (header->seqType) {
-    case SQT_NONE:
-      break;
-    case SQT_SUBSEQ:
-      retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
-                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
-                                   return arg.subSequenceStartPositions;
-                                 });
-    // fall through, not break;
-    case SQT_SEQ:
-      retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
-                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
-                                   return arg.sequenceStartPositions;
-                                 });
-      break;
-    default:
-      LOG(FATAL) << "Not implemented";
-  }
-
-  return retv;
-}
-
-/**
- * No Cache Strategy. Will destruct old data immediately and load data from
- * python every pass.
- */
-class NoCacheStrategy : public IPyDataProviderCache {
- public:
-  virtual bool reset() { return true; }
-
-  virtual void drop(std::deque<PyObjectPtr>* data) { data->clear(); }
-
-  virtual std::deque<PyObjectPtr>* load() { return nullptr; }
-};
-
-/**
- * Cache One Pass In Memory strategy.
- *
- * In first pass, will load data from python and store them in memory.
- * The rest passes, will load data from memory.
- */
-class CacheOnePassInMemory : public IPyDataProviderCache {
- public:
-  CacheOnePassInMemory()
-      : objPool_(new std::deque<PyObjectPtr>()),
-        droppedPool_(new std::deque<PyObjectPtr>()) {}
-
-  virtual bool reset() {
-    if (objPool_->empty() && droppedPool_->empty()) {
-      return true;
-    } else if (objPool_->empty()) {
-      std::swap(objPool_, droppedPool_);
-      return false;
-    } else {
-      LOG(FATAL) << "Unexpected branch";
-    }
-  }
-
-  virtual void drop(std::deque<PyObjectPtr>* data) {
-    size_t orgSize = droppedPool_->size();
-    droppedPool_->resize(orgSize + data->size());
-    for (size_t i = 0; i < data->size(); ++i) {
-      std::swap((*droppedPool_)[orgSize + i], (*data)[i]);
-    }
-    data->clear();
-  }
-
-  virtual std::deque<PyObjectPtr>* load() { return objPool_.get(); }
-
- private:
-  std::unique_ptr<std::deque<PyObjectPtr>> objPool_;
-  std::unique_ptr<std::deque<PyObjectPtr>> droppedPool_;
-};
-
-IPyDataProviderCache* IPyDataProviderCache::create(CacheType ct) {
-  switch (ct) {
-    case NO_CACHE:
-      return new NoCacheStrategy();
-    case CACHE_PASS_IN_MEM:
-      return new CacheOnePassInMemory();
-    default:
-      LOG(FATAL) << "Not implemented";
-  }
-}
-}  // namespace paddle
-
-#endif
diff --git a/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp
deleted file mode 100644
index c145adda5e0..00000000000
--- a/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp
+++ /dev/null
@@ -1,320 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Evaluator.h"
-#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/legacy/utils/StringUtil.h"
-
-namespace paddle {
-
-/**
- * calculate sequence-to-sequence edit distance
- */
-class CTCErrorEvaluator : public Evaluator {
- private:
-  MatrixPtr outActivations_;
-  int numTimes_, numClasses_, numSequences_, blank_;
-  real deletions_, insertions_, substitutions_;
-  int seqClassficationError_;
-  mutable std::unordered_map<std::string, real> evalResults_;
-
-  std::vector<int> path2String(const std::vector<int>& path) {
-    std::vector<int> str;
-    str.clear();
-    int prevLabel = -1;
-    for (std::vector<int>::const_iterator label = path.begin();
-         label != path.end();
-         label++) {
-      if (*label != blank_ &&
-          (str.empty() || *label != str.back() || prevLabel == blank_)) {
-        str.push_back(*label);
-      }
-      prevLabel = *label;
-    }
-    return str;
-  }
-
-  std::vector<int> bestLabelSeq() {
-    std::vector<int> path;
-    path.clear();
-    real* acts = outActivations_->getData();
-    for (int i = 0; i < numTimes_; ++i) {
-      path.push_back(std::max_element(acts + i * numClasses_,
-                                      acts + (i + 1) * numClasses_) -
-                     (acts + i * numClasses_));
-    }
-    return path2String(path);
-  }
-
-  /* "sp, dp, ip" is the weighting parameter of "substitution, deletion,
-   * insertion"
-   * in edit-distance error */
-  real stringAlignment(std::vector<int>& gtStr,
-                       std::vector<int>& recogStr,
-                       bool backtrace = true,
-                       real sp = 1.0,
-                       real dp = 1.0,
-                       real ip = 1.0) {
-    std::vector<std::vector<int>> matrix;
-    int substitutions, deletions, insertions;
-    real distance;
-    int n = gtStr.size();
-    int m = recogStr.size();
-
-    if (n == 0) {
-      substitutions = 0;
-      deletions = 0;
-      insertions = m;
-      distance = m;
-    } else if (m == 0) {
-      substitutions = 0;
-      deletions = n;
-      insertions = 0;
-      distance = n;
-    } else {
-      substitutions = 0;
-      deletions = 0;
-      insertions = 0;
-      distance = 0;
-      // initialize the matrix
-      matrix.resize(n + 1);
-      for (int i = 0; i < n + 1; ++i) {
-        matrix[i].resize(m + 1);
-        for (int j = 0; j < m + 1; ++j) {
-          matrix[i][j] = 0;
-        }
-      }
-      for (int i = 0; i < n + 1; ++i) {
-        matrix[i][0] = i;
-      }
-      for (int j = 0; j < m + 1; ++j) {
-        matrix[0][j] = j;
-      }
-
-      // calculate the insertions, substitutions and deletions
-      for (int i = 1; i < n + 1; ++i) {
-        int s_i = gtStr[i - 1];
-        for (int j = 1; j < m + 1; ++j) {
-          int t_j = recogStr[j - 1];
-          int cost = (s_i == t_j) ? 0 : 1;
-          const int above = matrix[i - 1][j];
-          const int left = matrix[i][j - 1];
-          const int diag = matrix[i - 1][j - 1];
-          const int cell = std::min(above + 1, std::min(left + 1, diag + cost));
-          matrix[i][j] = cell;
-        }
-      }
-
-      if (backtrace) {
-        size_t i = n;
-        size_t j = m;
-        substitutions = 0;
-        deletions = 0;
-        insertions = 0;
-
-        while (i != 0 && j != 0) {
-          if (matrix[i][j] == matrix[i - 1][j - 1]) {
-            --i;
-            --j;
-          } else if (matrix[i][j] == matrix[i - 1][j - 1] + 1) {
-            ++substitutions;
-            --i;
-            --j;
-          } else if (matrix[i][j] == matrix[i - 1][j] + 1) {
-            ++deletions;
-            --i;
-          } else {
-            ++insertions;
-            --j;
-          }
-        }
-        while (i != 0) {
-          ++deletions;
-          --i;
-        }
-        while (j != 0) {
-          ++insertions;
-          --j;
-        }
-        int diff = substitutions + deletions + insertions;
-        if (diff != matrix[n][m]) {
-          LOG(ERROR) << "Found path with distance " << diff
-                     << " but Levenshtein distance is " << matrix[n][m];
-        }
-
-        distance = (sp * substitutions) + (dp * deletions) + (ip * insertions);
-      } else {
-        distance = (real)matrix[n][m];
-      }
-    }
-    real maxLen = std::max(m, n);
-    deletions_ += deletions / maxLen;
-    insertions_ += insertions / maxLen;
-    substitutions_ += substitutions / maxLen;
-
-    if (distance != 0) {
-      seqClassficationError_ += 1;
-    }
-
-    return distance / maxLen;
-  }
-
-  real editDistance(
-      real* output, int numTimes, int numClasses, int* labels, int labelsLen) {
-    numTimes_ = numTimes;
-    numClasses_ = numClasses;
-    blank_ = numClasses_ - 1;
-    outActivations_ = Matrix::create(output, numTimes, numClasses);
-    std::vector<int> recogStr, gtStr;
-    recogStr = bestLabelSeq();
-    for (int i = 0; i < labelsLen; ++i) {
-      gtStr.push_back(labels[i]);
-    }
-
-    return stringAlignment(gtStr, recogStr);
-  }
-
-  void storeLocalValues() const {
-    evalResults_["error"] = numSequences_ ? totalScore_ / numSequences_ : 0;
-    evalResults_["deletion_error"] =
-        numSequences_ ? deletions_ / numSequences_ : 0;
-    evalResults_["insertion_error"] =
-        numSequences_ ? insertions_ / numSequences_ : 0;
-    evalResults_["substitution_error"] =
-        numSequences_ ? substitutions_ / numSequences_ : 0;
-    evalResults_["sequence_error"] =
-        (real)seqClassficationError_ / numSequences_;
-  }
-
- public:
-  CTCErrorEvaluator()
-      : numTimes_(0),
-        numClasses_(0),
-        numSequences_(0),
-        blank_(0),
-        deletions_(0),
-        insertions_(0),
-        substitutions_(0),
-        seqClassficationError_(0) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    CHECK_EQ(arguments.size(), (size_t)2);
-    Argument output, label;
-    output.resizeAndCopyFrom(arguments[0], false, HPPL_STREAM_DEFAULT);
-    label.resizeAndCopyFrom(arguments[1], false, HPPL_STREAM_DEFAULT);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-    CHECK(label.sequenceStartPositions);
-    CHECK(label.ids);
-    size_t numSequences = label.sequenceStartPositions->getSize() - 1;
-    const int* labelStarts = label.sequenceStartPositions->getData(false);
-    const int* outputStarts = output.sequenceStartPositions->getData(false);
-    real totalErr = 0;
-    for (size_t i = 0; i < numSequences; ++i) {
-      real err = 0;
-      err = editDistance(
-          output.value->getData() + output.value->getWidth() * outputStarts[i],
-          outputStarts[i + 1] - outputStarts[i],
-          output.value->getWidth(),
-          label.ids->getData() + labelStarts[i],
-          labelStarts[i + 1] - labelStarts[i]);
-
-      totalErr += err;
-    }
-
-    return totalErr;
-  }
-
-  virtual void eval(const NeuralNetwork& nn) {
-    Evaluator::eval(nn);
-    std::vector<Argument> arguments;
-    arguments.reserve(config_.input_layers_size());
-    for (const std::string& name : config_.input_layers()) {
-      arguments.push_back(nn.getLayer(name)->getOutput());
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    numSequences_ += arguments[1].getNumSequences();
-  }
-
-  virtual void start() {
-    Evaluator::start();
-    numSequences_ = 0;
-    blank_ = 0;
-    deletions_ = 0;
-    insertions_ = 0;
-    substitutions_ = 0;
-    seqClassficationError_ = 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    storeLocalValues();
-    os << config_.name() << " error = " << evalResults_["error"];
-    os << " deletions error = " << evalResults_["deletion_error"];
-    os << " insertions error = " << evalResults_["insertion_error"];
-    os << " substitution error = " << evalResults_["substitution_error"];
-    os << " sequence error = " << evalResults_["sequence_error"];
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    double buf[6] = {totalScore_,
-                     (double)deletions_,
-                     (double)insertions_,
-                     (double)substitutions_,
-                     (double)seqClassficationError_,
-                     (double)numSequences_};
-    client->reduce(buf, buf, 6, FLAGS_trainer_id, 0);
-    totalScore_ = buf[0];
-    deletions_ = (real)buf[1];
-    insertions_ = (real)buf[2];
-    substitutions_ = (real)buf[3];
-    seqClassficationError_ = (int)buf[4];
-    numSequences_ = (int)buf[5];
-  }
-
-  void getNames(std::vector<std::string>* names) {
-    storeLocalValues();
-    names->reserve(names->size() + evalResults_.size());
-    for (auto it = evalResults_.begin(); it != evalResults_.end(); ++it) {
-      names->push_back(config_.name() + "." + it->first);
-    }
-  }
-
-  real getValue(const std::string& name, Error* err) const {
-    storeLocalValues();
-
-    std::vector<std::string> buffers;
-    paddle::str::split(name, '.', &buffers);
-    auto it = evalResults_.find(buffers[buffers.size() - 1]);
-
-    if (it == evalResults_.end()) {
-      *err = Error("Evaluator does not have the key %s", name.c_str());
-      return 0.0f;
-    }
-
-    return it->second;
-  }
-
-  std::string getType(const std::string& name, Error* err) const {
-    this->getValue(name, err);
-    if (!err->isOK()) {
-      return "";
-    }
-    return "ctc_edit_distance";
-  }
-};
-
-REGISTER_EVALUATOR(ctc_edit_distance, CTCErrorEvaluator);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp b/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp
deleted file mode 100644
index 0ff3f2fa8cf..00000000000
--- a/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <set>
-#include <vector>
-
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/utils/StringUtil.h"
-
-#include "Evaluator.h"
-
-namespace paddle {
-
-/**
- * Chunk evaluator is used to evaluate segment labelling accuracy for a
- * sequence. It calculates the chunk detection F1 score.
- *
- * A chunk is correctly detected if its beginning, end and type are correct.
- * Other chunk type is ignored.
- * For each label in the label sequence, we have
- *
- * @code
- * tagType = label % numTagType
- * chunkType = label / numTagType
- * otherChunkType = numChunkTypes
- * @endcode
- *
- * The total number of different labels is numTagType*numChunkTypes+1
- * We support 4 labelling scheme
- * The tag type for each of the scheme is shown as follows:
- *
- * @code
- *  Scheme Begin Inside End   Single
- *   plain  0     -      -     -
- *   IOB    0     1      -     -
- *   IOE    -     0      1     -
- *   IOBES  0     1      2     3
- * @endcode
- *
- * 'plain' means the whole chunk must contain exactly the same chunk label.
- */
-class ChunkEvaluator : public Evaluator {
-  int otherChunkType_;
-  int numChunkTypes_;  // number of chunk types besides other chunk type
-  int numTagTypes_;
-  int tagBegin_;
-  int tagInside_;
-  int tagEnd_;
-  int tagSingle_;
-
-  int64_t numLabelSegments_;
-  int64_t numOutputSegments_;
-  int64_t numCorrect_;
-
-  struct Segment {
-    int begin;
-    int end;
-    int type;
-    bool operator==(const Segment& y) const {
-      return begin == y.begin && end == y.end && type == y.type;
-    }
-  };
-
-  std::vector<Segment> labelSegments_;
-  std::vector<Segment> outputSegments_;
-  std::set<int> excludedChunkTypes_;
-  mutable std::unordered_map<std::string, real> values_;
-
- public:
-  virtual void init(const EvaluatorConfig& config) {
-    Evaluator::init(config);
-    if (config.chunk_scheme() == "IOB") {
-      numTagTypes_ = 2;
-      tagBegin_ = 0;
-      tagInside_ = 1;
-      tagEnd_ = -1;
-      tagSingle_ = -1;
-    } else if (config.chunk_scheme() == "IOE") {
-      numTagTypes_ = 2;
-      tagBegin_ = -1;
-      tagInside_ = 0;
-      tagEnd_ = 1;
-      tagSingle_ = -1;
-    } else if (config.chunk_scheme() == "IOBES") {
-      numTagTypes_ = 4;
-      tagBegin_ = 0;
-      tagInside_ = 1;
-      tagEnd_ = 2;
-      tagSingle_ = 3;
-    } else if (config.chunk_scheme() == "plain") {
-      numTagTypes_ = 1;
-      tagBegin_ = -1;
-      tagInside_ = -1;
-      tagEnd_ = -1;
-      tagSingle_ = -1;
-    } else {
-      LOG(FATAL) << "Unknown chunk scheme: " << config.chunk_scheme();
-    }
-    CHECK(config.has_num_chunk_types()) << "Missing num_chunk_types in config";
-    otherChunkType_ = numChunkTypes_ = config.num_chunk_types();
-
-    // the chunks of types in excludedChunkTypes_ will not be counted
-    auto& tmp = config.excluded_chunk_types();
-    excludedChunkTypes_.insert(tmp.begin(), tmp.end());
-  }
-
-  virtual void start() {
-    Evaluator::start();
-    numLabelSegments_ = 0;
-    numOutputSegments_ = 0;
-    numCorrect_ = 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    storeLocalValues();
-    os << config_.name() << "=" << values_["F1-score"]
-       << " true_chunks=" << numLabelSegments_
-       << " result_chunks=" << numOutputSegments_
-       << " correct_chunks=" << numCorrect_;
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    int64_t buf[3] = {numLabelSegments_, numOutputSegments_, numCorrect_};
-    client->reduce(buf, buf, 3, FLAGS_trainer_id, 0);
-    numLabelSegments_ = buf[0];
-    numOutputSegments_ = buf[1];
-    numCorrect_ = buf[2];
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    CHECK_EQ(arguments.size(), (size_t)2);
-    IVectorPtr& output = arguments[0].ids;
-    IVectorPtr& label = arguments[1].ids;
-    CHECK(!output->useGpu() && !label->useGpu()) << "Not supported";
-    auto sequenceStartPositions =
-        arguments[1].sequenceStartPositions->getVector(false);
-    CHECK_EQ(output->getSize(), label->getSize());
-    CHECK(sequenceStartPositions);
-    size_t numSequences = sequenceStartPositions->getSize() - 1;
-    const int* starts = sequenceStartPositions->getData();
-    for (size_t i = 0; i < numSequences; ++i) {
-      eval1(output->getData() + starts[i],
-            label->getData() + starts[i],
-            starts[i + 1] - starts[i]);
-    }
-    return 0;
-  }
-
-  void eval1(int* output, int* label, int length) {
-    getSegments(output, length, outputSegments_);
-    getSegments(label, length, labelSegments_);
-    size_t i = 0, j = 0;
-    while (i < outputSegments_.size() && j < labelSegments_.size()) {
-      if (outputSegments_[i] == labelSegments_[j] &&
-          excludedChunkTypes_.count(outputSegments_[i].type) != 1) {
-        ++numCorrect_;
-      }
-      if (outputSegments_[i].end < labelSegments_[j].end) {
-        ++i;
-      } else if (outputSegments_[i].end > labelSegments_[j].end) {
-        ++j;
-      } else {
-        ++i;
-        ++j;
-      }
-    }
-    for (auto& segment : labelSegments_) {
-      if (excludedChunkTypes_.count(segment.type) != 1) ++numLabelSegments_;
-    }
-    for (auto& segment : outputSegments_) {
-      if (excludedChunkTypes_.count(segment.type) != 1) ++numOutputSegments_;
-    }
-  }
-
-  void getSegments(int* label, int length, std::vector<Segment>& segments) {
-    segments.clear();
-    segments.reserve(length);
-    int chunkStart = 0;
-    bool inChunk = false;
-    int tag = -1;
-    int type = otherChunkType_;
-    for (int i = 0; i < length; ++i) {
-      int prevTag = tag;
-      int prevType = type;
-      CHECK_LE(label[i], numChunkTypes_ * numTagTypes_);
-      tag = label[i] % numTagTypes_;
-      type = label[i] / numTagTypes_;
-      if (inChunk && isChunkEnd(prevTag, prevType, tag, type)) {
-        Segment segment{
-            chunkStart,  // begin
-            i - 1,       // end
-            prevType,
-        };
-        segments.push_back(segment);
-        inChunk = false;
-      }
-      if (isChunkBegin(prevTag, prevType, tag, type)) {
-        chunkStart = i;
-        inChunk = true;
-      }
-    }
-    if (inChunk) {
-      Segment segment{
-          chunkStart,  // begin
-          length - 1,  // end
-          type,
-      };
-      segments.push_back(segment);
-    }
-  }
-
-  // whether (prevTag, prevType) is the end of a chunk
-  bool isChunkEnd(int prevTag, int prevType, int tag, int type) {
-    if (prevType == otherChunkType_) return false;
-    if (type == otherChunkType_) return true;
-    if (type != prevType) return true;
-    if (prevTag == tagBegin_) return tag == tagBegin_ || tag == tagSingle_;
-    if (prevTag == tagInside_) return tag == tagBegin_ || tag == tagSingle_;
-    if (prevTag == tagEnd_) return true;
-    if (prevTag == tagSingle_) return true;
-    return false;
-  }
-
-  // whether (tag, type) is the beginning of a chunk
-  bool isChunkBegin(int prevTag, int prevType, int tag, int type) {
-    if (prevType == otherChunkType_) return type != otherChunkType_;
-    if (type == otherChunkType_) return false;
-    if (type != prevType) return true;
-    if (tag == tagBegin_) return true;
-    if (tag == tagInside_) return prevTag == tagEnd_ || prevTag == tagSingle_;
-    if (tag == tagEnd_) return prevTag == tagEnd_ || prevTag == tagSingle_;
-    if (tag == tagSingle_) return true;
-    return false;
-  }
-
-  // three metrics: precision, recall and F1-score
-  void getNames(std::vector<std::string>* names) {
-    storeLocalValues();
-    names->reserve(names->size() + values_.size());
-    for (auto it = values_.begin(); it != values_.end(); ++it) {
-      names->push_back(config_.name() + "." + it->first);
-    }
-  }
-
-  // get value by field name
-  real getValue(const std::string& name, Error* err) const {
-    storeLocalValues();
-    std::vector<std::string> buffers;
-    paddle::str::split(name, '.', &buffers);
-    auto it = values_.find(buffers.back());
-    if (it == values_.end()) {  // not found
-      *err = Error("No such key %s", name.c_str());
-      return 0.0f;
-    }
-
-    return it->second;
-  }
-
-  // get type of evaluator
-  std::string getType(const std::string& name, Error* err) const {
-    this->getValue(name, err);
-    if (!err->isOK()) {
-      return "";
-    }
-    return "chunk";
-  }
-
- private:
-  void storeLocalValues() const {
-    CHECK_GE(numOutputSegments_, 0);
-    CHECK_GE(numLabelSegments_, 0);
-    double precision =
-        !numOutputSegments_ ? 0 : (double)numCorrect_ / numOutputSegments_;
-    double recall =
-        !numLabelSegments_ ? 0 : (double)numCorrect_ / numLabelSegments_;
-    values_["precision"] = precision;
-    values_["recall"] = recall;
-    values_["F1-score"] =
-        !numCorrect_ ? 0 : 2 * precision * recall / (precision + recall);
-  }
-};
-
-REGISTER_EVALUATOR(chunk, ChunkEvaluator);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/DetectionMAPEvaluator.cpp b/paddle/legacy/gserver/evaluators/DetectionMAPEvaluator.cpp
deleted file mode 100644
index 57657241f8c..00000000000
--- a/paddle/legacy/gserver/evaluators/DetectionMAPEvaluator.cpp
+++ /dev/null
@@ -1,308 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Evaluator.h"
-#include "paddle/legacy/gserver/layers/DetectionUtil.h"
-
-using std::map;
-using std::vector;
-using std::pair;
-using std::make_pair;
-
-namespace paddle {
-
-/**
- * @brief detection map Evaluator
- *
- * The config file api is detection_map_evaluator.
- */
-class DetectionMAPEvaluator : public Evaluator {
- public:
-  DetectionMAPEvaluator()
-      : evaluateDifficult_(false), cpuOutput_(nullptr), cpuLabel_(nullptr) {}
-
-  virtual void start() {
-    Evaluator::start();
-    allTruePos_.clear();
-    allFalsePos_.clear();
-    numPos_.clear();
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    overlapThreshold_ = config_.overlap_threshold();
-    backgroundId_ = config_.background_id();
-    evaluateDifficult_ = config_.evaluate_difficult();
-    apType_ = config_.ap_type();
-
-    MatrixPtr detectTmpValue = arguments[0].value;
-    Matrix::resizeOrCreate(cpuOutput_,
-                           detectTmpValue->getHeight(),
-                           detectTmpValue->getWidth(),
-                           false,
-                           false);
-
-    MatrixPtr labelTmpValue = arguments[1].value;
-    Matrix::resizeOrCreate(cpuLabel_,
-                           labelTmpValue->getHeight(),
-                           labelTmpValue->getWidth(),
-                           false,
-                           false);
-
-    cpuOutput_->copyFrom(*detectTmpValue);
-    cpuLabel_->copyFrom(*labelTmpValue);
-
-    Argument label = arguments[1];
-    const int* labelIndex = label.sequenceStartPositions->getData(false);
-    size_t batchSize = label.getNumSequences();
-
-    vector<map<size_t, vector<NormalizedBBox>>> allGTBBoxes;
-    vector<map<size_t, vector<pair<real, NormalizedBBox>>>> allDetectBBoxes;
-
-    for (size_t n = 0; n < batchSize; ++n) {
-      map<size_t, vector<NormalizedBBox>> bboxes;
-      for (int i = labelIndex[n]; i < labelIndex[n + 1]; ++i) {
-        vector<NormalizedBBox> bbox;
-        getBBoxFromLabelData(cpuLabel_->getData() + i * 6, 1, bbox);
-        int c = cpuLabel_->getData()[i * 6];
-        bboxes[c].push_back(bbox[0]);
-      }
-      allGTBBoxes.push_back(bboxes);
-    }
-
-    size_t n = 0;
-    const real* cpuOutputData = cpuOutput_->getData();
-    for (size_t imgId = 0; imgId < batchSize; ++imgId) {
-      map<size_t, vector<pair<real, NormalizedBBox>>> bboxes;
-      size_t curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
-      while (curImgId == imgId && n < cpuOutput_->getHeight()) {
-        vector<real> label;
-        vector<real> score;
-        vector<NormalizedBBox> bbox;
-        getBBoxFromDetectData(cpuOutputData + n * 7, 1, label, score, bbox);
-        bboxes[label[0]].push_back(make_pair(score[0], bbox[0]));
-        ++n;
-        curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
-      }
-      allDetectBBoxes.push_back(bboxes);
-    }
-
-    for (size_t n = 0; n < batchSize; ++n) {
-      for (map<size_t, vector<NormalizedBBox>>::iterator it =
-               allGTBBoxes[n].begin();
-           it != allGTBBoxes[n].end();
-           ++it) {
-        size_t count = 0;
-        if (evaluateDifficult_) {
-          count = it->second.size();
-        } else {
-          for (size_t i = 0; i < it->second.size(); ++i)
-            if (!(it->second[i].isDifficult)) ++count;
-        }
-        if (numPos_.find(it->first) == numPos_.end() && count != 0) {
-          numPos_[it->first] = count;
-        } else {
-          numPos_[it->first] += count;
-        }
-      }
-    }
-
-    // calcTFPos
-    calcTFPos(batchSize, allGTBBoxes, allDetectBBoxes);
-
-    return 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    real mAP = calcMAP();
-    os << "Detection mAP=" << mAP;
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    LOG(FATAL) << "Distribute detection evaluation not implemented.";
-  }
-
- protected:
-  void calcTFPos(const size_t batchSize,
-                 const vector<map<size_t, vector<NormalizedBBox>>>& allGTBBoxes,
-                 const vector<map<size_t, vector<pair<real, NormalizedBBox>>>>&
-                     allDetectBBoxes) {
-    for (size_t n = 0; n < allDetectBBoxes.size(); ++n) {
-      if (allGTBBoxes[n].size() == 0) {
-        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
-                 it = allDetectBBoxes[n].begin();
-             it != allDetectBBoxes[n].end();
-             ++it) {
-          size_t label = it->first;
-          for (size_t i = 0; i < it->second.size(); ++i) {
-            allTruePos_[label].push_back(make_pair(it->second[i].first, 0));
-            allFalsePos_[label].push_back(make_pair(it->second[i].first, 1));
-          }
-        }
-      } else {
-        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
-                 it = allDetectBBoxes[n].begin();
-             it != allDetectBBoxes[n].end();
-             ++it) {
-          size_t label = it->first;
-          vector<pair<real, NormalizedBBox>> predBBoxes = it->second;
-          if (allGTBBoxes[n].find(label) == allGTBBoxes[n].end()) {
-            for (size_t i = 0; i < predBBoxes.size(); ++i) {
-              allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
-              allFalsePos_[label].push_back(make_pair(predBBoxes[i].first, 1));
-            }
-          } else {
-            vector<NormalizedBBox> gtBBoxes =
-                allGTBBoxes[n].find(label)->second;
-            vector<bool> visited(gtBBoxes.size(), false);
-            // Sort detections in descend order based on scores
-            std::sort(predBBoxes.begin(),
-                      predBBoxes.end(),
-                      sortScorePairDescend<NormalizedBBox>);
-            for (size_t i = 0; i < predBBoxes.size(); ++i) {
-              real maxOverlap = -1.0;
-              size_t maxIdx = 0;
-              for (size_t j = 0; j < gtBBoxes.size(); ++j) {
-                real overlap =
-                    jaccardOverlap(predBBoxes[i].second, gtBBoxes[j]);
-                if (overlap > maxOverlap) {
-                  maxOverlap = overlap;
-                  maxIdx = j;
-                }
-              }
-              if (maxOverlap > overlapThreshold_) {
-                if (evaluateDifficult_ ||
-                    (!evaluateDifficult_ && !gtBBoxes[maxIdx].isDifficult)) {
-                  if (!visited[maxIdx]) {
-                    allTruePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 1));
-                    allFalsePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 0));
-                    visited[maxIdx] = true;
-                  } else {
-                    allTruePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 0));
-                    allFalsePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 1));
-                  }
-                }
-              } else {
-                allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
-                allFalsePos_[label].push_back(
-                    make_pair(predBBoxes[i].first, 1));
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  real calcMAP() const {
-    real mAP = 0.0;
-    size_t count = 0;
-    for (map<size_t, size_t>::const_iterator it = numPos_.begin();
-         it != numPos_.end();
-         ++it) {
-      size_t label = it->first;
-      size_t labelNumPos = it->second;
-      if (labelNumPos == 0 || allTruePos_.find(label) == allTruePos_.end())
-        continue;
-      vector<pair<real, size_t>> labelTruePos = allTruePos_.find(label)->second;
-      vector<pair<real, size_t>> labelFalsePos =
-          allFalsePos_.find(label)->second;
-      // Compute average precision.
-      vector<size_t> tpCumSum;
-      getAccumulation(labelTruePos, &tpCumSum);
-      vector<size_t> fpCumSum;
-      getAccumulation(labelFalsePos, &fpCumSum);
-      std::vector<real> precision, recall;
-      size_t num = tpCumSum.size();
-      // Compute Precision.
-      for (size_t i = 0; i < num; ++i) {
-        CHECK_LE(tpCumSum[i], labelNumPos);
-        precision.push_back(static_cast<real>(tpCumSum[i]) /
-                            static_cast<real>(tpCumSum[i] + fpCumSum[i]));
-        recall.push_back(static_cast<real>(tpCumSum[i]) / labelNumPos);
-      }
-      // VOC2007 style
-      if (apType_ == "11point") {
-        vector<real> maxPrecisions(11, 0.0);
-        int startIdx = num - 1;
-        for (int j = 10; j >= 0; --j)
-          for (int i = startIdx; i >= 0; --i) {
-            if (recall[i] < j / 10.) {
-              startIdx = i;
-              if (j > 0) maxPrecisions[j - 1] = maxPrecisions[j];
-              break;
-            } else {
-              if (maxPrecisions[j] < precision[i])
-                maxPrecisions[j] = precision[i];
-            }
-          }
-        for (int j = 10; j >= 0; --j) mAP += maxPrecisions[j] / 11;
-        ++count;
-      } else if (apType_ == "Integral") {
-        // Nature integral
-        real averagePrecisions = 0.;
-        real prevRecall = 0.;
-        for (size_t i = 0; i < num; ++i) {
-          if (fabs(recall[i] - prevRecall) > 1e-6)
-            averagePrecisions += precision[i] * fabs(recall[i] - prevRecall);
-          prevRecall = recall[i];
-        }
-        mAP += averagePrecisions;
-        ++count;
-      } else {
-        LOG(FATAL) << "Unkown ap version: " << apType_;
-      }
-    }
-    if (count != 0) mAP /= count;
-    return mAP * 100;
-  }
-
-  void getAccumulation(vector<pair<real, size_t>> inPairs,
-                       vector<size_t>* accuVec) const {
-    std::stable_sort(
-        inPairs.begin(), inPairs.end(), sortScorePairDescend<size_t>);
-    accuVec->clear();
-    size_t sum = 0;
-    for (size_t i = 0; i < inPairs.size(); ++i) {
-      sum += inPairs[i].second;
-      accuVec->push_back(sum);
-    }
-  }
-
-  std::string getTypeImpl() const { return "detection_map"; }
-
-  real getValueImpl() const { return calcMAP(); }
-
- private:
-  real overlapThreshold_;  // overlap threshold when determining whether matched
-  bool evaluateDifficult_;  // whether evaluate difficult ground truth
-  size_t backgroundId_;     // class index of background
-  std::string apType_;      // how to calculate mAP (Integral or 11point)
-
-  MatrixPtr cpuOutput_;
-  MatrixPtr cpuLabel_;
-
-  map<size_t, size_t> numPos_;  // counts of true objects each classification
-  map<size_t, vector<pair<real, size_t>>>
-      allTruePos_;  // true positive prediction
-  map<size_t, vector<pair<real, size_t>>>
-      allFalsePos_;  // false positive prediction
-};
-
-REGISTER_EVALUATOR(detection_map, DetectionMAPEvaluator);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/Evaluator.cpp b/paddle/legacy/gserver/evaluators/Evaluator.cpp
deleted file mode 100644
index a956f40d02e..00000000000
--- a/paddle/legacy/gserver/evaluators/Evaluator.cpp
+++ /dev/null
@@ -1,1361 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/gserver/evaluators/Evaluator.h"
-#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/StringUtil.h"
-
-DECLARE_int32(trainer_id);
-
-namespace paddle {
-
-void Evaluator::eval(const NeuralNetwork& nn) {
-  std::vector<Argument> arguments;
-  arguments.reserve(config_.input_layers_size());
-  for (const std::string& name : config_.input_layers()) {
-    arguments.push_back(nn.getLayer(name)->getOutput());
-  }
-  SetDevice device(arguments[0].deviceId);
-  real score = evalImp(arguments);
-  totalScore_ += score;
-  updateSamplesNum(arguments);
-}
-/**
- * @brief classification error Evaluator
- *
- * The config file api is classification_error_evaluator.
- */
-class ClassificationErrorEvaluator : public Evaluator {
- public:
-  /*
-  ClassificationErrorEvaluator() : totalScore2_(0) {}
-
-  virtual void start() {
-    Evaluator::start();
-    totalScore2_ = 0;
-    } */
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    if (3 == arguments.size()) {
-      numSamples_ += arguments[2].value->getSum();
-    } else {
-      numSamples_ += arguments[0].getBatchSize();
-    }
-  }
-
-  MatrixPtr calcError(std::vector<Argument>& arguments) {
-    CHECK_GE(arguments.size(), (size_t)2);
-    CHECK_LE(arguments.size(), (size_t)3);
-    MatrixPtr& output = arguments[0].value;
-    IVectorPtr& label = arguments[1].ids;
-    MatrixPtr& multiBinaryLabel = arguments[1].value;  // For multi binary label
-    bool supportWeight = (3 == arguments.size()) ? true : false;
-    MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
-    if (nullptr == output ||
-        (nullptr == label && nullptr == multiBinaryLabel) ||
-        (supportWeight && nullptr == weight)) {
-      return 0;
-    }
-
-    if (label != nullptr) {
-      CHECK_EQ(label->getSize(), output->getHeight());
-    } else {
-      CHECK_EQ(multiBinaryLabel->getHeight(), output->getHeight());
-      CHECK_EQ(multiBinaryLabel->getWidth(), output->getWidth());
-    }
-    if (supportWeight) {
-      CHECK_EQ(output->getHeight(), weight->getHeight());
-      CHECK_EQ((size_t)1, weight->getWidth());
-    }
-
-    const MatrixPtr errorMat = Matrix::create(output->getHeight(),
-                                              1,
-                                              /* trans= */ false,
-                                              useGpu(arguments[0].deviceId));
-
-    errorMat->zeroMem();
-
-    if (label != nullptr) {
-      errorMat->classificationError(*output, *label, config_.top_k());
-    } else if (dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()) ||
-               dynamic_cast<GpuSparseMatrix*>(multiBinaryLabel.get())) {
-      errorMat->classificationErrorMulti(
-          *output, *multiBinaryLabel, config_.classification_threshold());
-    } else {
-      errorMat->binaryClassificationError(
-          0, *output, *multiBinaryLabel, config_.classification_threshold());
-    }
-
-    if (supportWeight) {
-      errorMat->dotMul(*errorMat, *weight);
-    }
-    return errorMat;
-  }
-
-  void printStats(std::ostream& os) const {
-    if (config_.top_k() == 1) {
-      os << config_.name() << "="
-         << (numSamples_ ? totalScore_ / numSamples_ : 0);
-    } else {
-      os << " top_" << config_.top_k()
-         << "_error=" << (numSamples_ ? totalScore_ / numSamples_ : 0);
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    MatrixPtr errorMat = calcError(arguments);
-    return errorMat->getSum();
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const { return "classification_error"; }
-};
-
-/**
- * @brief sequence classification error Evaluator
- * @note sequence level classification error stats,
- * if any frame in one sequence has error, the sequence is error
- */
-class SequenceClassificationErrorEvaluator
-    : public ClassificationErrorEvaluator {
- public:
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    numSamples_ += arguments[0].getNumSequences();
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    auto sequenceStartPositions =
-        arguments[0].sequenceStartPositions->getVector(false);
-    CHECK(sequenceStartPositions != nullptr);
-    const int* starts = sequenceStartPositions->getData();
-
-    MatrixPtr errorMat = calcError(arguments);
-
-    int errCounter = 0;
-    CpuVector errorVec(0, nullptr);
-    for (size_t i = 0; i < sequenceStartPositions->getSize() - 1; ++i) {
-      errorVec.subVecFrom(
-          errorMat->getData(), starts[i], starts[i + 1] - starts[i]);
-      if (errorVec.getSum() > 0) {
-        errCounter += 1;
-      }
-    }
-
-    return static_cast<real>(errCounter);
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const { return "seq_classification_error"; }
-};
-REGISTER_EVALUATOR(seq_classification_error,
-                   SequenceClassificationErrorEvaluator);
-/**
- * @brief sum Evaluator
- * Calculate the sum of output or label
- *
- * The config file api is sum_evaluator.
- */
-class SumEvaluator : public Evaluator {
- public:
-  SumEvaluator() : cpuLabel_(nullptr), cpuWeight_(nullptr) {}
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    if (2 == arguments.size()) {
-      numSamples_ += arguments[1].value->getSum();
-    } else {
-      numSamples_ += arguments[0].getBatchSize();
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    REGISTER_TIMER("SumEvaluator");
-    CHECK_GE(arguments.size(), (size_t)1);
-    CHECK_LE(arguments.size(), (size_t)2);
-    bool supportWeight = (2 == arguments.size()) ? true : false;
-    if (supportWeight) {
-      if (nullptr == arguments[1].value) {
-        return 0;
-      }
-      CHECK_EQ(arguments[1].value->getWidth(), (size_t)1);
-    }
-
-    // The sum of output
-    if (arguments[0].value) {
-      if (supportWeight) {
-        CHECK_EQ(arguments[0].value->getHeight(),
-                 arguments[1].value->getHeight());
-        MatrixPtr tmpMat = Matrix::create(arguments[0].value->getHeight(),
-                                          arguments[0].value->getWidth(),
-                                          /* trans= */ false,
-                                          arguments[0].value->useGpu());
-        tmpMat->copyFrom(*arguments[0].value);
-        tmpMat->rowScale(0, *tmpMat, *arguments[1].value);
-        return tmpMat->getSum();
-      } else {
-        return arguments[0].value->getSum();
-      }
-      // The sum of label
-    } else if (arguments[0].ids) {
-      size_t insNum = arguments[0].ids->getSize();
-      IVectorPtr label = arguments[0].ids;
-      MatrixPtr weight = supportWeight ? arguments[1].value : nullptr;
-      if (dynamic_cast<GpuIVector*>(label.get())) {
-        IVector::resizeOrCreate(cpuLabel_, insNum, false);
-        cpuLabel_->copyFrom(*arguments[0].ids);
-
-        if (supportWeight) {
-          CHECK_EQ(insNum, arguments[1].value->getHeight());
-          Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
-          cpuWeight_->copyFrom(*arguments[1].value);
-        }
-
-        label = cpuLabel_;
-        weight = cpuWeight_;
-      }
-
-      if (supportWeight) {
-        real score = 0.0;
-        int* labelD = label->getData();
-        real* weightD = weight->getData();
-        for (size_t i = 0; i < insNum; ++i) {
-          score += (labelD[i] * weightD[i]);
-        }
-        return score;
-      } else {
-        return label->getSum();
-      }
-    } else {
-      return 0;
-    }
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
- private:
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const { return "sum"; }
-};
-/**
- * @brief column sum Evaluator
- * @note column sum for the colIdx-th column *
- * - colIdx = 0: the 0-th column.
- * - colIdx > 0: the colIdx-th column.
- * - colIdx < 0: the last colIdx-th column.
- *
- * The config file api is column_sum_evaluator.
- *
- */
-class ColumnSumEvaluator : public Evaluator {
- public:
-  explicit ColumnSumEvaluator(int32_t colIdx)
-      : colIdx_(colIdx), colNum_(0), sum_(nullptr) {}
-
-  virtual void start() {
-    Evaluator::start();
-    if (nullptr != sum_) {
-      sum_->zeroMem();
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    if (2 == arguments.size()) {
-      numSamples_ += arguments[1].value->getSum();
-    } else {
-      numSamples_ += arguments[0].getBatchSize();
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    REGISTER_TIMER("ColumnSumEvaluator");
-    CHECK_GE(arguments.size(), (size_t)1);
-    CHECK_LE(arguments.size(), (size_t)2);
-    bool supportWeight = (2 == arguments.size()) ? true : false;
-    if (nullptr == arguments[0].value ||
-        (supportWeight && nullptr == arguments[1].value)) {
-      return 0;
-    }
-
-    size_t insNum = arguments[0].value->getHeight();
-    size_t colNum = arguments[0].value->getWidth();
-    if (nullptr == sum_) {
-      sum_ = Matrix::create((size_t)1, colNum, false, /* useGpu */ false);
-      colNum_ = colNum;
-      sum_->zeroMem();
-    } else {
-      CHECK_EQ(colNum, sum_->getWidth());
-    }
-
-    if (supportWeight) {
-      CHECK_EQ(insNum, arguments[1].value->getHeight());
-      CHECK_EQ((size_t)1, arguments[1].value->getWidth());
-      MatrixPtr tmpMat = Matrix::create(insNum, colNum);
-      if (arguments[0].value->useGpu()) {
-        tmpMat->copyFrom(*arguments[0].value);
-      }
-      if (!arguments[1].value->useGpu()) {
-        if (!arguments[0].value->useGpu()) {
-          tmpMat->rowScale(0, *arguments[0].value, *arguments[1].value);
-        } else {
-          tmpMat->rowScale(0, *tmpMat, *arguments[1].value);
-        }
-      } else {
-        MatrixPtr tmp2 = Matrix::create(insNum, 1);
-        tmp2->copyFrom(*arguments[1].value);
-        if (!arguments[0].value->useGpu()) {
-          tmpMat->rowScale(0, *arguments[0].value, *tmp2);
-        } else {
-          tmpMat->rowScale(0, *tmpMat, *tmp2);
-        }
-      }
-      sum_->accumulateColSum(*tmpMat);
-    } else {
-      if (!arguments[0].value->useGpu()) {
-        sum_->accumulateColSum(*arguments[0].value);
-      } else {
-        MatrixPtr tmpMat = Matrix::create(insNum, colNum);
-        tmpMat->copyFrom(*arguments[0].value);
-        sum_->accumulateColSum(*tmpMat);
-      }
-    }
-    return 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    CHECK(colIdx_ + (int32_t)colNum_ >= 0 && colIdx_ - (int32_t)colNum_ < 0)
-        << "column index [" << colIdx_ << "] out of range [-" << colNum_ << ", "
-        << colNum_ << ")";
-    size_t colIdx = 0;
-    if (colIdx_ >= 0) {
-      colIdx = colIdx_;
-    } else {
-      colIdx = colNum_ + colIdx_;
-    }
-    os << config_.name() << "="
-       << (numSamples_ ? sum_->getElement(0, colIdx) / numSamples_ : 0);
-  }
-
-  void distributeEval(ParameterClient2* client) {
-    client->reduce(
-        sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id, 0);
-    client->reduce(&numSamples_, &numSamples_, 1, FLAGS_trainer_id, 0);
-  }
-
- private:
-  int32_t colIdx_;
-  size_t colNum_;
-  MatrixPtr sum_; /* cpu matrix */
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const {
-    if (colIdx_ == -1)
-      return "last-column-sum";
-    else
-      return "column-sum";
-  }
-};
-
-void AucEvaluator::start() {
-  Evaluator::start();
-  memset(statPos_, 0, sizeof(statPos_));
-  memset(statNeg_, 0, sizeof(statNeg_));
-}
-
-real AucEvaluator::evalImp(std::vector<Argument>& arguments) {
-  REGISTER_TIMER("AucEvaluator");
-  CHECK_GE(arguments.size(), (size_t)2);
-  CHECK_LE(arguments.size(), (size_t)3);
-  MatrixPtr output = arguments[0].value;
-  IVectorPtr label = arguments[1].ids;
-  MatrixPtr labelval = arguments[1].value;
-  bool supportWeight = (3 == arguments.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
-
-  if (nullptr == output || (supportWeight && nullptr == weight)) {
-    return 0;
-  }
-  size_t insNum = output->getHeight();
-  size_t outputDim = output->getWidth();
-  // Copy label from value to a vector.
-  if (nullptr == label && nullptr != labelval) {
-    // label width is 1
-    CHECK_EQ(1U, labelval->getWidth());
-    VectorPtr vec =
-        Vector::create(labelval->getData(), insNum, output->useGpu());
-    label = vec->castToInt();
-  }
-
-  CHECK_EQ(insNum, label->getSize());
-  if (supportWeight) {
-    CHECK_EQ(insNum, weight->getHeight());
-    CHECK_EQ((size_t)1, weight->getWidth());
-  }
-
-  CHECK(colIdx_ + (int32_t)outputDim >= 0 && colIdx_ - (int32_t)outputDim < 0)
-      << "column index [" << colIdx_ << "] out of range [-" << outputDim << ", "
-      << outputDim << ")";
-  realColumnIdx_ = 0;
-  if (colIdx_ >= 0) {
-    realColumnIdx_ = colIdx_;
-  } else {
-    realColumnIdx_ = outputDim + colIdx_;
-  }
-
-  if (dynamic_cast<GpuMatrix*>(output.get())) {
-    Matrix::resizeOrCreate(cpuOutput_,
-                           insNum,
-                           outputDim,
-                           /* trans=*/false,
-                           /* useGpu=*/false);
-    cpuOutput_->copyFrom(*output);
-    IVector::resizeOrCreate(cpuLabel_, insNum, false);
-    cpuLabel_->copyFrom(*label);
-
-    if (supportWeight) {
-      Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
-      cpuWeight_->copyFrom(*weight);
-    }
-
-    output = cpuOutput_;
-    label = cpuLabel_;
-    weight = cpuWeight_;
-  }
-
-  real* outputD = output->getData();
-  int* labelD = label->getData();
-  real* weightD = supportWeight ? weight->getData() : nullptr;
-  size_t pos = realColumnIdx_;
-
-  for (size_t i = 0; i < insNum; ++i) {
-    real value = outputD[pos];
-    uint32_t binIdx = static_cast<uint32_t>(value * kBinNum_);
-    CHECK(binIdx <= kBinNum_) << "bin index [" << binIdx
-                              << "] out of range, predict value[" << value
-                              << "]";
-    real w = supportWeight ? weightD[i] : 1.0;
-    if (labelD[i] == kNegativeLabel_) {
-      statNeg_[binIdx] += w;
-    } else {
-      statPos_[binIdx] += w;
-    }
-    pos += outputDim;
-  }
-  return 0;
-}
-
-void AucEvaluator::distributeEval(ParameterClient2* client) {
-  client->reduce(statPos_, statPos_, kBinNum_ + 1, FLAGS_trainer_id, 0);
-  client->reduce(statNeg_, statNeg_, kBinNum_ + 1, FLAGS_trainer_id, 0);
-}
-
-double AucEvaluator::calcAuc() const {
-  double totPos = 0.0;
-  double totNeg = 0.0;
-  double totPosPrev = 0.0;
-  double totNegPrev = 0.0;
-  double auc = 0.0;
-
-  int64_t idx = kBinNum_;
-  while (idx >= 0) {
-    totPosPrev = totPos;
-    totNegPrev = totNeg;
-    totPos += statPos_[idx];
-    totNeg += statNeg_[idx];
-    auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
-    --idx;
-  }
-
-  if (totPos > 0.0 && totNeg > 0.0) {
-    return auc / totPos / totNeg;
-  } else {
-    return 0.0;
-  }
-}
-
-real AucEvaluator::getValueImpl() const { return calcAuc(); }
-
-std::string AucEvaluator::getTypeImpl() const {
-  if (colIdx_ == -1) {
-    return "last-column-auc";
-  } else {
-    return "auc";
-  }
-}
-
-// class RankAucEvaluator
-REGISTER_EVALUATOR(rankauc, RankAucEvaluator);
-
-void RankAucEvaluator::start() { Evaluator::start(); }
-void RankAucEvaluator::updateSamplesNum(
-    const std::vector<Argument>& arguments) {
-  numSamples_ += arguments[0].getNumSequences();
-}
-real RankAucEvaluator::evalImp(std::vector<Argument>& arguments) {
-  CHECK_GE(arguments.size(), 2U);
-  CHECK_LE(arguments.size(), 3U);
-  double batchAuc = 0.0;
-  output_ = arguments[0].value;
-  click_ = arguments[1].value;
-  size_t batchSize = output_->getHeight();
-  CHECK(!output_->useGpu()) << "RankAUC evaluator does not support GPU!";
-
-  if (arguments.size() == 3U) {
-    pv_ = arguments[2].value;
-  } else {
-    Matrix::resizeOrCreate(pv_, batchSize, 1, false, false);
-    std::fill(pv_->getData(), pv_->getData() + batchSize, 1.0);
-  }
-
-  real* outputData = output_->getData();
-  real* clickData = click_->getData();
-  real* pvData = pv_->getData();
-
-  auto startPos = arguments[0].sequenceStartPositions->getVector(false);
-  const int* startPosData = startPos->getData();
-  size_t batchNum = startPos->getSize() - 1;
-  for (size_t i = 0; i < batchNum; ++i) {
-    int beginPos = startPosData[i];
-    int endPos = startPosData[i + 1];
-    batchAuc += calcRankAuc(outputData + beginPos,
-                            clickData + beginPos,
-                            pvData + beginPos,
-                            endPos - beginPos);
-  }
-  return batchAuc;
-}
-
-double RankAucEvaluator::calcRankAuc(real* outputData,
-                                     real* clickData,
-                                     real* pvData,
-                                     size_t size) {
-  outputPair_.clear();
-  for (size_t i = 0; i < size; ++i) {
-    outputPair_.push_back(std::make_pair(outputData[i], i));
-  }
-  std::sort(outputPair_.begin(),
-            outputPair_.end(),
-            [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-              return a.first > b.first;
-            });
-  double aucTmp = 0.0;
-  double clickSum = 0.0;
-  double oldClickSum = 0.0;
-  double noClick = 0.0;
-  double noClickSum = 0.0;
-
-  double lastScore = outputPair_[0].first + 1.0;
-  for (size_t i = 0; i < size; ++i) {
-    if (lastScore != outputPair_[i].first) {
-      aucTmp += (clickSum + oldClickSum) * noClick / 2.0;
-      oldClickSum = clickSum;
-      noClick = 0.0;
-      lastScore = outputPair_[i].first;
-    }
-    size_t id = outputPair_[i].second;
-    noClick += pvData[id] - clickData[id];
-    noClickSum += noClick;
-    clickSum += clickData[id];
-  }
-  aucTmp += (clickSum + oldClickSum) * noClick / 2.0;
-  return (clickSum * noClickSum) == 0.0 ? 0.0
-                                        : aucTmp / (clickSum * noClickSum);
-}
-
-std::string RankAucEvaluator::getTypeImpl() const { return "rankauc"; }
-
-// class PrecisionRecallEvaluator
-REGISTER_EVALUATOR(precision_recall, PrecisionRecallEvaluator);
-
-void PrecisionRecallEvaluator::start() {
-  Evaluator::start();
-  statsInfo_.clear();
-  values_.clear();
-}
-
-real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
-  REGISTER_TIMER("PrecisionRecallEvaluator");
-  CHECK_GE(arguments.size(), (size_t)2);
-  CHECK_LE(arguments.size(), (size_t)3);
-  MatrixPtr output = arguments[0].value;
-  IVectorPtr label = arguments[1].ids;
-  MatrixPtr multiBinaryLabel = arguments[1].value;
-  bool supportWeight = (3 == arguments.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
-  if (nullptr == output || (nullptr == label && nullptr == multiBinaryLabel) ||
-      (supportWeight && nullptr == weight)) {
-    return 0;
-  }
-
-  size_t insNum = output->getHeight();
-  size_t outputDim = output->getWidth();
-  if (label != nullptr) {
-    CHECK_EQ(insNum, label->getSize());
-  } else {
-    CHECK_EQ(insNum, multiBinaryLabel->getHeight());
-    CHECK_EQ(outputDim, multiBinaryLabel->getWidth());
-  }
-  if (supportWeight) {
-    CHECK_EQ(insNum, weight->getHeight());
-    CHECK_EQ((size_t)1, weight->getWidth());
-  }
-
-  if (statsInfo_.size() != outputDim) {
-    statsInfo_.clear();
-    statsInfo_.resize(outputDim);
-  }
-
-  isMultiBinaryLabel_ = (nullptr == label) ? true : false;
-  if (label != nullptr) {
-    if (dynamic_cast<GpuMatrix*>(output.get())) {
-      Matrix::resizeOrCreate(cpuOutput_, insNum, outputDim, false, false);
-      cpuOutput_->copyFrom(*output);
-      IVector::resizeOrCreate(cpuLabel_, insNum, false);
-      cpuLabel_->copyFrom(*label);
-      if (supportWeight) {
-        Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
-        cpuWeight_->copyFrom(*weight);
-      }
-
-      output = cpuOutput_;
-      label = cpuLabel_;
-      weight = cpuWeight_;
-    }
-    calcStatsInfo(output, label, weight);
-  } else {
-    // Not support GPU for multi binary labels
-    CHECK(dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()));
-    calcStatsInfoMulti(output, multiBinaryLabel, weight);
-  }
-  return 0;
-}
-
-void PrecisionRecallEvaluator::printStats(std::ostream& os) const {
-  PrintStatsInfo info;
-  bool containMacroMicroInfo = getStatsInfo(&info);
-  os << "positive_label=" << config_.positive_label()
-     << " precision=" << info.precision << " recall=" << info.recall
-     << " F1-score=" << info.f1;
-  if (containMacroMicroInfo) {
-    os << "macro-average-precision=" << info.macroAvgPrecision
-       << " macro-average-recall=" << info.macroAvgRecall
-       << " macro-average-F1-score=" << info.macroAvgF1Score;
-    if (!isMultiBinaryLabel_) {
-      // precision and recall are equal in this case
-      os << " micro-average-precision=" << info.microAvgPrecision;
-    } else {
-      os << " micro-average-precision=" << info.microAvgPrecision
-         << " micro-average-recall=" << info.microAvgRecall
-         << " micro-average-F1-score=" << info.microAvgF1Score;
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::calcStatsInfo(const MatrixPtr& output,
-                                             const IVectorPtr& label,
-                                             const MatrixPtr& weight) {
-  size_t insNum = output->getHeight();
-  size_t dim = output->getWidth();
-  real* outputD = output->getData();
-  int* labelD = label->getData();
-  real* weightD = (weight != nullptr) ? weight->getData() : nullptr;
-  for (size_t i = 0; i < insNum; ++i) {
-    CHECK_GE(labelD[i], 0);
-    CHECK_LT((size_t)labelD[i], dim);
-    size_t maxIdx = 0;
-    real maxValue = outputD[i * dim];
-    for (size_t j = 1; j < dim; ++j) {
-      size_t idx = i * dim + j;
-      if (maxValue < outputD[idx]) {
-        maxIdx = j;
-        maxValue = outputD[idx];
-      }
-    }
-
-    real w = (weightD != nullptr) ? weightD[i] : 1.0;
-    if (maxIdx == (size_t)labelD[i]) {
-      statsInfo_[maxIdx].TP += w;  // true positive for labelD[i]
-      // true negative for all labels except for labelD[i]
-      for (size_t j = 0; j < dim; ++j) {
-        statsInfo_[j].TN += w;
-      }
-      statsInfo_[maxIdx].TN -= w;
-    } else {
-      statsInfo_[labelD[i]].FN += w;  // false negative for labelD[i]
-      statsInfo_[maxIdx].FP += w;     // false positive for maxIdx
-      // true negatives for all labels except for maxIdx and labelD[i]
-      for (size_t j = 0; j < dim; ++j) {
-        statsInfo_[j].TN += w;
-      }
-      statsInfo_[maxIdx].TN -= w;
-      statsInfo_[labelD[i]].TN -= w;
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::calcStatsInfoMulti(const MatrixPtr& output,
-                                                  const MatrixPtr& label,
-                                                  const MatrixPtr& weight) {
-  size_t insNum = output->getHeight();
-  size_t dim = output->getWidth();
-  real* outputD = output->getData();
-  auto labelD = dynamic_cast<CpuSparseMatrix*>(label.get());
-  real* weightD = (weight != nullptr) ? weight->getData() : nullptr;
-  real threshold = config_.classification_threshold();
-  for (size_t i = 0; i < insNum; ++i) {
-    for (size_t j = 0; j < dim; ++j) {
-      real w = (weightD != nullptr) ? weightD[i] : 1.0;
-      size_t idx = i * dim + j;
-      if (outputD[idx] < threshold) {
-        statsInfo_[j].TN += w;  // true negative
-      } else {
-        statsInfo_[j].FP += w;  // false positive
-      }
-    }
-
-    const int* cols = labelD->getRowCols(i);
-    for (size_t j = 0; j < labelD->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      real w = (weightD != nullptr) ? weightD[i] : 1.0;
-      size_t idx = i * dim + cols[j];
-      if (outputD[idx] < threshold) {
-        statsInfo_[cols[j]].FN += w;  // false negative
-        statsInfo_[cols[j]].TN -= w;  // true negative
-      } else {
-        statsInfo_[cols[j]].TP += w;  // true positive
-        statsInfo_[cols[j]].FP -= w;  // false positive
-      }
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::storeLocalValues() const {
-  if (this->values_.size() == 0) {
-    PrintStatsInfo info;
-    bool containMacroMicroInfo = getStatsInfo(&info);
-    values_["precision"] = info.precision;
-    values_["recal"] = info.recall;
-    values_["F1-score"] = info.f1;
-    if (containMacroMicroInfo) {
-      values_["macro-average-precision"] = info.macroAvgPrecision;
-      values_["macro-average-recall"] = info.macroAvgRecall;
-      values_["macro-average-F1-score"] = info.macroAvgF1Score;
-      if (!isMultiBinaryLabel_) {
-        // precision and recall are equal in this case
-        values_["micro-average-precision"] = info.microAvgPrecision;
-      } else {
-        values_["micro-average-precision"] = info.microAvgPrecision;
-        values_["micro-average-recall"] = info.microAvgRecall;
-        values_["micro-average-F1-score"] = info.microAvgF1Score;
-      }
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::getNames(std::vector<std::string>* names) {
-  this->storeLocalValues();
-  names->reserve(this->values_.size());
-  for (auto it = this->values_.begin(); it != this->values_.end(); ++it) {
-    names->push_back(this->config_.name() + "." + it->first);
-  }
-}
-
-real PrecisionRecallEvaluator::getValue(const std::string& name,
-                                        Error* err) const {
-  this->storeLocalValues();
-  std::vector<std::string> buffers;
-  paddle::str::split(name, '.', &buffers);
-  auto it = this->values_.find(buffers[buffers.size() - 1]);
-  if (it == this->values_.end()) {  // not found
-    *err = Error("No such key %s", name.c_str());
-    return .0f;
-  }
-
-  return it->second;
-}
-
-std::string PrecisionRecallEvaluator::getType(const std::string& name,
-                                              Error* err) const {
-  this->getValue(name, err);
-  if (!err->isOK()) {
-    return "";
-  }
-  return "precision_recall";
-}
-
-void PrecisionRecallEvaluator::distributeEval(ParameterClient2* client) {
-  size_t size = 4 * statsInfo_.size();
-  double* buf = new double[size];
-  for (size_t i = 0; i < statsInfo_.size(); ++i) {
-    buf[4 * i + 0] = statsInfo_[i].TP;
-    buf[4 * i + 1] = statsInfo_[i].TN;
-    buf[4 * i + 2] = statsInfo_[i].FP;
-    buf[4 * i + 3] = statsInfo_[i].FN;
-  }
-  client->reduce(buf, buf, size, FLAGS_trainer_id, 0);
-  for (size_t i = 0; i < statsInfo_.size(); ++i) {
-    statsInfo_[i].TP = buf[4 * i + 0];
-    statsInfo_[i].TN = buf[4 * i + 1];
-    statsInfo_[i].FP = buf[4 * i + 2];
-    statsInfo_[i].FN = buf[4 * i + 3];
-  }
-  delete[] buf;
-}
-
-bool PrecisionRecallEvaluator::getStatsInfo(
-    PrecisionRecallEvaluator::PrintStatsInfo* info) const {
-  int label = config_.positive_label();
-  if (label != -1) {
-    CHECK(label >= 0 && label < (int)statsInfo_.size())
-        << "positive_label [" << label << "] should be in range [0, "
-        << statsInfo_.size() << ")";
-    info->precision = calcPrecision(statsInfo_[label].TP, statsInfo_[label].FP);
-    info->recall = calcRecall(statsInfo_[label].TP, statsInfo_[label].FN);
-    info->f1 = calcF1Score(info->precision, info->recall);
-    return false;
-  }
-
-  // micro average method: precision = (TP1+TP2)/(TP1+FP1+TP2+FP2)
-  // macro average method: precision = (precision1+precision2)/2
-  double microTotalTP = 0;
-  double microTotalFP = 0;
-  double microTotalFN = 0;
-  info->macroAvgPrecision = 0;
-  info->macroAvgRecall = 0;
-  size_t numLabels = statsInfo_.size();
-  for (size_t i = 0; i < numLabels; ++i) {
-    microTotalTP += statsInfo_[i].TP;
-    microTotalFP += statsInfo_[i].FP;
-    microTotalFN += statsInfo_[i].FN;
-    info->macroAvgPrecision +=
-        calcPrecision(statsInfo_[i].TP, statsInfo_[i].FP);
-    info->macroAvgRecall += calcRecall(statsInfo_[i].TP, statsInfo_[i].FN);
-  }
-  info->macroAvgPrecision /= numLabels;
-  info->macroAvgRecall /= numLabels;
-  info->macroAvgF1Score =
-      calcF1Score(info->macroAvgPrecision, info->macroAvgRecall);
-
-  info->microAvgPrecision = calcPrecision(microTotalTP, microTotalFP);
-  info->microAvgRecall = calcPrecision(microTotalTP, microTotalFN);
-  info->microAvgF1Score =
-      calcF1Score(info->microAvgPrecision, info->microAvgRecall);
-  return true;
-}
-
-REGISTER_EVALUATOR(pnpair, PnpairEvaluator);
-void PnpairEvaluator::start() {
-  Evaluator::start();
-  memset(pairArray_, 0, sizeof(pairArray_));
-  predictArray_.clear();
-}
-
-real PnpairEvaluator::evalImp(std::vector<Argument>& arguments) {
-  CHECK_GE(arguments.size(), 3UL);
-  CHECK_LE(arguments.size(), 4UL);
-  MatrixPtr output = arguments[0].value;
-  IVectorPtr label = arguments[1].ids;
-  IVectorPtr info = arguments[2].ids;
-  bool supportWeight = (4 == arguments.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? arguments[3].value : nullptr;
-  if (nullptr == output || nullptr == label ||
-      (supportWeight && nullptr == weight)) {
-    return 0;
-  }
-  size_t height = output->getHeight();
-  size_t width = output->getWidth();
-  CHECK_EQ(height, label->getSize());
-  CHECK_EQ(height, info->getSize());
-  if (supportWeight) {
-    CHECK_EQ(height, weight->getHeight());
-    CHECK_EQ((size_t)1, weight->getWidth());
-  }
-
-  if (dynamic_cast<GpuMatrix*>(output.get())) {
-    Matrix::resizeOrCreate(cpuOutput_, height, width, false, false);
-    IVector::resizeOrCreate(cpuLabel_, height, false);
-    IVector::resizeOrCreate(cpuInfo_, height, false);
-    cpuOutput_->copyFrom(*output);
-    cpuLabel_->copyFrom(*label);
-    cpuInfo_->copyFrom(*info);
-
-    output = cpuOutput_;
-    label = cpuLabel_;
-    info = cpuInfo_;
-
-    if (supportWeight) {
-      Matrix::resizeOrCreate(cpuWeight_, height, (size_t)1, false, false);
-      cpuWeight_->copyFrom(*weight);
-      weight = cpuWeight_;
-    }
-  }
-
-  real* outputs = output->getData();
-  int* labels = label->getData();
-  int* infos = info->getData();
-  real* weights = supportWeight ? weight->getData() : nullptr;
-  for (size_t i = 0; i < output->getHeight(); i++) {
-    real y1 = outputs[i * width + (width - 1)];
-    real w = supportWeight ? weights[i] : 1.0;
-    predictArray_.push_back(PredictionResult(y1, labels[i], infos[i], w));
-  }
-  return 0;
-}
-
-void PnpairEvaluator::stat(size_t start,
-                           size_t end,
-                           PredictionResult* answers,
-                           double& pos,
-                           double& neg,
-                           double& spe) {
-  for (size_t i = start; i < end; i++) {
-    for (size_t j = i + 1; j < end; j++) {
-      CHECK_EQ(answers[i].queryid, answers[j].queryid);
-      // The pair weight is the mean of the two samples' weight
-      double weight = (answers[i].weight + answers[j].weight) / 2.0;
-      if (answers[i].label != answers[j].label) {
-        if ((answers[i].out > answers[j].out &&
-             answers[i].label > answers[j].label) ||
-            (answers[i].out < answers[j].out &&
-             answers[i].label < answers[j].label)) {
-          pos += weight;
-        } else if ((answers[i].out > answers[j].out &&
-                    answers[i].label < answers[j].label) ||
-                   (answers[i].out < answers[j].out &&
-                    answers[i].label > answers[j].label)) {
-          neg += weight;
-        } else {
-          spe += weight;
-        }
-      }
-    }
-  }
-}
-
-void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
-  std::sort(predictArray.begin(),
-            predictArray.end(),
-            [](const PredictionResult& x, const PredictionResult& y) {
-              return x.queryid < y.queryid;
-            });
-
-  double pos = 0;
-  double neg = 0;
-  double special = 0;
-  auto start = predictArray.begin();
-  while (start != predictArray.end()) {
-    auto end = std::find_if(
-        start + 1, predictArray.end(), [=](const PredictionResult& x) {
-          return x.queryid != start->queryid;
-        });
-    CHECK(end != start);
-    stat(start - predictArray.begin(),
-         end - predictArray.begin(),
-         predictArray.data(),
-         pos,
-         neg,
-         special);
-
-    start = end;
-  }
-
-  pairArray_[0] += pos;
-  pairArray_[1] += neg;
-
-  LOG(INFO) << " calc total pos pair: " << pos
-            << " calc total neg pair: " << neg
-            << " calc total special pair: " << special;
-}
-
-std::string PnpairEvaluator::getTypeImpl() const { return "pnpair"; }
-
-ClassRegistrar<Evaluator> Evaluator::registrar_;
-Evaluator* Evaluator::create(const EvaluatorConfig& config) {
-  Evaluator* evaluator = registrar_.createByType(config.type());
-  evaluator->init(config);
-  return evaluator;
-}
-
-REGISTER_EVALUATOR(classification_error, ClassificationErrorEvaluator);
-REGISTER_EVALUATOR(sum, SumEvaluator);
-static InitFunction __reg_type_auc_sum__([]() {
-  Evaluator::registrar_.registerClass(
-      "last-column-sum", [] { return new ColumnSumEvaluator(-1); });
-  Evaluator::registrar_.registerClass("last-column-auc",
-                                      [] { return new AucEvaluator(-1); });
-});
-
-/**
- * @brief print value of each layer.
- *
- * The config file api is value_printer_evaluator.
- */
-class ValuePrinter : public NotGetableEvaluator {
- public:
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      nn.getLayer(name)->getOutput().printValueString(LOG(INFO),
-                                                      "layer=" + name + " ");
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(value_printer, ValuePrinter);
-
-/**
- * @brief print gradient of each layer.
- *
- * The config file api is gradient_printer_evaluator.
- */
-class GradientPrinter : public NotGetableEvaluator {
- public:
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-      if (argu.grad) {
-        std::ostringstream os;
-        argu.grad->print(os);
-        LOG(INFO) << "layer=" << name << " grad matrix:\n" << os.str();
-      }
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(gradient_printer, GradientPrinter);
-/**
- * @brief print row max id vctor of each layer
- *
- * The config file api is maxid_printer_evaluator.
- */
-class MaxIdPrinter : public NotGetableEvaluator {
- private:
-  IVectorPtr maxIds_;
-  MatrixPtr maxValues_;
-
- public:
-  MaxIdPrinter() {}
-
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-      if (argu.value) {
-        size_t height = argu.value->getHeight();
-        size_t width = config_.num_results();
-        IVector::resizeOrCreate(maxIds_, height * width, false);
-        Matrix::resizeOrCreate(maxValues_, height, width, false);
-        argu.value->rowMax(*maxIds_, *maxValues_);
-        std::ostringstream os;
-        int* ids = maxIds_->getData();
-        real* values = maxValues_->getData();
-        for (size_t i = 0; i < height; ++i) {
-          for (size_t j = 0; j < width; ++j) {
-            size_t pos = i * width + j;
-            os << ids[pos] << " : " << values[pos] << ", ";
-          }
-          os << std::endl;
-        }
-        LOG(INFO) << "layer=" << name << " row max id vector:\n" << os.str();
-      }
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(max_id_printer, MaxIdPrinter);
-/**
- * @brief print sequence max frames of each layer
- *
- * The config file api is maxframe_printer_evaluator.
- */
-class MaxFramePrinter : public NotGetableEvaluator {
- private:
-  IVectorPtr maxIds_;
-  MatrixPtr maxValues_;
-  MatrixPtr value_;
-
- public:
-  MaxFramePrinter() {
-    value_ =
-        Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, false);
-  }
-
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-
-      CHECK_EQ(argu.value->getWidth(), 1LU);
-      size_t numSequences = argu.getNumSequences();
-      const int* starts = argu.sequenceStartPositions->getData(false);
-
-      std::ostringstream os;
-      for (size_t i = 0; i < numSequences; ++i) {
-        size_t offset = starts[i];
-        size_t size = starts[i + 1] - starts[i];
-        value_->setData(argu.value->getData() + offset, 1LU, size);
-
-        size_t height = 1LU;
-        size_t width = std::min((size_t)config_.num_results(), size);
-        IVector::resizeOrCreate(maxIds_, height * width, false);
-        Matrix::resizeOrCreate(maxValues_, height, width, false);
-
-        value_->rowMax(*maxIds_, *maxValues_);
-
-        int* ids = maxIds_->getData();
-        real* values = maxValues_->getData();
-        for (size_t j = 0; j < width; ++j) {
-          os << ids[j] << " : " << values[j] << ", ";
-        }
-        os << "total " << size << " frames" << std::endl;
-      }
-      LOG(INFO) << "layer=" << name << " sequence max frames:\n" << os.str();
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
-
-/**
- * @brief print text according to index matrix and a dictionary.
- *
- * There can be multiple input to this layer:
- * - If there is only one input, the input must be a matrix containing
- *      the sequence of indices;
- * - If there are more than one input, the first input should be ids,
- *      and are interpreted as sample ids.
- *
- * The output format will be:
- *
- * - sequence without sub-sequence, and there is probability.
- *
- *     @code
- *      id \t prob space_seperated_tokens_from_dictionary_according_to_seq
- *     @endcode
- *
- * - sequence without sub-sequence, and there is not probability.
- *
- *     @code
- *      id \t space_seperated_tokens_from_dictionary_according_to_seq
- *     @endcode
- *
- * - sequence with sub-sequence, and there is not probability.
- *
- *     @code
- *      id \t space_seperated_tokens_from_dictionary_according_to_sub_seq
- *      \t \t space_seperated_tokens_from_dictionary_according_to_sub_seq
- *      ...
- *     @endcode
- *
- * Typically SequenceTextPrinter layer takes output of maxid or RecurrentGroup
- * with maxid (when generating) as an input.
- *
- * The config file api is seqtext_printer_evaluator.
- *
- */
-class SequenceTextPrinter : public NotGetableEvaluator {
- private:
-  /// dict_file, which contains a list of tokens
-  std::vector<std::string> dict_;
-  /// result_file, which is the output file
-  std::ofstream os_;
-  /// True/False, to indicate whether to use space to separate output tokens.
-  /// Default is True. No space is added if set to False.
-  bool delimited_;
-  /// store the cpu version of argument.ids
-  std::vector<IVectorPtr> cpuIds_;
-  /// store the probability associated with each sequence
-  std::vector<MatrixPtr> cpuIn_;
-
- public:
-  SequenceTextPrinter() {}
-
-  virtual void init(const EvaluatorConfig& config) {
-    Evaluator::init(config);
-    if (!config.dict_file().empty()) {
-      loadFileList(config.dict_file(), dict_);
-    }
-
-    os_.open(config.result_file(), std::ofstream::trunc);
-    CHECK(os_.is_open()) << "Failed to open file " << config.result_file();
-    delimited_ = config.delimited();
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    CHECK_GE(arguments.size(), 1LU);
-    bool hasId = arguments.size() > 1;
-    size_t numSequences = arguments[0].getNumSequences();
-    if (hasId) {
-      CHECK_EQ(arguments[0].ids->getSize(), numSequences)
-          << "first input must be sample id.";
-    }
-    for (size_t i = hasId ? 1 : 0; i < arguments.size(); ++i) {
-      CHECK_EQ((size_t)arguments[i].getNumSequences(), numSequences);
-    }
-
-    auto resizeVector = [](IVectorPtr& dest, const IVectorPtr& src) {
-      if (src && src->useGpu()) {
-        IVector::resizeOrCreate(dest, src->getSize(), false);
-        dest->copyFrom(*src);
-      } else {
-        dest = src;
-      }
-    };
-
-    auto resizeMatrix = [](MatrixPtr& dest, const MatrixPtr& src) {
-      if (src && src->useGpu()) {
-        Matrix::resizeOrCreate(
-            dest, src->getHeight(), src->getWidth(), false, false);
-        dest->copyFrom(*src);
-      } else {
-        dest = src;
-      }
-    };
-
-    cpuIds_.resize(arguments.size());
-    cpuIn_.resize(arguments.size());
-    for (size_t i = 0; i < arguments.size(); ++i) {
-      resizeVector(cpuIds_[i], arguments[i].ids);
-      resizeMatrix(cpuIn_[i], arguments[i].in);
-    }
-
-    int* sampleIds = nullptr;
-    if (hasId) {
-      sampleIds = cpuIds_[0]->getData();
-    }
-
-    for (size_t i = 0; i < numSequences; ++i) {
-      os_ << (hasId ? sampleIds[i] : i);
-      for (size_t j = hasId ? 1 : 0; j < arguments.size(); ++j) {
-        int* output = cpuIds_[j]->getData();
-        const int* starts = arguments[j].sequenceStartPositions->getData(false);
-
-        auto seqPrint = [&](int start, int end) {
-          os_ << "\t";
-          for (int k = start; k < end; k++) {
-            int id = output[k];
-            os_ << (delimited_ ? " " : "");
-            if (!dict_.empty()) {
-              CHECK_LT((size_t)id, dict_.size());
-              os_ << dict_[id];
-            } else {
-              os_ << id;
-            }
-          }
-        };
-
-        if (arguments[j].hasSubseq()) {
-          // print sequence with sub-sequence
-          const int* subStarts =
-              arguments[j].subSequenceStartPositions->getData(false);
-          int subSeqId_start = 0;
-          int subSeqId_end = 0;
-          for (size_t k = 0; k < (size_t)arguments[j].getNumSubSequences() + 1;
-               ++k) {
-            if (starts[i] == subStarts[k]) subSeqId_start = k;
-            if (starts[i + 1] == subStarts[k]) subSeqId_end = k;
-          }
-          for (int k = subSeqId_start; k < subSeqId_end; k++) {
-            seqPrint(subStarts[k], subStarts[k + 1]);
-            os_ << std::endl;
-          }
-
-        } else {
-          // print sequence without sub-sequence
-          if (arguments[j].in) {  // beam print
-            real* probs = cpuIn_[j]->rowBuf(i);
-            os_ << std::endl;
-            int start = starts[i];
-            int seqEnd = starts[i + 1];
-            for (size_t k = 0; k < arguments[j].in->getWidth(); ++k) {
-              if (start == seqEnd) {
-                break;
-              }
-              int end = start + output[start] + 2;
-              CHECK_LE(end, seqEnd);
-              CHECK_EQ(output[end - 1], -1);
-              os_ << k << "\t" << probs[k];
-              seqPrint(start + 1, end - 1);
-              os_ << std::endl;
-              start = end;
-            }
-          } else {
-            seqPrint(starts[i], starts[i + 1]);
-          }
-        }
-      }
-      os_ << std::endl;
-    }
-    return 0;
-  }
-};
-REGISTER_EVALUATOR(seq_text_printer, SequenceTextPrinter);
-/**
- * @brief print classification error.
- *
- * The config file api is classification_error_printer_evaluator.
- */
-class ClassificationErrorPrinter : public ClassificationErrorEvaluator {
- public:
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    MatrixPtr errorMat = calcError(arguments);
-
-    std::ostringstream os;
-    errorMat->print(os);
-    LOG(INFO) << "Printer=" << config_.name() << " Classification Error:\n"
-              << os.str();
-
-    if (auto startPos = arguments[0].sequenceStartPositions) {
-      std::ostringstream os;
-      startPos->getVector(false)->print(os, startPos->getSize());
-      LOG(INFO) << "Printer=" << config_.name() << " sequence pos vector:\n"
-                << os.str();
-    }
-    return 0;
-  }
-};
-REGISTER_EVALUATOR(classification_error_printer, ClassificationErrorPrinter);
-
-std::string DummyEvaluator::getTypeImpl() const { return "dummy"; }
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/Evaluator.h b/paddle/legacy/gserver/evaluators/Evaluator.h
deleted file mode 100644
index b3462819b12..00000000000
--- a/paddle/legacy/gserver/evaluators/Evaluator.h
+++ /dev/null
@@ -1,510 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <fstream>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/parameter/Argument.h"
-#include "paddle/legacy/pserver/ParameterClient2.h"
-#include "paddle/legacy/utils/ClassRegistrar.h"
-#include "paddle/legacy/utils/Error.h"
-
-namespace paddle {
-
-class NeuralNetwork;
-/**
- * @def REGISTER_EVALUATOR
- * @brief Macro for registering evaluator class
- */
-
-#define REGISTER_EVALUATOR(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {                \
-    Evaluator::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-/**
- * @brief Base class for Evaluator
- * Evaluating the performance of a model is very important.
- * It indicates how successful the scores(predictions) of a datasets
- * has been by a trained model.
- */
-class Evaluator {
- public:
-  static Evaluator* create(const EvaluatorConfig& config);
-
-  Evaluator() : numSamples_(0), totalScore_(0) {}
-
-  virtual ~Evaluator() {}
-
-  virtual void init(const EvaluatorConfig& config) { config_ = config; }
-
-  /**
-   * @brief start to evaluate some data
-   */
-  virtual void start() {
-    numSamples_ = 0;
-    totalScore_ = 0;
-  }
-
-  /**
-   * @brief Process a batch of data.
-   */
-  virtual void eval(const NeuralNetwork& nn);
-
-  /**
-   * @brief Process a batch of data.
-   * @return the score for the batch if it make sense to sum the score across
-   * batches.
-   * @note Otherwise evaluator should return 0 and override finish() and
-   * printStats() to do the right calculation.
-   */
-  virtual real evalImp(std::vector<Argument>& arguments) = 0;
-
-  /**
-   * @brief Update the number of processed samples
-   */
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    numSamples_ += arguments[0].getBatchSize();
-  }
-
-  /// finish() should be called before distributeEval
-  virtual void distributeEval(ParameterClient2* client) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  void mergeResultsOfAllClients(ParameterClient2* client) {
-    double data[2] = {totalScore_, numSamples_};
-    client->reduce(data, data, 2, FLAGS_trainer_id, 0);
-    totalScore_ = data[0];
-    numSamples_ = data[1];
-  }
-
-  /**
-   * @brief finish the evaluation.
-   */
-  virtual void finish() {}
-
-  /**
-   * @brief print the statistics of evaluate result
-   * @note finish() should be called before printStats
-   */
-  virtual void printStats(std::ostream& os) const {
-    os << config_.name() << "="
-       << (numSamples_ ? totalScore_ / numSamples_ : 0);
-  }
-
-  friend std::ostream& operator<<(std::ostream& os,
-                                  const Evaluator& evaluator) {
-    evaluator.printStats(os);
-    return os;
-  }
-
-  friend std::ostream&& operator<<(std::ostream&& os,  // NOLINT
-                                   const Evaluator& evaluator) {
-    evaluator.printStats(os);
-    return std::move(os);
-  }
-
-  static ClassRegistrar<Evaluator> registrar_;
-
-  /**
-   * @brief getNames will return all field names of current evaluator.
-   *
-   * The format of name is `evaluator_name.evaluator_fields`. If the evaluator
-   * has multiple field, the name could be `evaluator_name.field1`. For example
-   * the PrecisionRecallEvaluator contains `precision`, `recall` fields. The get
-   * names will return `precision_recall_evaluator.precision`,
-   * `precision_recall_evaluator.recal`, etc.
-   *
-   * Also, if current Evaluator is a combined evaluator. getNames will return
-   * all names of all evaluators inside the combined evaluator.
-   *
-   * @param names [out]: the field names of current evaluator.
-   * @note Never clear the names parameter inside getNames.
-   */
-  virtual void getNames(std::vector<std::string>* names) {
-    names->push_back(config_.name());
-  }
-
-  /**
-   * @brief getValue will return the current evaluate value of one field.
-   *
-   * @param name: The field name of current evaluator.
-   * @param err [out]: The error state.
-   *
-   * @return The evaluate value(metric).
-   */
-  virtual real getValue(const std::string& name, Error* err) const {
-    if (name != config_.name()) {
-      *err = Error("no such name of evaluator %s", name.c_str());
-      return .0f;
-    }
-    return this->getValueImpl();
-  }
-
-  /**
-   * @brief getType will return the evaluator type by field name.
-   *
-   * Evaluate Type is the current type of evaluator in string. Such as 'auc',
-   * 'precision_recall'. In combined evaluator, different name may get different
-   * evaluate type because it could be evaluated by different evaluator inside.
-   *
-   * @param name: The field name of current Evaluator.
-   * @param err: The error state. nullptr means don't care.
-   * @return the evaluator type string.
-   */
-  virtual std::string getType(const std::string& name, Error* err) const {
-    if (name != config_.name()) {
-      *err = Error("no such name of evaluator %s", name.c_str());
-      return std::string();
-    }
-    return this->getTypeImpl();
-  }
-
- protected:
-  /**
-   * @brief getValueImpl The simplest way to define getValue result. If this
-   * evaluator doesn't contain multiple fields, and do not throw any error, just
-   * implemented this method to get the evaluate result(metric).
-   * @return Evaluate result(metric).
-   */
-  virtual real getValueImpl() const {
-    return numSamples_ != .0 ? totalScore_ / numSamples_ : .0;
-  }
-
-  /**
-   * @brief getTypeImpl The simplest way to define getType result. If this
-   * evaluator doesn't combine many evaluators, the get type should only return
-   * itself type.
-   * @return Evaluator type.
-   */
-  virtual std::string getTypeImpl() const { return "base"; }
-
- protected:
-  EvaluatorConfig config_;
-  double numSamples_;
-  double totalScore_;
-};
-
-/**
- * @brief The NotGetableEvaluator class is the base class of evaluator that
- * cannot get value in runtime. The most NotGetableEvaluator is Printer
- * Evaluator, which is only used to debug network configuration.
- */
-class NotGetableEvaluator : public Evaluator {
-  // Evaluator interface
- public:
-  void getNames(std::vector<std::string>* names) {}
-
-  real getValue(const std::string& name, Error* err) const {
-    *err = Error("Not implemented");
-    return .0f;
-  }
-
-  std::string getType(const std::string& name, Error* err) const {
-    *err = Error("Not implemented");
-    return "";
-  }
-};
-
-class DummyEvaluator : public Evaluator {
- public:
-  DummyEvaluator() {}
-  virtual void init(const EvaluatorConfig&) {}
-  virtual void start() {}
-  virtual void eval(const NeuralNetwork&) {}
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    (void)arguments;
-    return -1;
-  }
-  virtual void finish() {}
-  virtual void printStats(std::ostream&) const {}
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const;
-};
-/**
- * @brief evaluate AUC using colIdx-th column as prediction.
- * The AUC(Area Under the Curve) is a common evaluation metric
- * for binary classification problems. It computes the area under
- * the receiver operating characteristic(ROC) curve.
- *
- * @note colIdx-th column
- *
- * - colIdx = 0: the 0-th column.
- * - colIdx > 0: the colIdx-th column.
- * - colIdx < 0: the last colIdx-th column.
- *
- * The config file api is auc_evaluator.
- *
- */
-class AucEvaluator : public Evaluator {
- public:
-  AucEvaluator(int32_t colIdx)
-      : colIdx_(colIdx),
-        realColumnIdx_(0),
-        cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  virtual void start();
-
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  virtual void printStats(std::ostream& os) const {
-    os << config_.name() << "=" << calcAuc();
-  }
-
-  virtual void distributeEval(ParameterClient2* client);
-
- private:
-  static const uint32_t kBinNum_ = (1 << 24) - 1;
-  static const int kNegativeLabel_ = 0;
-  double statPos_[kBinNum_ + 1];
-  double statNeg_[kBinNum_ + 1];
-  int32_t colIdx_;
-  uint32_t realColumnIdx_;
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-
-  AucEvaluator() {}
-
-  inline static double trapezoidArea(double X1,
-                                     double X2,
-                                     double Y1,
-                                     double Y2) {
-    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
-  }
-
-  double calcAuc() const;
-
-  // Evaluator interface
- protected:
-  real getValueImpl() const;
-  std::string getTypeImpl() const;
-};
-
-/**
- * @brief RankAucEvaluator calculates the AUC of each list (i.e., titles
- * under the same query), and averages them. Each list should be organized
- * as a sequence. The inputs of this evaluator is [output, click, pv]. If pv
- * is not provided, it will be set to 1. The types of click and pv are
- * dense value.
- */
-class RankAucEvaluator : public Evaluator {
- public:
-  // evaluate ranking AUC
-  virtual void start();
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments);
-
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
- private:
-  MatrixPtr output_;
-  MatrixPtr click_;
-  MatrixPtr pv_;
-  std::vector<std::pair<real, int>> outputPair_;
-
-  double calcRankAuc(real* outputData,
-                     real* clickData,
-                     real* pvData,
-                     size_t size);
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const;
-};
-
-/**
- * @brief precision, recall and f1 score Evaluator
- * \f[
- * precision = \frac{tp}{tp+tn} \\
- * recall=\frac{tp}{tp+fn} \\
- * f1=2*\frac{precsion*recall}{precision+recall}
- * \f]
- *
- * The config file api is precision_recall_evaluator.
- */
-class PrecisionRecallEvaluator : public Evaluator {
- public:
-  // Evaluate precision, recall and F1 score
-  PrecisionRecallEvaluator()
-      : isMultiBinaryLabel_(false),
-        cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  virtual void start();
-
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  virtual void printStats(std::ostream& os) const;
-
-  virtual void distributeEval(ParameterClient2* client);
-
-  void getNames(std::vector<std::string>* names);
-
-  real getValue(const std::string& name, Error* err) const;
-
-  std::string getType(const std::string& name, Error* err) const;
-
-  struct StatsInfo {
-    /// numbers of true positives
-    double TP;
-    /// numbers of true negatives
-    double TN;
-    /// numbers of false positives
-    double FP;
-    /// numbers of false negatives
-    double FN;
-
-    StatsInfo() : TP(0.0), TN(0.0), FP(0.0), FN(0.0) {}
-  };
-
- private:
-  bool isMultiBinaryLabel_;
-  std::vector<StatsInfo> statsInfo_;
-
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-
-  struct PrintStatsInfo {
-    double precision;
-    double recall;
-    double f1;
-    double macroAvgPrecision;
-    double macroAvgRecall;
-    double macroAvgF1Score;
-    double microAvgPrecision;
-    double microAvgRecall;
-    double microAvgF1Score;
-  };
-
-  bool getStatsInfo(PrintStatsInfo* info) const;
-
-  void calcStatsInfo(const MatrixPtr& output,
-                     const IVectorPtr& label,
-                     const MatrixPtr& weight);
-
-  void calcStatsInfoMulti(const MatrixPtr& output,
-                          const MatrixPtr& label,
-                          const MatrixPtr& weight);
-
-  inline static double calcPrecision(double TP, double FP) {
-    if (TP > 0.0 || FP > 0.0) {
-      return TP / (TP + FP);
-    } else {
-      return 1.0;
-    }
-  }
-
-  inline static double calcRecall(double TP, double FN) {
-    if (TP > 0.0 || FN > 0.0) {
-      return TP / (TP + FN);
-    } else {
-      return 1.0;
-    }
-  }
-
-  inline static double calcF1Score(double precision, double recall) {
-    if (precision > 0.0 || recall > 0.0) {
-      return 2 * precision * recall / (precision + recall);
-    } else {
-      return 0;
-    }
-  }
-
-  mutable std::unordered_map<std::string, real> values_;
-
-  void storeLocalValues() const;
-};
-
-/*
- * @brief positive-negative pair rate Evaluator
- *
- * The config file api is pnpair_evaluator.
- */
-class PnpairEvaluator : public Evaluator {
- public:
-  PnpairEvaluator()
-      : cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuInfo_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  virtual void start();
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  struct PredictionResult {
-    PredictionResult(real __out, int __label, int __queryid, real __weight)
-        : out(__out), label(__label), queryid(__queryid), weight(__weight) {}
-    real out;
-    int label;
-    int queryid;
-    real weight;
-  };
-  std::vector<PredictionResult> predictArray_;
-  void printPredictResults() {
-    std::ofstream fs(FLAGS_predict_file);
-    CHECK(fs) << "Fail to open " << FLAGS_predict_file;
-    for (auto& res : predictArray_) {
-      fs << res.out << " " << res.label << " " << res.queryid << std::endl;
-    }
-  }
-
-  void stat(size_t start,
-            size_t end,
-            PredictionResult* answers,
-            double& pos,
-            double& neg,
-            double& spe);
-  void calc(std::vector<PredictionResult>& predictArray);
-
-  virtual void finish() { calc(predictArray_); }
-
-  virtual void printStats(std::ostream& os) const {
-    os << " pos/neg=" << this->getValueImpl();
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    client->reduce(pairArray_, pairArray_, kPairArrayNum_, FLAGS_trainer_id, 0);
-    LOG(INFO) << " distribute eval calc total pos pair: " << pairArray_[0]
-              << " calc total neg pair: " << pairArray_[1];
-  }
-
- private:
-  static const uint32_t kPairArrayNum_ = 2;
-  double pairArray_[kPairArrayNum_];
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  IVectorPtr cpuInfo_;
-  MatrixPtr cpuWeight_;
-
-  // Evaluator interface
- protected:
-  real getValueImpl() const {
-    return pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
-  }
-  std::string getTypeImpl() const;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/GradientMachine.cpp b/paddle/legacy/gserver/gradientmachines/GradientMachine.cpp
deleted file mode 100644
index 1c4034d8bba..00000000000
--- a/paddle/legacy/gserver/gradientmachines/GradientMachine.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GradientMachine.h"
-
-#include <fstream>
-#include "paddle/legacy/utils/Logging.h"
-
-#include "NeuralNetwork.h"
-#include "hl_gpu.h"
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "GradientMachineMode.h"
-#include "MultiGradientMachine.h"
-#include "MultiNetwork.h"
-#include "ParallelNeuralNetwork.h"
-#endif
-
-namespace paddle {
-
-GradientMachine* GradientMachine::create(
-    const ModelConfig& config,
-    int mode,
-    const std::vector<ParameterType>& parameterTypes) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  if (auto gm = IGradientMachineMode::tryCreateGradientMachine(mode, config)) {
-    return gm;
-  }
-  if (FLAGS_trainer_count > 1) {
-    return new MultiGradientMachine(config, FLAGS_use_gpu);
-  }
-#endif
-  if (FLAGS_trainer_count == 1) {  // single
-#ifndef PADDLE_MOBILE_INFERENCE
-    NeuralNetwork* nn;
-    if (config.type() == "multi_nn") {
-      /* multi submodel calculate, thread(s) will be initialized inside */
-      nn = new MultiNetwork("root");
-    } else if (FLAGS_parallel_nn) {
-      /* multi threads calculate */
-      nn = new ParallelNeuralNetwork();
-    } else {
-      /* single thread calculate */
-      nn = NeuralNetwork::create(config);
-    }
-#else
-    NeuralNetwork* nn = NeuralNetwork::create(config);
-#endif
-    ParamInitCallback testParamInitCb = [](int paramId, Parameter* para) {
-      para->enableType(PARAMETER_VALUE);
-    };
-    nn->init(
-        config, mode == kTesting ? testParamInitCb : nullptr, parameterTypes);
-    return nn;
-  }
-  LOG(FATAL) << "Unknown model type: " << config.type();
-  return nullptr;
-}
-
-void GradientMachine::saveParameters(const std::string& dir) const {
-  LOG(INFO) << "Saving parameters to " << dir;
-
-  for (auto& para : parameters_) {
-    std::string filename = dir + "/" + para->getName();
-    if (para->isFullSize()) {
-      para->save(filename);
-    }
-  }
-}
-
-void GradientMachine::loadParameters(const std::string& dir) {
-  LOG(INFO) << "Loading parameters from " << dir;
-
-  for (auto& para : parameters_) {
-    std::string filename = dir + "/" + para->getName();
-    if (para->isFullSize()) {
-      para->load(filename);
-    }
-  }
-}
-
-void GradientMachine::randParameters() {
-  LOG(INFO) << "Initing parameters..";
-
-  for (auto& para : parameters_) {
-    if (para->isFullSize()) {
-      para->randomize();
-    }
-  }
-  LOG(INFO) << "Init parameters done.";
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/GradientMachine.h b/paddle/legacy/gserver/gradientmachines/GradientMachine.h
deleted file mode 100644
index d4f754a9f4d..00000000000
--- a/paddle/legacy/gserver/gradientmachines/GradientMachine.h
+++ /dev/null
@@ -1,250 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <vector>
-
-#include "ModelConfig.pb.h"
-#include "TrainerConfig.pb.h"
-#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
-#include "paddle/legacy/gserver/layers/Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/parameter/ParameterUpdaterBase.h"
-#include "paddle/legacy/utils/Thread.h"
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "paddle/legacy/gserver/evaluators/Evaluator.h"
-#endif
-
-namespace paddle {
-/**
- * @brief A gradient machine is capable of calculating some outputs given
- *        some inputs and performing gradient calculation based on the
- *        derivative from the outputs.
- *
- * A gradient machine can be either a full neural network or part of a neural
- * network.
- *
- * Usage for training:
- *
- *  1. Prepare inArgs. Put your input data into inArgs[i].value.
- *
- *  2. Call forward(inArgs, &outArgs)
- *
- *  3. Calculate gradient with respect to outArgs[i]->value
- *     and fill them into outArgs[i]->grad.
- *     This step can be skipped if your the outputs are from cost layers.
- *
- *  4. Call backward(). After backward, gradient of each parameter is
- *     accumulated to getParameters()[i]->getBuf(PARAMETER_GRADIENT)
- *
- *  5. Update parameter value getParameters()[i]->getBuf(PARAMETER_VALUE) using
- *     gradients.
- *
- *  6. Clear gradients to zero.
- *
- * Usage for prediction:
- *
- *  1. Prepare inArgs. Put your input data into inArgs[i].value.
- *
- *  2. Call forward(inArgs, &outArgs)
- *
- *  3. Obtain the prediction result from outArgs[i]
- */
-
-typedef std::vector<LayerStatePtr> MachineState;
-
-class GradientMachine;
-
-typedef std::shared_ptr<GradientMachine> GradientMachinePtr;
-
-class GradientMachine {
- public:
-  enum CreateMode {
-    kNormal = 0,
-    kSgdSparseCpuTraining = 3,
-    kTesting = 4,
-    kCustom = 10
-  };
-
-  /**
-   * Create a gradient machine from ModelConfig
-   * Parameter will have parameterTypes
-   */
-  static GradientMachine* create(
-      const ModelConfig& config,
-      int mode = kNormal,
-      const std::vector<ParameterType>& parameterTypes =
-          std::vector<ParameterType>{
-              PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM});
-
-  virtual ~GradientMachine() {}
-
-  /**
-   * Prefetch row ids of sparse parameter.
-   */
-  virtual void prefetch(const std::vector<Argument>& inArgs) { (void)inArgs; }
-
-  /**
-   * @brief Forward propagation.
-   *
-   * Calculate outputs (outArgs) based the inputs (inArgs)
-   *
-   * @note: if passType==PASS_TEST, then backward() should not be called
-   */
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType) = 0;
-
-  /**
-   * @brief Backward propagation.
-   *
-   * Calculate the gradient of inArgs and parameter.
-   *
-   * This function should only be called after a corresponding forward() call.
-   * The caller is responsible for filling the correct grad for the outArgs
-   * obtained using forward().
-   *
-   * It may also change the grad field for the inArgs supplied at forward()
-   */
-  virtual void backward(const UpdateCallback& callback = nullptr) = 0;
-
-  /**
-   * Combine forward() and backward(). For multithread training, this
-   * may be faster.
-   *
-   * @note: passType PASS_TEST is not allowed for forwardBackward().
-   */
-  virtual void forwardBackward(const std::vector<Argument>& inArgs,
-                               std::vector<Argument>* outArgs,
-                               PassType passType,
-                               const UpdateCallback& callback = nullptr) {
-    forward(inArgs, outArgs, passType);
-    backward(callback);
-  }
-
-  virtual Argument getLayerOutput(const std::string& layerName) = 0;
-
-  // see comment in Layer.h for the function with the same name
-  virtual void resetState() {}
-
-  // set machine state
-  virtual void setState(const MachineState& machineState) {}
-
-  // save machine state
-  virtual void getState(MachineState& machineState) {}
-
-  virtual void onPassEnd() = 0;
-
-#ifndef PADDLE_MOBILE_INFERENCE
-  /**
-   * Create an evaluator which can be used for eval()
-   */
-  virtual Evaluator* makeEvaluator() const = 0;
-
-  /**
-   * evaluate using the given evaluator
-   */
-  virtual void eval(Evaluator* evaluator) const = 0;
-#endif
-
-  std::vector<ParameterPtr>& getParameters() { return parameters_; }
-
-  std::vector<ParameterPtr>& getNonStaticParameters() {
-    if (nonStaticParameters_.empty()) {
-      for (auto para : parameters_) {
-        if (!para->isStatic()) {
-          nonStaticParameters_.push_back(para);
-        }
-      }
-    }
-    return nonStaticParameters_;
-  }
-
-  inline bool hasStaticParameters() {
-    return parameters_.size() != getNonStaticParameters().size();
-  }
-
-  /**
-   * @brief   Used before formal training, start work-threads and set
-   *          trainer Parameters;
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual void start() {}
-
-  /**
-   * @brief   check  each work-thread whether is failed/error/finish,
-   *          if not, return ture, and yes return false.
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual void finish() {}
-
-  /**
-   * @brief   set the training status a "finished" value, the sub_work_threads
-   *          will option the change, and then exit.
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual bool trainIsOn() { return true; }
-
-  /**
-   * @brief   when all or some of the sub-workThreads are suspended to waiting
-   *          controller's instructions, and after some processing done in the
-   *          controller, it will call this function to wake up all the pending
-   *          thread.
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual void restart() {}
-
-  /// Set the gradient of the output from outside.
-  virtual void setOutputGrad(const std::vector<Argument>& args) {
-    LOG(FATAL) << "Not implemented!";
-  }
-
-  void saveParameters(const std::string& dir) const;
-
-  void loadParameters(const std::string& dir);
-
-  void randParameters();
-
-  virtual void getStats(real& cost, int64_t& numProcessed) {
-    (void)cost;
-    (void)numProcessed;
-  }
-
-  /**
-   * @brief   Release the middle layer's output memory.
-   *
-   * @note    This function is used for memory optimization in inference.
-   */
-  virtual void releaseOutput() {}
-
- protected:
-  virtual void onLoadParameter() {}
-
-  std::vector<ParameterPtr> parameters_;
-  std::vector<ParameterPtr> nonStaticParameters_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/GradientMachineMode.cpp b/paddle/legacy/gserver/gradientmachines/GradientMachineMode.cpp
deleted file mode 100644
index 9a0b2643e03..00000000000
--- a/paddle/legacy/gserver/gradientmachines/GradientMachineMode.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GradientMachineMode.h"
-
-namespace paddle {
-std::unordered_map<int32_t, std::unique_ptr<IGradientMachineMode>>
-    IGradientMachineMode::modes_;
-}
diff --git a/paddle/legacy/gserver/gradientmachines/GradientMachineMode.h b/paddle/legacy/gserver/gradientmachines/GradientMachineMode.h
deleted file mode 100644
index dd944a35f89..00000000000
--- a/paddle/legacy/gserver/gradientmachines/GradientMachineMode.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "GradientMachine.h"
-#include "unordered_map"
-
-namespace paddle {
-
-class IGradientMachineMode {
- public:
-  virtual ~IGradientMachineMode() {}
-
- public:  // interfaces
-          /**
-           * @brief create current mode's gradient machine by model config.
-           * @param config model config
-           */
-  virtual GradientMachine* create(const ModelConfig& config) = 0;
-
-  /**
-   * @brief shouldBeMe the current mode of GradientMachine should be this mode.
-   * @param algo training algorithm name.
-   * @param trainerCount trainer count.
-   * @param isLocal is local mode (without pserver)
-   * @param isGpu is using gpu.
-   * @return true if mode should be this mode.
-   */
-  virtual bool shouldBeMe(const std::string& algo,
-                          size_t trainerCount,
-                          bool isLocal,
-                          bool isGpu) const = 0;
-
-  /**
-   * @brief Is data must be in cpu even if using gpu mode.
-   * @param trainerCount trainer count
-   * @return true if data must be gpu.
-   */
-  virtual bool isDataMustInCpu(size_t trainerCount) const = 0;
-
-  /**
-   * @brief Need not to use mini-batch method, and should train all data in one
-   * batch in one pass.
-   */
-  virtual bool needTrainWholeDataInOneBatch() const = 0;
-
- public:  // static methods.
-          /**
-           * @brief register a custom gradient machine mode.
-           * @note For user to register a custom gradient machine mode, id should >=
-           * kCustom.
-           * @param mode mode id.
-           * @param ptr mode description object.
-           */
-  static void regGradientMachineMode(
-      int32_t mode, std::unique_ptr<IGradientMachineMode>&& ptr) {
-    modes_.insert(std::make_pair(mode, std::move(ptr)));
-  }
-
-  /**
-   * @brief get custom mode from mode id.
-   * @param mode mode id
-   * @return mode description object.
-   */
-  static IGradientMachineMode* mode(int32_t mode) {
-    if (modes_.find(mode) != modes_.end()) {
-      return modes_[mode].get();
-    } else {
-      return nullptr;
-    }
-  }
-
-  /**
-   * @brief helper function to test trainWholeDataInOneBatch or not for mode
-   */
-  static bool trainWholeDataInOneBatch(int32_t mode) {
-    if (modes_.find(mode) != modes_.end()) {
-      return modes_[mode]->needTrainWholeDataInOneBatch();
-    } else {
-      return false;
-    }
-  }
-
-  /**
-   * @brief Try to get custom mode if we can.
-   * @param [out] mode the custom mode id.
-   * @param [in] algo algorithm name
-   * @param [in] trainerCount trainer count.
-   * @param [in] isLocal is local or not
-   * @param [in] isGpu using gpu or not.
-   * @return true if there is a custom mode fit these conditions.
-   */
-  static bool tryGetMode(int* mode,
-                         const std::string& algo,
-                         int32_t trainerCount,
-                         bool isLocal,
-                         bool isGpu) {
-    for (auto it = modes_.begin(); it != modes_.end(); ++it) {
-      if (it->second->shouldBeMe(algo, trainerCount, isLocal, isGpu)) {
-        *mode = it->first;
-        return true;
-      }
-    }
-    return false;
-  }
-
-  /**
-   * @brief helper function for data must in cpu
-   */
-  static bool dataMustInCpu(int32_t mode, size_t trainerCount) {
-    if (modes_.find(mode) != modes_.end()) {
-      return modes_[mode]->isDataMustInCpu(trainerCount);
-    } else {
-      // provide data to cpu if using synchronized multi-gpu gradient machine.
-      return trainerCount > 1;
-    }
-  }
-
-  /**
-   * @brief try to create gradient machine by mode & config.
-   * @return nullptr if we cannot create a gradient machine by such mode.
-   */
-  static GradientMachine* tryCreateGradientMachine(int32_t mode,
-                                                   const ModelConfig& config) {
-    auto m = IGradientMachineMode::mode(mode);
-    if (m) {
-      return m->create(config);
-    } else {
-      return nullptr;
-    }
-  }
-
- private:
-  static std::unordered_map<int32_t, std::unique_ptr<IGradientMachineMode>>
-      modes_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp
deleted file mode 100644
index 3ef0dfbfe2e..00000000000
--- a/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp
+++ /dev/null
@@ -1,898 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MultiGradientMachine.h"
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include "paddle/legacy/utils/Stat.h"
-
-#include "NeuralNetwork.h"
-#include "ParallelNeuralNetwork.h"
-
-DEFINE_bool(allow_only_one_model_on_one_gpu,
-            true,
-            "If true, do not allow multiple models on one GPU device");
-
-namespace paddle {
-
-// get types of the parameters which need to be merged after backward()
-static void fillMergeTypes(PassType passType,
-                           std::vector<ParameterType>* mergeTypes) {
-  mergeTypes->clear();
-  if (passType != PASS_TEST) {
-    mergeTypes->push_back(PARAMETER_GRADIENT);
-  }
-}
-
-MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
-                                           bool useGpu)
-    : useGpu_(useGpu),
-      trainerBarrier_(FLAGS_trainer_count),
-      allBarrier_(FLAGS_trainer_count + 1),
-      inArgsCopied_(false) {
-  isPassGrad_ = false;
-  numThreads_ = FLAGS_trainer_count;
-  if (useGpu) {
-    //! TODO(yuyang18): When useGpu=false && paddle is not compiled with gpu,
-    //! the hl_get_device_count will get an error result. It seems should return
-    //! 0 when hppl is not compiled as gpu version.
-    numDevices_ = hl_get_device_count();
-  } else {
-    numDevices_ = 0;
-  }
-  ParamInitCallback mainParamInitCb = [](int paramId, Parameter* para) {
-    // only create buf for CPU parameters
-    // GPU parameters will be created in each thread
-    if (para->useGpu()) return;
-
-    if (para->isSparseRemoteUpdate()) {
-      para->enableType(PARAMETER_VALUE,
-                       FLAGS_loadsave_parameters_in_pserver
-                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
-                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
-      para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
-    } else if (para->isGradSparseUpdate()) {
-      para->enableType(PARAMETER_VALUE);
-      para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_IDS);
-      SparseRowIdsCpuMatrix* mat = dynamic_cast<SparseRowIdsCpuMatrix*>(
-          para->getMat(PARAMETER_GRADIENT).get());
-      mat->setNumOfThreads(FLAGS_trainer_count);
-    } else if (para->isValueShared()) {
-      para->enableType(PARAMETER_VALUE, Parameter::MAT_VALUE_SHARED);
-      if (!para->isStatic()) {
-        para->enableType(PARAMETER_GRADIENT);
-      }
-    } else {
-      para->enableType(PARAMETER_VALUE);
-      if (!para->isStatic()) {
-        para->enableType(PARAMETER_GRADIENT);
-      }
-    }
-  };
-
-  NeuralNetwork* nn = NeuralNetwork::create(config);
-  nn->init(config, mainParamInitCb);
-  gradientMachine_.reset(nn);
-  parameters_ = gradientMachine_->getParameters();
-
-  numLogicalDevices_ = 0;
-  if (useGpu_) {
-    numLogicalDevices_ = 1;
-
-    for (size_t pid = 0; pid < parameters_.size(); pid++) {
-      if (parameters_[pid]->getConfig().device() + 1 > numLogicalDevices_) {
-        numLogicalDevices_ = parameters_[pid]->getConfig().device() + 1;
-      }
-    }
-    LOG(INFO) << "numLogicalDevices=" << numLogicalDevices_
-              << " numThreads=" << numThreads_ << " numDevices=" << numDevices_;
-
-    if (numLogicalDevices_ * numThreads_ > numDevices_ &&
-        FLAGS_allow_only_one_model_on_one_gpu) {
-      LOG(FATAL) << "trainer_count * num_devices_in_model "
-                 << "(" << numThreads_ << "*" << numLogicalDevices_ << ")"
-                 << "=" << numThreads_ * numLogicalDevices_
-                 << " exceeds number of GPU devices(" << numDevices_ << ")";
-    }
-    numLogicalDevices_ = std::min(numLogicalDevices_, numDevices_);
-
-    /* Enables direct access to memory allocations on a peer device */
-    for (int i = 0; i < numThreads_; i++) {
-      for (int d = 0; d < numLogicalDevices_; ++d) {
-        enablePeerAccess(logicalDeviceId2RealDeviceId(d, i),
-                         logicalDeviceId2RealDeviceId(d, i + 1));
-        enablePeerAccess(logicalDeviceId2RealDeviceId(d, i),
-                         logicalDeviceId2RealDeviceId(d, i - 1));
-      }
-    }
-  }
-
-  for (int i = 0; i < numThreads_; ++i) {
-    threads_.emplace_back(new TrainerThread(config, i, this));
-  }
-
-  bufferSizes_.resize(numLogicalDevices_, 0);
-  paraMainThread_.reserve(parameters_.size());
-  int pid = 0;
-  for (auto& para : parameters_) {
-    if (para->isStatic() || !para->useGpu()) {
-      paraMainThread_.push_back(0);
-    } else {
-      int end = pid++ % numThreads_;
-      paraMainThread_.push_back(end);
-      int paraDeviceId = para->getDeviceId();
-      if (paraDeviceId == -1) paraDeviceId = 0;
-      paraDeviceId = paraDeviceId % numLogicalDevices_;
-      if (para->getSize() > bufferSizes_[paraDeviceId]) {
-        bufferSizes_[paraDeviceId] = para->getSize();
-        VLOG(1) << "bufferSize[" << paraDeviceId << "]" << para->getSize();
-      }
-    }
-  }
-
-  // TODO(xuwei06) Instead of using maximal buffer size, we may use a smaller
-  // fixed buffer size and use pipeline to dispatch parameter value and merge
-  // parameter gradient, which may be faster.
-
-  // combination of all trainers mainPara into GradientMachine parameters
-  hasNonstaticCpuParamters_ = false;
-  for (size_t pid = 0; pid < parameters_.size(); pid++) {
-    if (parameters_[pid]->useGpu()) {
-      parameters_[pid] = threads_[paraMainThread_[pid]]->getParameters()[pid];
-    } else if (!parameters_[pid]->isStatic()) {
-      hasNonstaticCpuParamters_ = true;
-    }
-  }
-
-  gradBufs_.resize(numThreads_);
-  for (int i = 0; i < numThreads_; ++i) {
-    gradBufs_[i].resize(numLogicalDevices_);
-    for (int d = 0; d < numLogicalDevices_; ++d) {
-      gradBufs_[i][d].sem.post();
-    }
-  }
-
-  outArgStream_ = HPPL_STREAM_1;
-
-  start();
-}
-
-void MultiGradientMachine::start() {
-  for (auto& thread : threads_) {
-    thread->start();
-  }
-}
-
-void MultiGradientMachine::finish() {
-  for (auto& thread : threads_) {
-    thread->stop();
-  }
-}
-
-std::vector<const std::vector<ParameterPtr>*>
-MultiGradientMachine::getSlaveParameters() {
-  std::vector<const std::vector<ParameterPtr>*> vec;
-  vec.reserve(threads_.size());
-  for (auto& thread : threads_) {
-    vec.push_back(&thread->getParameters());
-  }
-  return vec;
-}
-
-void MultiGradientMachine::notifyGradientTransfer(int paramId) {
-  gradQueue_.enqueue(paramId);
-}
-
-void MultiGradientMachine::allocGradBufs() {
-  if (numLogicalDevices_ == 0) return;
-  if (gradBufs_[0][0].bufs.size() >= mergeTypes_.size()) return;
-
-  for (int i = 0; i < numThreads_; i++) {
-    for (int d = 0; d < numLogicalDevices_; ++d) {
-      if (bufferSizes_[d] == 0) continue;
-      SetDevice device(logicalDeviceId2RealDeviceId(d, i));
-      for (size_t j = 0; j < mergeTypes_.size(); j++) {
-        gradBufs_[i][d].bufs.push_back(
-            Vector::create(bufferSizes_[d], /* useGpu= */ true));
-      }
-    }
-  }
-}
-
-void MultiGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
-  // Each gradient machine in threads needs to do prefetch on its own
-  // part of inArgs. So we need to first divide inArgs to each thread
-  inArgs_ = inArgs;
-  startTask(TASK_COPY_IN_ARGS);
-
-  for (auto& para : parameters_) {
-    if (para->isSparseRemoteUpdate()) {
-      auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-          para->getMat(PARAMETER_VALUE).get());
-      mat->clearIndices();
-    }
-  }
-
-  waitForCopyInArgs();
-
-  // Because SparsePrefetchRowCpuMatrix can only be changed by ONE thread
-  // at one time, we need to do prefetch sequentially
-  for (auto& thread : threads_) {
-    thread->prefetch();
-  }
-
-  for (auto& para : parameters_) {
-    if (para->isSparseRemoteUpdate()) {
-      auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-          para->getMat(PARAMETER_VALUE).get());
-      mat->setupIndices();
-      auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
-          para->getMat(PARAMETER_GRADIENT).get());
-      matGrad->reserveStore();
-    }
-  }
-}
-
-void MultiGradientMachine::forward(const std::vector<Argument>& inArgs,
-                                   std::vector<Argument>* outArgs,
-                                   PassType passType) {
-  forwardImp(inArgs, outArgs, passType, TASK_FORWARD);
-}
-
-void MultiGradientMachine::forwardImp(const std::vector<Argument>& inArgs,
-                                      std::vector<Argument>* outArgs,
-                                      PassType passType,
-                                      TaskType taskType) {
-  updateThreadParameters();
-  passType_ = passType;
-
-  if (!inArgsCopied_) {
-    inArgs_ = inArgs;
-    inArgsCopied_ = false;
-  }
-
-  fillMergeTypes(passType, &mergeTypes_);
-  allocGradBufs();
-  startTask(taskType);
-
-  getOutArgs(outArgs, passType);
-}
-
-void MultiGradientMachine::backward(const UpdateCallback& callback) {
-  backwardCallback_ = callback;
-  startTask(TASK_BACKWARD);
-  backwardImp(callback);
-}
-
-void MultiGradientMachine::forwardBackward(const std::vector<Argument>& inArgs,
-                                           std::vector<Argument>* outArgs,
-                                           PassType passType,
-                                           const UpdateCallback& callback) {
-  backwardCallback_ = callback;
-  forwardImp(inArgs, outArgs, passType, TASK_FORWARD_BACKWARD);
-  backwardImp(callback);
-}
-
-Argument MultiGradientMachine::getLayerOutput(const std::string& layerName) {
-  std::vector<Argument> args;
-  args.reserve(threads_.size());
-
-  for (auto& thread : threads_) {
-    args.push_back(thread->getGradientMachine()->getLayerOutput(layerName));
-  }
-  outLayerArgs_.concat(args, false /* use_gpu */, outArgStream_, passType_);
-
-  return outLayerArgs_;
-}
-
-void MultiGradientMachine::backwardImp(const UpdateCallback& callback) {
-  for (size_t i = 0; i < parameters_.size(); i++) {
-    if (!parameters_[i]->useGpu() || parameters_[i]->isStatic()) continue;
-    REGISTER_TIMER("controller_dequeue");
-    gradQueue_.dequeue();
-  }
-  if (hasNonstaticCpuParamters()) {
-    waitAfterMerge();
-    if (backwardCallback_) {
-      for (auto& para : parameters_) {
-        if (!para->useGpu() && !para->isStatic()) {
-          backwardCallback_(para.get());
-        }
-      }
-    }
-  }
-}
-
-void MultiGradientMachine::updateThreadParameters() {
-  for (size_t pid = 0; pid < parameters_.size(); ++pid) {
-    if (!parameters_[pid]->useGpu()) continue;
-    if (!parameters_[pid]->isValueUpdated()) continue;
-    parameters_[pid]->clearValueUpdated();
-    for (int i = 0; i < (int)threads_.size(); i++) {
-      threads_[i]->incUpdateCounter();
-    }
-    // NotifyValueReady should happen after that all threads' incUpdateCounter()
-    // are called so that the counters are correct when notifyValueReady()
-    // is called.
-    threads_[paraMainThread_[pid]]->notifyValueReady(pid);
-  }
-}
-
-void MultiGradientMachine::onPassEnd() {
-  for (auto& thread : threads_) {
-    thread->onPassEnd();
-  }
-}
-
-Evaluator* MultiGradientMachine::makeEvaluator() const {
-  return threads_[0]->getGradientMachine()->makeEvaluator();
-}
-
-void MultiGradientMachine::eval(Evaluator* evaluator) const {
-  for (auto& thread : threads_) {
-    SetDevice device(thread->getDeviceId());
-    if (thread->hasInputData()) {
-      thread->getGradientMachine()->eval(evaluator);
-    }
-  }
-}
-
-void MultiGradientMachine::getOutArgs(std::vector<Argument>* outArgs,
-                                      PassType passType) {
-  for (auto& thread : threads_) {
-    REGISTER_TIMER("waitOutArgs");
-    thread->waitOutArgsReady();
-  }
-
-  outArgs_.resize(threads_[threads_.size() - 1]->getOutArgs().size());
-
-  REGISTER_TIMER("copyOutArgs");
-  for (size_t i = 0; i < outArgs_.size(); ++i) {
-    std::vector<Argument> args;
-    args.reserve(threads_.size());
-    for (auto& thread : threads_) {
-      // If the thread input is empty, then the output is empty.
-      auto tmp = thread->getOutArgs();
-      if (tmp.size() > 0) {
-        args.push_back(tmp[i]);
-      }
-    }
-    outArgs_[i].concat(args, useGpu_, outArgStream_, passType);
-  }
-
-  if (useGpu_) {
-    hl_stream_synchronize(outArgStream_);
-  }
-
-  *outArgs = outArgs_;
-}
-
-void MultiGradientMachine::setOutputGrad(const std::vector<Argument>& args) {
-  CHECK_EQ(args.size(), outArgs_.size());
-  for (size_t i = 0; i < args.size(); i++) {
-    outArgs_[i].grad = args[i].grad;
-  }
-}
-
-void MultiGradientMachine::startTask(TaskType taskType) {
-  taskType_ = taskType;
-  for (auto& thread : threads_) {
-    thread->notifyTaskReady();
-  }
-}
-
-TrainerThread::TrainerThread(const ModelConfig& config,
-                             int threadId,
-                             MultiGradientMachine* multiMachine)
-    : multiMachine_(multiMachine),
-      config_(config),
-      threadId_(threadId),
-      inArgsCopied_(false) {
-  int numThreads = multiMachine->getNumThreads();
-
-  auto& mainParas = multiMachine->getParameters();
-
-  using std::placeholders::_1;
-  using std::placeholders::_2;
-
-  partnerId_ = mod(threadId_ - 1, numThreads);
-
-  deviceId_ = !multiMachine_->useGpu()
-                  ? -1
-                  : multiMachine_->logicalDeviceId2RealDeviceId(0, threadId_);
-  SetDevice gpuDevice(deviceId_);
-
-  NeuralNetwork* nn = nullptr;
-  if (!multiMachine->useGpu() || !FLAGS_parallel_nn) {
-    nn = NeuralNetwork::create(config);
-  } else {
-    nn = new ParallelNeuralNetwork();
-    for (auto& paraConfig : *config_.mutable_parameters()) {
-      if (paraConfig.device() != -1) {
-        paraConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId(
-            paraConfig.device(), threadId_));
-      }
-    }
-    for (auto& layerConfig : *config_.mutable_layers()) {
-      if (layerConfig.device() != -1) {
-        layerConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId(
-            layerConfig.device(), threadId_));
-      }
-    }
-  }
-  // Only GPU do not share parameter values with main paramters.
-  ParamInitCallback slaveParamInitCb =
-      std::bind(parameterInitNN, _1, _2, &mainParas);
-  nn->init(config_, slaveParamInitCb);
-  gradientMachine_.reset(nn);
-  parameters_ = gradientMachine_->getParameters();
-  if (!FLAGS_parallel_nn) {
-    for (auto& para : parameters_) {
-      para->setDevice(deviceId_);
-    }
-  }
-
-  backwardCallback_ =
-      std::bind(&TrainerThread::backwardCallback, this, std::placeholders::_1);
-
-  gradStream_ = HPPL_STREAM_2;
-  valueStream_ = HPPL_STREAM_3;
-  stopping_ = true;
-  updateCounter_ = 0;
-  parameterUpdated_ = false;
-}
-
-TrainerThread::~TrainerThread() { stop(); }
-
-void TrainerThread::start() {
-  if (!stopping_) return;
-
-  stopping_ = false;
-
-  gradientMachine_->start();
-
-  computeThread_.reset(new std::thread([this]() { computeThread(); }));
-
-  if (multiMachine_->useGpu()) {
-    gradCollectThread_.reset(
-        new std::thread([this]() { gradCollectThread(); }));
-
-    valueDispatchThread_.reset(
-        new std::thread([this]() { valueDispatchThread(); }));
-
-    copyThread_.reset(new std::thread([this]() { copyGradToBufferThread(); }));
-  }
-}
-
-void TrainerThread::stop() {
-  if (stopping_) return;
-
-  stopping_ = true;
-
-  if (computeThread_) {
-    taskReadySem_.post();
-    computeThread_->join();
-  }
-  if (gradCollectThread_) {
-    gradQueue_.enqueue(0);
-    gradCollectThread_->join();
-  }
-  if (copyThread_) {
-    gradBufQueue_.enqueue(0);
-    copyThread_->join();
-  }
-  if (valueDispatchThread_) {
-    valueReadyQueue_.enqueue(0);
-    valueDispatchThread_->join();
-  }
-}
-
-void TrainerThread::computeThread() {
-  VLOG(1) << "gradComputeThread " << threadId_;
-
-  if (deviceId_ >= 0) {
-    hl_init(deviceId_);
-  }
-
-  while (true) {
-    {
-      REGISTER_TIMER("taskSem_wait");
-      taskReadySem_.wait();
-    }
-
-    if (stopping_) break;
-
-    switch (multiMachine_->getTaskType()) {
-      case MultiGradientMachine::TASK_FORWARD_BACKWARD:
-        forward();
-        backward();
-        break;
-      case MultiGradientMachine::TASK_FORWARD:
-        forward();
-        break;
-      case MultiGradientMachine::TASK_BACKWARD:
-        backward();
-        break;
-      case MultiGradientMachine::TASK_COPY_IN_ARGS:
-        batchSize_ = copyInArgs();
-        inArgsCopied_ = true;
-        multiMachine_->waitForCopyInArgs();
-        break;
-    }
-  }
-  hl_fini();
-}
-
-void TrainerThread::prefetch() {
-  SetDevice setDevice(deviceId_);
-  gradientMachine_->prefetch(inArgs_);
-}
-
-void TrainerThread::forward() {
-  if (!inArgsCopied_) {
-    REGISTER_TIMER("copyInArgs");
-    batchSize_ = copyInArgs();
-  } else {
-    inArgsCopied_ = false;
-  }
-
-  if (multiMachine_->getPassType() != PASS_TEST) {
-    REGISTER_TIMER("clearGradient");
-    // For main parameter, the user of MultiGpuSyncMachine is responsible
-    // for setting the gradient to zero
-    for (size_t i = 0; i < parameters_.size(); i++) {
-      if (parameters_[i]->useGpu()) {
-        if (multiMachine_->paraMainThread(i) != threadId_) {
-          SetDevice device(parameters_[i]->getDeviceId());
-          parameters_[i]->clearGradient();
-        }
-      } else {
-        parameters_[i]->clearGradient();
-      }
-    }
-  }
-
-  {
-    REGISTER_TIMER("wait_value");
-    valueReadyCond_.wait([this]() { return !parameterUpdated_; });
-  }
-
-  { fillMergeTypes(multiMachine_->getPassType(), &mergeTypes_); }
-
-  {
-    REGISTER_TIMER("thread_forward");
-    if (batchSize_ > 0) {
-      gradientMachine_->forward(
-          inArgs_, &outArgs_, multiMachine_->getPassType());
-    } else {
-      outArgs_.clear();
-    }
-  }
-  outArgsReadySem_.post();
-}
-
-void TrainerThread::backward() {
-  REGISTER_TIMER("thread_backward");
-  if (multiMachine_->isPassGrad()) {
-    copyOutputGrad();
-  }
-  if (batchSize_ > 0) {
-    gradientMachine_->backward(backwardCallback_);
-  } else {
-    for (size_t i = parameters_.size(); i > 0; i--) {
-      backwardCallback(parameters_[i - 1].get());
-    }
-  }
-  if (multiMachine_->hasNonstaticCpuParamters()) {
-    mergeCpuGradients();
-  }
-}
-
-void TrainerThread::backwardCallback(Parameter* para) {
-  // CPU parameters are merged in the end
-  if (!para->useGpu() || para->isStatic()) return;
-
-  int paramId = para->getID();
-  if (multiMachine_->getNumThreads() == 1) {
-    // no need to do merge if there is only one thread
-    doCallback(paramId);
-  } else if (threadId_ == mod(multiMachine_->paraMainThread(paramId) - 1,
-                              multiMachine_->getNumThreads())) {
-    notifyCopyGradToBuffer(paramId);
-  } else {
-    notifyGradientCollect(paramId);
-  }
-}
-
-void TrainerThread::copyGradToBufferThread() {
-  VLOG(1) << "copyGradToBufferThread " << threadId_;
-
-  if (deviceId_ >= 0) {
-    hl_init(deviceId_);
-  }
-  auto& partnerThread = multiMachine_->getThread(partnerId_);
-  auto& gradBufs = multiMachine_->getGradBuf(partnerId_);
-
-  while (true) {
-    int pid = gradBufQueue_.dequeue();
-    if (stopping_) break;
-
-    int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
-        parameters_[pid]->getDeviceId(), threadId_);
-
-    auto& gradBuf = gradBufs[pdeviceId];
-
-    {
-      REGISTER_TIMER("waitBufferReady");
-      gradBuf.sem.wait();
-    }
-
-    {
-      REGISTER_TIMER("copyGradToBuffer");
-      SetDevice setDevice(parameters_[pid]->getDeviceId());
-      for (size_t i = 0; i < mergeTypes_.size(); ++i) {
-        gradBuf.bufs[i]->resize(
-            parameters_[pid]->getBuf(mergeTypes_[i])->getSize());
-        gradBuf.bufs[i]->copyFrom(*parameters_[pid]->getBuf(mergeTypes_[i]),
-                                  gradStream_);
-      }
-      hl_stream_synchronize(gradStream_);
-    }
-    partnerThread->notifyGradientCollect(pid);
-  }
-  hl_fini();
-}
-
-void TrainerThread::gradCollectThread() {
-  VLOG(1) << "gradCollectThread " << threadId_;
-
-  if (deviceId_ >= 0) {
-    hl_init(deviceId_);
-  }
-
-  std::vector<size_t> gradReadyCount(parameters_.size(), 0);
-
-  auto& gradBufs = multiMachine_->getGradBuf(threadId_);
-
-  while (true) {
-    int pid = gradQueue_.dequeue();
-    if (stopping_) break;
-
-    if (++gradReadyCount[pid] < 2) continue;
-    gradReadyCount[pid] = 0;
-    int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
-        parameters_[pid]->getDeviceId(), threadId_);
-
-    auto& gradBuf = gradBufs[pdeviceId];
-
-    {
-      REGISTER_TIMER("mergeGrad");
-      for (size_t i = 0; i < mergeTypes_.size(); ++i) {
-        ParameterType type = mergeTypes_[i];
-        const VectorPtr& localGrad = parameters_[pid]->getBuf(type);
-        SetDevice setDevice(parameters_[pid]->getDeviceId());
-        localGrad->add(*gradBuf.bufs[i]);
-      }
-    }
-
-    gradBuf.sem.post();
-
-    if (multiMachine_->paraMainThread(pid) == threadId_) {
-      doCallback(pid);
-    } else {
-      notifyCopyGradToBuffer(pid);
-    }
-  }
-  hl_fini();
-}
-
-void TrainerThread::doCallback(int pid) {
-  REGISTER_TIMER("callback");
-  auto& gpuThreads = multiMachine_->getAllThreads();
-  if (multiMachine_->getBackwardCallback()) {
-    // The callback supplied by the user of MultiGradientMachine may handle
-    // the parameter update using the gradient.
-    multiMachine_->getBackwardCallback()(parameters_[pid].get());
-    if (parameters_[pid]->isValueUpdated()) {
-      parameters_[pid]->clearValueUpdated();
-      for (auto& thread : gpuThreads) {
-        thread->incUpdateCounter();
-      }
-      notifyValueReady(pid);
-    }
-  }
-  multiMachine_->notifyGradientTransfer(pid);
-}
-
-void TrainerThread::valueDispatchThread() {
-  VLOG(1) << "valueDispatchThread " << threadId_;
-
-  if (deviceId_ >= 0) {
-    hl_init(deviceId_);
-  }
-
-  auto& thread = multiMachine_->getThread(partnerId_);
-
-  while (true) {
-    int pid;
-    {
-      REGISTER_TIMER("value_dequeue");
-      pid = valueReadyQueue_.dequeue();
-    }
-    if (stopping_) break;
-
-    if (multiMachine_->paraMainThread(pid) == partnerId_) continue;
-
-    {
-      REGISTER_TIMER("copyValue");
-      SetDevice setDevice(parameters_[pid]->getDeviceId());
-      thread->getValueBuf(pid)->copyFrom(*getValueBuf(pid), valueStream_);
-      hl_stream_synchronize(valueStream_);
-    }
-
-    thread->notifyValueReady(pid);
-  }
-  hl_fini();
-}
-
-void TrainerThread::notifyValueReady(int paramId) {
-  if (--updateCounter_ == 0) {
-    valueReadyCond_.notify_all([this] { parameterUpdated_ = false; });
-  }
-
-  notifyValueDispatch(paramId);
-}
-
-int TrainerThread::copyInArgs() {
-  const std::vector<Argument>& fullInArgs = multiMachine_->getInArgs();
-  int numThreads = multiMachine_->getAllThreads().size();
-  int32_t numSequences = fullInArgs[0].getNumSequences();
-  int32_t startSeq = numSequences * threadId_ / numThreads;
-  int32_t endSeq = numSequences * (threadId_ + 1) / numThreads;
-  int32_t copySize = endSeq - startSeq;
-
-  /**
-   * For the first copy, need to allocate space here
-   */
-  if (inArgs_.size() == 0) {
-    inArgs_.resize(fullInArgs.size());
-  }
-
-  if (copySize == 0) {
-    return 0;
-  }
-
-  for (size_t i = 0; i < fullInArgs.size(); i++) {
-    inArgs_[i].resizeAndCopyFrom(
-        fullInArgs[i],
-        startSeq,
-        copySize,
-        FLAGS_parallel_nn ? false : multiMachine_->useGpu());
-  }
-  return copySize;
-}
-
-void TrainerThread::mergeCpuGradients() {
-  CHECK_EQ(mergeTypes_.size(), 1UL);
-  CHECK_EQ(mergeTypes_[0], PARAMETER_GRADIENT);
-
-  {
-    REGISTER_TIMER("waitbeforeMerge");
-    multiMachine_->waitBeforeMerge();
-  }
-  std::vector<const std::vector<ParameterPtr>*> slaveParameters =
-      multiMachine_->getSlaveParameters();
-
-  CHECK(slaveParameters.size());
-  for (auto& para : multiMachine_->getNonStaticParameters()) {
-    if (para->useGpu()) continue;
-    if (para->isSparseRemoteUpdate()) {
-      REGISTER_TIMER("mergeRemoteGradSparse");
-      mergeGradSparseRemote(para.get(), slaveParameters);
-    } else if (para->isGradSparseUpdate()) {
-      REGISTER_TIMER("mergeGradSparse");
-      mergeGradSparse(para.get(), slaveParameters);
-    } else {
-      REGISTER_TIMER("mergeGradDense");
-      mergeGradDense(para.get(), slaveParameters);
-    }
-  }
-  {
-    REGISTER_TIMER("waitbeforeMerge");
-    multiMachine_->waitAfterMerge();
-  }
-}
-
-void TrainerThread::mergeGradSparse(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
-  size_t pid = para->getID();
-  SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
-      para->getMat(PARAMETER_GRADIENT).get());
-  std::vector<uint32_t>& ids = mainMat->getIds(threadId_);
-
-  for (auto slaveParams : slaveParameters) {
-    SparseRowCpuMatrix* mat = dynamic_cast<SparseRowCpuMatrix*>(
-        (*slaveParams)[pid]->getMat(PARAMETER_GRADIENT).get());
-    mat->addTo(*mainMat, ids, threadId_, multiMachine_->getNumThreads());
-    // we use a sample hash method(%) instead of range partition,
-    // because range partition has balance issue sometimes,
-    // when feature ids are not generated from hashcode.
-  }
-  uniqueIds(ids);
-}
-
-void TrainerThread::mergeGradSparseRemote(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
-  size_t pid = para->getID();
-  SparseRowCpuMatrix* mainMat =
-      dynamic_cast<SparseRowCpuMatrix*>(para->getMat(PARAMETER_GRADIENT).get());
-
-  mainMat->checkIndices();
-  mainMat->zeroMemThread(threadId_, multiMachine_->getNumThreads());
-
-  for (auto slaveParams : slaveParameters) {
-    SparseRowCpuMatrix* mat = dynamic_cast<SparseRowCpuMatrix*>(
-        (*slaveParams)[pid]->getMat(PARAMETER_GRADIENT).get());
-    mat->addTo(*mainMat, threadId_, multiMachine_->getNumThreads());
-  }
-}
-
-void TrainerThread::mergeGradDense(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
-  size_t pid = para->getID();
-  auto interval = calcSplitArrayInterval(para->getSize(),
-                                         (size_t)threadId_,
-                                         multiMachine_->getNumThreads(),
-                                         8LU /*for avx*/);
-  size_t startSeq = interval.first;
-  size_t copySize = interval.second - interval.first;
-
-  // setup sub bufs
-  CpuVector destGrad(0, nullptr);
-  destGrad.subVecFrom(*para->getBuf(PARAMETER_GRADIENT), startSeq, copySize);
-
-  // merge
-  CpuVector slaveGradSub(0, nullptr);
-  for (auto slaveParams : slaveParameters) {
-    slaveGradSub.subVecFrom(
-        *(*slaveParams)[pid]->getBuf(PARAMETER_GRADIENT), startSeq, copySize);
-    destGrad.add(slaveGradSub);
-  }
-}
-
-void TrainerThread::copyOutputGrad() {
-  const std::vector<Argument>& outputGradArgs = multiMachine_->outArgs_;
-  int numThreads = multiMachine_->getAllThreads().size();
-  int32_t numSequences = outputGradArgs[0].getNumSequences();
-  int32_t startSeq = numSequences * threadId_ / numThreads;
-  int32_t endSeq = numSequences * (threadId_ + 1) / numThreads;
-  int32_t copySize = endSeq - startSeq;
-  outArgs_.resize(outputGradArgs.size());
-  for (size_t i = 0; i < outputGradArgs.size(); i++) {
-    outArgs_[i].resizeAndCopyFrom(outputGradArgs[i],
-                                  startSeq,
-                                  copySize,
-                                  multiMachine_->useGpu(),
-                                  HPPL_STREAM_DEFAULT);
-  }
-  if (multiMachine_->useGpu()) {
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-  gradientMachine_->setOutputGrad(outArgs_);
-}
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h b/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h
deleted file mode 100644
index 674acd41249..00000000000
--- a/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h
+++ /dev/null
@@ -1,478 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <atomic>
-
-#include "GradientMachine.h"
-
-#include "hl_gpu.h"
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/Queue.h"
-
-namespace paddle {
-
-class TrainerThread;
-
-typedef Queue<int> PidQueue;
-typedef std::unique_ptr<TrainerThread> TrainerThreadPtr;
-
-struct GradBuffer {
-  /// GradBuffer is used for gathering gradient for GPU parameters
-  int paramId;
-
-  /// sem is used to notify that the local gradient merge of the current thread
-  /// finished for the current thread.
-  Semaphore sem;
-
-  // bufs[mergeIndex]
-  std::vector<VectorPtr> bufs;
-};
-
-/**
- *  A MultiGradientMachine is a synchronous GradientMachine which devides
- *  one data batch into several smaller batches and assign each one small batch
- *  to one computint thread for computation. After each thread finishes
- *  computation, it merges result (including output Argument and gradient during
- *  backward()). It basically is the same as single thread gradient machine,
- *  except that it uses multi-thread to do the computation.
- *
- *  It handles GPU and Cpu parameters differently.  In GPU, one computing thread
- *  generally corresponds to one GPU device. Thus, each thread keeps a separate
- *  copy of the parameter in its own device's memory. In CPU, we only need to
- keep
- *  one copy of the parameters in the main memory. After, each computing thread
- *  computes its own parameter gradient, the update process needs to accumulate
- *  the parameter gradients from all the computing threads, and update the
- *  accumulated parameter gradient to the corresponding parameter value.
- *
- *  Each GPU parameter is assigned to a thread called its main thread. For each
- *  parameter, the accumulation of its gradients and the update of its value
- *  happens in its main thread. The main thread first gather the parameter
- *  gradients from all the computing thread. Then, it performs parameter update.
- *  After a gradient is updated by the main thread, it is scattered to all the
- *  computing thread so that the parameters in all the computing threads are
- *  synchronized. The scatter and gather process are implemented by ring-style
- *  communication. Assume we have N computing threads, its thread ids will be
- *  0, 1, ..., N-1. For each parameter, the id of the main thread is specified
- in
- *  paraMainThread_[pid], where pid is the id of the parameter. Each thread i
- only
- *  sends data to its partner thread (i - 1) % N. For example, for a parameter
- *  gradient that is computed in thread 4, and its main thread is 2. Its
- *  traveling process would be 4, 5,..., N-1, 0, 1, 2. In each step, the
- gradient
- *  buffer is added to the local gradient, and the local gradient is then copied
- *  to the gradient buffer of the next thread. At last, its main thread 2 will
- *  get the accumulated parameter gradient. For the same parameter, after its
- *  value is updated, the value's traveling process would be 2, 1, 0, N-1, ...
- 3.
- *  At the end, all the computing threads would have the updated parameter
- value.
- *
- *  A computing thread (TrainerThread) uses 4 threads to do different jobs:
- *
- *  1. computeThread(): performing forward(), backward(), prefetch().
- *
- *  2. valueDispatchThread(): copying parameter values to partner thread.
- *
- *  3. copyGradToBufferThread(): copying parameter gradient to partner thread.
- *
- *  4. gradCollectThread(): merging the gradient from step 3 with local gradient
- *     and call the callback supplied by the user to update parameter value.
- *
- *  CPU parameter value has only one copy. And their gradients are merged at the
- *  end of backward().
- *
- *  * Handling of sparse update
- *  Currently, sparse update is only supported for CPU parameters.
-
- *  Sparse updates refers to gradient caculation where the gradient is sparse.
- For
- *  example, if the input argument to a 'fc' layer is sparse, the gradient of
- the
- *  weight matrix of this layer will be sparse. It is usually more efficient to
- *  treat the gradient explicitly as sparse vector during the parameter update.
-
- *  There are two types of sparse updates called local sparse update and remote
- *  sparse update.
-
- *  For both types of sparse updates, there is one copy of parameter value and
- *  gradient called main parameter value and gradient, and there is a copy of
- *  parameter value and gradient for each computing thread called slave
- parameter
- *  value and gradient. The slave parameter values are always shared with the
- *  corresponding main parameter value. The slave parameter grad is a sparse row
- *  matrix. The sparse pattern for slave parameter grads are different, because
- *  the small batches for each computing thread might have different sparsity
- *  pattern.
-
- *  1. Local sparse update
- *
- *     Main parameter value type is MAT_NORMAL. It is a dense matrix.
- *
- *     Main parameter grad type is MAT_SPARSE_ROW_IDS (SparseRowIdsCpuMatrix)
- *     It is also a dense matrix, but the updated values are specified by IDS.
- *
- *     Slave parameter value shares with main parameter value.
- *
- *     Slave parameter grad type is MAT_SPARSE_ROW_AUTO_GROW
- *     (SparseAutoGrowRowCpuMatrix). It is a sparse row matrix.
- *
- *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
- *     gather all the non-zero gradient. And After backward(), they will be
- merged
- *     into main parameter grad (SparseRowIdsCpuMatrix), with indices indicating
- *     which rows have nonzero gradient.
- *
- *  2. Remote sparse update
- *
- *     Main parameter value type is MAT_SPARSE_ROW_PREFETCH(_FULL_SIZE)
- *     (SparsePrefetchRowCpuMatrix). MAT_SPARSE_ROW_PREFETCH is a sparse matrix.
- *     MAT_SPARSE_ROW_PREFETCH_FULL_SIZE is a dense matrix. However, only the
- *     parameter values that are prefetched is up-to-date.
- *
- *     Main parameter grad type is MAT_SPARSE_ROW (SparseRowCpuMatrix).
- *     And it shares sparse pattern with value by sharing indexDictHandle_,
- which
- *     is an internal data structure used by SparseRowCpuMatrixto specify the
- *     sparsity pattern of Slave parameter value shares with main parameter
- value.
- *
- *     Slave parameter grad type is MAT_SPARSE_ROW_AUTO_GROW
- *     (SparsePrefetchRowCpuMatrix). It is a sparse row matrix
- *
- *     During prefetch(), all the layers will indicates which rows of each
- *     parameter are needed. Then the framework will retrieve those rows from
- *     parameter server.
- *
- *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
- *     gather all the non-zero gradient. And After backward(), they will be
- merged
- *     into main parameter grad (SparseRowCpuMatrix). And the framework will
- send
- *     the merged gradient to parameter server.
- */
-class MultiGradientMachine : public GradientMachine {
- public:
-  enum TaskType {
-    TASK_FORWARD_BACKWARD = 0,
-    TASK_FORWARD = 1,
-    TASK_BACKWARD = 2,
-    TASK_COPY_IN_ARGS = 3,
-  };
-
-  explicit MultiGradientMachine(const ModelConfig& config, bool useGpu);
-
-  virtual void start();
-
-  virtual void finish();
-
-  virtual void prefetch(const std::vector<Argument>& inArgs);
-
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback& callback = nullptr);
-
-  void forwardBackward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType,
-                       const UpdateCallback& callback);
-
-  virtual Argument getLayerOutput(const std::string& layerName);
-
-  virtual void onPassEnd();
-
-  virtual Evaluator* makeEvaluator() const;
-
-  virtual void eval(Evaluator* evaluator) const;
-
-  bool useGpu() const { return useGpu_; }
-
-  /// @return whether to pass the gradients in outArgs_ to each threads.
-  bool isPassGrad() { return isPassGrad_; }
-
-  /// @brief set whether to pass the gradient in outArgs_ to each threads.
-  void setPassGrad(bool isPass) { isPassGrad_ = isPass; }
-
-  /// Set the gradients of the outputs.
-  /// The gradietns will be copied to each thread in the computing threads.
-  virtual void setOutputGrad(const std::vector<Argument>& args);
-
- protected:
-  friend class TrainerThread;
-
-  std::vector<TrainerThreadPtr>& getAllThreads() { return threads_; }
-  /// Calculate the real device id based on the logical device id and the
-  /// thread id.
-  int logicalDeviceId2RealDeviceId(int logicalId, int threadId = 0) const {
-    if (logicalId == -1) {
-      logicalId = 0;
-    }
-    return mod(logicalId + FLAGS_gpu_id + threadId * numLogicalDevices_,
-               numDevices_);
-  }
-
-  /// Calculate the logical device id based on the real device id and the
-  /// thread id.
-  int realDeviceId2LogicalDeviceId(int realId, int threadId = 0) const {
-    if (realId == -1) {
-      return 0;
-    } else {
-      return mod(realId - FLAGS_gpu_id - threadId * numLogicalDevices_,
-                 numDevices_);
-    }
-  }
-
-  std::vector<const std::vector<ParameterPtr>*> getSlaveParameters();
-
-  bool hasNonstaticCpuParamters() const { return hasNonstaticCpuParamters_; }
-
-  /// Called TrainerThread to wait before merging CPU parameter gradients.
-  void waitBeforeMerge() { trainerBarrier_.wait(); }
-
-  /// called by MultiGradientMachine and TrainerThread to wait after merging
-  /// CPU parameter graidents.
-  void waitAfterMerge() { allBarrier_.wait(); }
-
-  /// called by MultiGradientMachine and TrainerThread to wait for copyInArgs()
-  /// finishing
-  void waitForCopyInArgs() { allBarrier_.wait(); }
-
-  TrainerThreadPtr& getThread(int threadId) { return threads_[threadId]; }
-
-  std::vector<GradBuffer>& getGradBuf(int threadId) {
-    return gradBufs_[threadId];
-  }
-
-  PassType getPassType() const { return passType_; }
-
-  /// Called by TrainerThread to notify MultiGradientMachine that the gradient
-  /// for paramId is ready
-  void notifyGradientTransfer(int paramId);
-
-  const std::vector<Argument>& getInArgs() { return inArgs_; }
-
-  TaskType getTaskType() const { return taskType_; }
-
-  const UpdateCallback& getBackwardCallback() const {
-    return backwardCallback_;
-  }
-
-  int getNumDevices() const { return numDevices_; }
-
-  int getNumLogicalDevices() const { return numLogicalDevices_; }
-
-  int getNumThreads() const { return numThreads_; }
-
-  int paraMainThread(int pid) const { return paraMainThread_[pid]; }
-
- protected:
-  virtual void forwardImp(const std::vector<Argument>& inArgs,
-                          std::vector<Argument>* outArgs,
-                          PassType passType,
-                          TaskType taskType);
-
-  virtual void backwardImp(const UpdateCallback& callback = NULL);
-
-  /// update all parameters
-  void updateThreadParameters();
-
-  void startTask(TaskType taskType);
-
-  void getOutArgs(std::vector<Argument>* outArgs, PassType passType);
-
-  void allocGradBufs();
-
- protected:
-  bool useGpu_;
-
-  bool hasNonstaticCpuParamters_;
-
-  /// store main parameter only
-  std::unique_ptr<GradientMachine> gradientMachine_;
-
-  std::vector<TrainerThreadPtr> threads_;
-  std::vector<int> paraMainThread_;
-  std::vector<std::vector<GradBuffer>> gradBufs_;  // [threadId][deviceId]
-  std::vector<size_t> bufferSizes_;
-
-  PassType passType_;
-  TaskType taskType_;
-  PidQueue gradQueue_;
-  std::vector<Argument> inArgs_;
-  std::vector<Argument> outArgs_;
-  hl_stream_t outArgStream_;
-
-  Argument outLayerArgs_;
-
-  /// ParameterType which needs to be merged from each GPU
-  std::vector<ParameterType> mergeTypes_;
-  int numDevices_;         /* number of gpu devices */
-  int numLogicalDevices_;  // number of GPU used by one NN
-  int numThreads_;         /* number of train threads */
-
-  UpdateCallback backwardCallback_;
-
-  /// barrrier for threads_
-  ThreadBarrier trainerBarrier_;
-
-  /// barrier for both MultiGradientMachine and threds_
-  ThreadBarrier allBarrier_;
-
-  /// indicate whether inArgs is copied before forward()
-  bool inArgsCopied_;
-
-  /// Whether to copy the gradient back from an external input.
-  bool isPassGrad_;
-};
-
-class TrainerThread {
- public:
-  TrainerThread(const ModelConfig& config,
-                int threadId,
-                MultiGradientMachine* multiMachine);
-
-  ~TrainerThread();
-
-  void start();
-
-  void onPassEnd() { gradientMachine_->onPassEnd(); }
-
-  void waitOutArgsReady() { outArgsReadySem_.wait(); }
-
-  void notifyTaskReady() { taskReadySem_.post(); }
-
-  int getDeviceId() const { return deviceId_; }
-
-  GradientMachine* getGradientMachine() { return gradientMachine_.get(); }
-
-  const std::vector<ParameterPtr>& getParameters() { return parameters_; }
-
-  void stop();
-
-  void notifyValueReady(int paramId);
-
-  const VectorPtr& getValueBuf(int paramId) {
-    return parameters_[paramId]->getBuf(PARAMETER_VALUE);
-  }
-
-  const std::vector<Argument>& getOutArgs() { return outArgs_; }
-
-  void incUpdateCounter(int n = 1) {
-    updateCounter_ += n;
-    parameterUpdated_ = true;
-  }
-
-  void notifyGradientCollect(int paramId) { gradQueue_.enqueue(paramId); }
-
-  void notifyCopyGradToBuffer(int paramId) { gradBufQueue_.enqueue(paramId); }
-
-  void notifyValueDispatch(int paramId) { valueReadyQueue_.enqueue(paramId); }
-
-  void prefetch();
-
-  /// copy the output gradient from the main GradientMachine.
-  void copyOutputGrad();
-
-  /// Whether the thread has input data.
-  bool hasInputData() { return batchSize_ != 0; }
-
- protected:
-  void mergeCpuGradients();
-
-  void mergeGradSparse(
-      Parameter* para,
-      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
-
-  void mergeGradSparseRemote(
-      Parameter* para,
-      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
-
-  void mergeGradDense(
-      Parameter* para,
-      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
-
-  void computeThread();
-  void valueDispatchThread();
-  void copyGradToBufferThread();
-  void gradCollectThread();
-
-  int copyInArgs();
-  void forward();
-  void backward();
-  void backwardCallback(Parameter* para);
-
-  /// call the actuall callback supplied by the caller of
-  /// GradientMachine::backward
-  void doCallback(int pid);
-
- protected:
-  MultiGradientMachine* multiMachine_;
-  ModelConfig config_;
-  /// whether the thread should stop
-  bool stopping_;
-  /// the threads form which to collect gradient
-  int partnerId_;
-  /// from 0 to threads-1
-  int threadId_;
-  int deviceId_;
-  std::unique_ptr<GradientMachine> gradientMachine_;
-  std::vector<ParameterPtr> parameters_;
-
-  /// ParameterType which needs to be merged from each GPU
-  std::vector<ParameterType> mergeTypes_;
-
-  /// compute thread
-  std::unique_ptr<std::thread> computeThread_;
-  std::vector<Argument> inArgs_;
-  std::vector<Argument> outArgs_;
-  Semaphore taskReadySem_;
-  Semaphore outArgsReadySem_;
-
-  /// copy thread
-  std::unique_ptr<std::thread> copyThread_;
-  /// queue of gradient needs to be copied to partner
-  PidQueue gradBufQueue_;
-  hl_stream_t gradStream_;
-
-  /// grad merge thread
-  std::unique_ptr<std::thread> gradCollectThread_;
-  /// queue of gradient needs to be merged with gradient coopied by
-  /// copyGradToBufferThread
-  PidQueue gradQueue_;
-  UpdateCallback backwardCallback_;
-
-  /// value dispatch thread
-  std::unique_ptr<std::thread> valueDispatchThread_;
-  /// queue of the parameter whose the vale are ready for copy
-  PidQueue valueReadyQueue_;
-
-  /// used to notify all the parameter values are ready
-  LockedCondition valueReadyCond_;
-
-  hl_stream_t valueStream_;
-  /// how many parameters are updated
-  std::atomic<int> updateCounter_;
-  bool parameterUpdated_;
-
-  /// indicate whether inArgs is copied before forward()
-  bool inArgsCopied_;
-  int batchSize_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp b/paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp
deleted file mode 100644
index 1245c441036..00000000000
--- a/paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include "MultiNetwork.h"
-
-#include "NeuralNetwork.h"
-#include "ParallelNeuralNetwork.h"
-
-namespace paddle {
-
-void MultiNetwork::init(const ModelConfig& config,
-                        ParamInitCallback callback,
-                        const std::vector<ParameterType>& parameterTypes,
-                        bool useGpu) {
-  CHECK_GT(config.sub_models_size(), 1) << "sub_models_size should GT 1";
-  // check submodel[0] is root
-  CHECK_EQ("root", config.sub_models(0).name())
-      << "sub_models(0) should be root";
-  // ignore root
-  subNetworks_.resize(config.sub_models_size() - 1);
-  // base class
-  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
-  // sub networks
-  for (int i = 1; i < config.sub_models_size(); ++i) {
-    std::string subModelName = config.sub_models(i).name();
-    if (FLAGS_parallel_nn) {
-      subNetworks_[i - 1] = std::unique_ptr<ParallelNeuralNetwork>(
-          new ParallelNeuralNetwork(subModelName, this));
-    } else {
-      subNetworks_[i - 1] = std::unique_ptr<NeuralNetwork>(
-          NeuralNetwork::newNeuralNetwork(subModelName, this));
-    }
-    subNetworks_[i - 1]->init(config);
-  }
-}
-
-void MultiNetwork::prefetch(const std::vector<Argument>& inArgs) {
-  std::vector<std::vector<Argument>> argumentGroups;
-  Argument::splitByDataId(inArgs, &argumentGroups);
-  // check group size is equal to sub network size
-  CHECK_EQ(argumentGroups.size(), subNetworks_.size());
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    if (argumentGroups[i].size() == 1 && argumentGroups[i][0].dataId == -1) {
-      // check input args: if dataId is -1, then skip this sub network
-      continue;
-    }
-    subNetworks_[i]->prefetch(argumentGroups[i]);
-  }
-}
-
-void MultiNetwork::forward(const std::vector<Argument>& inArgs,
-                           std::vector<Argument>* outArgs,
-                           PassType passType) {
-  // split inArgs to several vectors
-  std::vector<std::vector<Argument>> argumentGroups;
-  Argument::splitByDataId(inArgs, &argumentGroups);
-
-  // check group size is equal to sub network size
-  CHECK_EQ(argumentGroups.size(), subNetworks_.size());
-  std::vector<Argument> tempOutArgs;
-  outArgs->clear();
-
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    tempOutArgs.clear();
-    if (argumentGroups[i].size() == 1 && argumentGroups[i][0].dataId == -1) {
-      // check input args: if dataId is -1, then skip this sub network
-      continue;
-    }
-    subNetworks_[i]->forward(argumentGroups[i], &tempOutArgs, passType);
-    for (const auto& elem : tempOutArgs) {
-      outArgs->push_back(elem);
-      outArgs->back().dataId = i;
-    }
-  }
-}
-
-void MultiNetwork::backward(const UpdateCallback& callback) {
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    subNetworks_[i]->backward(callback);
-  }
-}
-
-void MultiNetwork::forwardBackward(const std::vector<Argument>& inArgs,
-                                   std::vector<Argument>* outArgs,
-                                   PassType passType,
-                                   const UpdateCallback& callback) {
-  forward(inArgs, outArgs, passType);
-  backward(callback);
-}
-
-void MultiNetwork::onPassEnd() {
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    subNetworks_[i]->onPassEnd();
-  }
-}
-
-void MultiNetwork::start() {
-  for (auto& subNetwork : subNetworks_) {
-    subNetwork->start();
-  }
-}
-
-void MultiNetwork::finish() {
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    subNetworks_[i]->finish();
-  }
-}
-
-class MultiCombinedEvaluator : public Evaluator {
- public:
-  MultiCombinedEvaluator() {}
-  void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
-    evaluators_.emplace_back(std::move(evaluator));
-  }
-  virtual void start() {
-    for (auto& evaluator : evaluators_) {
-      evaluator->start();
-    }
-  }
-
-  virtual void finish() {
-    for (auto& evaluator : evaluators_) {
-      evaluator->finish();
-    }
-  }
-
-  virtual void eval(const NeuralNetwork& nn) {
-    const MultiNetwork& multiNetwork = dynamic_cast<const MultiNetwork&>(nn);
-    CHECK_EQ(evaluators_.size(), multiNetwork.getSubNetworks().size());
-    int size = evaluators_.size();
-    for (int i = 0; i < size; i++) {
-      // one evaluator for one subNetwork
-      evaluators_[i]->eval(*multiNetwork.getSubNetworks()[i]);
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    (void)arguments;
-    return -1;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    for (auto& evaluator : evaluators_) {
-      evaluator->printStats(os);
-      os << ' ';
-    }
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    for (auto& evaluator : evaluators_) {
-      evaluator->distributeEval(client);
-    }
-  }
-
- protected:
-  std::vector<std::unique_ptr<Evaluator>> evaluators_;
-};
-
-Evaluator* MultiNetwork::makeEvaluator() const {
-  MultiCombinedEvaluator* multiCombinedEvaluator = new MultiCombinedEvaluator();
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    std::unique_ptr<Evaluator> evaluator(subNetworks_[i]->makeEvaluator());
-    multiCombinedEvaluator->addEvaluator(std::move(evaluator));
-  }
-  return multiCombinedEvaluator;
-}
-
-void MultiNetwork::eval(Evaluator* evaluator) const { evaluator->eval(*this); }
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/MultiNetwork.h b/paddle/legacy/gserver/gradientmachines/MultiNetwork.h
deleted file mode 100644
index afe15cb020e..00000000000
--- a/paddle/legacy/gserver/gradientmachines/MultiNetwork.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "GradientMachine.h"
-#include "NeuralNetwork.h"
-
-#include "paddle/legacy/utils/Locks.h"
-
-namespace paddle {
-
-class MultiNetwork : public NeuralNetwork {
- public:
-  explicit MultiNetwork(std::string subModelName = "")
-      : NeuralNetwork(subModelName) {}
-
-  virtual void init(const ModelConfig& config,
-                    ParamInitCallback callback,
-                    const std::vector<ParameterType>& parameterTypes,
-                    bool useGpu);
-
-  virtual void prefetch(const std::vector<Argument>& inArgs);
-
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback& callback = nullptr);
-
-  void forwardBackward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType,
-                       const UpdateCallback& callback);
-
-  virtual void onPassEnd();
-
-  virtual Evaluator* makeEvaluator() const;
-
-  virtual void eval(Evaluator* evaluator) const;
-
-  const std::vector<std::unique_ptr<NeuralNetwork>>& getSubNetworks() const {
-    return subNetworks_;
-  }
-
-  virtual void start();
-
-  virtual void finish();
-
- protected:
-  std::vector<std::unique_ptr<NeuralNetwork>> subNetworks_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp
deleted file mode 100644
index 0f8048152ff..00000000000
--- a/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp
+++ /dev/null
@@ -1,548 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Util.h"
-
-#include "NeuralNetwork.h"
-#include "hl_gpu.h"
-#include "paddle/legacy/utils/CustomStackTrace.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/legacy/gserver/layers/MKLDNNLayer.h"
-#endif
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "MultiNetwork.h"
-#include "RecurrentGradientMachine.h"
-#include "paddle/legacy/gserver/layers/AgentLayer.h"
-#endif
-
-namespace paddle {
-void parameterInitNN(int paramId,
-                     Parameter* para,
-                     std::vector<ParameterPtr>* sharedParams) {
-  // Create parameters values.
-  if (!para->useGpu() && sharedParams) {
-    para->enableSharedType(PARAMETER_VALUE,
-                           (*sharedParams)[paramId]->getBuf(PARAMETER_VALUE),
-                           (*sharedParams)[paramId]->getMat(PARAMETER_VALUE));
-  } else {
-    if (para->isSparseRemoteUpdate()) {
-      para->enableType(PARAMETER_VALUE,
-                       FLAGS_loadsave_parameters_in_pserver
-                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
-                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
-    } else {
-      para->enableType(PARAMETER_VALUE);
-    }
-  }
-  // Create parameter gradients.
-  if (para->isSparseRemoteUpdate() && !sharedParams) {
-    para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
-  } else if (para->isGradSparseUpdate()) {
-    para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_AUTO_GROW);
-  } else if (!para->isStatic()) {
-    para->enableType(PARAMETER_GRADIENT);
-  }
-}
-
-NeuralNetwork* NeuralNetwork::create(const ModelConfig& config) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  if (config.type() == "recurrent_nn") {
-    return newNeuralNetwork("root");
-  } else if (config.type() == "multi_nn") {
-    return new MultiNetwork("root");
-  } else {
-    return newNeuralNetwork();
-  }
-#else
-  return new NeuralNetwork();
-#endif
-}
-
-std::map<std::string, bool> NeuralNetwork::dllInitMap;
-
-void NeuralNetwork::init(const ModelConfig& config,
-                         ParamInitCallback callback,
-                         const std::vector<ParameterType>& parameterTypes,
-                         bool useGpu) {
-  using std::placeholders::_1;
-  using std::placeholders::_2;
-  ParamInitCallback paramCallback = nullptr;
-  if (callback != nullptr) {
-    paramSelfInited_ = false;
-    paramCallback = callback;
-  } else {
-    paramSelfInited_ = true;
-    paramCallback = std::bind(parameterInitNN, _1, _2, nullptr);
-  }
-  config_ = config;
-
-  if (rootNetwork_ != nullptr) {
-    // direct use parameters_ and parameterMap_ from base network
-    CHECK_EQ((size_t)config.parameters_size(),
-             rootNetwork_->getParameters().size());
-    parameters_ = rootNetwork_->getParameters();
-    parameterMap_ = *(rootNetwork_->getParameterMap());
-  } else {
-    parameters_.reserve(config.parameters_size());
-    for (const auto& para_config : config.parameters()) {
-      auto parameter = std::make_shared<Parameter>(para_config,
-                                                   useGpu,
-                                                   /*initialize=*/false);
-      paramCallback(parameters_.size(), parameter.get());
-      if (!callback) {
-        for (ParameterType type :
-             (parameter->isStatic()
-                  ? std::vector<ParameterType>{PARAMETER_VALUE}
-                  : parameterTypes)) {
-          if (type != PARAMETER_VALUE && type != PARAMETER_GRADIENT) {
-            parameter->enableType(type);
-          }
-        }
-      }
-      parameter->setID(parameters_.size());
-      parameters_.push_back(parameter);
-      CHECK(!parameterMap_.count(parameter->getName()));
-      parameterMap_[parameter->getName()] = parameter;
-    }
-  }
-
-  auto layerCreate = [&](const LayerConfig& layer_config) {
-    auto layer = Layer::create(layer_config);
-    CHECK(layer) << "Create layer failed. Layer name:" << layer->getName();
-    layers_.push_back(layer);
-    CHECK(!layerMap_.count(layer->getName()));
-    layerMap_[layer->getName()] = layer;
-  };
-
-  auto subModelConfig = std::find_if(config.sub_models().begin(),
-                                     config.sub_models().end(),
-                                     [=](const SubModelConfig& sub_model) {
-                                       return sub_model.name() == subModelName_;
-                                     });
-  bool useSubModel = (subModelConfig != config.sub_models().end());
-  CHECK_EQ(useSubModel, !subModelName_.empty());
-  if (useSubModel) {
-    layers_.reserve(subModelConfig->layer_names_size());
-    for (const auto& layer_name : subModelConfig->layer_names()) {
-      auto layer_config =
-          std::find_if(config.layers().begin(),
-                       config.layers().end(),
-                       [=](const LayerConfig& layer_config) {
-                         return layer_config.name() == layer_name;
-                       });
-      CHECK(layer_config != config.layers().end());
-      layerCreate(*layer_config);
-    }
-  } else {
-    layers_.reserve(config.layers_size());
-    for (const auto& layer_config : config.layers()) {
-      bool useLayer = true;
-      if (config.has_external_config()) {
-        useLayer = true;
-        for (const auto& name : config.external_config().layer_names()) {
-          if (layer_config.name() == name) {
-            useLayer = false;
-            break;
-          }
-        }
-      }
-      if (useLayer) {
-        layerCreate(layer_config);
-      }
-    }
-  }
-
-  for (const auto& layer : layers_) {
-    layer->init(layerMap_, parameterMap_);
-    layer->initSubNetwork(this /*root*/, config_, parameterTypes, useGpu);
-  }
-
-  for (const auto& layer_name :
-       (useSubModel ? subModelConfig->input_layer_names()
-                    : config.input_layer_names())) {
-    auto it = layerMap_.find(layer_name);
-    CHECK(it != layerMap_.end());
-    dataLayers_.push_back(std::dynamic_pointer_cast<DataLayer>(it->second));
-  }
-
-  for (const auto& layer_name :
-       (useSubModel ? subModelConfig->output_layer_names()
-                    : config.output_layer_names())) {
-    auto it = layerMap_.find(layer_name);
-    CHECK(it != layerMap_.end());
-    outputLayers_.push_back(it->second);
-  }
-
-  for (const auto& layer : layers_) {
-    const auto& name = layer->getName();
-    bool isMiddleLayer = true;
-
-    // if data layer
-    for (const auto& dataLayer : dataLayers_) {
-      if (name == dataLayer->getName()) {
-        isMiddleLayer = false;
-        break;
-      }
-    }
-
-    // if output layer
-    for (const auto& dataLayer : outputLayers_) {
-      if (name == dataLayer->getName()) {
-        isMiddleLayer = false;
-        break;
-      }
-    }
-
-    if (isMiddleLayer) {
-      middleLayers_.push_back(layer);
-    }
-  }
-}
-
-void NeuralNetwork::connect(LayerPtr agentLayer,
-                            LayerPtr realLayer,
-                            int height) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
-  CHECK_NOTNULL(agent);
-  agent->setRealLayer(realLayer, height);
-#endif
-}
-
-void NeuralNetwork::connect(std::string agentLayerName,
-                            NeuralNetwork* srcNN,
-                            std::string realLayerName) {
-  connect(this->getLayer(agentLayerName), srcNN->getLayer(realLayerName));
-}
-
-void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
-  CHECK_EQ(inArgs.size(), dataLayers_.size());
-
-  if (paramSelfInited_) {
-    for (auto& para : parameters_) {
-      if (para->isSparseRemoteUpdate()) {
-        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-            para->getMat(PARAMETER_VALUE).get());
-        para->clearGradient();
-        if (mat) mat->clearIndices();
-      }
-    }
-  }
-
-  for (size_t i = 0; i != dataLayers_.size(); ++i) {
-    if (FLAGS_parallel_nn) {
-      const_cast<Argument&>(inArgs[i]).deviceId = -1;
-    }
-    dataLayers_[i]->setData(inArgs[i]);
-  }
-
-  for (auto& layer : layers_) {
-    layer->prefetch();
-  }
-
-  if (paramSelfInited_) {
-    for (auto& para : parameters_) {
-      if (para->isSparseRemoteUpdate()) {
-        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-            para->getMat(PARAMETER_VALUE).get());
-        mat->setupIndices();
-        auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
-            para->getMat(PARAMETER_GRADIENT).get());
-        matGrad->reserveStore();
-      }
-    }
-  }
-}
-
-void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
-                            std::vector<Argument>* outArgs,
-                            PassType passType) {
-  CHECK_EQ(inArgs.size(), dataLayers_.size());
-  outArgs->resize(outputLayers_.size());
-  for (size_t i = 0; i != dataLayers_.size(); ++i) {
-    dataLayers_[i]->setData(inArgs[i]);
-  }
-
-  gLayerStackTrace.set_stage(true);
-
-  {
-    for (auto& layer : layers_) {
-      REGISTER_TIMER_INFO("ForwardTimer", layer->getName().c_str());
-      gLayerStackTrace.push(layer->getName());
-      layer->forward(passType);
-      gLayerStackTrace.pop(layer->getName());
-    }
-  }
-
-  outArgs->clear();
-  outArgs->reserve(outputLayers_.size());
-  for (auto& layer : outputLayers_) {
-    outArgs->push_back(layer->getOutput());
-  }
-}
-
-void NeuralNetwork::resetState() {
-  for (auto& layer : layers_) {
-    layer->resetState();
-  }
-}
-
-void NeuralNetwork::setState(const MachineState& machineState) {
-  for (size_t i = 0; i < layers_.size(); i++) {
-    if (machineState[i] != nullptr) {
-      layers_[i]->setState(machineState[i]);
-    }
-  }
-}
-
-void NeuralNetwork::getState(MachineState& machineState) {
-  machineState.clear();
-  machineState.reserve(layers_.size());
-  for (auto& layer : layers_) {
-    LayerStatePtr p = layer->getState();
-    machineState.push_back(p);
-  }
-}
-
-void NeuralNetwork::backward(const UpdateCallback& callback) {
-  gLayerStackTrace.set_stage(false);
-  FOR_EACH_R(layer, layers_) {
-    REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str());
-    gLayerStackTrace.push((*layer)->getName());
-    if ((*layer)->needGradient()) {
-      (*layer)->backward(callback);
-    }
-    gLayerStackTrace.pop((*layer)->getName());
-  }
-}
-
-void NeuralNetwork::finish() {
-#ifdef PADDLE_WITH_MKLDNN
-  FOR_EACH_R(layer, layers_) {
-    MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(*layer);
-    if (dnnLayer) {
-      dnnLayer->convertWeightsToPaddle();
-    }
-  }
-#endif
-}
-
-Argument NeuralNetwork::getLayerOutput(const std::string& layerName) {
-  return getLayer(layerName)->getOutput();
-}
-
-void NeuralNetwork::onPassEnd() {
-  for (auto& layer : layers_) {
-    layer->onPassEnd();
-  }
-}
-
-void NeuralNetwork::releaseOutput() {
-  for (auto& layer : middleLayers_) {
-    Argument& arg = layer->getOutput();
-    arg.value.reset();
-  }
-}
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-class CombinedEvaluator : public Evaluator {
- public:
-  void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
-    evaluators_.emplace_back(std::move(evaluator));
-  }
-  void start() override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->start();
-    }
-  }
-
-  void finish() override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->finish();
-    }
-  }
-
-  void eval(const NeuralNetwork& nn) override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->eval(nn);
-    }
-  }
-  real evalImp(std::vector<Argument>& arguments) override {
-    (void)arguments;
-    return -1;
-  }
-  void printStats(std::ostream& os) const override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->printStats(os);
-      os << ' ';
-    }
-  }
-
-  void distributeEval(ParameterClient2* client) override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->distributeEval(client);
-    }
-  }
-
- protected:
-  std::vector<std::unique_ptr<Evaluator>> evaluators_;
-
-  // Evaluator interface
- public:
-  /**
-   * @brief getNames will return all inside evaluators' names.
-   * @param names [out]: return names.
-   */
-  void getNames(std::vector<std::string>* names) override {
-    for (auto& eval : evaluators_) {
-      eval->getNames(names);
-    }
-  }
-
-  /**
-   * @brief getValue could get all inside evaluators' value.
-   */
-  real getValue(const std::string& name, Error* err) const override {
-    return this->getMethodHelper<real>(
-        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
-          return eval->getValue(name, err);
-        });
-  }
-
-  /**
-   * @brief getType could get all inside evaluators' type.
-   */
-  std::string getType(const std::string& name, Error* err) const override {
-    return this->getMethodHelper<std::string>(
-        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
-          return eval->getType(name, err);
-        });
-  }
-
- private:
-  template <typename T>
-  T getMethodHelper(const std::string& name,
-                    Error* err,
-                    const std::function<T(const std::unique_ptr<Evaluator>&)>&
-                        callback) const {
-    for (auto& eval : evaluators_) {
-      std::vector<std::string> names;
-      eval->getNames(&names);
-      if (std::find(names.begin(), names.end(), name) != names.end()) {
-        return callback(eval);
-      }
-    }
-    *err = Error("No such key %s", name.c_str());
-    return T();
-  }
-};
-
-class SubnetEvaluator : public CombinedEvaluator {
- public:
-  SubnetEvaluator(const std::string& layerName,
-                  std::unique_ptr<Evaluator>&& evaluator)
-      : layerName_(layerName) {
-    addEvaluator(std::move(evaluator));
-  }
-  void eval(const NeuralNetwork& nn) override {
-    const LayerPtr& layer = nn.getLayer(layerName_);
-    CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
-                 << nn.getName();
-    bool accessed = false;
-    layer->accessSubNetwork([this, &accessed](NeuralNetwork& subnet) {
-      subnet.eval(evaluators_[0].get());
-      accessed = true;
-    });
-    CHECK(accessed) << "There is no subnetwork for layer " << layerName_
-                    << " in submodel " << nn.getName();
-  }
-
- protected:
-  std::string layerName_;
-};
-
-Evaluator* NeuralNetwork::makeEvaluator() const {
-  CombinedEvaluator* combinedEvaluator = new CombinedEvaluator();
-  auto subModelConfig = std::find_if(config_.sub_models().begin(),
-                                     config_.sub_models().end(),
-                                     [=](const SubModelConfig& sub_model) {
-                                       return sub_model.name() == subModelName_;
-                                     });
-  bool useSubModel = (subModelConfig != config_.sub_models().end());
-  CHECK_EQ(useSubModel, !subModelName_.empty());
-  if (useSubModel) {
-    // create the evaluators that belong to CURRENT submodel
-    for (int i = 0; i < subModelConfig->evaluator_names_size(); ++i) {
-      // find evaluator by name
-      auto thisEvalConfig = std::find_if(
-          config_.evaluators().begin(),
-          config_.evaluators().end(),
-          [=](const EvaluatorConfig& ecfg) {
-            return ecfg.name() == subModelConfig->evaluator_names(i);
-          });
-      bool validConfig = (thisEvalConfig != config_.evaluators().end());
-      if (validConfig) {
-        std::unique_ptr<Evaluator> evaluator(
-            Evaluator::create(*thisEvalConfig));
-        combinedEvaluator->addEvaluator(std::move(evaluator));
-      }
-    }
-    for (auto& layer : layers_) {
-      layer->accessSubNetwork(
-          [layer, combinedEvaluator](NeuralNetwork& subnet) {
-            std::unique_ptr<Evaluator> subEvaluator(new SubnetEvaluator(
-                layer->getName(),
-                std::unique_ptr<Evaluator>(subnet.makeEvaluator())));
-            combinedEvaluator->addEvaluator(std::move(subEvaluator));
-          });
-    }
-  } else {
-    for (const EvaluatorConfig& evalConfig : config_.evaluators()) {
-      std::unique_ptr<Evaluator> evaluator(Evaluator::create(evalConfig));
-      combinedEvaluator->addEvaluator(std::move(evaluator));
-    }
-  }
-  return combinedEvaluator;
-}
-
-void NeuralNetwork::eval(Evaluator* evaluator) const { evaluator->eval(*this); }
-
-#endif
-
-void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
-  CHECK_GE(outputLayers_.size(), args.size());
-  for (size_t i = 0; i < args.size(); ++i) {
-    outputLayers_[i]->getOutput().grad = args[i].grad;
-  }
-}
-
-extern NeuralNetwork* newCustomNerualNetwork(const std::string& name,
-                                             NeuralNetwork* network)
-    __attribute__((weak));
-
-NeuralNetwork* NeuralNetwork::newNeuralNetwork(const std::string& name,
-                                               NeuralNetwork* rootNetwork) {
-  if (newCustomNerualNetwork) {
-    return newCustomNerualNetwork(name, rootNetwork);
-  } else {
-    return new NeuralNetwork(name, rootNetwork);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h
deleted file mode 100644
index 566157c8998..00000000000
--- a/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <map>
-#include <memory>
-
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/legacy/gserver/layers/CostLayer.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/gserver/layers/Layer.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/utils/ClassRegistrar.h"
-
-namespace paddle {
-/*
- * @brief  Init function for the parameters.
- * @param paramId: the id of the parameter to init.
- * @param para: the pointer to the parameter to init.
- * @param sharedParams: the pointer to an array of the parameter to be shared.
- *                      If it is null, no parameter sharing is used.
- *                      Only CPU paramters can be shared.
- * It handles CPU, CPU sparse, CPU sparse remote,
- * and GPU parameters differently. If the type
- * of a parameter is NORMAL. Basically nothing need to be done.
- * CPU value: NORMAL.
- * CPU param: NORMAL.
- *
- * CPU sparse value: NORMAL.
- * CPU sparse gradient: MAT_SPARSE_ROW_AUTO_GROW.
- *
- * CPU sparse remote value: MAT_SPARSE_ROW_PREFETCH(_FULL_SIZE).
- * CPU sparse remote gradient: MAT_SPARSE_ROW (!sharedParams)
- *                             MAT_SPARSE_ROW_AUTO_GROW (sharedParams)
- *
- * GPU value: NORMAL
- * GPU param: NORMAL
- */
-void parameterInitNN(int paramId,
-                     Parameter* para,
-                     std::vector<ParameterPtr>* sharedParams);
-
-class NeuralNetwork : public GradientMachine {
- public:
-  virtual void init(const ModelConfig& config,
-                    ParamInitCallback callback = nullptr,
-                    const std::vector<ParameterType>& parameterTypes =
-                        std::vector<ParameterType>{PARAMETER_VALUE,
-                                                   PARAMETER_GRADIENT,
-                                                   PARAMETER_MOMENTUM},
-                    bool useGpu = FLAGS_use_gpu);
-
-  /**
-   * Connect two submodels and
-   * down-submodel's output become up-submodel's input.
-   * By default, connection is one by one,
-   * If the agent height is smaller than real layer, *height* has to be filled.
-   *
-   * @param realLayer  The down-submodel's output layer.
-   * @param agentLayer The up-submodel's input agent layer.
-   */
-  static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0);
-  void connect(std::string agentLayerName,
-               NeuralNetwork* srcNN,
-               std::string realLayerName);
-
-  virtual void prefetch(const std::vector<Argument>& inArgs);
-
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback& callback = nullptr);
-
-  virtual Argument getLayerOutput(const std::string& layerName);
-
-  const LayerPtr& getLayer(const std::string& layerName) const {
-    auto it = layerMap_.find(layerName);
-    CHECK(it != layerMap_.end()) << "Unknown layer " << layerName;
-    return it->second;
-  }
-
-  virtual void onPassEnd();
-
-#ifndef PADDLE_MOBILE_INFERENCE
-  virtual Evaluator* makeEvaluator() const;
-
-  virtual void eval(Evaluator* evaluator) const;
-#endif
-
-  virtual void resetState();
-  virtual void setOutputGrad(const std::vector<Argument>& args);
-
-  /// set machine state
-  virtual void setState(const MachineState& machineState);
-
-  /// get machine state
-  virtual void getState(MachineState& machineState);
-
-  static NeuralNetwork* create(const ModelConfig& config);
-
-  ParameterMap* getParameterMap() { return &parameterMap_; }
-
-  /**
-   * @brief Access each layer as a for each loop.
-   * @param callback invoke with each layer.
-   */
-  template <typename T>
-  void forEachLayer(T callback) {
-    for (auto& l : layers_) {
-      if (callback(l)) {
-        break;
-      }
-    }
-  }
-
-  static NeuralNetwork* newNeuralNetwork(const std::string& name = "",
-                                         NeuralNetwork* rootNetwork = nullptr);
-
-  const std::string& getName() const { return subModelName_; }
-
-  /// some finish work, like convert the weight format of MKLDNNLayers
-  void finish();
-
-  /**
-   * @brief   Release the middle layer's output memory.
-   *
-   * @note    This function is used for memory optimization in inference.
-   */
-  void releaseOutput();
-
- protected:
-  /**
-   * The constructor of NeuralNetwork.
-   * The sub networks can get parameters_ and parameterMap_
-   * from base NeuralNetwork.
-   *
-   * @param subModelName The name of sub-model.
-   * @param rootNetwork  It used in MultiNetwork.
-   */
-  NeuralNetwork(std::string subModelName = "",
-                NeuralNetwork* rootNetwork = nullptr)
-      : subModelName_(subModelName), rootNetwork_(rootNetwork) {}
-
-  std::string subModelName_;
-  ModelConfig config_;
-  std::vector<LayerPtr> layers_;
-  ParameterMap parameterMap_;
-  LayerMap layerMap_;
-
-  std::vector<DataLayerPtr> dataLayers_;
-  std::vector<LayerPtr> outputLayers_;
-  std::vector<LayerPtr> middleLayers_;
-
-  static std::map<std::string, bool> dllInitMap;
-
-  NeuralNetwork* rootNetwork_;
-
-  /// Whether parameter of this NN is initialized by its own
-  /// (i.e., not by callback supplied with the caller)
-  bool paramSelfInited_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp b/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp
deleted file mode 100644
index 33d24b5b832..00000000000
--- a/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include "ParallelNeuralNetwork.h"
-
-#include <pthread.h>
-#include <sched.h>
-
-namespace paddle {
-
-void ParallelNeuralNetwork::init(
-    const ModelConfig& config,
-    ParamInitCallback callback,
-    const std::vector<ParameterType>& parameterTypes,
-    bool useGpu) {
-  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
-
-  if (config.type() == "recurrent_nn") {
-    LOG(FATAL)
-        << "You can not add `--parallel_nn=true` on the command line, "
-        << "parallel_nn training mode does not support the recurrent_nn model.";
-  }
-
-  useGpu_ = useGpu;
-  numDevices_ = 0;
-  if (useGpu_) {
-    numDevices_ = hl_get_device_count();
-  }
-
-  for (auto& layer : layers_) {
-    int deviceId = layer->getDeviceId();
-    CHECK_LT(deviceId, numDevices_);
-    addComputeThread(deviceId);
-  }
-}
-
-void ParallelNeuralNetwork::addComputeThread(int deviceId) {
-  for (auto& thread : threads_) {
-    if (thread->getDeviceId() == deviceId) {
-      return;
-    }
-  }
-
-  threads_.emplace_back(new ParallelThread(
-      threads_.size(), deviceId, deviceId >= 0 ? useGpu_ : false));
-}
-
-void ParallelNeuralNetwork::waitAllThread() {
-  for (auto& thread : threads_) {
-    thread->jobEnqueue(NULL, TASK_END_LAYER);
-  }
-
-  for (size_t i = 0; i < threads_.size(); i++) {
-    threads_[i]->queue_.waitEmpty();
-  }
-}
-
-void ParallelNeuralNetwork::dispatchByDeviceId(int deviceId,
-                                               LayerPtr layer,
-                                               TaskType task) {
-  for (auto& thread : threads_) {
-    if (thread->getDeviceId() == deviceId) {
-      thread->jobEnqueue(layer, task);
-      return;
-    }
-  }
-  LOG(FATAL) << "No specific device thread ";
-}
-
-void ParallelNeuralNetwork::forward(const std::vector<Argument>& inArgs,
-                                    std::vector<Argument>* outArgs,
-                                    PassType passType) {
-  for (auto& thread : threads_) {
-    thread->setForwardPassType(passType);
-  }
-  CHECK_EQ(inArgs.size(), dataLayers_.size());
-  outArgs->resize(outputLayers_.size());
-  for (size_t i = 0; i != dataLayers_.size(); ++i) {
-    const_cast<Argument&>(inArgs[i]).deviceId = -1;
-    dataLayers_[i]->setData(inArgs[i]);
-  }
-
-  for (auto& layer : layers_) {
-    dispatchByDeviceId(layer->getDeviceId(), layer, TASK_FORWARD);
-  }
-
-  {
-    REGISTER_TIMER("forwardTime");
-    waitAllThread();
-  }
-  outArgs->clear();
-  outArgs->reserve(outputLayers_.size());
-  for (auto& layer : outputLayers_) {
-    outArgs->push_back(layer->getOutput());
-  }
-}
-
-void ParallelNeuralNetwork::backward(const UpdateCallback& callback) {
-  for (auto& thread : threads_) {
-    thread->setBackwardCallback(callback);
-  }
-
-  FOR_EACH_R(layer, layers_) {
-    dispatchByDeviceId((*layer)->getDeviceId(), *layer, TASK_BACKWARD);
-  }
-  {
-    REGISTER_TIMER("backwardTime");
-    waitAllThread();
-  }
-}
-
-void ParallelNeuralNetwork::forwardBackward(const std::vector<Argument>& inArgs,
-                                            std::vector<Argument>* outArgs,
-                                            PassType passType,
-                                            const UpdateCallback& callback) {
-  forward(inArgs, outArgs, passType);
-  backward(callback);
-}
-
-void ParallelNeuralNetwork::start() {
-  for (auto& thread : threads_) {
-    thread->start();
-  }
-}
-
-ParallelThread::ParallelThread(int threadId, int deviceId, bool useGpu)
-    : threadId_(threadId), deviceId_(deviceId), useGpu_(useGpu) {}
-
-ParallelThread::~ParallelThread() { stop(); }
-
-void ParallelThread::stop() {
-  if (computeThread_) {
-    jobEnqueue(NULL, TASK_THREAD_FINISH);
-    computeThread_->join();
-    computeThread_.reset(nullptr);
-  }
-}
-
-void ParallelThread::computeThread() {
-  LOG(INFO) << "gradComputeThread " << threadId_;
-
-  if (useGpu_) {
-    hl_init(deviceId_);
-  }
-
-  while (true) {
-    struct Job job_work = queue_.dequeue();
-
-    if (job_work.task_ == TASK_END_LAYER) {
-      continue;
-    } else if (job_work.task_ == TASK_THREAD_FINISH) {
-      break;
-    }
-
-    if (TASK_FORWARD == job_work.task_) {
-      {
-        REGISTER_TIMER_INFO("waitInputValue",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->waitInputValue();
-      }
-      {
-        REGISTER_TIMER_INFO("threadForwardTimer",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->forward(passType_);
-      }
-      {
-        REGISTER_TIMER_INFO("copyOutputToOtherDevice",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->copyOutputToOtherDevice();
-      }
-    } else {
-      {
-        REGISTER_TIMER_INFO("waitAndMergeOutputGrad",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->waitAndMergeOutputGrad();
-      }
-      {
-        REGISTER_TIMER_INFO("threadBackwardTimer",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->backward(backwardCallback_);
-      }
-      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-      job_work.layer_->markAllInputGrad();
-    }
-  }
-  hl_fini();
-}
-
-void ParallelThread::start() {
-  computeThread_.reset(new std::thread([this]() { computeThread(); }));
-}
-
-void ParallelThread::jobEnqueue(LayerPtr layer, TaskType task) {
-  struct Job job_work;
-  job_work.layer_ = layer;
-  job_work.task_ = task;
-  queue_.enqueue(job_work);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.h
deleted file mode 100644
index c091459506a..00000000000
--- a/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "NeuralNetwork.h"
-
-namespace paddle {
-
-class ParallelThread;
-
-enum TaskType {
-  TASK_FORWARD = 0,
-  TASK_BACKWARD = 1,
-  TASK_END_LAYER = 2,
-  TASK_THREAD_FINISH = 3,
-};
-
-/**
- * A ParallelNeuralNetwork is capable of calculating a neural network through
- * multiple threads in parallel.
- */
-class ParallelNeuralNetwork : public NeuralNetwork {
- public:
-  ParallelNeuralNetwork(std::string subModelName = "",
-                        NeuralNetwork *rootNetwork = nullptr)
-      : NeuralNetwork(subModelName, rootNetwork) {}
-
-  virtual void init(const ModelConfig &config,
-                    ParamInitCallback callback = nullptr,
-                    const std::vector<ParameterType> &parameterTypes =
-                        std::vector<ParameterType>{PARAMETER_VALUE,
-                                                   PARAMETER_GRADIENT,
-                                                   PARAMETER_MOMENTUM},
-                    bool useGpu = FLAGS_use_gpu);
-
-  virtual void forward(const std::vector<Argument> &inArgs,
-                       std::vector<Argument> *outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback &callback = nullptr);
-
-  void forwardBackward(const std::vector<Argument> &inArgs,
-                       std::vector<Argument> *outArgs,
-                       PassType passType,
-                       const UpdateCallback &callback = NULL);
-
-  virtual void start();
-
-  void addComputeThread(int deviceId);
-
-  void dispatchByDeviceId(int deviceId, LayerPtr layer, TaskType task);
-
-  void waitAllThread();
-
-  // virtual void eval(Evaluator* evaluator);
-
- protected:
-  bool useGpu_;
-  /// number of gpu devices
-  int numDevices_;
-  std::vector<std::unique_ptr<ParallelThread>> threads_;
-};
-
-class ParallelThread {
- public:
-  ParallelThread(int threadId, int deviceId, bool useGpu);
-  ~ParallelThread();
-  void jobEnqueue(LayerPtr layer, TaskType task);
-  void start();
-  void stop();
-  int getDeviceId() const { return deviceId_; }
-
-  void setBackwardCallback(const UpdateCallback &callback) {
-    backwardCallback_ = callback;
-  }
-  void setForwardPassType(PassType passType) { passType_ = passType; }
-
- protected:
-  void computeThread();
-
- public:
-  struct Job {
-    LayerPtr layer_;
-    TaskType task_;
-  };
-  typedef Queue<Job> JobQueue;
-  JobQueue queue_;
-
- protected:
-  /// from 0 to threads-1
-  int threadId_;
-  /// the GPU device Id which the computeThread_ used
-  int deviceId_;
-  bool useGpu_;
-  std::unique_ptr<std::thread> computeThread_;
-  /// whether the thread should stop
-  bool stopping_;
-  UpdateCallback backwardCallback_;
-  PassType passType_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp
deleted file mode 100644
index e49f042404f..00000000000
--- a/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ /dev/null
@@ -1,1501 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RecurrentGradientMachine.h"
-#include <dlfcn.h>
-#include <algorithm>
-#include <cmath>
-#include <functional>
-#include <limits>
-#include "NeuralNetwork.h"
-#include "paddle/legacy/gserver/layers/AgentLayer.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-
-DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
-
-static const char* DIY_CALC_PROB_SYMBOL_NAME = "calc_prob";
-static const char* DIY_START_CALC_PROB_SYMBOL_NAME = "start_calc_prob";
-static const char* DIY_FINISH_CALC_PROB_SYMBOL_NAME = "finish_calc_prob";
-
-namespace paddle {
-
-/**
- * Start Custom Calculate Probability callback type.
- *
- * @param nNode, nodes: the path will be explored. nNodes is array size.
- *                      nodes is array elements.
- *
- * @return: A custom handler id that will passed to another callback.
- */
-typedef int (*DiyStartCalcProbCallback)(size_t nNodes, int* nodes);
-
-/**
- * Doing Custom Calculation of Probability callback type.
- *
- * @param handler: User custom handler. The return value from start calc prob.
- * @param nNode, nodes: Array. The current path.
- * @param curProb: The current log probability that neural network returns.
- *
- * @return: Log probability which user calculated, it will be updated to this
- *          path.
- * @NOTE: Return -INFINITY will DROP this path IMMEDIATELY!!
- */
-typedef real (*DiyCalcProbCallback)(
-    int handler, size_t nNodes, int* nodes, real curProb, bool atEos);
-
-/**
- * Finish Custom Calculation of Probability callback type.
- *
- * @param handler: User custom handler. The return value from start calc prob.
- */
-typedef void (*DiyStopCalcProbCallback)(int handler);
-
-static DiyCalcProbCallback gDiyProbMethod = nullptr;
-static DiyStartCalcProbCallback gDiyProbStart = nullptr;
-static DiyStopCalcProbCallback gDiyProbStop = nullptr;
-static void* gDiyProbHandle = nullptr;
-
-static void exit_diy_prob() { dlclose(gDiyProbHandle); }
-
-template <typename SymbolType>
-static inline SymbolType loadDiySymbol(const char* symbolName) {
-  void* sym = dlsym(gDiyProbHandle, symbolName);
-  CHECK(sym) << "Cannot load symbol " << symbolName << " from "
-             << FLAGS_diy_beam_search_prob_so;
-  return reinterpret_cast<SymbolType>(sym);
-}
-
-static InitFunction __init__diy_prob_method(
-    [] {
-      std::string soName = FLAGS_diy_beam_search_prob_so;
-      if (!soName.empty()) {
-        gDiyProbHandle = dlopen(soName.c_str(), RTLD_LAZY);
-        CHECK(gDiyProbHandle) << "Cannot Open DIY Prob So " << soName;
-        atexit(exit_diy_prob);
-        gDiyProbMethod =
-            loadDiySymbol<decltype(gDiyProbMethod)>(DIY_CALC_PROB_SYMBOL_NAME);
-        gDiyProbStart = loadDiySymbol<decltype(gDiyProbStart)>(
-            DIY_START_CALC_PROB_SYMBOL_NAME);
-        gDiyProbStop = loadDiySymbol<decltype(gDiyProbStop)>(
-            DIY_FINISH_CALC_PROB_SYMBOL_NAME);
-      }
-    },
-    std::numeric_limits<int>::max());
-
-class BeamSearchControlCallbacks {
- public:
-  RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback
-      beamSearchCandidateAdjust;
-  RecurrentGradientMachine::NormOrDropNodeCallback normOrDropNode;
-  RecurrentGradientMachine::DropCallback stopDetermineCandidates;
-
-  //! for gcc46 aggregate initialization is not very well, so we need to
-  //! explicit
-  BeamSearchControlCallbacks(
-      const RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback&
-          candidateAdjust,
-      const RecurrentGradientMachine::NormOrDropNodeCallback& norm,
-      const RecurrentGradientMachine::DropCallback& stop)
-      : beamSearchCandidateAdjust(candidateAdjust),
-        normOrDropNode(norm),
-        stopDetermineCandidates(stop) {}
-};
-
-class BeamSearchStatisticsCallbacks {
- public:
-  RecurrentGradientMachine::EachStepCallback onEachStepStarted;
-  RecurrentGradientMachine::EachStepCallback onEachStepStoped;
-
-  BeamSearchStatisticsCallbacks(
-      const RecurrentGradientMachine::EachStepCallback& start,
-      const RecurrentGradientMachine::EachStepCallback& stop)
-      : onEachStepStarted(start), onEachStepStoped(stop) {}
-};
-
-RecurrentGradientMachine::RecurrentGradientMachine(
-    const std::string& subModelName, NeuralNetwork* rootNetwork)
-    : NeuralNetwork(subModelName),
-      rootNetwork_(rootNetwork),
-      beamSearchCtrlCallbacks_(nullptr),
-      beamSearchStatistics_(nullptr) {
-  CHECK(!subModelName_.empty());
-}
-
-/**
- * bias layer, as input of memory frame 0 will give vector of zeros
- * if bias parameter is not set.
- *
- * boot bias layer create directly in recurrent gradient machine, because:
- *
- * 1. It is only one frame, so it should not be placed in layer group,
- *    which is one instance for every one frame.
- *
- * 2. It is no input layer, so it need resetHeight() before forward(),
- *    and resetHeight() must be called in recurrent gradient machine,
- *    so it's should not be placed in root network.
- */
-class BootBiasLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-  IVectorPtr cpuIds_;
-
- public:
-  explicit BootBiasLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    if (!Layer::init(layerMap, parameterMap)) return false;
-
-    if (biasParameter_) {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-    }
-    return true;
-  }
-
-  void resetHeight(int height) {
-    if (config_.has_bos_id()) {  // used as a constant id layerConfig
-      IVector::resizeOrCreate(output_.ids, height, useGpu_);
-      output_.ids->reset((int)config_.bos_id());
-    } else {
-      resetOutput(height, getSize());
-    }
-  }
-
-  void forward(PassType passType) override {
-    if (biases_) {
-      MatrixPtr outV = getOutputValue();
-      outV->addBias(*(biases_->getW()), 1);
-      forwardActivation();
-    }
-  }
-
-  void backward(const UpdateCallback& callback) override {
-    if (biases_ && biases_->getWGrad()) {
-      backwardActivation();
-      biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-      biases_->getParameterPtr()->incUpdate(callback);
-    }
-  }
-};
-
-void RecurrentGradientMachine::init(
-    const ModelConfig& config,
-    ParamInitCallback callback,
-    const std::vector<ParameterType>& parameterTypes,
-    bool useGpu) {
-  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
-  useGpu_ = useGpu;
-
-  auto subModelConfig =
-      std::find_if(config.sub_models().begin(),
-                   config.sub_models().end(),
-                   [this](const SubModelConfig& sub_model) {
-                     return sub_model.name() == this->subModelName_;
-                   });
-  CHECK(subModelConfig != config.sub_models().end());
-  reversed_ = subModelConfig->reversed();
-  generating_ = subModelConfig->has_generator();
-
-  inFrameLines_.resize(subModelConfig->in_links_size());
-  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
-    inFrameLines_[i].linkName = subModelConfig->in_links(i).link_name();
-    inFrameLines_[i].inLayer =
-        rootNetwork_->getLayer(subModelConfig->in_links(i).layer_name());
-  }
-
-  outFrameLines_.resize(subModelConfig->out_links_size());
-  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
-    auto& linkPair = subModelConfig->out_links(i);
-    outFrameLines_[i].layerName = linkPair.layer_name();
-    outFrameLines_[i].agentLayer = rootNetwork_->getLayer(linkPair.link_name());
-  }
-
-  memoryFrameLines_.resize(subModelConfig->memories_size());
-  for (size_t i = 0; i < memoryFrameLines_.size(); ++i) {
-    auto& memoryConfig = subModelConfig->memories(i);
-    memoryFrameLines_[i].layerName = memoryConfig.layer_name();
-    memoryFrameLines_[i].linkName = memoryConfig.link_name();
-    auto agentConfig =
-        std::find_if(config.layers().begin(),
-                     config.layers().end(),
-                     [&memoryConfig](const LayerConfig& layerConfig) {
-                       return layerConfig.name() == memoryConfig.link_name();
-                     });
-    CHECK(agentConfig != config.layers().end());
-    if (memoryConfig.has_boot_layer_name()) {
-      memoryFrameLines_[i].rootLayer =
-          rootNetwork_->getLayer(memoryConfig.boot_layer_name());
-
-      LayerConfig scatterConfig = *agentConfig;
-      memoryFrameLines_[i].rootAgent.reset(
-          new ScatterAgentLayer(scatterConfig));
-      memoryFrameLines_[i].rootAgent->init(LayerMap(), parameterMap_);
-
-      memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].rootAgent;
-    } else {
-      LayerConfig biasConfig = *agentConfig;
-      if (memoryConfig.has_boot_bias_parameter_name()) {
-        biasConfig.set_bias_parameter_name(
-            memoryConfig.boot_bias_parameter_name());
-        biasConfig.set_active_type(memoryConfig.boot_bias_active_type());
-      } else if (memoryConfig.has_boot_with_const_id()) {
-        biasConfig.set_bos_id(memoryConfig.boot_with_const_id());
-      }
-      memoryFrameLines_[i].biasLayer.reset(new BootBiasLayer(biasConfig));
-      memoryFrameLines_[i].biasLayer->init(LayerMap(), parameterMap_);
-
-      memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].biasLayer;
-    }
-
-    if (subModelConfig->has_generator()) {
-      memoryFrameLines_[i].scatterAgents.resize(2);
-      for (auto& agent : memoryFrameLines_[i].scatterAgents) {
-        agent.reset(new ScatterAgentLayer(*agentConfig));
-        agent->init(LayerMap(), parameterMap_);
-      }
-    }
-  }
-
-  if (subModelConfig->has_generator()) {
-    generator_.config = subModelConfig->generator();
-    eosFrameLine_.reset(new EosFrameLine);
-    maxSequenceLength_ = generator_.config.max_num_frames();
-  }
-
-  // get parameters actually used by this Layer Group
-  resizeOrCreateFrames(1);
-  for (auto& para : frames_[0]->getParameters()) {
-    if (para->getSharedCount() > 0) {
-      parameterIds_.push_back(para->getID());
-    }
-  }
-  for (auto& para : parameters_) {  // bias layer parameters
-    if (para->getSharedCount() > 0) {
-      parameterIds_.push_back(para->getID());
-    }
-  }
-}
-
-void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) {
-  if ((size_t)numFrames <= frames_.size()) {
-    return;
-  }
-
-  frames_.reserve(numFrames);
-  for (auto& inFrameLine : inFrameLines_) {
-    inFrameLine.agents.reserve(numFrames);
-  }
-  for (auto& outFrameLine : outFrameLines_) {
-    outFrameLine.frames.reserve(numFrames);
-  }
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.frames.reserve(numFrames);
-    memoryFrameLine.agents.reserve(numFrames);
-  }
-  if (eosFrameLine_) {
-    eosFrameLine_->layers.reserve(numFrames);
-  }
-
-  ParamInitCallback subParamInitCb = [this](int paramId, Parameter* para) {
-    para->enableSharedType(PARAMETER_VALUE,
-                           this->parameters_[paramId]->getBuf(PARAMETER_VALUE),
-                           this->parameters_[paramId]->getMat(PARAMETER_VALUE));
-    para->enableSharedType(
-        PARAMETER_GRADIENT,
-        this->parameters_[paramId]->getBuf(PARAMETER_GRADIENT),
-        this->parameters_[paramId]->getMat(PARAMETER_GRADIENT));
-  };
-
-  for (int i = frames_.size(); i < numFrames; ++i) {
-    std::unique_ptr<NeuralNetwork> frame(
-        NeuralNetwork::newNeuralNetwork(subModelName_));
-    frame->init(config_, subParamInitCb);
-
-    for (auto& inFrameLine : inFrameLines_) {
-      inFrameLine.agents.push_back(frame->getLayer(inFrameLine.linkName));
-    }
-
-    for (auto& outFrameLine : outFrameLines_) {
-      outFrameLine.frames.push_back(frame->getLayer(outFrameLine.layerName));
-    }
-    for (auto& memoryFrameLine : memoryFrameLines_) {
-      memoryFrameLine.frames.push_back(
-          frame->getLayer(memoryFrameLine.layerName));
-      memoryFrameLine.agents.push_back(
-          frame->getLayer(memoryFrameLine.linkName));
-    }
-    if (eosFrameLine_) {
-      eosFrameLine_->layers.push_back(
-          frame->getLayer(generator_.config.eos_layer_name()));
-    }
-
-    frames_.emplace_back(std::move(frame));
-  }
-}
-
-void RecurrentGradientMachine::resizeBootFrame(int numSequences) {
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (memoryFrameLine.biasLayer) {
-      auto biasLayer =
-          dynamic_cast<BootBiasLayer*>(memoryFrameLine.biasLayer.get());
-      CHECK_NOTNULL(biasLayer);
-      biasLayer->resetHeight(numSequences);
-    } else {  // check input root layer height
-      CHECK_EQ(numSequences,
-               memoryFrameLine.rootLayer->getOutput().getNumSequences());
-    }
-  }
-}
-
-void RecurrentGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
-  LOG(FATAL) << "should not use this function";
-}
-
-void RecurrentGradientMachine::checkInputConsistency(
-    int inlinkId, const std::vector<Argument::SeqInfo>& seqInfo) {
-  if (commonSeqInfo_.empty()) {
-    commonSeqInfo_.resize(seqInfo.size());
-    for (size_t i = 0; i < seqInfo.size(); ++i) {
-      commonSeqInfo_[i].topLevelLength = seqInfo[i].topLevelLength;
-      commonSeqInfo_[i].seqId = seqInfo[i].seqId;
-    }
-  } else {
-    CHECK_EQ(commonSeqInfo_.size(), seqInfo.size())
-        << " RecurrentGroup " << subModelName_ << " input " << inlinkId
-        << " has mismatched number of sequences";
-    for (size_t i = 0; i < seqInfo.size(); ++i) {
-      CHECK_EQ(commonSeqInfo_[i].topLevelLength, seqInfo[i].topLevelLength)
-          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
-          << " has mismatched sequence length";
-      CHECK_EQ(commonSeqInfo_[i].seqId, seqInfo[i].seqId)
-          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
-          << " has mismatched sequence length";
-    }
-  }
-}
-
-void RecurrentGradientMachine::calcNumSequencesAtEachStep() {
-  int numSequences = commonSeqInfo_.size();
-  numSeqs_.resize(maxSequenceLength_);
-  for (int i = 0; i < numSequences; ++i) {
-    for (int j = 0; j < commonSeqInfo_[i].topLevelLength; ++j) {
-      numSeqs_[j] = i + 1;
-    }
-  }
-}
-
-void RecurrentGradientMachine::reorganizeInput(PassType passType) {
-  info_.clear();
-  info_.resize(inFrameLines_.size());
-
-  commonSeqInfo_.clear();
-  seqInfos_.clear();
-  seqInfos_.resize(inFrameLines_.size());
-
-  for (size_t i = 0; i < inFrameLines_.size(); i++) {
-    const Argument& input = inFrameLines_[i].inLayer->getOutput();
-    if (!input.hasSeq()) {
-      continue;
-    }
-    input.getSeqInfo(&seqInfos_[i]);
-    checkInputConsistency(i, seqInfos_[i]);
-  }
-  CHECK(!commonSeqInfo_.empty())
-      << "At least one input needs to be sequence or subsequence";
-  maxSequenceLength_ = commonSeqInfo_[0].topLevelLength;
-
-  calcNumSequencesAtEachStep();
-
-  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
-    const Argument& input = inFrameLines_[i].inLayer->getOutput();
-    if (!input.hasSeq()) {
-      seqInfos_[i] = commonSeqInfo_;
-    }
-    createInFrameInfo(i, input, passType);
-  }
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-
-    // inFrameLine select rows in real layer one time
-    for (size_t i = 0; i < inFrameLines_.size(); i++) {
-      selectRowsOneTime(inFrameLines_[i].inLayer,
-                        info_[i].allIds,
-                        &(inFrameLines_[i].outArg),
-                        passType);
-    }
-  }
-}
-
-void RecurrentGradientMachine::reorganizeOutput(PassType passType) {
-  calcSequenceStartPositions();
-  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
-    Info info;
-    auto& outFrameLine = outFrameLines_[i];
-    ICpuGpuVectorPtr sequenceStartPositions;
-    ICpuGpuVectorPtr subSequenceStartPositions;
-    createOutFrameInfo(
-        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
-    auto gatherAgent =
-        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
-    CHECK_NOTNULL(gatherAgent);
-    gatherAgent->copyIdAndSequenceInfo(sequenceStartPositions,
-                                       subSequenceStartPositions,
-                                       info.allIds,
-                                       info.idIndex);
-  }
-}
-
-void RecurrentGradientMachine::connectFrames(PassType passType) {
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (memoryFrameLine.rootAgent) {
-      auto scatterAgent =
-          dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
-      createMemoryFrameInfo(&memoryFrameLine, passType);
-      scatterAgent->setRealLayerAndOutput(memoryFrameLine.rootLayer,
-                                          memoryFrameLine.outArg,
-                                          memoryFrameLine.allIds,
-                                          /* idIndex */ 0,
-                                          memoryFrameLine.allIds->getSize(),
-                                          /* handleBackward */ true);
-      if (memoryFrameLine.sequenceStartPositions) {
-        int size = memoryFrameLine.sequenceStartPositions->getSize();
-        scatterAgent->setSequenceStartPositions(
-            memoryFrameLine.sequenceStartPositions,
-            /* seqStartPosIndex */ 0,
-            size);
-      }
-    }
-  }
-
-  for (auto& outFrameLine : outFrameLines_) {
-    auto gatherAgent =
-        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
-    gatherAgent->clearRealLayers();
-  }
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    // connect in_links
-    for (size_t j = 0; j < inFrameLines_.size(); ++j) {
-      Info& info = info_[j];
-      // idSize denotes the sum number of tokens in each length i
-      int idIndex = info.idIndex.empty() ? 0 : info.idIndex[i];
-      int idSize = info.idIndex.empty() ? numSeqs_[i]
-                                        : info.idIndex[i + 1] - info.idIndex[i];
-      InFrameLine inFrameLine = inFrameLines_[j];
-      auto scatterAgent =
-          dynamic_cast<ScatterAgentLayer*>(inFrameLine.agents[i].get());
-      scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer,
-                                          inFrameLine.outArg,
-                                          info.allIds,
-                                          idIndex,
-                                          idSize,
-                                          i == 0);
-      if (info.sequenceStartPositions) {
-        // size: the length of subsequence
-        int size = info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i];
-        scatterAgent->setSequenceStartPositions(
-            info.sequenceStartPositions, info.seqStartPosIndex[i], size);
-      }
-    }
-
-    // connect out_links
-    for (auto& outFrameLine : outFrameLines_) {
-      auto gatherAgent =
-          dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
-      gatherAgent->addRealLayer(outFrameLine.frames[i]);
-    }
-    for (auto& memoryFrameLine : memoryFrameLines_) {
-      NeuralNetwork::connect(
-          memoryFrameLine.agents[i],
-          i == 0 ? memoryFrameLine.bootLayer : memoryFrameLine.frames[i - 1],
-          numSeqs_[i] /*height of agent*/);
-    }
-  }
-}
-
-void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
-                                       std::vector<Argument>* outArgs,
-                                       PassType passType) {
-  /* inArgs and outArgs are not used.
-     The inputs are inFrameLines_[i].inLayer.
-     The outputs are outFramesLines_[i].agentLayer
-   */
-
-  if (generating_) {
-    generateSequence();
-    return;
-  }  // else forward..
-
-  reorganizeInput(passType);
-  int numSequences = commonSeqInfo_.size();
-
-  resizeOrCreateFrames(maxSequenceLength_);
-  resizeBootFrame(numSequences);
-
-  connectFrames(passType);
-
-  REGISTER_TIMER_INFO("RecurrentFwTime", "RecurrentFwTime");
-  // forward
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.bootLayer->forward(passType);
-  }
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    const std::vector<Argument> inArgs;
-    std::vector<Argument> outArgs;
-    frames_[i]->forward(inArgs, &outArgs, passType);
-  }
-
-  reorganizeOutput(passType);
-}
-
-void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
-  if (generating_) {
-    return;
-  }
-  REGISTER_TIMER_INFO("RecurrentBwTime", "RecurrentBwTime");
-  AsyncGpuBlock asyncGpuBlock;
-  for (int i = maxSequenceLength_ - 1; i >= 0; --i) {
-    frames_[i]->backward(nullptr);
-  }
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.bootLayer->backward(nullptr);
-  }
-}
-
-void RecurrentGradientMachine::forwardBackward(
-    const std::vector<Argument>& inArgs,
-    std::vector<Argument>* outArgs,
-    PassType passType,
-    const UpdateCallback& callback) {
-  LOG(FATAL) << "should not use this function";
-}
-
-void RecurrentGradientMachine::eval(Evaluator* evaluator) const {
-  // call printers frame by frame
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    VLOG(2) << "Recurrent Layer Group eval frame " << i << " begin";
-    evaluator->eval(*(frames_[i].get()));
-    VLOG(2) << "Recurrent Layer Group eval frame " << i << " end";
-  }
-}
-
-void RecurrentGradientMachine::registerBeamSearchControlCallbacks(
-    const BeamSearchCandidatesAdjustCallback& adjustBeamSearch,
-    const NormOrDropNodeCallback& normOrDropNode,
-    const DropCallback& stopBeamSearch) {
-  this->removeBeamSearchControlCallbacks();
-  //! for gcc 46, aggregate initialization is not supported. TAT
-  this->beamSearchCtrlCallbacks_ = new BeamSearchControlCallbacks(
-      adjustBeamSearch, normOrDropNode, stopBeamSearch);
-}
-
-void RecurrentGradientMachine::removeBeamSearchControlCallbacks() {
-  if (this->beamSearchCtrlCallbacks_) {
-    delete this->beamSearchCtrlCallbacks_;
-    this->beamSearchCtrlCallbacks_ = nullptr;
-  }
-}
-
-void RecurrentGradientMachine::registerBeamSearchStatisticsCallbacks(
-    const EachStepCallback& onEachStepStarted,
-    const EachStepCallback& onEachStepStoped) {
-  this->removeBeamSearchStatisticsCallbacks();
-  this->beamSearchStatistics_ =
-      new BeamSearchStatisticsCallbacks(onEachStepStarted, onEachStepStoped);
-}
-
-void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() {
-  if (this->beamSearchStatistics_) {
-    delete this->beamSearchStatistics_;
-    this->beamSearchStatistics_ = nullptr;
-  }
-}
-
-namespace {
-void lenToStarts(std::vector<int>& starts) {
-  int pos = 0;
-  starts.back() = 0;
-  for (auto& start : starts) {
-    int tmp = start;
-    start = pos;
-    pos += tmp;
-  }
-  starts.back() = pos;
-}
-}  // namespace
-
-void RecurrentGradientMachine::calcSequenceStartPositions() {
-  std::vector<int> starts(commonSeqInfo_.size() + 1);
-  for (auto& seqInfo : commonSeqInfo_) {
-    starts[seqInfo.seqId] = seqInfo.topLevelLength;
-  }
-  lenToStarts(starts);
-  ICpuGpuVector::resizeOrCreate(sequenceStartPositions_, starts.size(), false);
-  std::copy(starts.begin(),
-            starts.end(),
-            sequenceStartPositions_->getMutableData(false));
-}
-
-void RecurrentGradientMachine::checkOutputConsistency(
-    OutFrameLine& outFrameLine) {
-  bool hasSeq = outFrameLine.frames[0]->getOutput().hasSeq();
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    CHECK_EQ(hasSeq, frame->getOutput().hasSeq());
-    int numSequences = frame->getOutput().getNumSequences();
-    CHECK_EQ(numSeqs_[i], numSequences);
-  }
-}
-
-void RecurrentGradientMachine::createOutFrameInfo(
-    OutFrameLine& outFrameLine,
-    Info& info,
-    ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions) {
-  checkOutputConsistency(outFrameLine);
-
-  if (!outFrameLine.frames[0]->getOutput().hasSeq()) {
-    createOutFrameInfo_seq(
-        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
-  } else {
-    createOutFrameInfo_subseq(
-        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
-  }
-}
-
-void RecurrentGradientMachine::createOutFrameInfo_seq(
-    OutFrameLine& outFrameLine,
-    Info& info,
-    ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions) {
-  std::vector<int> allIds;
-  info.idIndex.resize(1, 0);  // first idIndex = 0
-
-  const int* starts = sequenceStartPositions_->getData(false);
-
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    size_t numSequences = frame->getOutput().getNumSequences();
-    for (size_t j = 0; j < numSequences; ++j) {
-      int seqStart = starts[commonSeqInfo_[j].seqId];
-      int seqLength = commonSeqInfo_[j].topLevelLength;
-      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
-                                 : (seqStart + i));
-    }
-    info.idIndex.push_back(allIds.size());
-  }
-  sequenceStartPositions = sequenceStartPositions_;
-  copyScattedId(allIds, &info.allIds, allIds.size());
-  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
-}
-
-void RecurrentGradientMachine::createOutFrameInfo_subseq(
-    OutFrameLine& outFrameLine,
-    Info& info,
-    ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions) {
-  size_t numSequences = commonSeqInfo_.size();
-  std::vector<int> allIds;
-  info.idIndex.resize(1, 0);  // first idIndex = 0
-
-  const int* starts = sequenceStartPositions_->getData(false);
-  std::vector<int> subStarts(starts[numSequences] + 1);
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    size_t numSequences = frame->getOutput().getNumSequences();
-    const int* seqStarts =
-        frame->getOutput().sequenceStartPositions->getData(false);
-    for (size_t j = 0; j < numSequences; ++j) {
-      subStarts[starts[commonSeqInfo_[j].seqId] + i] =
-          seqStarts[j + 1] - seqStarts[j];
-    }
-  }
-  lenToStarts(subStarts);
-
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    size_t numSequences = frame->getOutput().getNumSequences();
-    for (size_t j = 0; j < numSequences; ++j) {
-      int pos = starts[commonSeqInfo_[j].seqId] + i;
-      int subSeqStart = subStarts[pos];
-      int subSeqEnd = subStarts[pos + 1];
-      for (int k = subSeqStart; k < subSeqEnd; ++k) {
-        allIds.push_back(k);
-      }
-    }
-    info.idIndex.push_back(allIds.size());
-  }
-
-  ICpuGpuVector::resizeOrCreate(
-      subSequenceStartPositions, subStarts.size(), false);
-  int* cpuSubSequenceStartPositions =
-      subSequenceStartPositions->getMutableData(false);
-  std::copy(subStarts.begin(), subStarts.end(), cpuSubSequenceStartPositions);
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, numSequences + 1, false);
-  int* cpuSequenceStartPositions =
-      sequenceStartPositions->getMutableData(false);
-  for (size_t i = 0; i <= numSequences; ++i) {
-    cpuSequenceStartPositions[i] = subStarts[starts[i]];
-  }
-  copyScattedId(allIds, &info.allIds, allIds.size());
-  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
-}
-
-/* create scattered id infomation for all realLayer of inFrameLines one time.
- * If hasSubseq, will also create scattered sequenceStartPositions infomation
- * for all realLayer of inFrameLines one time.
- */
-void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
-                                                 const Argument& input,
-                                                 PassType passType) {
-  if (!input.hasSeq()) {
-    createInFrameInfo_nonseq(inlinkId, input, passType);
-  } else if (!input.hasSubseq()) {
-    createInFrameInfo_seq(inlinkId, input, passType);
-  } else {
-    createInFrameInfo_subseq(inlinkId, input, passType);
-  }
-}
-
-void RecurrentGradientMachine::createInFrameInfo_nonseq(int inlinkId,
-                                                        const Argument& input,
-                                                        PassType passType) {
-  std::vector<int> allIds;
-
-  auto& seqInfo = seqInfos_[inlinkId];
-  Info* inlinkInfo = &info_[inlinkId];
-  inlinkInfo->idIndex.clear();
-  for (size_t i = 0; i < seqInfo.size(); ++i) {
-    allIds.push_back(seqInfo[i].seqId);
-  }
-  // copy and check scatterId
-  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
-}
-
-void RecurrentGradientMachine::createInFrameInfo_seq(int inlinkId,
-                                                     const Argument& input,
-                                                     PassType passType) {
-  std::vector<int> allIds;
-  auto& seqInfo = seqInfos_[inlinkId];
-  Info* inlinkInfo = &info_[inlinkId];
-  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
-
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    for (int j = 0; j < numSeqs_[i]; ++j) {
-      int seqLength = seqInfo[j].topLevelLength;
-      int seqStart = seqInfo[j].seqStart;
-      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
-                                 : (seqStart + i));
-    }
-    inlinkInfo->idIndex.push_back(allIds.size());
-  }
-
-  // copy and check scatterId
-  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
-  CHECK_EQ(inlinkInfo->idIndex.size(),
-           static_cast<size_t>(maxSequenceLength_ + 1));
-}
-void RecurrentGradientMachine::createInFrameInfo_subseq(int inlinkId,
-                                                        const Argument& input,
-                                                        PassType passType) {
-  std::vector<int> allIds;
-
-  auto& seqInfo = seqInfos_[inlinkId];
-
-  Info* inlinkInfo = &info_[inlinkId];
-  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
-  std::vector<int> sequenceStartPositions;
-  const int* subSequenceStartPositions = nullptr;
-
-  subSequenceStartPositions = input.subSequenceStartPositions->getData(false);
-  inlinkInfo->seqStartPosIndex.clear();
-  inlinkInfo->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    sequenceStartPositions.push_back(0);  // first element = 0
-    for (int j = 0; j < numSeqs_[i]; ++j) {
-      int subSeqStart = subSequenceStartPositions[seqInfo[j].subSeqStart + i];
-      int subSeqEnd = subSequenceStartPositions[seqInfo[j].subSeqStart + i + 1];
-      for (int k = subSeqStart; k < subSeqEnd; ++k) {
-        allIds.push_back(k);
-      }
-      sequenceStartPositions.push_back(sequenceStartPositions.back() +
-                                       subSeqEnd - subSeqStart);
-    }
-    inlinkInfo->idIndex.push_back(allIds.size());
-    inlinkInfo->seqStartPosIndex.push_back(sequenceStartPositions.size());
-  }
-  // inFrameLine create sequenceStartPositions one time
-  CHECK_EQ(
-      sequenceStartPositions.size(),
-      static_cast<size_t>(maxSequenceLength_ + input.getNumSubSequences()));
-  CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
-           static_cast<size_t>(maxSequenceLength_ + 1));
-  createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
-
-  // copy and check scatterId
-  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
-  CHECK_EQ(inlinkInfo->idIndex.size(),
-           static_cast<size_t>(maxSequenceLength_ + 1));
-}
-
-/* like createInFrameInfo, but for all realLayer of memoryFrameLines*/
-void RecurrentGradientMachine::createMemoryFrameInfo(
-    MemoryFrameLine* memoryFrameLine, PassType passType) {
-  const Argument& input = (*memoryFrameLine).rootLayer->getOutput();
-  size_t numSequences = input.getNumSequences();
-  std::vector<int> allIds;
-  bool seqFlag = input.hasSeq();
-  CHECK(!input.hasSubseq())
-      << "Subsequence boot layer for memory is not supported";
-
-  if (seqFlag) {  // for sequenceScatterAgentLayer
-    std::vector<int> sequenceStartPositions;
-    sequenceStartPositions.push_back(0);  // first element = 0
-    const int* starts = input.sequenceStartPositions->getData(false);
-    for (size_t i = 0; i < numSequences; ++i) {
-      // memory info adopt info of inlinks[0]
-      int seqId = seqInfos_[0][i].seqId;
-      for (int k = starts[seqId]; k < starts[seqId + 1]; ++k) {
-        allIds.push_back(k);
-      }
-      sequenceStartPositions.push_back(sequenceStartPositions.back() +
-                                       starts[seqId + 1] - starts[seqId]);
-    }
-    createSeqPos(sequenceStartPositions,
-                 &(*memoryFrameLine).sequenceStartPositions);
-
-  } else {  // for scatterAgentLayer
-    for (size_t i = 0; i < numSequences; ++i) {
-      allIds.push_back(seqInfos_[0][i].seqId);
-    }
-  }
-  // copy and check scatterId
-  copyScattedId(allIds, &(*memoryFrameLine).allIds, input.getBatchSize());
-  // memoryFrameLine select rows in real layer one time
-  selectRowsOneTime((*memoryFrameLine).rootLayer,
-                    (*memoryFrameLine).allIds,
-                    &(*memoryFrameLine).outArg,
-                    passType);
-}
-
-void RecurrentGradientMachine::copyScattedId(std::vector<int>& srcIds,
-                                             IVectorPtr* dstIds,
-                                             int size) {
-  int idSize = srcIds.size();
-  CHECK_EQ(idSize, size);
-  IVector::resizeOrCreate(*dstIds, idSize, useGpu_);
-  (*dstIds)->copyFrom(srcIds.data(), idSize);
-  // check
-  std::sort(srcIds.begin(), srcIds.end());
-  for (int i = 0; i < idSize; ++i) {
-    CHECK_EQ(srcIds[i], i);
-  }
-}
-
-void RecurrentGradientMachine::selectRowsOneTime(LayerPtr layer,
-                                                 const IVectorPtr& allIds,
-                                                 Argument* arg,
-                                                 PassType passType) {
-  Argument& src = layer->getOutput();
-  if (src.value) {
-    const MatrixPtr& realV = src.value;
-    int height = realV->getHeight();
-    int width = realV->getWidth();
-    Matrix::resizeOrCreate(
-        arg->value, height, width, /* trans */ false, useGpu_);
-    arg->value->zeroMem();
-    arg->value->selectRows(*realV, *allIds);
-    if (passType != PASS_TEST) {
-      Matrix::resizeOrCreate(
-          arg->grad, height, width, /* trans */ false, useGpu_);
-      arg->grad->zeroMem();
-    }
-  }
-  if (src.ids) {
-    IVector::resizeOrCreate(arg->ids, src.ids->getSize(), useGpu_);
-    arg->ids->selectFrom(*src.ids, *allIds);
-  }
-}
-
-void RecurrentGradientMachine::createSeqPos(
-    const std::vector<int>& sequenceStartPosition,
-    ICpuGpuVectorPtr* sequenceStartPositions) {
-  int size = sequenceStartPosition.size();
-  const int* data = sequenceStartPosition.data();
-  ICpuGpuVector::resizeOrCreate(*sequenceStartPositions, size, false);
-  (*sequenceStartPositions)->copyFrom(data, size, false);
-}
-
-size_t RecurrentGradientMachine::getGenBatchSize() {
-  size_t numSequences = 0;
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (!memoryFrameLine.rootLayer) continue;
-    Argument& bootArg = memoryFrameLine.rootLayer->getOutput();
-    size_t batchSize = bootArg.getNumSequences();
-    if (numSequences) {
-      CHECK_EQ(numSequences, batchSize);
-    } else {
-      numSequences = batchSize;
-    }
-  }
-  CHECK(numSequences)
-      << "Fail to get batch size in generation. "
-         "At least one of the Memory layer MUST have a layer that is NOT in "
-         "the layer group to boot it, and this boot layer is used to "
-         "decide batch_size in generation process.";
-  return numSequences;
-}
-
-void RecurrentGradientMachine::generateSequence() {
-  CHECK_NOTNULL(eosFrameLine_.get());
-  CHECK_GE(outFrameLines_.size(), 1UL);
-  size_t numSequences = getGenBatchSize();
-
-  resizeBootFrame(numSequences);
-  // We create only two sub-network in generation, one stores states of all
-  // layers in previous time step and the other storing the states at current
-  // time step.
-  resizeOrCreateFrames(2);
-
-  // outFrameLines_.size() > 1UL
-  dataArgsSize_ = outFrameLines_.size() - 1;
-  dataArgs_.resize(dataArgsSize_);
-  dataArgsFrame_.clear();
-  dataArgsFrame_.resize(dataArgsSize_);
-
-  // connect boot frame memory links
-  std::vector<int> ids(numSequences);
-  for (size_t i = 0; i < numSequences; ++i) {
-    ids[i] = i;
-  }
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (memoryFrameLine.rootAgent) {
-      auto scatterAgent =
-          dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
-      scatterAgent->setRealLayer(memoryFrameLine.rootLayer, ids);
-    }
-    NeuralNetwork::connect(
-        memoryFrameLine.agents[0], memoryFrameLine.bootLayer, ids.size());
-  }
-
-  // boot layer forward
-  AsyncGpuBlock asyncGpuBlock;
-
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.bootLayer->forward(PASS_TEST);
-  }
-
-  // init outArg
-  size_t resultNum = generator_.config.num_results_per_sample();
-  size_t maxGenWordCount =
-      generator_.config.max_num_frames() * numSequences * resultNum;
-  IVector::resizeOrCreate(generator_.outArg.ids, maxGenWordCount, false);
-  if (resultNum > 1) {
-    CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
-    Matrix::resizeOrCreate(generator_.outArg.in,
-                           /* height */ numSequences,
-                           /* width */ resultNum,
-                           false,
-                           /* useGpu */ false);
-  }
-  ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
-                                numSequences + 1,
-                                /* useGpu */ false);
-  if (getBeamSize() > 1) {
-    beamSearch(numSequences);
-  } else {
-    oneWaySearch(numSequences);
-  }
-  if (dataArgsSize_) createDataOutlink();
-
-  size_t size = generator_.ids.size();
-  generator_.outArg.ids->resize(size);
-  generator_.outArg.ids->copyFrom(generator_.ids.data(), size);
-
-  OutFrameLine& outFrameLine = outFrameLines_[0];
-  auto dataAgent = dynamic_cast<DataLayer*>(outFrameLine.agentLayer.get());
-  CHECK_NOTNULL(dataAgent);
-  dataAgent->setData(generator_.outArg);
-  dataAgent->prefetch();
-}
-
-void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
-  OutFrameLine& outFrameLine = outFrameLines_[0];
-
-  // finalPaths_[0] stores the generated results of the
-  // entire batch, so its size exactly equals to batchSize.
-  finalPaths_.clear();
-  finalPaths_.resize(1);
-  std::vector<Path>& finalPaths = finalPaths_[0];
-  finalPaths.resize(batchSize);
-
-  seqIds_.resize(batchSize);
-  std::vector<int> scatterIds;
-  for (size_t i = 0; i < batchSize; ++i) {
-    finalPaths[i].seqId = i;
-    seqIds_[i] = i;
-  }
-
-  // forward
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    if (i && scatterIds.empty()) break;
-    int machineCur = i % 2;
-    int machinePrev = (i - 1) % 2;
-    // connect memory links
-    if (i) {
-      seqIds_.clear();
-      for (size_t j = 0; j < batchSize; ++j) {
-        if (finalPaths[j].seqId != -1) seqIds_.push_back(j);
-      }
-
-      for (auto& memoryFrameLine : memoryFrameLines_) {
-        auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
-            memoryFrameLine.scatterAgents[machineCur].get());
-        scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
-                                   scatterIds);
-        scatterAgent->forward(PASS_TEST);
-        NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
-                               memoryFrameLine.scatterAgents[machineCur]);
-      }
-    }
-    const std::vector<Argument> inArgs;
-    std::vector<Argument> outArgs;
-    frames_[machineCur]->forward(inArgs, &outArgs, PASS_TEST);
-
-    const IVectorPtr& idVec = outFrameLine.frames[machineCur]->getOutput().ids;
-    for (size_t j = 0; j < seqIds_.size(); ++j) {
-      finalPaths[seqIds_[j]].ids.push_back(idVec->getElement(j));
-      finalPaths[seqIds_[j]].machineIdVec.push_back(j);
-    }
-
-    copyDataOutlinkFrame(machineCur);
-
-    // check eos
-    const IVectorPtr& eosVec =
-        eosFrameLine_->layers[machineCur]->getOutput().ids;
-    scatterIds.clear();
-    for (size_t j = 0; j < seqIds_.size(); ++j) {
-      if (eosVec->getElement(j) == 1U) {
-        // path.seqId = -1 indicates end of generation
-        // of an input sequence
-        finalPaths[seqIds_[j]].seqId = -1;
-      } else {
-        scatterIds.push_back(j);
-      }
-    }
-  }
-
-  batchMachineIdVec_.clear();
-  batchMachineStartPos_.clear();
-  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
-  starts[0] = 0;
-  generator_.ids.clear();
-  for (size_t i = 0; i < batchSize; ++i) {
-    generator_.ids.insert(generator_.ids.end(),
-                          finalPaths[i].ids.begin(),
-                          finalPaths[i].ids.end());
-    starts[i + 1] = generator_.ids.size();
-    batchMachineIdVec_.insert(batchMachineIdVec_.end(),
-                              finalPaths[i].machineIdVec.begin(),
-                              finalPaths[i].machineIdVec.end());
-  }
-}
-
-void RecurrentGradientMachine::connectPrevFrame(int stepId,
-                                                std::vector<Path>& paths) {
-  int machineCur = stepId % 2;
-  int machinePrev = (stepId - 1) % 2;
-  int beam = getBeamSize();
-  machineIds_.clear();
-  topIds_.clear();
-  seqIds_.clear();
-
-  for (size_t j = 0; j < paths.size(); ++j) {
-    machineIds_.push_back(paths[j].machineId);
-    topIds_.push_back(paths[j].machineId * beam + paths[j].topIndex);
-    seqIds_.push_back(paths[j].seqId);
-  }
-
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    bool isOutIds = (memoryFrameLine.layerName == outFrameLines_[0].layerName);
-    auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
-        memoryFrameLine.scatterAgents[machineCur].get());
-    scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
-                               isOutIds ? topIds_ : machineIds_);
-    scatterAgent->forward(PASS_TEST);
-    NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
-                           memoryFrameLine.scatterAgents[machineCur]);
-  }
-}
-
-void RecurrentGradientMachine::forwardFrame(int machineCur) {
-  // forward
-  const std::vector<Argument> inArgs;
-  std::vector<Argument> outArgs;
-  frames_[machineCur]->forward(inArgs, &outArgs, PASS_TEST);
-
-  copyDataOutlinkFrame(machineCur);
-
-  IVectorPtr& ids = outFrameLines_[0].frames[machineCur]->getOutput().ids;
-  MatrixPtr in = outFrameLines_[0].frames[machineCur]->getOutput().in;
-  IVectorPtr& eos = eosFrameLine_->layers[machineCur]->getOutput().ids;
-  if (useGpu_) {
-    IVector::resizeOrCreate(cpuId_, ids->getSize(), false /* useGpu */);
-    cpuId_->copyFrom(*ids);
-    Matrix::resizeOrCreate(cpuProb_,
-                           in->getHeight(),
-                           in->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    cpuProb_->copyFrom(*in);
-    IVector::resizeOrCreate(cpuEos_, eos->getSize(), false /* useGpu */);
-    cpuEos_->copyFrom(*eos);
-  } else {
-    cpuId_ = ids;
-    cpuProb_ = in;
-    cpuEos_ = eos;
-  }
-}
-
-void RecurrentGradientMachine::singlePathExpand(Path& curPath,
-                                                size_t curPathId,
-                                                std::vector<Path>& newPaths,
-                                                size_t expandWidth) {
-  int calc_id =
-      gDiyProbStart ? gDiyProbStart(curPath.ids.size(), curPath.ids.data()) : 0;
-
-  const int* idVec = cpuId_->getData();
-  const real* probMat = cpuProb_->getData();
-  const int* eosVec = cpuEos_->getData();
-
-  for (size_t k = 0; k < expandWidth; k++) {
-    int index = curPathId * expandWidth + k;
-    int id = idVec[index];
-    real prob = probMat[index];
-    /*
-     * Ordinarily, beam search greedily expands the most promising expandWidth
-     * paths that currently are ALWAYS returned by MaxIdLayer.
-     * In one condition, if user customizes the beam search procedure by
-     * restricting the expansion within a user defined subset,
-     * as a result, MaxIdLayer possibly COULD NOT return expandWidth
-     * vaild expansions, and it will use -1 to indicate the end of valid
-     * expansion candidates.
-     */
-    if (id == -1) break;
-
-    real newLogProb = generator_.config.log_prob() ? std::log(prob) : prob;
-    Path newPath(
-        curPath, id, newLogProb, curPathId /*machineId*/, k /*topIndex*/);
-    if (this->beamSearchCtrlCallbacks_) {
-      if (beamSearchCtrlCallbacks_->stopDetermineCandidates(
-              newPath.seqId, newPath.ids, newPath.probHistory))
-        return;
-    }
-    // outFrameLines_.size() > 1UL
-    if (dataArgsSize_) {
-      newPath.machineIdVec = curPath.machineIdVec;
-      newPath.machineIdVec.push_back(curPathId);
-    }
-    bool atEos =
-        eosVec[index] == 1U || newPath.ids.size() >= (size_t)maxSequenceLength_;
-    // adjustNewPath
-    newPath.adjustProb(calc_id, atEos);
-    if (this->beamSearchCtrlCallbacks_) {
-      this->beamSearchCtrlCallbacks_->normOrDropNode(
-          newPath.seqId, newPath.ids, newPath.probHistory, &newPath.logProb);
-    }
-    if (!newPath.isDropable()) {
-      atEos ? finalPaths_[curPath.seqId].push_back(newPath)
-            : newPaths.push_back(newPath);
-    }
-  }  // for expandWidth
-
-  if (gDiyProbStop) {
-    gDiyProbStop(calc_id);
-  }
-}
-
-void RecurrentGradientMachine::beamExpand(std::vector<Path>& paths,
-                                          std::vector<Path>& newPaths) {
-  size_t candidatePathCount = paths.size();
-  // idVec.size() could be larger than candidatePathCount * beam,
-  // so user can drop some node customly.
-  CHECK_EQ(cpuId_->getSize() % candidatePathCount, 0UL);
-  size_t expandWidth = cpuId_->getSize() / candidatePathCount;
-
-  // iterate over each sequence
-  size_t totalExpandCount = 0;
-  int prevSeqId = -1;
-  int curSeqId = 0;
-  for (size_t j = 0; j <= candidatePathCount; j++) {
-    // expansions of a single sequence are all processed
-    curSeqId = (j < candidatePathCount ? paths[j].seqId : curSeqId + 1);
-    if (prevSeqId != -1 && curSeqId != prevSeqId) {
-      totalExpandCount += beamShrink(newPaths, prevSeqId, totalExpandCount);
-    }
-    if (j == candidatePathCount) return;
-    singlePathExpand(paths[j], j, newPaths, expandWidth);
-
-    prevSeqId = paths[j].seqId;
-  }  // for paths
-}
-
-// Drop extra nodes to beam size.
-size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
-                                            size_t seqId,
-                                            size_t totalExpandCount) {
-  size_t minNewPathSize =
-      std::min(getBeamSize(), newPaths.size() - totalExpandCount);
-  if (!minNewPathSize) {
-    return 0;
-  }
-  std::nth_element(newPaths.begin() + totalExpandCount,
-                   newPaths.begin() + totalExpandCount + minNewPathSize,
-                   newPaths.end(),
-                   Path::greaterPath);
-  newPaths.resize(totalExpandCount + minNewPathSize);
-
-  real minPathLogProb =
-      std::min_element(newPaths.end() - minNewPathSize, newPaths.end())
-          ->logProb;
-  real maxPathLogProb =
-      std::max_element(newPaths.end() - minNewPathSize, newPaths.end())
-          ->logProb;
-
-  // Remove the already formed paths that are relatively short
-  finalPaths_[seqId].erase(
-      std::remove_if(finalPaths_[seqId].begin(),
-                     finalPaths_[seqId].end(),
-                     [&](Path& p) { return p.logProb < minPathLogProb; }),
-      finalPaths_[seqId].end());
-  for (auto p : finalPaths_[seqId]) {
-    if (minFinalPathLogProb_[seqId] > p.logProb) {
-      minFinalPathLogProb_[seqId] = p.logProb;
-    }
-  }
-
-  if (finalPaths_[seqId].size() >= getBeamSize() &&
-      minFinalPathLogProb_[seqId] >= maxPathLogProb) {
-    newPaths.resize(totalExpandCount);
-    return 0;
-  }
-  return minNewPathSize;
-}
-
-void RecurrentGradientMachine::fillGenOutputs() {
-  size_t numResults = generator_.config.num_results_per_sample();
-  for (size_t i = 0; i < finalPaths_.size(); ++i) {
-    size_t minFinalPathsSize = std::min(numResults, finalPaths_[i].size());
-    std::partial_sort(finalPaths_[i].begin(),
-                      finalPaths_[i].begin() + minFinalPathsSize,
-                      finalPaths_[i].end(),
-                      Path::greaterPath);
-    finalPaths_[i].resize(minFinalPathsSize);
-  }
-
-  generator_.ids.clear();
-  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
-  starts[0] = 0;
-  if (numResults > 1) {
-    int idsProbSaveSize = 0;
-    for (auto inSeq : finalPaths_) {
-      for (auto path : inSeq) idsProbSaveSize += path.ids.size();
-      idsProbSaveSize += inSeq.size();
-    }
-    Matrix::resizeOrCreate(
-        generator_.outArg.value, idsProbSaveSize, 1, false, false);
-    real* idsProb = generator_.outArg.value->getData();
-
-    real* probs = generator_.outArg.in->getData();
-    size_t curPos = 0;
-    for (size_t i = 0; i < finalPaths_.size(); ++i) {
-      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
-        Path& path = finalPaths_[i][j];
-        size_t genLen = path.ids.size();
-        generator_.ids.push_back(genLen);  // sequence size
-        generator_.ids.insert(
-            generator_.ids.end(), path.ids.begin(), path.ids.end());
-        generator_.ids.push_back(-1);  // end of sequence
-
-        memcpy(idsProb + curPos, path.idsProb.data(), sizeof(real) * genLen);
-        curPos += genLen;
-        idsProb[curPos++] = -1.0;
-        probs[i * numResults + j] = path.logProb;
-      }
-      starts[i + 1] = generator_.ids.size();
-    }
-  } else {
-    for (size_t i = 0; i < finalPaths_.size(); ++i) {
-      CHECK(!finalPaths_[i].empty());
-      Path& path = finalPaths_[i][0];
-      generator_.ids.insert(
-          generator_.ids.end(), path.ids.begin(), path.ids.end());
-      starts[i + 1] = starts[i] + path.ids.size();
-    }
-  }
-}
-
-void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
-  for (size_t i = 0; i < dataArgsSize_; i++) {
-    Argument outFrame;
-    outFrame.resizeAndCopyFrom(
-        outFrameLines_[i + 1].frames[machineCur]->getOutput(), useGpu_);
-    dataArgsFrame_[i].emplace_back(outFrame);
-  }
-}
-
-void RecurrentGradientMachine::createDataOutlinkSelRowsInfo(
-    bool isSeq, std::vector<Argument>& outArgs) {
-  batchMachineIdVec_.clear();
-
-  size_t seqIdx = 0;
-  for (size_t i = 0; i < finalPaths_.size(); ++i) {
-    for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
-      std::vector<int>& machineIdVec = finalPaths_[i][j].machineIdVec;
-      if (isSeq) {
-        for (size_t i = 0; i < machineIdVec.size(); ++i) {
-          size_t rowId = machineIdVec[i];
-          int* seqPos =
-              outArgs[i].sequenceStartPositions->getMutableData(false);
-          batchMachineIdVec_.push_back(seqPos[rowId]);
-        }
-      } else {
-        batchMachineIdVec_.insert(
-            batchMachineIdVec_.end(), machineIdVec.begin(), machineIdVec.end());
-      }
-      seqIdx++;
-    }
-  }
-}
-
-void RecurrentGradientMachine::createDataOutlinkCopySizeInfo(
-    bool isSeq, std::vector<Argument>& outArgs, std::vector<int>& copySize) {
-  size_t totalSeqNum = std::accumulate(
-      finalPaths_.begin(),
-      finalPaths_.end(),
-      0UL,
-      [](size_t a, const std::vector<Path>& b) { return a + b.size(); });
-  copySize.resize(totalSeqNum, 1);
-
-  batchMachineStartPos_.resize(totalSeqNum + 1, 0);
-  if (isSeq) {
-    ICpuGpuVectorPtr inputSeqStartPos = outArgs[0].sequenceStartPositions;
-    CHECK_EQ(static_cast<size_t>(inputSeqStartPos->getSize() - 1),
-             getBeamSize() > 1 ? finalPaths_.size() : finalPaths_[0].size());
-    int* starts = inputSeqStartPos->getMutableData(false);
-    int seqId = 0;
-    for (size_t i = 0; i < finalPaths_.size(); ++i) {
-      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
-        copySize[seqId] = getBeamSize() > 1 ? starts[i + 1] - starts[i]
-                                            : starts[j + 1] - starts[j];
-        batchMachineStartPos_[seqId + 1] =
-            batchMachineStartPos_[seqId] + finalPaths_[i][j].ids.size();
-        seqId++;
-      }
-    }
-  } else {
-    for (size_t i = 0; i < finalPaths_[0].size(); ++i)
-      batchMachineStartPos_[i + 1] =
-          batchMachineStartPos_[i] + finalPaths_[0][i].ids.size();
-  }
-}
-
-void RecurrentGradientMachine::createDataOutlink() {
-  for (size_t i = 0; i < dataArgsSize_; i++) {
-    bool isSeq = dataArgsFrame_[i][0].hasSeq();
-    std::vector<int> copySize;
-    createDataOutlinkCopySizeInfo(isSeq, dataArgsFrame_[i], copySize);
-    createDataOutlinkSelRowsInfo(isSeq, dataArgsFrame_[i]);
-
-    dataArgs_[i].concat(dataArgsFrame_[i],
-                        batchMachineIdVec_,
-                        batchMachineStartPos_,
-                        copySize,
-                        useGpu_,
-                        HPPL_STREAM_1,
-                        PASS_TEST);
-    auto dataAgent =
-        dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
-    CHECK_NOTNULL(dataAgent);
-    dataAgent->setData(dataArgs_[i]);
-  }
-}
-
-void RecurrentGradientMachine::beamSearch(size_t batchSize) {
-  finalPaths_.clear();
-  finalPaths_.resize(batchSize);
-  seqIds_.resize(batchSize);
-  minFinalPathLogProb_.clear();
-  minFinalPathLogProb_.resize(batchSize, 0);
-
-  std::vector<Path> paths;
-  std::vector<Path> newPaths;
-  for (size_t i = 0; i < batchSize; ++i) {
-    paths.push_back(Path(i));
-    if (this->beamSearchCtrlCallbacks_) {
-      paths.back().recordHistory();
-    }
-  }
-
-  // restart beam search
-  stopBeamSearch_ = false;
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    int machineCur = i % 2;
-    std::unique_ptr<
-        ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&, int>>
-        statisticsBlock;
-    if (this->beamSearchStatistics_) {
-      auto ptr =
-          new ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&,
-                              int>(beamSearchStatistics_->onEachStepStarted,
-                                   beamSearchStatistics_->onEachStepStoped,
-                                   i);
-      statisticsBlock.reset(ptr);
-    }
-    if (stopBeamSearch_) break;
-
-    if (i) connectPrevFrame(i, paths);
-
-    if (this->beamSearchCtrlCallbacks_) {
-      std::vector<std::vector<int>*> prefixes;
-      prefixes.resize(paths.size());
-      std::transform(
-          paths.begin(), paths.end(), prefixes.begin(), [](const Path& p) {
-            return const_cast<std::vector<int>*>(&p.ids);
-          });
-      beamSearchCtrlCallbacks_->beamSearchCandidateAdjust(
-          prefixes, frames_[machineCur].get(), i);
-    }
-
-    forwardFrame(machineCur);
-    beamExpand(paths, newPaths);
-    if (newPaths.empty()) break;
-
-    paths = newPaths;
-    newPaths.clear();
-  }  // end for machineCur
-  fillGenOutputs();
-}
-
-void RecurrentGradientMachine::Path::adjustProb(int calc_id, bool atEos) {
-  if (gDiyProbMethod) {
-    logProb = gDiyProbMethod(calc_id, ids.size(), ids.data(), logProb, atEos);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h
deleted file mode 100644
index 0a13d4f6f84..00000000000
--- a/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h
+++ /dev/null
@@ -1,580 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include "GradientMachine.h"
-#include "NeuralNetwork.h"
-
-#include "paddle/legacy/utils/Locks.h"
-
-namespace paddle {
-
-/**
- * Private data class declares.
- * Used for user customized beam search.
- */
-class BeamSearchControlCallbacks;
-class BeamSearchStatisticsCallbacks;
-
-class RecurrentGradientMachine : public NeuralNetwork {
- public:
-  RecurrentGradientMachine(const std::string& subModelName,
-                           NeuralNetwork* rootNetwork);
-
-  // Disable copy and assign.
-  RecurrentGradientMachine(const RecurrentGradientMachine& other) = delete;
-  RecurrentGradientMachine& operator=(const RecurrentGradientMachine& other) =
-      delete;
-
-  virtual ~RecurrentGradientMachine() {
-    this->removeBeamSearchStatisticsCallbacks();
-    this->removeBeamSearchControlCallbacks();
-  }
-
-  virtual void init(const ModelConfig& config,
-                    ParamInitCallback callback,
-                    const std::vector<ParameterType>& parameterTypes,
-                    bool useGpu);
-
-  virtual void prefetch(const std::vector<Argument>& inArgs);
-
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback& callback = nullptr);
-
-  void forwardBackward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType,
-                       const UpdateCallback& callback);
-
-  virtual void resetState() {}
-  virtual void eval(Evaluator* evaluator) const;
-
-  const std::vector<int>& getParameterIds() { return parameterIds_; }
-
-  /**
-   * @brief BeamSearchCandidatesAdjustCallback
-   *
-   * Adjust searching candidates to restrict beam search
-   * searching within a limited subset of all possibile paths.
-   *
-   * The first parameter is the prefixes of all formed paths in current
-   * beam search step, whose type is basically int[][].
-   *
-   * The second parameter is a pointer to the network used to generate sequence,
-   * user can use this pointer to tranverse each layer in the network to
-   * modify behaivors of a particular layer.
-   *
-   * The third parameter is an integer to indicate the iteration number of
-   * beam search, so that user can customize different operations in different
-   * beam search iterations.
-   */
-  typedef std::function<void(
-      const std::vector<std::vector<int>*>&, NeuralNetwork*, const int)>
-      BeamSearchCandidatesAdjustCallback;
-
-  /**
-   * @brief DropCallback
-   *
-   * Drop a whole prefix or one candidate in beam search or not.
-   *
-   * The first parameter is sequence index in a batch
-   *
-   * The second parameter is one path in beam search,
-   * which is made up of node indices.
-   *
-   * The third parameter is probabilites for each node in this path.
-   *
-   * Return true if this prefix or candidate is expected to be dropped.
-   */
-  typedef std::function<bool(
-      int seqId, const std::vector<int>&, const std::vector<real>&)>
-      DropCallback;
-
-  /**
-   * @brief NormOrDropNodeCallback
-   *
-   * Normalize a path's probabilities or just drop it by modifying path.logProb
-   *
-   * The first parameter is sequence index in a batch
-   *
-   * The second parameter is path.ids
-   *
-   * The third parameter is probabilites for each node in this path.
-   *
-   * The fourth parameter is the probability of the whole path.
-   */
-  typedef std::function<void(
-      int seqId, const std::vector<int>&, std::vector<real>&, real*)>
-      NormOrDropNodeCallback;
-
-  /**
-   * @brief Register beam search control callbacks. Used for prediction.
-   *
-   * @param queryBeamSearch: Give the sequences already formed, return the
-   * nodes expected to be expanded.
-   * Input: A pointer to an array holding pathes which have been expanded
-   * Return: A pointer to an array holding nodes wanted to be expanded.
-   *
-   * @param dropOneNode: Early drop a node in one beam search step.
-   * Given the path formed and probability history, decide whether a node
-   * should be dropped or not.
-   *
-   * @param stopBeamSearch: Early stop a path in one beam search step.
-   * Given the path and probability history, decide whether a path
-   * should be dropped or not.
-   */
-  void registerBeamSearchControlCallbacks(
-      const BeamSearchCandidatesAdjustCallback& adjustBeamSearch,
-      const NormOrDropNodeCallback& normOrDropNode,
-      const DropCallback& stopBeamSearch);
-
-  /**
-   * @brief Remove user costumized beam search callbacks,
-   *
-   * make sequence generation acts like normal beam search.
-   */
-  void removeBeamSearchControlCallbacks();
-
-  /**
-   * @brief EachStepCallback
-   *
-   * Invoke with beam search step.
-   */
-  typedef std::function<void(int)> EachStepCallback;
-
-  /**
-   * @brief register statistics methods for performance profile of beam search.
-   *
-   * @param onEachStepStarted: invoke once a beam search step starts.
-   * Its input is index of the beam search step.
-   *
-   * @param onEachStepStoped: invoke once a beam search step ends.
-   * Its input is index of the beam search step.
-   */
-  void registerBeamSearchStatisticsCallbacks(
-      const EachStepCallback& onEachStepStarted,
-      const EachStepCallback& onEachStepStoped);
-
-  /**
-   * @brief Remove beam search callbacks.
-   */
-  void removeBeamSearchStatisticsCallbacks();
-
-  /**
-   * @brief Stop beam search for current source.
-   *
-   * Will restart beam search in the next forward
-   */
-  void stopBeamSearch();
-
-  struct Path {
-    /**
-     * @brief ids, path of beam search.
-     */
-    std::vector<int> ids;
-
-    /**
-     * @brief idsProb, log probability of each generated word.
-     */
-    std::vector<real> idsProb;
-
-    /**
-     * @brief logProb, current probability of path.
-     */
-    real logProb;
-
-    int machineId;  // index of sample in frame
-    int topIndex;   // index of MaxIdLayer output in one sample
-    int seqId;      // index of sequence in batch generation
-    std::vector<int> machineIdVec;
-
-    /**
-     * @brief A record of each node's probality in a formed path in beam search.
-     *
-     * @note  It could be empty when history is not recorded. If the history is
-     *        wanted to be recorded, recordHistory() MUST be invoked first.
-     */
-    std::vector<real> probHistory;
-
-    /**
-     * @brief Path default ctor, first logProb is 0.
-     */
-    Path() {
-      logProb = 0;
-      seqId = 0;
-    }
-    explicit Path(size_t seqId) : seqId(seqId) { logProb = 0; }
-
-    /**
-     * @brief Create a new path based on an old path and
-     * a new node with probability.
-     *
-     * @param old       old path
-     * @param newId     index of the new node
-     * @param logProb   probability of the new node.
-     * @param machineId sample index of a frame in RNN
-     * @param topIndex  index of MaxIdLayer output in one sample
-     */
-    Path(Path& old, int newId, real logProb, int machineId, int topIndex)
-        : ids(old.ids),
-          idsProb(old.idsProb),
-          logProb(old.logProb + logProb),
-          machineId(machineId),
-          topIndex(topIndex),
-          seqId(old.seqId) {
-      ids.push_back(newId);
-      idsProb.push_back(logProb);
-      if (!old.probHistory.empty()) {
-        this->probHistory = old.probHistory;
-        // probHistory store current prob, not sum
-        this->probHistory.push_back(logProb);
-      }
-    }
-
-    /**
-     * @brief operator <
-     *
-     * Path a < Path b means log probability of a is smaller than that of b
-     */
-    bool operator<(const Path& other) const {
-      return (logProb < other.logProb);
-    }
-
-    static bool greaterPath(const Path& a, const Path& b) { return (b < a); }
-
-    /**
-     * @brief Start recording history in this path.
-     */
-    void recordHistory() { this->probHistory.push_back(this->logProb); }
-
-    /**
-     * @brief Adjust probability for DIY beam search interface.
-     * In normal situation, it will do nothing.
-     *
-     * @param calc_id: the object id for DIY beam search interface.
-     * @param atEos: at end of sequence or not.
-     */
-    void adjustProb(int calc_id, bool atEos = false);
-
-    /**
-     * @brief isDropable indacating whether the current node will be
-     * dropped or not in beam search.
-     *
-     * @note: if logProb is -inf, current node will be dropped.
-     * @return true to drop the current node.
-     */
-    bool isDropable() const { return std::isinf(logProb) && logProb < 0; }
-  };
-
-  /**
-   * @brief access beam search results.
-   * @return beam search results.
-   */
-  const std::vector<std::vector<Path>>& getFinalPaths() const {
-    return this->finalPaths_;
-  }
-
- protected:
-  std::vector<Argument::SeqInfo> commonSeqInfo_;
-  ICpuGpuVectorPtr sequenceStartPositions_;
-  void calcSequenceStartPositions();
-  void checkInputConsistency(int inlinkId,
-                             const std::vector<Argument::SeqInfo>& seqInfo);
-  void reorganizeInput(PassType passType);
-  void reorganizeOutput(PassType passType);
-  void connectFrames(PassType passType);
-  void calcNumSequencesAtEachStep();
-
-  void resizeOrCreateFrames(int numFrames);
-  void resizeBootFrame(int numSequences);
-
-  void generateSequence();
-  void oneWaySearch(size_t batchSize);
-  void beamSearch(size_t batchSize);
-
-  struct InFrameLine {
-    std::string linkName;
-    LayerPtr inLayer;
-    std::vector<LayerPtr> agents;  // Scatter Agents to reform batch input
-    Argument outArg;               // scatter output argument
-  };
-  std::vector<InFrameLine> inFrameLines_;
-
-  struct OutFrameLine {
-    std::string layerName;
-    LayerPtr agentLayer;
-    std::vector<LayerPtr> frames;
-  };
-  std::vector<OutFrameLine> outFrameLines_;
-
-  struct MemoryFrameLine {
-    std::string layerName;
-    std::string linkName;
-    LayerPtr bootLayer;  // actually used biasLayer or rootAgent
-    LayerPtr biasLayer;
-    LayerPtr rootLayer;  // layer in root network to boot this memory
-    LayerPtr rootAgent;  // agent to link rootLayer
-    std::vector<LayerPtr> frames;
-    std::vector<LayerPtr> agents;
-    std::vector<LayerPtr> scatterAgents;  // scatter agent used by beam search
-    Argument outArg;                      // scatter output argument
-    // Different memoryFrameLine have different element as follows
-    IVectorPtr allIds;  // scattered id of realLayer
-    ICpuGpuVectorPtr
-        sequenceStartPositions;  // scattered sequenceStartPositions
-  };
-  std::vector<MemoryFrameLine> memoryFrameLines_;
-
-  // Each inFrameLines(inlinks) has its own info(elements) below,
-  // and all outFrameLines(outlinks) share the info with one inFrameLine,
-  // which is assigned by targetInfoInlinkId_.
-  struct Info {
-    // The original positions in the original batch
-    IVectorPtr allIds;  // scattered id of realLayer [batchSize]
-
-    // index of allIds for each step [maxSequenceLength_]
-    // idIndex[i] is the total length of the first i sequences
-    std::vector<int> idIndex;
-
-    ICpuGpuVectorPtr
-        sequenceStartPositions;         // scattered sequenceStartPositions
-    std::vector<int> seqStartPosIndex;  // index of sequenceStartPositions
-  };
-  std::vector<Info> info_;  // for input
-
-  // numSeqs_[i] is the number sequences which is longer than i (for sequence
-  // data) or has more than i subsequences (for subsequence data)
-  // Equivalently, numSeqs_[i] is the number of sequences at step i;
-  std::vector<int> numSeqs_;
-
-  std::vector<std::vector<Argument::SeqInfo>> seqInfos_;
-
-  void checkOutputConsistency(OutFrameLine& outFrameLine);
-
-  /* create scattered id infomation for all realLayer of inFrameLines one time.
-   *  If hasSubseq, will also create scattered sequenceStartPositions infomation
-   *  for all realLayer of inFrameLines one time.
-   */
-  void createInFrameInfo(int inlinks_id,
-                         const Argument& input,
-                         PassType passType);
-  void createInFrameInfo_nonseq(int inlinks_id,
-                                const Argument& input,
-                                PassType passType);
-  void createInFrameInfo_seq(int inlinks_id,
-                             const Argument& input,
-                             PassType passType);
-  void createInFrameInfo_subseq(int inlinks_id,
-                                const Argument& input,
-                                PassType passType);
-
-  void createOutFrameInfo(OutFrameLine& outFrameLine,
-                          Info& info,
-                          ICpuGpuVectorPtr& sequenceStartPositions,
-                          ICpuGpuVectorPtr& subSequenceStartPositions);
-  void createOutFrameInfo_seq(OutFrameLine& outFrameLine,
-                              Info& info,
-                              ICpuGpuVectorPtr& sequenceStartPositions,
-                              ICpuGpuVectorPtr& subSequenceStartPositions);
-  void createOutFrameInfo_subseq(OutFrameLine& outFrameLine,
-                                 Info& info,
-                                 ICpuGpuVectorPtr& sequenceStartPositions,
-                                 ICpuGpuVectorPtr& subSequenceStartPositions);
-
-  void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine,
-                             PassType passType);
-
-  void copyScattedId(std::vector<int>& srcIds, IVectorPtr* dstIds, int size);
-
-  void selectRowsOneTime(LayerPtr layer,
-                         const IVectorPtr& allIds,
-                         Argument* arg,
-                         PassType passType);
-
-  void createSeqPos(const std::vector<int>& sequenceStartPosition,
-                    ICpuGpuVectorPtr* sequenceStartPositions);
-
-  // for generator
-  struct EosFrameLine {
-    std::vector<LayerPtr> layers;
-  };
-  std::unique_ptr<EosFrameLine> eosFrameLine_;
-
-  struct Generator {
-    GeneratorConfig config;
-    std::vector<int> ids;       // store generated sequences
-    std::vector<real> idsProb;  // log probability of each generated word
-    Argument outArg;            // final output argument
-  };
-  bool generating_;
-  Generator generator_;
-
-  std::vector<std::unique_ptr<NeuralNetwork>> frames_;
-
-  NeuralNetwork* rootNetwork_;
-  bool reversed_;
-
-  int maxSequenceLength_;  // Max top-level length
-  bool useGpu_;
-  bool stopBeamSearch_;
-
-  std::vector<int>
-      parameterIds_;  // parameters actually used by this Layer Group
-
-  // store final argument of outFrameLines_
-  std::vector<Argument> dataArgs_;
-  // store each frame's output argument of outFrameLines_
-  std::vector<std::vector<Argument>> dataArgsFrame_;
-  size_t dataArgsSize_;  // size of dataArgs_ = size of dataArgsFrame_
-
-  IVectorPtr cpuId_;
-  MatrixPtr cpuProb_;
-  IVectorPtr cpuEos_;
-
- private:
-  /*
-   * @return beam size in beam search
-   */
-  size_t getBeamSize() { return generator_.config.beam_size(); }
-
-  /*
-   * @return number of sequence in a batch in generation
-   */
-  size_t getGenBatchSize();
-
-  /*
-   * @brief store output of the machineCur-th frame during generation, for
-   * creating the final outlink after the entire generation process is finished.
-   *
-   * In generation, if the layer group has more than 1 outlink, the first
-   * one is reserved to store the generated word indices, the others are data
-   * outlinks, that can be used like a common layer in the network.
-   *
-   * @param machineCur : index to access the layer group frame in
-   * currrent generation step.
-   */
-  void copyDataOutlinkFrame(size_t machineCur);
-
-  /*
-   * @brief In generation, if the layer group has more than 1 outlink, outlink
-   * except the first one is a data outlink. In RecurrentLayerGroup, each time
-   * step is a separate Network, outputs of a layer inside the
-   * RecurrentLayerGroup are stored in separate Arguments. If one layer is
-   * specified as an outlink of RecurrentLayerGroup. This function will
-   * collect outputs in each time step of each generated sequence which are
-   * dispersed in separate Arguments to form a new single Argument as output of
-   * RecurrentLayerGroup.
-   */
-  void createDataOutlink();
-
-  /*
-   * @brief decide to select how many rows from the Matrix stored the forward
-   * pass results from a start position.
-   *
-   * @param isSeq: a flag indicating whetehr the layer to be output of the
-   * RecurrentGradientMachine is a sequence or not
-   * @param outArgs: all of the the returned Arguments of the forward pass
-   * during the generation process.
-   * @param copySize: the returned result, number of rows to select from the
-   * Matrix stored the forward pass results from a start position.
-   */
-  void createDataOutlinkCopySizeInfo(bool isSeq,
-                                     std::vector<Argument>& outArgs,
-                                     std::vector<int>& copySize);
-
-  /*
-   * @brief decide index of the start row for each time step of a generated
-   * sequence in Matrix stored the entire beam search batch's forward pass
-   * results.
-   *
-   * @param isSeq: a flag indicating whether the layer to be output of the
-   * RecurrentGradientMachine is a sequence or not
-   * @param outArgs: all of the returned Arguments of the forward pass
-   * during the generation process.
-   */
-  void createDataOutlinkSelRowsInfo(bool isSeq, std::vector<Argument>& outArgs);
-
-  /*
-   * @brief used in beam search, connect previous frame to form recurrent link
-   * @param stepId : iteration number of generation process.
-   * It equals to the length of longest half-generated sequence.
-   * @param paths : half-generated paths that are going to be expanded
-   * in current beam search iteration.
-   */
-  void connectPrevFrame(int stepId, std::vector<Path>& paths);
-
-  /*
-   * @brief used in beam search, forward current recurrent frame
-   * @param machineCur : index to access the layer group frame in
-   * currrent generation step.
-   */
-  void forwardFrame(int machineCur);
-
-  /*
-   * @brief reduce all expanded paths to beam size.
-   *
-   * @param newPaths : newPaths[totalExpandCount : ] stores all expanded paths
-   * for the seqId-th sequence
-   * @param seqId : sequence index in a batch
-   * @param totalExpandCount : number of already shrinked paths in newPaths
-   * @return size of retained paths at the end of a beam search iteration
-   */
-  size_t beamShrink(std::vector<Path>& newPaths,
-                    size_t seqId,
-                    size_t totalExpandCount);
-
-  /*
-   * @brief expand a single path to expandWidth new paths
-   * with highest probability
-   * @param curPath : path to be expanded
-   * @param curPathId : index of curPath in member newPaths
-   * @param expandWidth : number of paths to be expanded
-   */
-  void singlePathExpand(Path& curPath,
-                        size_t curPathId,
-                        std::vector<Path>& newPaths,
-                        size_t expandWidth);
-
-  /*
-   * @brief A new beam search iteration. Each half-generated paths in previous
-   * beam search iteration are further expanded to beam_size new paths
-   * with highest probabilities, and then all the expanded paths are again
-   * reduced to beam_size paths according to their log probabilities.
-   * @param paths : half-generated paths in previous iteration.
-   * @param newPaths : paths expanded and then reduces in current iteration.
-   */
-  void beamExpand(std::vector<Path>& paths, std::vector<Path>& newPaths);
-
-  /*
-   * @brief fill sequence start positions and some other information that are
-   * uesed by the "text_printer" evaluator.
-   */
-  void fillGenOutputs();
-
-  std::vector<int> machineIds_;
-  std::vector<int> topIds_;
-  std::vector<int> seqIds_;
-  std::vector<int> batchMachineIdVec_;
-  std::vector<int> batchMachineStartPos_;
-  std::vector<std::vector<Path>> finalPaths_;
-  std::vector<real> minFinalPathLogProb_;
-  BeamSearchControlCallbacks* beamSearchCtrlCallbacks_;
-  BeamSearchStatisticsCallbacks* beamSearchStatistics_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AddtoLayer.cpp b/paddle/legacy/gserver/layers/AddtoLayer.cpp
deleted file mode 100644
index 39c5603d938..00000000000
--- a/paddle/legacy/gserver/layers/AddtoLayer.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "AddtoLayer.h"
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(addto, AddtoLayer);
-
-bool AddtoLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  return true;
-}
-
-void AddtoLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(0)->getHeight();
-  int size = getSize();
-
-  reserveOutput(batchSize, size);
-
-  MatrixPtr outV = getOutputValue();
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    MatrixPtr input = getInputValue(i);
-    i == 0 ? outV->assign(*input) : outV->add(*input);
-  }
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */ { forwardActivation(); }
-}
-
-void AddtoLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ { backwardActivation(); }
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    /* Calculate the input layers error */
-    MatrixPtr preGrad = getInputGrad(i);
-    if (NULL != preGrad) {
-      preGrad->add(*getOutputGrad());
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AddtoLayer.h b/paddle/legacy/gserver/layers/AddtoLayer.h
deleted file mode 100644
index ad3cefe1a4d..00000000000
--- a/paddle/legacy/gserver/layers/AddtoLayer.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * This layer just simply add all input layers together, then activate
- * the sum inputs. Each input of this layer should be the same size,
- * which is also the output size of this layer.
- * \f[
- *   y=f(\sum_{i}x_i + b)
- * \f]
- * where \f$y\f$ is output, \f$x\f$ is input, \f$b\f$ is bias, and \f$f\f$ is
- * activation function.
- *
- * The config file api is addto_layer.
- */
-class AddtoLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit AddtoLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~AddtoLayer() {}
-
-  /**
-   * Intialization of AddtoLayer.
-   */
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * Forward propagation.
-   * @note There is no weight matrix for each input,
-   *       because it just a simple add operation.
-   */
-  void forward(PassType passType) override;
-
-  /**
-   * Backward propagation.
-   */
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AgentLayer.cpp b/paddle/legacy/gserver/layers/AgentLayer.cpp
deleted file mode 100644
index bae89b2fa34..00000000000
--- a/paddle/legacy/gserver/layers/AgentLayer.cpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "AgentLayer.h"
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(agent, AgentLayer);
-
-bool AgentLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  CHECK_EQ(config_.inputs_size(), 0);
-  if (!Layer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  setNeedGradient(true);
-  return true;
-}
-
-void AgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  Argument& realOutput = realLayer_->getOutput();
-  int realNumSequences = realOutput.getNumSequences();
-  CHECK_LE(numSamples_, realNumSequences);
-
-  // get Arguments from real layers
-  if (numSamples_ > 0 && numSamples_ < realNumSequences) {
-    if (realOutput.hasSeq()) {
-      int numRows =
-          realOutput.sequenceStartPositions->getData(false)[numSamples_];
-      output_.subArgFrom(realOutput,
-                         /* offset */ 0,
-                         numRows,
-                         getSize(),
-                         useGpu_,
-                         /* trans */ false,
-                         /* seqFlag */ true,
-                         /* seqStart */ 0,
-                         /* seqSize */ numSamples_ + 1);
-    } else {
-      output_.subArgFrom(
-          realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
-    }
-  } else {
-    output_ = realOutput;
-  }
-}
-
-bool GatherAgentLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  CHECK_EQ(config_.inputs_size(), 0);
-  if (!Layer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  setNeedGradient(true);
-  return true;
-}
-
-void GatherAgentLayer::copyIdAndSequenceInfo(
-    ICpuGpuVectorPtr sequenceStartPositions,
-    ICpuGpuVectorPtr subSequenceStartPositions,
-    const IVectorPtr& ids,
-    const std::vector<int>& idIndex) {
-  output_.sequenceStartPositions = sequenceStartPositions;
-  output_.subSequenceStartPositions = subSequenceStartPositions;
-  allIds_ = ids;
-  idIndex_ = idIndex;
-}
-
-void GatherAgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  forwardIds(passType);
-  forwardValue(passType);
-}
-
-void GatherAgentLayer::forwardValue(PassType passType) {
-  MatrixPtr valueReal = realLayers_[0]->getOutputValue();
-  if (!valueReal) return;
-
-  int height = allIds_->getSize();
-  int width = this->getSize();
-  resetOutput(height, width);
-  idsVec_.resize(idIndex_.size());
-
-  const MatrixPtr& outV = getOutputValue();
-
-  for (size_t i = 0; i < realLayers_.size(); ++i) {
-    const MatrixPtr& realV = realLayers_[i]->getOutputValue();
-    idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
-                                 /* size */ realV->getHeight(),
-                                 useGpu_);
-    realV->addToRows(*outV, *idsVec_[i]);
-  }
-}
-
-namespace {
-
-// dest[index[i]] <- src[i] for each i
-void copyElements(const IVector& srcVec,
-                  const IVector& indexVec,
-                  IVector& destVec) {
-  const int* src = srcVec.getData();
-  const int* index = indexVec.getData();
-  int* dest = destVec.getData();
-  int len = indexVec.getSize();
-  CHECK_EQ(srcVec.getSize(), indexVec.getSize());
-  for (int i = 0; i < len; ++i) {
-    dest[index[i]] = src[i];
-  }
-}
-}  // namespace
-
-void GatherAgentLayer::forwardIds(PassType passType) {
-  IVectorPtr realId = realLayers_[0]->getOutputLabel();
-  if (!realId) return;
-
-  IVector::resizeOrCreate(output_.ids, allIds_->getSize(), useGpu_);
-  IVectorPtr outId = output_.ids;
-  idsVec_.resize(idIndex_.size());
-
-  for (size_t i = 0; i < realLayers_.size(); ++i) {
-    const IVectorPtr& realId = realLayers_[i]->getOutputLabel();
-    idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
-                                 /* size */ realId->getSize(),
-                                 useGpu_);
-    execViaCpu(&copyElements, *realId, *idsVec_[i], *outId);
-  }
-}
-
-void GatherAgentLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  const MatrixPtr& outputGrad = getOutputGrad();
-
-  for (size_t i = 0; i < realLayers_.size(); ++i) {
-    const MatrixPtr& realG = realLayers_[i]->getOutputGrad();
-    if (realG) {
-      realG->selectRows(*outputGrad, *idsVec_[i]);
-    }
-  }
-}
-
-bool ScatterAgentLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  CHECK_EQ(config_.inputs_size(), 0);
-  if (!Layer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  setNeedGradient(true);
-  return true;
-}
-
-void ScatterAgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
-
-  int width = this->getSize();
-  if (selectionMode_) {
-    forwardWithSelection(passType);
-  } else {
-    if (realOutArg_.hasSeq()) {
-      output_.subArgFrom(realOutArg_,
-                         /* offset */ idIndex_,
-                         idSize_,
-                         width,
-                         useGpu_,
-                         /* trans */ false,
-                         /* seqFlag */ true,
-                         /* seqStart */ seqStartPosIndex_,
-                         /* seqSize */ numSequences_);
-    } else {
-      output_.subArgFrom(
-          realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_);
-    }
-  }
-}
-
-void ScatterAgentLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  CHECK(!selectionMode_);
-
-  const MatrixPtr& outputGrad = realOutArg_.grad;
-  const MatrixPtr& realGrad = realLayer_->getOutputGrad();
-  if (realGrad) {
-    // for agent in inFrameLines and memoryFrameLines,
-    // only first scatterAgentLayer should do addToRows in backward
-    if (handleBackward_) {
-      outputGrad->addToRows(*realGrad, *ids_);
-    }
-  }
-}
-
-REGISTER_LAYER(gather_agent, GatherAgentLayer);
-REGISTER_LAYER(scatter_agent, ScatterAgentLayer);
-
-void ScatterAgentLayer::forwardWithSelection(PassType passType) {
-  Layer::forward(passType);
-  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
-
-  const Argument& input = realLayer_->getOutput();
-  CHECK_EQ(realLayer_->getSize(), this->getSize());
-  int width = this->getSize();
-
-  AsyncGpuBlock asyncGpuBlock;
-  REGISTER_TIMER_INFO("SequenceAgentLayerForward", getName().c_str());
-
-  if (!input.hasSeq()) {
-    if (realLayer_->getOutput().ids) {
-      IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
-      output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_);
-    }
-    if (realLayer_->getOutput().value) {
-      int height = ids_->getSize();
-      resetOutput(height, width);
-
-      const MatrixPtr& outV = getOutputValue();
-      const MatrixPtr& realV = realLayer_->getOutputValue();
-      outV->selectRows(*realV, *ids_);
-    }
-  } else {
-    // Putting the generation logic here is really an ugly hack!
-    // used in generation
-    int height = 0;
-    size_t numSequences = ids_->getSize();
-    const int* starts = input.getCpuStartPositions();
-    size_t size = input.hasSubseq() ? input.getNumSubSequences()
-                                    : input.getNumSequences();
-    const int* cpuIds = cpuIds_->getData();
-
-    for (size_t i = 0; i < numSequences; ++i) {
-      size_t seqId = cpuIds[i];
-      CHECK_LT(seqId, size);
-      height += starts[seqId + 1] - starts[seqId];
-    }
-    reserveOutput(height, width);
-
-    const MatrixPtr& outputValue = getOutputValue();
-
-    CHECK_NE(input.sequenceStartPositions.get(),
-             output_.sequenceStartPositions.get());
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences + 1, false);
-    int* outStarts = output_.sequenceStartPositions->getMutableData(false);
-
-    ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false);
-    int* inStarts = inputStartPos_->getMutableData(false);
-
-    size_t offsetOut = 0;
-    for (size_t i = 0; i < numSequences; ++i) {
-      outStarts[i] = offsetOut;
-      size_t seqId = cpuIds[i];
-      int size = starts[seqId + 1] - starts[seqId];
-      for (int j = 0; j < size; j++) {
-        inStarts[offsetOut + j] = starts[seqId] + j;
-      }
-      offsetOut += size;
-    }
-    outStarts[numSequences] = offsetOut;
-
-    outputValue->copyByRowIndex(*input.value,
-                                *inputStartPos_->getVector(useGpu_));
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AgentLayer.h b/paddle/legacy/gserver/layers/AgentLayer.h
deleted file mode 100644
index a05eac5e704..00000000000
--- a/paddle/legacy/gserver/layers/AgentLayer.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * AgentLayer use as a virtual input of another layer in config,
- * before execute forward/backward, setRealLayer() should be
- * called to set one and only one real layer
- */
-class AgentLayer : public Layer {
- protected:
-  LayerPtr realLayer_;
-  int numSamples_;
-
- public:
-  explicit AgentLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~AgentLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  // if *numSamples* set,
-  // real layer output will only use first *numSamples* rows
-  void setRealLayer(LayerPtr layer, int numSamples = 0) {
-    realLayer_ = layer;
-    numSamples_ = numSamples;
-  }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override {}
-};
-
-/**
- * Like AgentLayer, but it can gather many real layers. Each real
- * layer give a few rows of a sequence, after gather all real layers,
- * GatherAgentLayer collect a complete sequence.
- */
-class GatherAgentLayer : public Layer {
- protected:
-  std::vector<LayerPtr> realLayers_;
-  std::vector<IVectorPtr> idsVec_;
-  // we don't clear idsVec_ vector to aviod IVector alloc/free
-  IVectorPtr allIds_;
-  std::vector<int> idIndex_;
-
- public:
-  explicit GatherAgentLayer(const LayerConfig& config) : Layer(config) {}
-
-  virtual ~GatherAgentLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  // call before addRealLayer
-  void clearRealLayers() { realLayers_.clear(); }
-
-  void copyIdAndSequenceInfo(ICpuGpuVectorPtr sequenceStartPositions,
-                             ICpuGpuVectorPtr subSequenceStartPositions,
-                             const IVectorPtr& allIds,
-                             const std::vector<int>& idIndex);
-
-  // add one real layer, can call many times
-  void addRealLayer(LayerPtr layer) { realLayers_.push_back(layer); }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-  void forwardValue(PassType passType);
-  void forwardIds(PassType passType);
-};
-
-/**
- * Like AgentLayer, but only select a few rows in real layer.
- * [idIndex, idIndex + idSize) of *ids* in setRealLayerAndOutput()
- * are the selected row ids. It's used to scatter one layer's output
- * to many small submodels. ScatterAgentLayer can support ids real layer,
- * if it is, the agent will select a few ids in real layer.
- */
-class ScatterAgentLayer : public Layer {
- protected:
-  LayerPtr realLayer_;
-  IVectorPtr ids_;
-  IVectorPtr cpuIds_;
-  Argument realOutArg_;
-  int idIndex_;
-  int idSize_;
-  int seqStartPosIndex_;
-  int numSequences_;  // number of sequences in this scatterAgentLayer
-  bool handleBackward_;
-
-  // use to store expanded cpuStartPositions or subSequenceStartPositions
-  // of real layer.
-  ICpuGpuVectorPtr inputStartPos_;
-
-  // true for setRealLayer, false for setRealLayerAndOutput
-  bool selectionMode_;
-
- public:
-  explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {}
-
-  virtual ~ScatterAgentLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * @brief set real layer in generation
-   *
-   * @param layer[input]    realLayer
-   * @param ids[input]      row id in real layer
-   * @param copyId[input]   whether to copy a cpu version of ids,
-   *                        false(default) in ScatterAgentLayer, and
-   *                        true in SequenceScatterAgentLayer.
-   */
-  void setRealLayer(LayerPtr layer, const std::vector<int>& ids) {
-    realLayer_ = layer;
-    IVector::resizeOrCreate(ids_, ids.size(), useGpu_);
-    ids_->copyFrom(ids.data(), ids.size());
-    if (useGpu_) {
-      IVector::resizeOrCreate(cpuIds_, ids.size(), false);
-      cpuIds_->copyFrom(ids.data(), ids.size());
-    } else {
-      cpuIds_ = ids_;
-    }
-    selectionMode_ = true;
-  }
-
-  // set real layer and output, [idIndex, idIndex + idSize) of *ids*
-  // are selected row for realOutArg in realLayer
-  void setRealLayerAndOutput(LayerPtr layer,
-                             const Argument& outArg,
-                             const IVectorPtr& ids,
-                             int idIndex,
-                             int idSize,
-                             bool handleBackward) {
-    realLayer_ = layer;
-    realOutArg_ = outArg;
-    ids_ = ids;
-    idIndex_ = idIndex;
-    idSize_ = idSize;
-    handleBackward_ = handleBackward;
-    selectionMode_ = false;
-  }
-
-  void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions,
-                                 int seqStartPosIndex,
-                                 int numSequences) {
-    realOutArg_.sequenceStartPositions = sequenceStartPositions;
-    seqStartPosIndex_ = seqStartPosIndex;
-    numSequences_ = numSequences;
-  }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  void forwardWithSelection(PassType passType);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AverageLayer.cpp b/paddle/legacy/gserver/layers/AverageLayer.cpp
deleted file mode 100644
index 0539da79371..00000000000
--- a/paddle/legacy/gserver/layers/AverageLayer.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "AverageLayer.h"
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(average, AverageLayer);
-
-bool AverageLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  SequencePoolLayer::init(layerMap, parameterMap);
-
-  // average strategy
-  if (config_.average_strategy() == "average") {
-    mode_ = kAverage;
-  } else if (config_.average_strategy() == "sum") {
-    mode_ = kSum;
-  } else if (config_.average_strategy() == "squarerootn") {
-    mode_ = kAverageSquareRootN;
-  } else {
-    LOG(FATAL) << "Unknown average strategy: " << config_.average_strategy();
-  }
-  return true;
-}
-
-void AverageLayer::forward(PassType passType) {
-  SequencePoolLayer::forward(passType);
-
-  MatrixPtr inputValue = getInputValue(0);
-  getOutputValue()->sequenceAvgForward(
-      *inputValue, *startPositions_->getVector(useGpu_), mode_);
-
-  /* add the bias-vector AFTER average operation */
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */ { forwardActivation(); }
-}
-
-void AverageLayer::backward(const UpdateCallback& callback) {
-  SequencePoolLayer::backward(callback);
-
-  if (getInputGrad(0)) {
-    getInputGrad(0)->sequenceAvgBackward(
-        *getOutputGrad(), *startPositions_->getVector(useGpu_), mode_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AverageLayer.h b/paddle/legacy/gserver/layers/AverageLayer.h
deleted file mode 100644
index a0d457d35f4..00000000000
--- a/paddle/legacy/gserver/layers/AverageLayer.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "SequencePoolLayer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * A layer for "internal average" for sequence input.
- * Input: one or more sequences. Each sequence contains some instances.
- * If SequenceLevel = kNonSeq:
- *    Output: output size is the number of input sequences (NOT input instances)
- *    output[i] = average_{for each instance in this sequence}{input[i]}
- *    If stride_ > 0:
- *      Output: a shorten sequence. Stride is the step size by which we slide a
- *              window upon the input sequence, and the average pooling
- *              operation is then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *    Check input sequence must has sub-sequence
- *    Output: output size is the number of input sub-sequences
- *    output[i] = average_{for each instance in this sub-sequence}{input[i]}
- *
- * The config file api is pooling_layer.
- */
-class AverageLayer : public SequencePoolLayer {
- public:
-  enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
-  explicit AverageLayer(const LayerConfig& config)
-      : SequencePoolLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  int mode_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp b/paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp
deleted file mode 100644
index 4dcbd8dc270..00000000000
--- a/paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BatchNormBaseLayer.h"
-#include "BatchNormalizationLayer.h"
-#include "Layer.h"
-#include "paddle/legacy/utils/Stat.h"
-#ifdef PADDLE_WITH_CUDA
-#include "CudnnBatchNormLayer.h"
-#endif
-
-namespace paddle {
-
-bool BatchNormBaseLayer::init(const LayerMap& layerMap,
-                              const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!Layer::init(layerMap, parameterMap)) return false;
-
-  /* initialize the weightList */
-  // first is Input in configure
-  // other two is created in config_parser.py
-  CHECK_EQ(inputLayers_.size(), 3U);
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK_EQ(inputLayers_.size(), size_t(config_.inputs_size()));
-  const ImageConfig& conf = config_.inputs(0).image_conf();
-  channels_ = conf.channels();
-  calFeatureMapSize();
-
-  if (config_.has_use_global_stats()) {
-    useGlobalStats_ = config_.use_global_stats();
-  }
-  movingAvgFraction_ = config_.moving_average_fraction();
-  epsilon_ = config_.epsilon();
-
-  weight_.reset(new Weight(1, channels_, parameters_[0]));
-  movingMean_.reset(new Weight(1, channels_, parameters_[1]));
-  movingVar_.reset(new Weight(1, channels_, parameters_[2]));
-
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, channels_, biasParameter_));
-  }
-
-  savedMean_ = Matrix::create(1, channels_, false, useGpu_);
-  savedInvVar_ = Matrix::create(1, channels_, false, useGpu_);
-  savedMean_->zeroMem();
-  savedInvVar_->zeroMem();
-
-  return true;
-}
-
-void BatchNormBaseLayer::calFeatureMapSize() {
-  const ImageConfig& conf = config_.inputs(0).image_conf();
-  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  imageD_ = inputLayers_[0]->getOutput().getFrameDepth();
-
-  if (0 == imageD_) imageD_ = conf.img_size_z();
-  if (imageH_ == 0 && imageW_ == 0) {
-    imageH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-    imageW_ = conf.img_size();
-  } else {
-    getOutput().setFrameHeight(imageH_);
-    getOutput().setFrameWidth(imageW_);
-    getOutput().setFrameDepth(imageD_);
-  }
-  imgPixels_ = imageH_ * imageW_ * imageD_;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BatchNormBaseLayer.h b/paddle/legacy/gserver/layers/BatchNormBaseLayer.h
deleted file mode 100644
index 8dc1d788376..00000000000
--- a/paddle/legacy/gserver/layers/BatchNormBaseLayer.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief Batch normalization layer use to normalizes the input to across the
- * batch.
- *
- * By default, calculating global mean and variance statistics via a running
- * average in the training peroid. Then the pre-calculated global mean and
- * variance are used for testing.
- *
- * Moving mean and variance are located in Parameter object when constructing
- * and the calculation will change them. Now we only save global mean and
- * variance of one thread in first node for GPU.
- * But the calculation in CPU is different, because parameters are shared by
- * multiple threads. Here using ShareCpuMatrix with lock to calculate. We
- * still save global mean and variance in first node in CPU when multi machine.
- *
- * [1] S. Ioffe and C. Szegedy, "Batch Normalization: Accelerating Deep Network
- *     Training by Reducing Internal Covariate Shift." arXiv preprint
- *     arXiv:1502.03167 (2015).
- */
-
-class BatchNormBaseLayer : public Layer {
- public:
-  explicit BatchNormBaseLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~BatchNormBaseLayer() {}
-
-  /**
-   * @brief Create BatchNorm layer by norm_type, including batch_norm and
-   * cudnn_batch_norm. If do not set norm_type, it will automatically select
-   * cudnn_batch_norm for GPU and batch_norm for CPU.
-   */
-  static Layer* create(const LayerConfig& config);
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * @brief Calculate feature map size. Some input uses frameHeight and
-   * frameWidth to store feature size
-   */
-  void calFeatureMapSize();
-
- protected:
-  /// Batch normalization scale parameter, which is referred to as gamma in
-  /// in original paper.
-  std::unique_ptr<Weight> weight_;
-  /// Moving average of mean.
-  std::unique_ptr<Weight> movingMean_;
-  /// Moving average of variance.
-  std::unique_ptr<Weight> movingVar_;
-  /// Batch normalization bias parameter, which is referred to as beta in
-  /// in original paper.
-  std::unique_ptr<Weight> biases_;
-
-  /// Save intermediate results computed during the forward pass,
-  /// these can then be reused to speed up the backward pass.
-  MatrixPtr savedMean_;
-  MatrixPtr savedInvVar_;
-
-  /// Height or width of input image feature.
-  /// Both of them are 1 if the input is fully-connected layer.
-  int imageD_;
-  int imageH_;
-  int imageW_;
-  /// Height * Width.
-  int imgPixels_;
-  /// Feature dimension. If the input layer is conv layer, it is the channels
-  /// of feature map of the conv layer. If the input layer is fully-connected
-  /// layer, it is the dimension of fc layer.
-  int channels_;
-  // if useGlobalStats_ is true, will use the loaded mean and variance.
-  // otherwise, calculate mean and variance in this mini-batch.
-  bool useGlobalStats_;
-  // use to compute moving mean and variance.
-  real movingAvgFraction_;
-  // Epsilon is a small random noise used in batch normalization for stability.
-  real epsilon_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp b/paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp
deleted file mode 100644
index 0297bd44c7b..00000000000
--- a/paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp
+++ /dev/null
@@ -1,266 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Stat.h"
-#ifdef PADDLE_WITH_CUDA
-#include "hl_batch_transpose.h"
-#endif
-#include "BatchNormalizationLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(batch_norm, BatchNormalizationLayer);
-
-bool BatchNormalizationLayer::init(const LayerMap& layerMap,
-                                   const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!BatchNormBaseLayer::init(layerMap, parameterMap)) return false;
-
-  return true;
-}
-
-void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
-  int numSamples = mat->getHeight();
-  Matrix::resizeOrCreate(tmpMat_, numSamples, channels_, false, useGpu_);
-  savedMean_->zeroMem();
-  savedMean_->accumulateColSum(*mat);
-  savedMean_->mulScalar(1.0 / numSamples);  // E[x]
-
-  tmpMat_->assign(*mat);
-  tmpMat_->square2();
-  savedInvVar_->zeroMem();
-  savedInvVar_->accumulateColSum(*tmpMat_);
-  savedInvVar_->mulScalar(1.0 / numSamples);   // E[x^2]
-  savedInvVar_->addSquare(*savedMean_, -1.0);  // E[x^2] - E^2[x]
-
-  // Variance may be small negative value
-  // because of the subtraction operation.
-  // Here using clipping.
-  savedInvVar_->downClip(real(0.0));
-
-  calMovingMeanAndVar();
-
-  savedInvVar_->subScalar(-epsilon_);
-  savedInvVar_->sqrt2(*savedInvVar_);
-}
-
-void BatchNormalizationLayer::calMovingMeanAndVar() {
-  // calculating and saving moving mean and variance
-  auto& movingMean = movingMean_->getW();
-  auto& movingVar = movingVar_->getW();
-  // movingMean =  movingMean * movingAvgFraction_
-  //            + savedMean_ * (1 - movingAvgFraction_)
-  movingMean->add(*savedMean_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-  // movingVar =  movingVar * movingAvgFraction_
-  //           + savedInvVar_ * (1 - movingAvgFraction_)
-  movingVar->add(*savedInvVar_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-}
-
-void BatchNormalizationLayer::setMeanAndStd() {
-  savedMean_->copyFrom(*(movingMean_->getW()));
-  savedInvVar_->copyFrom(*(movingVar_->getW()));
-  savedInvVar_->downClip(real(0.0));
-
-  savedInvVar_->subScalar(-epsilon_);
-  savedInvVar_->sqrt2(*savedInvVar_);
-}
-
-void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
-  CHECK_EQ(in->getWidth(), static_cast<size_t>(channels_ * imgPixels_));
-  CHECK_EQ(out->getWidth(), static_cast<size_t>(channels_));
-  CHECK(!in->isTransposed());
-  CHECK(!out->isTransposed());
-  if (imgPixels_ == 1) {
-    out->assign(*in);
-    return;
-  }
-  size_t batchSize = in->getHeight();
-  CHECK_EQ(out->getHeight(), batchSize * imgPixels_);
-  if (useGpu_) {
-#ifndef PADDLE_WITH_CUDA
-    LOG(FATAL) << "paddle is compiled only for cpu";
-#else
-    batchTranspose(
-        in->getData(), out->getData(), imgPixels_, channels_, batchSize);
-#endif
-  } else {
-    for (size_t i = 0; i < batchSize; i++) {
-      const MatrixPtr inTmp =
-          Matrix::create(in->getData() + i * imgPixels_ * channels_,
-                         channels_,
-                         imgPixels_,
-                         false,
-                         useGpu_);
-      MatrixPtr outTmp =
-          Matrix::create(out->getData() + i * imgPixels_ * channels_,
-                         imgPixels_,
-                         channels_,
-                         false,
-                         useGpu_);
-      inTmp->transpose(outTmp, false);
-    }
-  }
-}
-
-void BatchNormalizationLayer::shrinkMat(const MatrixPtr& in, MatrixPtr& out) {
-  CHECK_EQ(in->getWidth(), static_cast<size_t>(channels_));
-  CHECK_EQ(out->getWidth(), static_cast<size_t>(channels_ * imgPixels_));
-  size_t batchSize = out->getHeight();
-  CHECK(!in->isTransposed());
-  CHECK(!out->isTransposed());
-  if (imgPixels_ == 1) {
-    out->assign(*in);
-    return;
-  }
-  CHECK_EQ(in->getHeight(), static_cast<size_t>(batchSize * imgPixels_));
-  if (useGpu_) {
-#ifndef PADDLE_WITH_CUDA
-    LOG(FATAL) << "paddle is compiled only for cpu";
-#else
-    batchTranspose(
-        in->getData(), out->getData(), channels_, imgPixels_, batchSize);
-#endif
-  } else {
-    for (size_t i = 0; i < batchSize; i++) {
-      const MatrixPtr inTmp =
-          Matrix::create(in->getData() + i * channels_ * imgPixels_,
-                         imgPixels_,
-                         channels_,
-                         false,
-                         useGpu_);
-      MatrixPtr outTmp =
-          Matrix::create(out->getData() + i * imgPixels_ * channels_,
-                         channels_,
-                         imgPixels_,
-                         useGpu_);
-      inTmp->transpose(outTmp, false);
-    }
-  }
-}
-
-void BatchNormalizationLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInputValue(0)->getHeight();
-  calFeatureMapSize();
-  resetOutput(batchSize, getInputValue(0)->getWidth());
-
-  // for testing in training peroid.
-  useGlobalStats_ = (passType == PASS_TEST);
-  if (passType == PASS_TEST && config_.has_use_global_stats()) {
-    useGlobalStats_ = config_.use_global_stats();
-  }
-
-  Matrix::resizeOrCreate(
-      expandedIn_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      normIn_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      expandedOut_, batchSize * imgPixels_, channels_, false, useGpu_);
-  expandMat(getInputValue(0), expandedIn_);
-
-  if (useGlobalStats_) {
-    if (firstTest_) {
-      setMeanAndStd();
-      firstTest_ = false;
-    }
-  } else {
-    calMeanAndStd(expandedIn_);
-    firstTest_ = true;
-  }
-
-  normIn_->assign(*expandedIn_);
-  normIn_->addBias(*savedMean_, -1);     // subtract mean.
-  normIn_->divRowVector(*savedInvVar_);  // divide std.
-
-  expandedOut_->assign(*normIn_);
-  expandedOut_->mulRowVector(*weight_->getW());  // multiple gamma.
-  if (biases_) {
-    expandedOut_->addBias(*(biases_->getW()), 1);  // add beta.
-  }
-  MatrixPtr out = getOutputValue();
-  shrinkMat(expandedOut_, out);
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void BatchNormalizationLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-  int batchSize = getInputValue(0)->getHeight();
-
-  Matrix::resizeOrCreate(meanGrad_, 1, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(stdGrad_, 1, channels_, false, useGpu_);
-
-  Matrix::resizeOrCreate(
-      expandedInGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      inGrad_, batchSize, imgPixels_ * channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      normInGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      expandedOutGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      tmpMat_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      tmpGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
-
-  expandMat(getOutputGrad(), expandedOutGrad_);
-
-  // compute derivatives.
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*expandedOutGrad_, 1);
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-  if (weight_->getWGrad()) {
-    tmpMat_->dotMul(*expandedOutGrad_, *normIn_);
-    weight_->getWGrad()->collectBias(*tmpMat_, 1);
-  }
-
-  // compute input gradients.
-  normInGrad_->assign(*expandedOutGrad_);
-  normInGrad_->mulRowVector(*(weight_->getW()));  // multiple gamma.
-  // normInGrad * (x - \mu)/ \sqrt(\delta^2)
-  tmpMat_->dotMul(*normInGrad_, *normIn_);
-  stdGrad_->zeroMem();
-  stdGrad_->collectBias(*tmpMat_, -1.0 / (batchSize * imgPixels_));
-  tmpGrad_->assign(*normIn_);
-  tmpGrad_->mulRowVector(*stdGrad_);
-
-  meanGrad_->zeroMem();
-  meanGrad_->collectBias(*normInGrad_, -1.0 / (batchSize * imgPixels_));
-
-  expandedInGrad_->zeroMem();
-  expandedInGrad_->add(*normInGrad_, *tmpGrad_);
-  expandedInGrad_->addRowVector(*meanGrad_);
-  expandedInGrad_->divRowVector(*savedInvVar_);
-
-  shrinkMat(expandedInGrad_, inGrad_);
-  if (getInputGrad(0)) {
-    getInputGrad(0)->add(*getInputGrad(0), *inGrad_);
-  }
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BatchNormalizationLayer.h b/paddle/legacy/gserver/layers/BatchNormalizationLayer.h
deleted file mode 100644
index e5e4e690b60..00000000000
--- a/paddle/legacy/gserver/layers/BatchNormalizationLayer.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "BatchNormBaseLayer.h"
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * @brief A Inheritance class of Batch normalization layer.
- * It supports both CPU and GPU.
- *
- * The config file api is batch_norm_layer.
- */
-
-class BatchNormalizationLayer : public BatchNormBaseLayer {
- public:
-  explicit BatchNormalizationLayer(const LayerConfig& config)
-      : BatchNormBaseLayer(config), firstTest_(true) {}
-
-  ~BatchNormalizationLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  /// Load pre-calculated mean and std.
-  void setMeanAndStd();
-
-  /// Calculate mean and std.
-  void calMeanAndStd(const MatrixPtr& mat);
-
-  /// Calculate moving mean and variance.
-  void calMovingMeanAndVar();
-
-  /// expand a Matrix from batch, channels* imagePixels to
-  /// batch * ImagePixels * channels.
-  void expandMat(const MatrixPtr& in, MatrixPtr& out);
-
-  /// Shrink a Matrix from  from batch * ImagePixels * channels
-  /// to batch, channels* imagePixels.
-  void shrinkMat(const MatrixPtr& in, MatrixPtr& out);
-
-  void onPassEnd() override { firstTest_ = true; }
-
-  MatrixPtr tmpMat_, tmpGrad_;
-  MatrixPtr expandedIn_, expandedOut_;
-  MatrixPtr expandedInGrad_, expandedOutGrad_, inGrad_;
-  MatrixPtr normIn_, normInGrad_, meanGrad_, stdGrad_;
-
-  /// Load mean and variance only once flag.
-  bool firstTest_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BilinearInterpLayer.cpp b/paddle/legacy/gserver/layers/BilinearInterpLayer.cpp
deleted file mode 100644
index a091f51bc20..00000000000
--- a/paddle/legacy/gserver/layers/BilinearInterpLayer.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BilinearInterpLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(bilinear_interp, BilinearInterpLayer);
-
-size_t BilinearInterpLayer::getSize() {
-  inImgH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  inImgW_ = inputLayers_[0]->getOutput().getFrameWidth();
-
-  const BilinearInterpConfig& conf = config_.inputs(0).bilinear_interp_conf();
-  if (inImgH_ == 0) {
-    inImgH_ = conf.image_conf().img_size_y();
-  }
-  if (inImgW_ == 0) {
-    inImgW_ = conf.image_conf().img_size();
-  }
-
-  outImgH_ = conf.out_size_y();
-  outImgW_ = conf.out_size_x();
-  numChannels_ = conf.image_conf().channels();
-
-  CHECK(outImgH_ > 0 && outImgW_ > 0);
-  CHECK(inImgH_ > 0 && inImgW_ > 0);
-  CHECK(numChannels_);
-
-  ratioH_ =
-      (outImgH_ > 1) ? static_cast<real>(inImgH_ - 1) / (outImgH_ - 1) : 0.f;
-  ratioW_ =
-      (outImgW_ > 1) ? static_cast<real>(inImgW_ - 1) / (outImgW_ - 1) : 0.f;
-
-  getOutput().setFrameHeight(outImgH_);
-  getOutput().setFrameWidth(outImgW_);
-  return outImgH_ * outImgW_ * numChannels_;
-}
-
-bool BilinearInterpLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(1, config_.inputs_size());
-
-  return true;
-}
-
-void BilinearInterpLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t batchSize = getInput(0).getBatchSize();
-  size_t size = getSize();
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, size);
-  }
-
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwBilinearInterpTimer", getName().c_str());
-    outV->bilinearForward(*inV,
-                          inImgH_,
-                          inImgW_,
-                          outImgH_,
-                          outImgW_,
-                          numChannels_,
-                          ratioH_,
-                          ratioW_);
-  }
-}
-
-void BilinearInterpLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  MatrixPtr inputG = getInputGrad(0);
-  MatrixPtr outG = getOutputGrad();
-  {
-    REGISTER_TIMER_INFO("BwBilinearInterpTimer", getName().c_str());
-    if (inputG) {
-      inputG->bilinearBackward(*outG,
-                               outImgH_,
-                               outImgW_,
-                               inImgH_,
-                               inImgW_,
-                               numChannels_,
-                               ratioH_,
-                               ratioW_);
-    }
-  }
-}
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BilinearInterpLayer.h b/paddle/legacy/gserver/layers/BilinearInterpLayer.h
deleted file mode 100644
index c585a5ed10d..00000000000
--- a/paddle/legacy/gserver/layers/BilinearInterpLayer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for bilinear interpolation which is
- *        used on conv layer output.
- *
- * @note  The config file api is bilinear_interp_layer.
- */
-class BilinearInterpLayer : public Layer {
- protected:
-  size_t outImgH_, outImgW_;
-  size_t inImgH_, inImgW_;
-  real ratioH_, ratioW_;
-  size_t numChannels_;
-
- public:
-  explicit BilinearInterpLayer(const LayerConfig& config) : Layer(config) {}
-
-  virtual ~BilinearInterpLayer() {}
-
-  size_t getSize();
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BlockExpandLayer.cpp b/paddle/legacy/gserver/layers/BlockExpandLayer.cpp
deleted file mode 100644
index 24b5af67d40..00000000000
--- a/paddle/legacy/gserver/layers/BlockExpandLayer.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BlockExpandLayer.h"
-
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-REGISTER_LAYER(blockexpand, BlockExpandLayer);
-
-bool BlockExpandLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(config_.inputs_size(), 1);
-  const BlockExpandConfig& blockConf = config_.inputs(0).block_expand_conf();
-  blockH_ = blockConf.block_y();
-  blockW_ = blockConf.block_x();
-  strideH_ = blockConf.stride_y();
-  strideW_ = blockConf.stride_x();
-  paddingH_ = blockConf.padding_y();
-  paddingW_ = blockConf.padding_x();
-  channels_ = blockConf.channels();
-  imgSizeH_ = blockConf.img_size_y();
-  imgSizeW_ = blockConf.img_size_x();
-
-  std::vector<size_t> strides = {(size_t)strideH_, (size_t)strideW_};
-  std::vector<size_t> paddings = {(size_t)paddingH_, (size_t)paddingW_};
-  std::vector<size_t> blocks = {(size_t)blockH_, (size_t)blockW_};
-  createFunction(forward_,
-                 "BlockExpand",
-                 FuncConfig()
-                     .set("strides", strides)
-                     .set("paddings", paddings)
-                     .set("blocks", blocks));
-  createFunction(backward_,
-                 "BlockExpandGrad",
-                 FuncConfig()
-                     .set("strides", strides)
-                     .set("paddings", paddings)
-                     .set("blocks", blocks));
-
-  return true;
-}
-
-size_t BlockExpandLayer::getBlockNum() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  const BlockExpandConfig& blockConf = config_.inputs(0).block_expand_conf();
-  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imgSizeH_ == 0) {
-    imgSizeH_ = blockConf.img_size_y();
-  }
-  if (imgSizeW_ == 0) {
-    imgSizeW_ = blockConf.img_size_x();
-  }
-  size_t tmpH = 2 * paddingH_ + imgSizeH_ - blockH_;
-  outputH_ = (int)tmpH < 0 ? 1 : 1 + (tmpH + strideH_ - 1) / strideH_;
-  size_t tmpW = 2 * paddingW_ + imgSizeW_ - blockW_;
-  outputW_ = (int)tmpW < 0 ? 1 : 1 + (tmpW + strideW_ - 1) / strideW_;
-
-  return outputH_ * outputW_;
-}
-
-void BlockExpandLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  size_t blockNum = getBlockNum();
-  size_t blockSize = blockH_ * blockW_ * channels_;
-  resetOutput(blockNum * batchSize, blockSize);
-
-  // calculate output_.value
-  inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
-  outputShape_ = TensorShape({batchSize, blockNum, blockSize});
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), inputShape_);
-  outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO);
-  forward_[0]->calc(inputs, outputs);
-
-  // calculate output_.sequenceStartPositions and output_.cpuSequenceDims
-  Argument& out = getOutput();
-  ICpuGpuVector::resizeOrCreate(
-      out.sequenceStartPositions, batchSize + 1, false);
-  IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false);
-  int* start = out.sequenceStartPositions->getMutableData(false);
-  int* dims = out.cpuSequenceDims->getData();
-  for (size_t i = 0; i < batchSize; i++) {
-    start[i] = i * blockNum;
-    dims[2 * i] = outputH_;
-    dims[2 * i + 1] = outputW_;
-  }
-  start[batchSize] = batchSize * blockNum;
-}
-
-void BlockExpandLayer::backward(const UpdateCallback& callback) {
-  /* Calculate the input layers error */
-  if (getInputGrad(0)) {
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*getOutputGrad(), outputShape_);
-    outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO);
-    backward_[0]->calc(inputs, outputs);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BlockExpandLayer.h b/paddle/legacy/gserver/layers/BlockExpandLayer.h
deleted file mode 100644
index 8b90249bfb0..00000000000
--- a/paddle/legacy/gserver/layers/BlockExpandLayer.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Expand feature map to minibatch matrix.
- * - matrix width is: blockH_ * blockW_ * channels_
- * - matirx height is: outputH_ * outputW_
- *
- * \f[
- * outputH\_ = 1 + (2 * paddingH\_ + imgSizeH\_ - blockH\_ + strideH\_ - 1) /
- *             strideH\_ \\
- * outputW\_ = 1 + (2 * paddingW\_ + imgSizeW\_ - blockW\_ + strideW\_ - 1) /
- *             strideW\_
- * \f]
- *
- * The expand method is the same with ExpandConvLayer, but saved the transposed
- * value. After expanding, output_.sequenceStartPositions will store timeline.
- * The number of time steps are outputH_ * outputW_ and the dimension of each
- * time step is blockH_ * blockW_ * channels_. This layer can be used after
- * convolution neural network, and before recurrent neural network.
- *
- * The config file api is block_expand_layer.
- */
-class BlockExpandLayer : public Layer {
- protected:
-  /**
-   * @brief Calculate outputH_ and outputW_ and return block number which
-   * actually is time steps.
-   * @return time steps, outoutH_ * outputW_.
-   */
-  size_t getBlockNum();
-  size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_;
-  size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_;
-
-  TensorShape inputShape_;
-  TensorShape outputShape_;
-
- public:
-  explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~BlockExpandLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CRFDecodingLayer.cpp b/paddle/legacy/gserver/layers/CRFDecodingLayer.cpp
deleted file mode 100644
index 4afed7e2956..00000000000
--- a/paddle/legacy/gserver/layers/CRFDecodingLayer.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CRFDecodingLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(crf_decoding, CRFDecodingLayer);
-
-bool CRFDecodingLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  if (!CRFLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  crf_.reset(new LinearChainCRF(
-      numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData()));
-  return true;
-}
-
-void CRFDecodingLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  CHECK(!useGpu_) << "GPU is not supported";
-
-  const Argument& output = getInput(0);
-  CHECK(output.sequenceStartPositions);
-
-  size_t batchSize = output.getBatchSize();
-  size_t numSequences = output.sequenceStartPositions->getSize() - 1;
-
-  IVector::resizeOrCreate(output_.ids, batchSize, useGpu_);
-  const int* starts = output.sequenceStartPositions->getData(false);
-  CHECK_EQ(starts[numSequences], (int)batchSize);
-
-  for (size_t i = 0; i < numSequences; ++i) {
-    crf_->decode(output.value->getData() + numClasses_ * starts[i],
-                 output_.ids->getData() + starts[i],
-                 starts[i + 1] - starts[i]);
-  }
-
-  if (inputLayers_.size() == 2) {
-    const Argument& label = getInput(1);
-    resizeOutput(batchSize, 1);
-    CHECK(label.ids);
-    real* error = output_.value->getData();
-    int* ids = label.ids->getData();
-    int* result = output_.ids->getData();
-    for (size_t i = 0; i < batchSize; ++i) {
-      error[i] = ids[i] == result[i] ? 0 : 1;
-    }
-  }
-}
-
-void CRFDecodingLayer::backward(const UpdateCallback& callback) {
-  parameter_->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CRFDecodingLayer.h b/paddle/legacy/gserver/layers/CRFDecodingLayer.h
deleted file mode 100644
index 018162e146f..00000000000
--- a/paddle/legacy/gserver/layers/CRFDecodingLayer.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-
-#include "CRFLayer.h"
-#include "LinearChainCRF.h"
-
-namespace paddle {
-
-/**
- * A layer for calculating the decoding sequence of sequential conditional
- * random field model.
- * The decoding sequence is stored in output_.ids
- * It also calculate error, output_.value[i] is 1 for incorrect decoding
- * or 0 for correct decoding)
- * See LinearChainCRF.h for the detail of the CRF formulation.
- */
-class CRFDecodingLayer : public CRFLayer {
- public:
-  explicit CRFDecodingLayer(const LayerConfig& config) : CRFLayer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
- protected:
-  std::unique_ptr<LinearChainCRF> crf_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CRFLayer.cpp b/paddle/legacy/gserver/layers/CRFLayer.cpp
deleted file mode 100644
index 8b87a533a2b..00000000000
--- a/paddle/legacy/gserver/layers/CRFLayer.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CRFLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(crf, CRFLayer);
-
-bool CRFLayer::init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  if (config_.type() == "crf") {
-    CHECK_GE(inputLayers_.size(), 2UL);
-    // the third output is sequence weight. one weight for each sequence
-    CHECK_LE(inputLayers_.size(), 3UL);
-  }
-
-  // coeff only affect bp, keep consistent with CostLayer
-  coeff_ = config_.coeff();
-  if (inputLayers_.size() == 3) {
-    weightLayer_ = inputLayers_[2];
-  }
-
-  numClasses_ = inputLayers_[0]->getSize();
-
-  CHECK_GE(numClasses_, 2UL);
-
-  CHECK_EQ(parameters_[0]->getSize(), numClasses_ * (numClasses_ + 2));
-
-  parameter_ = parameters_[0];
-  weight_.reset(new Weight(numClasses_ + 2, numClasses_, parameter_));
-
-  // We don't need sequenceStartPositions because each sample of output_ is
-  // for the cost of one sequence.
-  setNeedSequenceInfo(false);
-
-  return true;
-}
-
-void CRFLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  CHECK(!useGpu_) << "GPU is not supported";
-
-  const Argument& output = getInput(0);
-  const Argument& label = getInput(1);
-  CHECK(label.sequenceStartPositions);
-  CHECK(label.ids);
-
-  int batchSize = output.getBatchSize();
-  size_t numSequences = label.sequenceStartPositions->getSize() - 1;
-  resizeOutput(numSequences, 1);
-
-  const int* starts = label.sequenceStartPositions->getData(false);
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  for (size_t i = 0; i < numSequences; ++i) {
-    if (i >= crfs_.size()) {
-      crfs_.emplace_back(numClasses_, weight_->getW()->getData());
-    }
-    output_.value->getData()[i] =
-        crfs_[i].forward(output.value->getData() + numClasses_ * starts[i],
-                         label.ids->getData() + starts[i],
-                         starts[i + 1] - starts[i]);
-  }
-
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    getOutputValue()->dotMul(*getOutputValue(), *weight);
-  }
-}
-
-void CRFLayer::backward(const UpdateCallback& callback) {
-  const Argument& output = getInput(0);
-  const Argument& label = getInput(1);
-  const int* starts = label.sequenceStartPositions->getData(false);
-  int numSequences = label.sequenceStartPositions->getSize() - 1;
-
-  bool needWGrad = weight_->getWGrad() ? true : false;
-  for (int i = 0; i < numSequences; ++i) {
-    crfs_[i].backward(output.value->getData() + numClasses_ * starts[i],
-                      label.ids->getData() + starts[i],
-                      starts[i + 1] - starts[i],
-                      needWGrad);
-    real instanceWeight = weightLayer_
-                              ? getInputValue(*weightLayer_)->getElement(i, 0)
-                              : real(1.0f);
-    instanceWeight *= coeff_;
-
-    if (output.grad) {
-      MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
-      grad->add(*crfs_[i].getXGrad(), real(1.0f), instanceWeight);
-    }
-    if (needWGrad) {
-      weight_->getWGrad()->add(
-          *crfs_[i].getWGrad(), real(1.0f), instanceWeight);
-    }
-  }
-
-  parameter_->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CRFLayer.h b/paddle/legacy/gserver/layers/CRFLayer.h
deleted file mode 100644
index 88c2ed343ad..00000000000
--- a/paddle/legacy/gserver/layers/CRFLayer.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-
-#include "Layer.h"
-#include "LinearChainCRF.h"
-
-namespace paddle {
-
-/**
- * A layer for calculating the cost of sequential conditional random field
- * model.
- * See class LinearChainCRF for the detail of the CRF formulation.
- */
-class CRFLayer : public Layer {
- public:
-  explicit CRFLayer(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
- protected:
-  size_t numClasses_;
-  ParameterPtr parameter_;
-  std::vector<LinearChainCRF> crfs_;
-  LayerPtr weightLayer_;            // weight for each sequence
-  std::unique_ptr<Weight> weight_;  // parameters
-  real coeff_;                      // weight for the layer
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CTCLayer.cpp b/paddle/legacy/gserver/layers/CTCLayer.cpp
deleted file mode 100644
index 64eb15cd0dd..00000000000
--- a/paddle/legacy/gserver/layers/CTCLayer.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CTCLayer.h"
-
-/* Please reference the Chapter7  in
- * "Alex graves, Supervised Sequence Labelling with
- * Recurrent Neural Networks" */
-namespace paddle {
-REGISTER_LAYER(ctc, CTCLayer);
-
-bool CTCLayer::init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2UL);
-
-  /* The inputLayers_[0] must be softmax output */
-  numClasses_ = inputLayers_[0]->getSize();
-  normByTimes_ = config_.norm_by_times();
-  CHECK_GE(numClasses_, 2UL);
-
-  // We don't need sequenceStartPositions because each sample of output_ is
-  // for the cost of one sequence.
-  setNeedSequenceInfo(false);
-  if (useGpu_) {
-    tmpCpuInput_.reserve(inputLayers_.size());
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_.push_back(Argument());
-    }
-  }
-  return true;
-}
-
-void CTCLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  if (useGpu_) {
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(
-          getInput(i), false, HPPL_STREAM_DEFAULT);
-    }
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-    forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]);
-  } else {
-    forwardImp(getInput(0), getInput(1));
-  }
-}
-
-void CTCLayer::forwardImp(const Argument& softmaxSeqs,
-                          const Argument& labelSeqs) {
-  CHECK(softmaxSeqs.sequenceStartPositions);
-  CHECK(labelSeqs.sequenceStartPositions);
-  CHECK(labelSeqs.ids);
-
-  size_t numSequences = labelSeqs.sequenceStartPositions->getSize() - 1;
-  CHECK_EQ(numSequences, softmaxSeqs.sequenceStartPositions->getSize() - 1);
-
-  resizeOutput(numSequences, 1);
-  std::vector<real> out(numSequences);
-
-  const int* labelSeqsStarts = labelSeqs.sequenceStartPositions->getData(false);
-  const int* softmaxSeqsStarts =
-      softmaxSeqs.sequenceStartPositions->getData(false);
-
-  for (size_t i = 0; i < numSequences; i++) {
-    if (i >= ctcs_.size()) {
-      ctcs_.emplace_back(numClasses_, normByTimes_);
-    }
-    out[i] = ctcs_[i].forward(
-        softmaxSeqs.value->getData() + numClasses_ * softmaxSeqsStarts[i],
-        softmaxSeqsStarts[i + 1] - softmaxSeqsStarts[i],
-        labelSeqs.ids->getData() + labelSeqsStarts[i],
-        labelSeqsStarts[i + 1] - labelSeqsStarts[i]);
-  }
-  output_.value->copyFrom(out.data(), numSequences);
-}
-
-void CTCLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  if (useGpu_) {
-    backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]);
-    const_cast<Argument&>(getInput(0))
-        .resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT);
-    const_cast<Argument&>(getInput(1))
-        .resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT);
-  } else {
-    backwardImp(callback, getInput(0), getInput(1));
-  }
-}
-
-void CTCLayer::backwardImp(const UpdateCallback& callback,
-                           const Argument& softmaxSeqs,
-                           const Argument& labelSeqs) {
-  size_t numSequences = labelSeqs.sequenceStartPositions->getSize() - 1;
-
-  const int* labelSeqsStarts = labelSeqs.sequenceStartPositions->getData(false);
-  const int* softmaxSeqsStarts =
-      softmaxSeqs.sequenceStartPositions->getData(false);
-
-  for (size_t i = 0; i < numSequences; ++i) {
-    ctcs_[i].backward(
-        softmaxSeqs.value->getData() + numClasses_ * softmaxSeqsStarts[i],
-        softmaxSeqs.grad->getData() + numClasses_ * softmaxSeqsStarts[i],
-        labelSeqs.ids->getData() + labelSeqsStarts[i],
-        labelSeqsStarts[i + 1] - labelSeqsStarts[i]);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CTCLayer.h b/paddle/legacy/gserver/layers/CTCLayer.h
deleted file mode 100644
index 5d70b1f4ceb..00000000000
--- a/paddle/legacy/gserver/layers/CTCLayer.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "LinearChainCTC.h"
-
-namespace paddle {
-
-class CTCLayer : public Layer {
- public:
-  explicit CTCLayer(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void forwardImp(const Argument& softmaxSeqs, const Argument& labelSeqs);
-  void backward(const UpdateCallback& callback) override;
-  void backwardImp(const UpdateCallback& callback,
-                   const Argument& softmaxSeqs,
-                   const Argument& labelSeqs);
-
- protected:
-  size_t numClasses_;
-  bool normByTimes_;
-  std::vector<LinearChainCTC> ctcs_;
-  std::vector<Argument> tmpCpuInput_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ClipLayer.cpp b/paddle/legacy/gserver/layers/ClipLayer.cpp
deleted file mode 100644
index 6aa3c8fe64f..00000000000
--- a/paddle/legacy/gserver/layers/ClipLayer.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * A layer for clipping the input value by the threshold.
- * \f[
- *   out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right)
- * \f]
- */
-
-class ClipLayer : public Layer {
- protected:
-  double min_;
-  double max_;
-
- public:
-  explicit ClipLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(clip, ClipLayer);
-
-bool ClipLayer::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1U);
-  auto layerConf = config_.inputs(0).clip_conf();
-  min_ = layerConf.min();
-  max_ = layerConf.max();
-  CHECK_LT(min_, max_);
-  return true;
-}
-
-void ClipLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-  resetOutput(inV->getHeight(), inV->getWidth());
-  MatrixPtr outV = getOutputValue();
-  outV->copyFrom(*inV);
-  outV->clip(min_, max_);
-}
-
-void ClipLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr inG = getInputGrad(0);
-  if (inG) {
-    MatrixPtr outV = getOutputValue();
-    MatrixPtr outG = getOutputGrad();
-    MatrixPtr tmpMtx;
-    Matrix::resizeOrCreate(
-        tmpMtx, outG->getHeight(), outG->getWidth(), false, useGpu_);
-    tmpMtx->clipDerivative(*inV, min_, max_);
-    inG->addDotMul(*outG, *tmpMtx, 1, 1);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConcatenateLayer.cpp b/paddle/legacy/gserver/layers/ConcatenateLayer.cpp
deleted file mode 100644
index ce3f2ca950b..00000000000
--- a/paddle/legacy/gserver/layers/ConcatenateLayer.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "Projection.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A concatenate layer has multiple input layers. It concatenates rows of
- * each input as one row for the output of this layer and apply activation.
- */
-class ConcatenateLayer : public Layer {
- public:
-  explicit ConcatenateLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ConcatenateLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(concat, ConcatenateLayer);
-
-bool ConcatenateLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!Layer::init(layerMap, parameterMap)) return false;
-
-  CHECK(!biasParameter_);
-
-  return true;
-}
-
-void ConcatenateLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  reserveOutput(batchSize, size);
-
-  const MatrixPtr& out = getOutputValue();
-  int offset = 0;
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const MatrixPtr& in = getInputValue(i);
-    size_t inSize = in->getWidth();
-    out->assignAtOffset(*in, offset);
-    offset += inSize;
-  }
-  CHECK_EQ(size, offset);
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void ConcatenateLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  /* Do activation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  const MatrixPtr& out = getOutputGrad();
-  int offset = 0;
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const MatrixPtr& in = getInputGrad(i);
-    size_t inSize = getInputValue(i)->getWidth();
-    if (in) {
-      in->addAtOffset(*out, offset);
-    }
-    offset += inSize;
-  }
-}
-
-/**
- * concat2 layer is like concat layer, but each input layer was
- * processed by a Projection.
- */
-class ConcatenateLayer2 : public Layer {
- public:
-  explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
-
-  ~ConcatenateLayer2() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  std::vector<std::unique_ptr<Projection>> projections_;
-  std::vector<Argument> projOutput_;
-  std::vector<std::pair<size_t, size_t>> projCol_;
-  bool sharedBias_;
-  std::unique_ptr<Weight> biases_;
-};
-
-REGISTER_LAYER(concat2, ConcatenateLayer2);
-
-bool ConcatenateLayer2::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!Layer::init(layerMap, parameterMap)) return false;
-
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  projections_.reserve(inputLayers_.size());
-  projCol_.reserve(inputLayers_.size());
-  projOutput_.resize(inputLayers_.size());
-
-  size_t startCol = 0;
-  size_t endCol = 0;
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    projections_.emplace_back(Projection::create(
-        config_.inputs(i).proj_conf(), parameters_[i], useGpu_));
-
-    endCol += projections_[i]->getOutputSize();
-    projCol_.push_back(std::make_pair(startCol, endCol));
-    startCol = endCol;
-  }
-  CHECK_EQ(getSize(), endCol);
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    sharedBias_ = config_.shared_biases();
-    size_t psize = config_.bias_size();
-    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
-  }
-
-  return true;
-}
-
-void ConcatenateLayer2::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  resetOutput(batchSize, size);
-
-  for (size_t i = 0; i < projections_.size(); i++) {
-    size_t startCol = projCol_[i].first;
-    size_t endCol = projCol_[i].second;
-    projOutput_[i].value = output_.value->subColMatrix(startCol, endCol);
-    if (output_.grad) {
-      projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
-    }
-  }
-
-  {
-    AsyncGpuBlock block;
-    for (size_t i = 0; i != inputLayers_.size(); ++i) {
-      projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
-    }
-  }
-
-  /* add the bias-vector */
-  if (biases_) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
-    output_.value->addBias(*(biases_->getW()), 1, sharedBias_);
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void ConcatenateLayer2::backward(const UpdateCallback& callback) {
-  /* Do activation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  AsyncGpuBlock block;
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("Concat2BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (projections_[i]) {
-      projections_[i]->backward(callback);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ContextProjection.cpp b/paddle/legacy/gserver/layers/ContextProjection.cpp
deleted file mode 100644
index 8bcf32663eb..00000000000
--- a/paddle/legacy/gserver/layers/ContextProjection.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ContextProjection.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_PROJECTION(context, ContextProjection);
-
-ContextProjection::ContextProjection(const ProjectionConfig& config,
-                                     ParameterPtr parameter,
-                                     bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(config.has_context_start());
-  CHECK(config.has_context_length());
-  if (config.context_start() == 0 && config.context_length() == 1) {
-    config_.set_trainable_padding(false);
-  }
-  if (config_.trainable_padding()) {
-    CHECK(parameter);
-    beginPad_ = std::max(0, -config.context_start());
-    endPad_ = std::max(0, config.context_start() + config.context_length() - 1);
-    size_t totalPad = beginPad_ + endPad_;
-    size_t inputDim = parameter->getSize() / totalPad;
-    CHECK_EQ(config.input_size(), inputDim);
-    CHECK_EQ(inputDim * totalPad, parameter->getSize());
-    weight_.reset(new Weight(totalPad, inputDim, parameter));
-  }
-  // init forward_ and backward_ functions
-  init();
-}
-
-bool ContextProjection::init() {
-  size_t context_length = config_.context_length();
-  int context_start = config_.context_start();
-  bool is_padding = config_.trainable_padding();
-  size_t total_pad = is_padding ? beginPad_ + endPad_ : 0;
-
-  createFunction(forward_,
-                 "ContextProjectionForward",
-                 FuncConfig()
-                     .set("context_length", context_length)
-                     .set("context_start", context_start)
-                     .set("begin_pad", beginPad_));
-  createFunction(backward_,
-                 "ContextProjectionBackward",
-                 FuncConfig()
-                     .set("context_length", context_length)
-                     .set("context_start", context_start)
-                     .set("begin_pad", beginPad_)
-                     .set("is_padding", is_padding)
-                     .set("total_pad", total_pad));
-
-  return true;
-}
-
-void ContextProjection::resetState() {
-  CHECK_LE(config_.context_start() + config_.context_length(), 1)
-      << "state is not allowed for future context";
-  if (config_.context_start() >= 0) return;
-  Matrix::resizeOrCreate(state_,
-                         -config_.context_start(),
-                         config_.input_size(),
-                         false,  // trans
-                         useGpu_);
-  Matrix::resizeOrCreate(state2_,
-                         -config_.context_start(),
-                         config_.input_size(),
-                         false,  // trans
-                         useGpu_);
-  if (config_.trainable_padding()) {
-    state_->assign(*weight_->getW()->subMatrix(0, -config_.context_start()));
-  } else {
-    state_->zeroMem();
-  }
-}
-
-void ContextProjection::setState(LayerStatePtr state) {
-  CHECK(state->value.size() == 1)
-      << "one matrix is expected for ContextProjection state";
-  state_->copyFrom(*(state->value[0]));
-}
-
-LayerStatePtr ContextProjection::getState() {
-  if (state_ == nullptr) {
-    return nullptr;
-  }
-  LayerStatePtr res = std::make_shared<LayerState>();
-  res->value.push_back(state_->clone(0, 0, false));
-  res->value[0]->copyFrom(*state_);
-  return res;
-}
-
-void ContextProjection::forward() {
-  CHECK(in_->value && out_->value);
-  CHECK(in_->sequenceStartPositions);
-
-  size_t input_dim = in_->value->getWidth();
-  size_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, input_dim * config_.context_length());
-  // size_t batch_size = in_->value->getHeight();
-  CHECK_EQ(forward_.size(), (size_t)1) << "Only one forward function here";
-
-  REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
-  bool is_padding = config_.trainable_padding();
-  /// first use state_, otherwise use weight_(padding false === w nullptr)
-  auto w_ptr =
-      state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
-  const auto start_pos = in_->sequenceStartPositions->getVector(useGpu_);
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*in_->value, *start_pos);
-  if (w_ptr) {
-    inputs.addArg(CpuMatrix(w_ptr->getData(), w_ptr->getHeight(), input_dim),
-                  *start_pos);
-  }
-  outputs.addArg(*out_->value, *start_pos, ADD_TO);
-  forward_[0]->calc(inputs, outputs);
-
-  if (state_ && config_.context_start() < 0) {
-    CHECK_EQ(1, in_->getNumSequences());
-    const int* starts = in_->sequenceStartPositions->getData(false);
-    int length = starts[1] - starts[0];
-    if (-config_.context_start() <= length) {
-      MatrixPtr sub = in_->value->subMatrix(starts[1] + config_.context_start(),
-                                            -config_.context_start());
-      state_->copyFrom(*sub);
-    } else {
-      int prevLength = -config_.context_start() - length;
-      state2_->subMatrix(0, prevLength)
-          ->copyFrom(*state_->subMatrix(length, prevLength));
-      state2_->subMatrix(prevLength, length)
-          ->copyFrom(*in_->value->subMatrix(starts[0], length));
-      std::swap(state_, state2_);
-    }
-  }
-}
-
-void ContextProjection::backward(const UpdateCallback& callback) {
-  CHECK(in_->value && out_->value && out_->grad);
-  size_t input_dim = in_->value->getWidth();
-  size_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, input_dim * config_.context_length());
-  size_t batch_size = in_->value->getHeight();
-  CHECK_EQ(batch_size, out_->value->getHeight());
-  CHECK_EQ(static_cast<int>(backward_.size()), 1)
-      << "Only one backward function here";
-
-  REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
-  bool is_padding = config_.trainable_padding();
-  auto start_pos = in_->sequenceStartPositions;
-  auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*out_->grad, *in_->sequenceStartPositions->getVector(useGpu_));
-  outputs.addArg(
-      CpuMatrix(
-          in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim),
-      *in_->sequenceStartPositions->getVector(useGpu_),
-      ADD_TO);
-  outputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
-                           w_ptr ? w_ptr->getHeight() : 0,
-                           input_dim),
-                 ADD_TO);
-  backward_[0]->calc(inputs, outputs);
-
-  if (config_.trainable_padding()) {
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ContextProjection.h b/paddle/legacy/gserver/layers/ContextProjection.h
deleted file mode 100644
index 9c217145419..00000000000
--- a/paddle/legacy/gserver/layers/ContextProjection.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * @brief Context projection concatenate features in adjacent time steps in
- * a sequence. The i-th row of the output is the concatenation of
- * context_length rows of the input. The context_length rows are the
- * consecutive rows from the i+shift_start row.
- *
- * For example, assumed input (x) has 4 words and the dimension of each word
- * representation is 2. If we use zero to pad instead of learned weight to pad,
- * and the context_lenth is 3, the output (y) is:
- *
- * @code
- *  x = [a1, a2;
- *       b1, b2;
- *       c1, c2;
- *       d1, d2]
- *  y = [0,  0,  a1, a2, b1, b2;
- *       a1, a2, b1, b2, c1, c2;
- *       b1, b2, c1, c2, d1, d2;
- *       c1, c2, d1, d2, 0,  0]
- * @endcode
- *
- * The config file api is context_projection.
- */
-class ContextProjection : public Projection {
- public:
-  /**
-   * Constructor. If context_start is zero and context_lenth is one, it will
-   * set trainable_padding false. trainable_padding is an optional arguments
-   * and if it is set, constructor will set learned weight, which is used to
-   * pad output.
-   */
-  ContextProjection(const ProjectionConfig& config,
-                    ParameterPtr parameter,
-                    bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
-  virtual void resetState();
-
-  virtual void setState(LayerStatePtr state);
-
-  virtual LayerStatePtr getState();
-
-  virtual bool init();
-
- protected:
-  std::unique_ptr<Weight> weight_;
-  /// number of extra timesteps added at the beginning
-  size_t beginPad_;
-  /// number of extra timesteps added at the end
-  size_t endPad_;
-  /// state_ and state2_ are used in sequence generating and saved
-  /// previous inputs.
-  MatrixPtr state_;
-  MatrixPtr state2_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Conv3DLayer.cpp b/paddle/legacy/gserver/layers/Conv3DLayer.cpp
deleted file mode 100644
index d072a74234b..00000000000
--- a/paddle/legacy/gserver/layers/Conv3DLayer.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Conv3DLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(conv3d, Conv3DLayer);
-
-bool Conv3DLayer::init(const LayerMap &layerMap,
-                       const ParameterMap &parameterMap) {
-  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
-  int index = 0;
-  for (auto &inputConfig : config_.inputs()) {
-    const ConvConfig &conf = inputConfig.conv_conf();
-    M_.push_back(numFilters_ / conf.groups());
-    K_.push_back(filterPixels_[index] * filterChannels_[index]);
-
-    // create a new weight
-    size_t height, width;
-    width = filterPixels_[index] * filterChannels_[index];
-    height = numFilters_;
-    CHECK_EQ(parameters_[index]->getSize(), width * height);
-    Weight *w = new Weight(height, width, parameters_[index]);
-    weights_.emplace_back(w);
-    ++index;
-  }
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
-    }
-  }
-  return true;
-}
-
-size_t Conv3DLayer::getSize() {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  outputH_.clear();
-  outputW_.clear();
-  outputD_.clear();
-  N_.clear();
-  size_t layerSize = 0;
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    outputW_.push_back(outputSize(
-        imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
-    outputH_.push_back(outputSize(
-        imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
-    outputD_.push_back(outputSize(
-        imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
-
-    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
-    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
-    layerSize += N_[i] * numFilters_;
-  }
-  getOutput().setFrameHeight(outputH_[0]);
-  getOutput().setFrameWidth(outputW_[0]);
-  getOutput().setFrameDepth(outputD_[0]);
-  return layerSize;
-}
-
-void Conv3DLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  int outWidth = getSize();
-  resetOutput(batchSize, outWidth);
-
-  REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const MatrixPtr &inMat = getInputValue(i);
-    const MatrixPtr &outMat = getOutputValue();
-    int M = M_[i];
-    int N = N_[i];
-    int K = K_[i];
-    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-    MatrixPtr wMat = weights_[i]->getW();
-    for (int n = 0; n < batchSize; ++n) {
-      colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
-                       channels_[i],
-                       imgSizeD_[i],
-                       imgSizeH_[i],
-                       imgSizeW_[i],
-                       filterSizeZ_[i],
-                       filterSizeY_[i],
-                       filterSize_[i],
-                       strideZ_[i],
-                       strideY_[i],
-                       stride_[i],
-                       paddingZ_[i],
-                       paddingY_[i],
-                       padding_[i]);
-
-      real *outData = outMat->getData() + n * outMat->getStride();
-      MatrixPtr outMatSub =
-          Matrix::create(outData, groups_[i] * M, N, false, useGpu_);
-      for (int g = 0; g < groups_[i]; g++) {
-        MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
-        MatrixPtr in = colBuf_->subMatrix(g * K, K);
-        MatrixPtr out = outMatSub->subMatrix(g * M, M);
-        out->mul(*wMatSub, *in, 1.0, 1.0);
-      }
-    }
-  }
-  if (nullptr != this->biasParameter_) {
-    this->addBias();
-  }
-  forwardActivation();
-}
-
-void Conv3DLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    bpropBiases();
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (weights_[i]->getWGrad()) {
-      bpropWeights(i);
-    }
-    if (getInputGrad(i)) {
-      bpropData(i);
-    }
-    weights_[i]->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-void Conv3DLayer::bpropWeights(int i) {
-  int M = M_[i];
-  int N = N_[i];
-  int K = K_[i];
-  const MatrixPtr &inMat = getInputValue(i);
-  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-  MatrixPtr wGradMat = weights_[i]->getWGrad();
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  for (int n = 0; n < batchSize; ++n) {
-    colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
-                     channels_[i],
-                     imgSizeD_[i],
-                     imgSizeH_[i],
-                     imgSizeW_[i],
-                     filterSizeZ_[i],
-                     filterSizeY_[i],
-                     filterSize_[i],
-                     strideZ_[i],
-                     strideY_[i],
-                     stride_[i],
-                     paddingZ_[i],
-                     paddingY_[i],
-                     padding_[i]);
-
-    real *outGradData =
-        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
-    MatrixPtr outGradSub =
-        Matrix::create(outGradData, groups_[i] * M, N, false, useGpu_);
-    for (int g = 0; g < groups_[i]; ++g) {
-      MatrixPtr inMatSub = colBuf_->subMatrix(g * K, K);
-      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
-      MatrixPtr wGradSub = wGradMat->subMatrix(g * M, M);
-      wGradSub->mul(*outG, *(inMatSub->getTranspose()), 1.0, 1.0);
-    }
-  }
-}
-
-void Conv3DLayer::bpropData(int i) {
-  int M = M_[i];
-  int N = N_[i];
-  int K = K_[i];
-  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-  MatrixPtr wMat = weights_[i]->getW();
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  for (int n = 0; n < batchSize; ++n) {
-    real *outGradData =
-        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
-    real *preGradData =
-        getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
-    MatrixPtr outGradSub =
-        Matrix::create(outGradData, M * groups_[i], N, false, useGpu_);
-    for (int g = 0; g < groups_[i]; ++g) {
-      MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
-      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
-      MatrixPtr inGradMatSub = colBuf_->subMatrix(g * K, K);
-      inGradMatSub->mul(*(wMatSub->getTranspose()), *outG, 1.0, 0.0);
-    }
-    colBuf_->col2Vol(preGradData,
-                     channels_[i],
-                     imgSizeD_[i],
-                     imgSizeH_[i],
-                     imgSizeW_[i],
-                     filterSizeZ_[i],
-                     filterSizeY_[i],
-                     filterSize_[i],
-                     strideZ_[i],
-                     strideY_[i],
-                     stride_[i],
-                     paddingZ_[i],
-                     paddingY_[i],
-                     padding_[i],
-                     1.0,
-                     1.0);
-  }
-}
-
-void Conv3DLayer::bpropBiases() {
-  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
-                                    1,
-                                    biases_->getWGrad()->getElementCnt(),
-                                    false,
-                                    useGpu_);
-  MatrixPtr outGradMat = getOutputGrad();
-
-  if (this->sharedBiases_) {
-    biases->collectSharedBias(*outGradMat, 1.0f);
-  } else {
-    biases->collectBias(*outGradMat, 1.0f);
-  }
-}
-
-void Conv3DLayer::addBias() {
-  MatrixPtr outMat = getOutputValue();
-  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
-                                  1,
-                                  biases_->getW()->getElementCnt(),
-                                  false,
-                                  useGpu_);
-  if (this->sharedBiases_) {
-    outMat->addSharedBias(*(bias), 1.0f);
-  } else {
-    outMat->addBias(*(bias), 1.0f);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Conv3DLayer.h b/paddle/legacy/gserver/layers/Conv3DLayer.h
deleted file mode 100644
index cb42a2f36d3..00000000000
--- a/paddle/legacy/gserver/layers/Conv3DLayer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A subclass of convolution layer.
- * This layer expands input and use matrix multiplication to
- * calculate convolution operation.
- */
-class Conv3DLayer : public ConvBaseLayer {
- public:
-  explicit Conv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
-  ~Conv3DLayer() {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void addBias();
-  void backward(const UpdateCallback& callback);
-  void bpropBiases();
-  void bpropData(int i);
-  void bpropWeights(int i);
-  size_t getSize();
-
- protected:
-  // Figure out the dimensions for individual gemms.
-  IntV M_;  /// numFilters_ / filter_group_;
-  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
-  IntV K_;  /// outputD_ * outputH_ * outputW_
-  MatrixPtr colBuf_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseLayer.cpp b/paddle/legacy/gserver/layers/ConvBaseLayer.cpp
deleted file mode 100644
index 76120915e48..00000000000
--- a/paddle/legacy/gserver/layers/ConvBaseLayer.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvBaseLayer.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/utils/Logging.h"
-namespace paddle {
-
-bool ConvBaseLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  isDeconv_ = (config_.type() == "exconv" || config_.type() == "cudnn_conv")
-                  ? false
-                  : true;
-
-  /* Initialize the convolutional layer parameter */
-  numFilters_ = config_.num_filters();
-  sharedBiases_ = config_.shared_biases();
-  for (auto& inputConfig : config_.inputs()) {
-    const ConvConfig& conf = inputConfig.conv_conf();
-    padding_.push_back(conf.padding());
-    stride_.push_back(conf.stride());
-    dilation_.push_back(conf.dilation());
-    filterSize_.push_back(conf.filter_size());
-    paddingY_.push_back(conf.padding_y());
-    strideY_.push_back(conf.stride_y());
-    dilationY_.push_back(conf.dilation_y());
-    filterSizeY_.push_back(conf.filter_size_y());
-    channels_.push_back(conf.channels());
-    imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
-                                              : conf.img_size());
-    imgSizeW_.push_back(conf.img_size());
-    groups_.push_back(conf.groups());
-    filterChannels_.push_back(conf.filter_channels());
-    outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
-    outputW_.push_back(conf.output_x());
-
-    paddingZ_.push_back(conf.padding_z());
-    strideZ_.push_back(conf.stride_z());
-    filterSizeZ_.push_back(conf.filter_size_z());
-    imgSizeD_.push_back(conf.img_size_z());
-    outputD_.push_back(conf.output_z());
-    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back() *
-                            filterSizeZ_.back());
-  }
-
-  CHECK(inputLayers_.size() == parameters_.size());
-
-  // create new weights_ in derived class
-  // create new biases_ in derived class
-
-  // default caffe model
-  caffeMode_ = true;
-
-  return true;
-}
-
-size_t ConvBaseLayer::calOutputSize() {
-  auto clearAndReserve = [this](IntV* vec) {
-    vec->clear();
-    vec->reserve(this->inputLayers_.size());
-  };
-  clearAndReserve(&imgSizeH_);
-  clearAndReserve(&imgSizeW_);
-  clearAndReserve(&outputH_);
-  clearAndReserve(&outputW_);
-  size_t layerSize = 0;
-
-  auto setLayerSize = [&](IntV& inH, IntV& inW, IntV& outH, IntV& outW) {
-    size_t filterSizeY;
-    size_t filterSize;
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      filterSizeY = (filterSizeY_[i] - 1) * dilationY_[i] + 1;
-      filterSize = (filterSize_[i] - 1) * dilation_[i] + 1;
-      inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
-      inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
-      const ConvConfig& conf = config_.inputs(i).conv_conf();
-      if (isDeconv_) {
-        if (inH[i] == 0)
-          inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x();
-        if (inW[i] == 0) inW[i] = conf.output_x();
-        outH.push_back(imageSize(
-            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
-        outW.push_back(
-            imageSize(inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
-      } else {
-        if (inH[i] == 0)
-          inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-        if (inW[i] == 0) inW[i] = conf.img_size();
-        outH.push_back(outputSize(
-            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
-        outW.push_back(outputSize(
-            inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
-      }
-      CHECK_EQ(outH[i], outH[0]);
-      CHECK_EQ(outW[i], outW[0]);
-    }
-    getOutput().setFrameHeight(outH[0]);
-    getOutput().setFrameWidth(outW[0]);
-    layerSize = outH[0] * outW[0] * size_t(numFilters_);
-  };
-
-  setLayerSize(imgSizeH_, imgSizeW_, outputH_, outputW_);
-
-  return layerSize;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseLayer.h b/paddle/legacy/gserver/layers/ConvBaseLayer.h
deleted file mode 100644
index 01e90e99962..00000000000
--- a/paddle/legacy/gserver/layers/ConvBaseLayer.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/MathUtils.h"
-namespace paddle {
-
-/**
- * @brief A Base Convolution Layer, which convolves the input image
- * with learned filters and (optionally) adds biases.
- */
-
-class ConvBaseLayer : public Layer {
- protected:
-  typedef std::vector<int> IntV;
-
-  /// True if it's deconv layer, false if it's convolution layer
-  bool isDeconv_;
-
-  /// The number of filters.
-  int numFilters_;
-  /// The x dimension of the padding.
-  IntV padding_;
-  /// The y dimension of the padding.
-  IntV paddingY_;
-  /// The x dimension of the stride.
-  IntV stride_;
-  /// The y dimension of the stride.
-  IntV strideY_;
-  /// The x dimension of the dilation.
-  IntV dilation_;
-  /// The y dimension of the dilation.
-  IntV dilationY_;
-  /// The x dimension of a filter kernel.
-  IntV filterSize_;
-  /// The y dimension of a filter kernel.
-  IntV filterSizeY_;
-  /// The spatial dimensions of the convolution input.
-  IntV channels_;
-  /// The spatial dimensions of input feature map height.
-  IntV imgSizeH_;
-  /// The spatial dimensions of input feature map width.
-  IntV imgSizeW_;
-  /// filterPixels_ = filterSizeX_ * filterSizeY_.
-  IntV filterPixels_;
-  /// filterChannels_ = channels_/groups_.
-  IntV filterChannels_;
-  /// The spatial dimensions of output feature map height.
-  IntV outputH_;
-  /// The spatial dimensions of output feature map width.
-  IntV outputW_;
-
-  IntV outputD_;
-  IntV imgSizeD_;
-  IntV filterSizeZ_;
-  IntV strideZ_;
-  IntV paddingZ_;
-
-  /// Group size, refer to grouped convolution in
-  /// Alex Krizhevsky's paper: when group=2, the first half of the
-  /// filters are only connected to the first half of the input channels,
-  /// and the second half only connected to the second half.
-  IntV groups_;
-  /// Whether the bias is shared for feature in each channel.
-  bool sharedBiases_;
-
-  /// shape of weight: (numChannels * filterPixels_, numFilters)
-  WeightList weights_;
-  /// If shared_biases is false shape of bias: (numFilters_, 1)
-  /// If shared_biases is ture shape of bias:
-  /// (numFilters_ * outputX * outputY, 1)
-  std::unique_ptr<Weight> biases_;
-
-  /// True by default. The only difference is the calculation
-  /// of output size.
-  bool caffeMode_;
-
- public:
-  explicit ConvBaseLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * imgSizeH_ and imgSizeW_ will be set according to the previous input layers
-   * in this function. Then it will calculate outputH_ and outputW_ and set them
-   * into output argument.
-   */
-  virtual size_t calOutputSize();
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseOperator.cpp b/paddle/legacy/gserver/layers/ConvBaseOperator.cpp
deleted file mode 100644
index e8e59b3bfe9..00000000000
--- a/paddle/legacy/gserver/layers/ConvBaseOperator.cpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvBaseOperator.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvBaseOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-ConvBaseOperator::ConvBaseOperator(const OperatorConfig &config, bool useGpu)
-    : Operator(config, useGpu) {
-  CHECK(useGpu);
-  CHECK_EQ(config_.input_indices_size(), 2L);
-
-  caffeMode_ = true;
-  getConvParams();
-  computeConvSizes();
-
-  // initialize all to default algorithms
-  fwdAlgo_ = 0;
-  bwdFilterAlgo_ = 0;
-  bwdDataAlgo_ = 0;
-  fwdLimitBytes_ = 0;
-  bwdDataLimitBytes_ = 0;
-  bwdFilterLimitBytes_ = 0;
-  workSpaceInBytes_ = 0;
-  workSpace_ = nullptr;
-
-  isSelectAlgo_ = false;
-}
-
-void ConvBaseOperator::allocConvWorkSpace() {
-  hl_conv_workspace(imageDesc_,
-                    outputDesc_,
-                    filterDesc_,
-                    convDesc_,
-                    &fwdAlgo_,
-                    &fwdLimitBytes_,
-                    &bwdDataAlgo_,
-                    &bwdDataLimitBytes_,
-                    &bwdFilterAlgo_,
-                    &bwdFilterLimitBytes_,
-                    /*useDilation*/ false);
-
-  size_t maxWorkSpace = 0;
-  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-
-  if (maxWorkSpace > workSpaceInBytes_) {
-    if (workSpaceInBytes_ != 0) {
-      hl_free_mem_device(workSpace_);
-    }
-    // total amount of storage needed
-    workSpace_ = hl_malloc_device(maxWorkSpace);
-    workSpaceInBytes_ = maxWorkSpace;
-  }
-}
-
-void ConvBaseOperator::computeConvSizes() {
-  hl_create_filter_descriptor(
-      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
-  hl_create_tensor_descriptor(&imageDesc_);
-  hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_,
-                                   imageDesc_,
-                                   filterDesc_,
-                                   paddingY_,
-                                   padding_,
-                                   strideY_,
-                                   stride_);
-}
-
-void ConvBaseOperator::reshapeImageDescriptors() {
-  hl_tensor_reshape(imageDesc_,
-                    1,
-                    channels_,
-                    imageH_,
-                    imageW_,
-                    channels_ * imageH_ * imageW_,
-                    imageH_ * imageW_,
-                    imageW_,
-                    1);
-  hl_tensor_reshape(outputDesc_,
-                    1,
-                    numFilters_,
-                    outputH_,
-                    outputW_,
-                    numFilters_ * outputH_ * outputW_,
-                    outputH_ * outputW_,
-                    outputW_,
-                    1);
-  hl_reset_convolution_descriptor(convDesc_,
-                                  imageDesc_,
-                                  filterDesc_,
-                                  paddingY_,
-                                  padding_,
-                                  strideY_,
-                                  stride_);
-}
-
-void ConvBaseOperator::getConvParams() {
-  configNumFilters_ = config_.num_filters();
-  const ConvConfig &conf = config_.conv_conf();
-  padding_ = conf.padding();
-  stride_ = conf.stride();
-  filterSize_ = conf.filter_size();
-  paddingY_ = conf.padding_y();
-  strideY_ = conf.stride_y();
-  filterSizeY_ = conf.filter_size_y();
-  filterPixels_ = filterSize_ * filterSizeY_;
-  configChannels_ = conf.channels();
-  imgSize_ = conf.img_size();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  imgPixels_ = imgSize_ * imgSizeY_;
-  CHECK_EQ(conf.groups(), 1U);
-  filterChannels_ = conf.filter_channels();
-  outputX_ = conf.output_x();
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  outputs_ = outputX_ * outputX_;
-
-  isDeconv_ = (config_.type() == "conv") ? false : true;
-  if (isDeconv_) {
-    channels_ = configNumFilters_;
-    numFilters_ = configChannels_;
-  } else {
-    channels_ = configChannels_;
-    numFilters_ = configNumFilters_;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseOperator.h b/paddle/legacy/gserver/layers/ConvBaseOperator.h
deleted file mode 100644
index 4ac77f2d743..00000000000
--- a/paddle/legacy/gserver/layers/ConvBaseOperator.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "Operator.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-class ConvBaseOperator : public Operator {
- public:
-  ConvBaseOperator(const OperatorConfig &config, bool useGpu);
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvBaseOperator() {
-    if (workSpaceInBytes_ != 0) {
-      hl_free_mem_device(workSpace_);
-      workSpaceInBytes_ = 0;
-    }
-
-    hl_destroy_tensor_descriptor(imageDesc_);
-    hl_destroy_tensor_descriptor(outputDesc_);
-    hl_destroy_filter_descriptor(filterDesc_);
-    hl_destroy_convolution_descriptor(convDesc_);
-  }
-
- protected:
-  /**
-   * Get convolution parameters from layer config and
-   * initialize member variables.
-   */
-  void getConvParams();
-
-  /**
-   * Allocate Gpu Memory for cudnn convolution algorithms.
-   */
-  void allocConvWorkSpace();
-
-  /**
-   * Create cudnn tensor descriptor for convolution operation.
-   */
-  void computeConvSizes();
-
-  /**
-   * Reshape cudnn tensor descriptor.
-   */
-  void reshapeImageDescriptors();
-
-  /**
-   * Reshape cudnn tensor descriptor.
-   */
-  virtual void reshape(int batchSize) = 0;
-
-  /**
-   * Check filter size is equal to the size calculated by parameters from
-   * layer config.
-   */
-  void checkFilterSize(const MatrixPtr &filter) {
-    CHECK_EQ(static_cast<int>(filter->getWidth()),
-             filterSize_ * filterSizeY_ * channels_ * numFilters_);
-  }
-
-  /// Most of member variables are same with CudnnConvLayer.
-  /// There is no explanation here.
-  bool isDeconv_;
-  int imageH_, imageW_, outputH_, outputW_;
-  hl_tensor_descriptor imageDesc_;
-  hl_tensor_descriptor outputDesc_;
-  hl_filter_descriptor filterDesc_;
-  hl_convolution_descriptor convDesc_;
-  bool caffeMode_;
-  int inputOffset_, outputOffset_, weightOffset_;
-  int numFilters_, channels_;
-
-  /// from parsing config
-  int configNumFilters_, configChannels_;
-  int padding_, stride_, filterSize_, imgSize_, imgSizeY_;
-  int paddingY_, strideY_, filterSizeY_;
-  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
-
-  /// Following member variables are same with CudnnConvLayer.
-  /// There is no explanation here.
-  int fwdAlgo_, bwdFilterAlgo_, bwdDataAlgo_;
-  size_t fwdLimitBytes_, bwdDataLimitBytes_, bwdFilterLimitBytes_;
-  size_t workSpaceInBytes_;
-  void *workSpace_;
-  bool isSelectAlgo_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseProjection.cpp b/paddle/legacy/gserver/layers/ConvBaseProjection.cpp
deleted file mode 100644
index ff5d3412de1..00000000000
--- a/paddle/legacy/gserver/layers/ConvBaseProjection.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvBaseProjection.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-ThreadLocalD<std::vector<MemoryHandlePtr>> ConvBaseProjection::convMem_;
-
-ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
-                                       ParameterPtr parameter,
-                                       bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(useGpu);  // only support GPU
-  getConvParams();
-  initCudnn();
-
-  size_t height = filterH_ * filterW_ * channels_ / groups_;
-  size_t width = numFilters_;
-  weight_.reset(new Weight(height, width, parameter));
-  weightOffset_ = height * width / groups_;
-}
-
-void ConvBaseProjection::getConvParams() {
-  const ConvConfig &conf = config_.conv_conf();
-  paddingH_ = conf.padding_y();
-  paddingW_ = conf.padding();
-
-  strideH_ = conf.stride_y();
-  strideW_ = conf.stride();
-
-  dilationH_ = conf.dilation_y();
-  dilationW_ = conf.dilation();
-  CHECK_GT(dilationH_, 0);
-  CHECK_GT(dilationW_, 0);
-
-  filterH_ = conf.filter_size_y();
-  filterW_ = conf.filter_size();
-
-  configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  configImgW_ = conf.img_size();
-
-  configOutH_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  configOutW_ = conf.output_x();
-
-  configChannels_ = conf.channels();
-  configNumFilters_ = config_.num_filters();
-
-  isDeconv_ = (config_.type() == "conv") ? false : true;
-
-  channels_ = (isDeconv_) ? configNumFilters_ : configChannels_;
-  numFilters_ = (isDeconv_) ? configChannels_ : configNumFilters_;
-
-  groups_ = conf.groups();
-  CHECK_EQ(channels_ % groups_, 0);
-  CHECK_EQ(numFilters_ % groups_, 0);
-}
-
-void ConvBaseProjection::initCudnn() {
-  hl_create_filter_descriptor(&filterDesc_,
-                              channels_ / groups_,
-                              numFilters_ / groups_,
-                              filterH_,
-                              filterW_);
-  hl_create_tensor_descriptor(&imageDesc_);
-  hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_,
-                                   imageDesc_,
-                                   filterDesc_,
-                                   paddingH_,
-                                   paddingW_,
-                                   strideH_,
-                                   strideW_,
-                                   dilationH_,
-                                   dilationW_);
-
-  // initialize all to default algorithms
-  fwdAlgo_ = 0;
-  bwdFilterAlgo_ = 0;
-  bwdDataAlgo_ = 0;
-  fwdLimitBytes_ = 0;
-  bwdDataLimitBytes_ = 0;
-  bwdFilterLimitBytes_ = 0;
-  workSpaceInBytes_ = 0;
-}
-
-void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
-  // The stride between two consecutive samples in the output of ConvProjection
-  // may not be numFilters_ * outputH_ * outputW_ (conv) or
-  // channels_ * imageH_ * imageW_ (deconv)
-  // for example, in the case of layer ConcatenateLayer2 with two
-  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
-  // So the calculation of nStride is different from CudnnConvLayer.
-  size_t nStrideImage, nStrideOutput;
-  if (isDeconv_) {
-    nStrideImage = out_->value->getStride();
-    nStrideOutput = numFilters_ * outputH_ * outputW_;
-  } else {
-    nStrideImage = channels_ * imageH_ * imageW_;
-    nStrideOutput = out_->value->getStride();
-  }
-
-  hl_tensor_reshape(imageDesc_,
-                    batchSize,
-                    channels_ / groups_,
-                    imageH_,
-                    imageW_,
-                    nStrideImage,
-                    imageH_ * imageW_,
-                    imageW_,
-                    1);
-
-  hl_tensor_reshape(outputDesc_,
-                    batchSize,
-                    numFilters_ / groups_,
-                    outputH_,
-                    outputW_,
-                    nStrideOutput,
-                    outputH_ * outputW_,
-                    outputW_,
-                    1);
-
-  hl_reset_convolution_descriptor(convDesc_,
-                                  imageDesc_,
-                                  filterDesc_,
-                                  paddingH_,
-                                  paddingW_,
-                                  strideH_,
-                                  strideW_,
-                                  dilationH_,
-                                  dilationW_);
-}
-
-void ConvBaseProjection::reshape(int batchSize) {
-  size_t width = calOutputSize();
-  CHECK_EQ(width, out_->value->getWidth());
-  CHECK_EQ(calInputSize(), in_->value->getWidth());
-
-  reshapeTensorDesc(batchSize);
-  bool useDilation = false;
-  if (dilationH_ > 1 || dilationW_ > 1) {
-    useDilation = true;
-  }
-  hl_conv_workspace(imageDesc_,
-                    outputDesc_,
-                    filterDesc_,
-                    convDesc_,
-                    &fwdAlgo_,
-                    &fwdLimitBytes_,
-                    &bwdDataAlgo_,
-                    &bwdDataLimitBytes_,
-                    &bwdFilterAlgo_,
-                    &bwdFilterLimitBytes_,
-                    useDilation);
-
-  size_t maxWorkSpace = 0;
-  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-  workSpaceInBytes_ = maxWorkSpace;
-
-  VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
-          << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
-}
-
-void *ConvBaseProjection::getSpaceBytes(size_t size) {
-  std::vector<MemoryHandlePtr> &convMem = *convMem_;
-  if (convMem.empty()) {
-    int numDevices = hl_get_device_count();
-    convMem.resize(numDevices);
-  }
-
-  int devId = hl_get_device();
-  MemoryHandlePtr localMem = convMem[devId];
-  if (NULL == localMem || size > localMem->getAllocSize()) {
-    localMem = std::make_shared<GpuMemoryHandle>(size);
-  }
-  return localMem->getBuf();
-}
-
-ConvBaseProjection::~ConvBaseProjection() {
-  hl_destroy_tensor_descriptor(imageDesc_);
-  hl_destroy_tensor_descriptor(outputDesc_);
-  hl_destroy_filter_descriptor(filterDesc_);
-  hl_destroy_convolution_descriptor(convDesc_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseProjection.h b/paddle/legacy/gserver/layers/ConvBaseProjection.h
deleted file mode 100644
index dcf5ce0f48d..00000000000
--- a/paddle/legacy/gserver/layers/ConvBaseProjection.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Projection.h"
-#include "paddle/legacy/math/MathUtils.h"
-
-namespace paddle {
-
-/**
- * @brief Base class for ConvProjection and ConvTransProjection.
- */
-class ConvBaseProjection : public Projection {
- public:
-  /**
-   * Constructor.
-   */
-  ConvBaseProjection(const ProjectionConfig& config,
-                     ParameterPtr parameter,
-                     bool useGpu);
-
-  ~ConvBaseProjection();
-
- protected:
-  void getConvParams();
-  void initCudnn();
-
-  void reshapeTensorDesc(int batchSize);
-  void reshape(int batchSize);
-
-  virtual size_t calOutputSize() = 0;
-  virtual size_t calInputSize() = 0;
-
-  static void* getSpaceBytes(size_t size);
-
-  /// True if it's deconv projection layer, false if it's ConvProjection layer
-  bool isDeconv_;
-  /// imageH_ and imageW_ / outputH_ and outputW_
-  /// is calculated from the input layer.
-  int imageH_, imageW_;
-  int outputH_, outputW_;
-  /// configImgH_ and configImgW_ / configOutH_ and configOutW_
-  /// is obtained from config.
-  int configImgH_, configImgW_;
-  int configOutH_, configOutW_;
-  /// channels_ and numFilters_ are defined in terms of convolution semantics
-  int channels_, numFilters_;
-  /// configChannels and configNumFilters_ are obtained from config
-  /// For Conv they are the same as channels_ and numFilters
-  /// For ConvTrans they are opposite to channels_ and numFilters
-  int configChannels_, configNumFilters_;
-  int paddingH_, paddingW_;
-  int strideH_, strideW_;
-  int dilationH_, dilationW_;
-  int filterH_, filterW_;
-  /// One group offset of input data.
-  int inputOffset_;
-  /// One group offset of output data.
-  int outputOffset_;
-  /// One group offset of weight.
-  int weightOffset_;
-  int groups_;
-
-  /// Cudnn tensor descriptor for input.
-  hl_tensor_descriptor imageDesc_;
-  /// Cudnn tensor descriptor for output.
-  hl_tensor_descriptor outputDesc_;
-  /// Cudnn tensor descriptor for filter.
-  hl_filter_descriptor filterDesc_;
-  /// Cudnn tensor descriptor for a convolution operation.
-  hl_convolution_descriptor convDesc_;
-
-  /// Record the algorithm for forward convolution, which is obtained by cudnn
-  /// api to search the best suited algorithm.
-  int fwdAlgo_;
-  /// Record the algorithm for computing convolution gradient with respect to
-  /// filter coefficients.
-  int bwdFilterAlgo_;
-  /// Record the algorithm for computing convolution gradient with respect to
-  /// the output.
-  int bwdDataAlgo_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// forward convolution with the specified algo.
-  size_t fwdLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardFilter with the specified algo.
-  size_t bwdDataLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardData with the specified algo.
-  size_t bwdFilterLimitBytes_;
-  /// Size of total work space.
-  size_t workSpaceInBytes_;
-  bool bias_;
-
-  std::unique_ptr<Weight> weight_;
-  static ThreadLocalD<std::vector<MemoryHandlePtr>> convMem_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvOperator.cpp b/paddle/legacy/gserver/layers/ConvOperator.cpp
deleted file mode 100644
index 5276b2c3920..00000000000
--- a/paddle/legacy/gserver/layers/ConvOperator.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvOperator.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-REGISTER_OPERATOR(conv, ConvOperator);
-
-void ConvOperator::reshape(int batchSize) {
-  imageH_ = ins_[0]->getFrameHeight();
-  imageW_ = ins_[0]->getFrameWidth();
-  if (imageH_ == 0) imageH_ = imgSizeY_;
-  if (imageW_ == 0) imageW_ = imgSize_;
-  outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
-  outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_);
-  /// Check that the outputSizes are consistent with config
-  CHECK_EQ(outputH_, outputY_);
-  CHECK_EQ(outputW_, outputX_);
-  out_->setFrameHeight(outputH_);
-  out_->setFrameWidth(outputW_);
-
-  reshapeImageDescriptors();
-
-  inputOffset_ = channels_ * imageH_ * imageW_;
-  outputOffset_ = numFilters_ * outputH_ * outputW_;
-  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
-
-  if (!isSelectAlgo_) {
-    allocConvWorkSpace();
-  }
-
-  isSelectAlgo_ = true;
-}
-
-void ConvOperator::forward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  reshape(batchSize);
-  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
-  checkFilterSize(ins_[1]->value);
-  Matrix::resizeOrCreate(out_->value,
-                         batchSize,
-                         outputH_ * outputW_ * numFilters_,
-                         false,
-                         useGpu_);
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-      real *outData = out_->value->getData() + outputOffset_ * batchId;
-      hl_convolution_forward(imageDesc_,
-                             inputData,
-                             outputDesc_,
-                             outData,
-                             filterDesc_,
-                             wgtData,
-                             convDesc_,
-                             workSpace_,
-                             workSpaceInBytes_,
-                             fwdAlgo_);
-    }
-  }
-}
-
-void ConvOperator::backward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
-      if (ins_[1]->grad) {
-        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_filter(imageDesc_,
-                                       inputData,
-                                       outputDesc_,
-                                       outGrad,
-                                       filterDesc_,
-                                       weightGrad,
-                                       convDesc_,
-                                       workSpace_,
-                                       workSpaceInBytes_,
-                                       bwdFilterAlgo_);
-      }
-
-      MatrixPtr preGrad = ins_[0]->grad;
-      if (NULL != preGrad) {
-        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
-        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_data(imageDesc_,
-                                     inputGrad,
-                                     outputDesc_,
-                                     outGrad,
-                                     filterDesc_,
-                                     wgtData,
-                                     convDesc_,
-                                     workSpace_,
-                                     workSpaceInBytes_,
-                                     bwdDataAlgo_);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvOperator.h b/paddle/legacy/gserver/layers/ConvOperator.h
deleted file mode 100644
index 8f31620111c..00000000000
--- a/paddle/legacy/gserver/layers/ConvOperator.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "ConvBaseOperator.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-class ConvOperator : public ConvBaseOperator {
- public:
-  ConvOperator(const OperatorConfig &config, bool useGpu)
-      : ConvBaseOperator(config, useGpu) {}
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvOperator() {}
-  void forward() override;
-  void backward() override;
-  void reshape(int batchSize) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvProjection.cpp b/paddle/legacy/gserver/layers/ConvProjection.cpp
deleted file mode 100644
index b40cdac2587..00000000000
--- a/paddle/legacy/gserver/layers/ConvProjection.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvProjection.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_PROJECTION(conv, ConvProjection);
-
-size_t ConvProjection::calOutputSize() {
-  imageH_ = in_->getFrameHeight();
-  imageW_ = in_->getFrameWidth();
-  if (imageH_ == 0) imageH_ = configImgH_;
-  if (imageW_ == 0) imageW_ = configImgW_;
-  outputH_ = outputSize(imageH_,
-                        (filterH_ - 1) * dilationH_ + 1,
-                        paddingH_,
-                        strideH_,
-                        /* caffeMode */ true);
-  outputW_ = outputSize(imageW_,
-                        (filterW_ - 1) * dilationW_ + 1,
-                        paddingW_,
-                        strideW_,
-                        /* caffeMode */ true);
-
-  const_cast<Argument *>(out_)->setFrameHeight(outputH_);
-  const_cast<Argument *>(out_)->setFrameWidth(outputW_);
-
-  inputOffset_ = (configChannels_ / groups_) * imageH_ * imageW_;
-  outputOffset_ = (configNumFilters_ / groups_) * outputH_ * outputW_;
-  return outputH_ * outputW_ * configNumFilters_;
-}
-
-size_t ConvProjection::calInputSize() {
-  return static_cast<size_t>(configChannels_ * imageH_ * imageW_);
-}
-
-void ConvProjection::forward() {
-  int batchSize = in_->value->getHeight();
-  reshape(batchSize);
-
-  void *workSpace = NULL;
-  if (workSpaceInBytes_ > 0) {
-    workSpace = getSpaceBytes(workSpaceInBytes_);
-  }
-
-  for (int g = 0; g < groups_; ++g) {
-    REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
-
-    real *inputData = in_->value->getData() + g * inputOffset_;
-    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-    real *outData = out_->value->getData() + g * outputOffset_;
-    hl_convolution_forward(imageDesc_,
-                           inputData,
-                           outputDesc_,
-                           outData,
-                           filterDesc_,
-                           wgtData,
-                           convDesc_,
-                           workSpace,
-                           fwdLimitBytes_,
-                           fwdAlgo_);
-  }
-}
-
-void ConvProjection::backward(const UpdateCallback &callback) {
-  REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
-
-  void *workSpace = NULL;
-  if (workSpaceInBytes_ > 0) {
-    workSpace = getSpaceBytes(workSpaceInBytes_);
-  }
-
-  for (int g = 0; g < groups_; ++g) {
-    real *outGrad = out_->grad->getData() + g * outputOffset_;
-    if (weight_->getWGrad()) {
-      real *inputData = in_->value->getData() + g * inputOffset_;
-      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
-      hl_convolution_backward_filter(imageDesc_,
-                                     inputData,
-                                     outputDesc_,
-                                     outGrad,
-                                     filterDesc_,
-                                     weightGrad,
-                                     convDesc_,
-                                     workSpace,
-                                     bwdFilterLimitBytes_,
-                                     bwdFilterAlgo_);
-    }
-
-    MatrixPtr preGrad = in_->grad;
-    if (NULL != preGrad) {
-      real *inputGrad = preGrad->getData() + g * inputOffset_;
-      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-      hl_convolution_backward_data(imageDesc_,
-                                   inputGrad,
-                                   outputDesc_,
-                                   outGrad,
-                                   filterDesc_,
-                                   wgtData,
-                                   convDesc_,
-                                   workSpace,
-                                   bwdDataLimitBytes_,
-                                   bwdDataAlgo_);
-    }
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvProjection.h b/paddle/legacy/gserver/layers/ConvProjection.h
deleted file mode 100644
index 890a17e2f8d..00000000000
--- a/paddle/legacy/gserver/layers/ConvProjection.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ConvBaseProjection.h"
-#include "paddle/legacy/math/MathUtils.h"
-
-namespace paddle {
-
-/**
- * @brief Convolution projection do the same calculation with CudnnConvLayer.
- */
-class ConvProjection : public ConvBaseProjection {
- public:
-  /**
-   * Constructor.
-   */
-  ConvProjection(const ProjectionConfig& config,
-                 ParameterPtr parameter,
-                 bool useGpu)
-      : ConvBaseProjection(config, parameter, useGpu) {}
-
-  ~ConvProjection() {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-  virtual size_t calOutputSize();
-  virtual size_t calInputSize();
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvShiftLayer.cpp b/paddle/legacy/gserver/layers/ConvShiftLayer.cpp
deleted file mode 100644
index b7ecbe556c5..00000000000
--- a/paddle/legacy/gserver/layers/ConvShiftLayer.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for circular convluation of two vectors,
- * which is used in NEURAL TURING MACHINE.
- * - Input: two vectors, the first is data (batchSize x dataDim)
- * the second is shift weights (batchSize x shiftDim)
- * - Output: a vector (batchSize x dataDim)
- * Assumed that:
- * - a[in]: contains M elements.
- * - b[in]: contains N elements (N should be odd).
- * - c[out]: contains M elements.
- *
- * \f[
- *     c[i] = \sum_{j=-(N-1)/2}^{(N-1)/2}a_{i+j} * b_{j}
- * \f]
- *
- * In this formula:
- *  - a's index is computed modulo M.
- *  - b's index is comupted modulo N.
- *
- * The config file api is conv_shift_layer.
- */
-
-class ConvShiftLayer : public Layer {
- public:
-  explicit ConvShiftLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ConvShiftLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(conv_shift, ConvShiftLayer);
-
-bool ConvShiftLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  return true;
-}
-
-void ConvShiftLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t dataDim = inV0->getWidth();
-
-  CHECK_EQ(batchSize, inV1->getHeight());
-  CHECK_EQ(dataDim, getSize());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  REGISTER_TIMER_INFO("FwConvShiftTimer", getName().c_str());
-  outV->circularConv(*inV0, *inV1);
-}
-
-void ConvShiftLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  REGISTER_TIMER_INFO("BwConvShiftTimer", getName().c_str());
-
-  if (inG0 && inG1) {
-    outG->circularConvDerivative(*outG, *inV0, *inV1, *inG0, *inG1);
-  } else {
-    CHECK(!inG0 || !inG1) << "Not supported";
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvTransOperator.cpp b/paddle/legacy/gserver/layers/ConvTransOperator.cpp
deleted file mode 100644
index f4ce2affb14..00000000000
--- a/paddle/legacy/gserver/layers/ConvTransOperator.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvTransOperator.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvTransOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-REGISTER_OPERATOR(convt, ConvTransOperator);
-
-void ConvTransOperator::reshape(int batchSize) {
-  outputH_ = ins_[0]->getFrameHeight();
-  outputW_ = ins_[0]->getFrameWidth();
-  if (outputH_ == 0) outputH_ = outputY_;
-  if (outputW_ == 0) outputW_ = outputX_;
-  imageH_ = imageSize(outputH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
-  imageW_ = imageSize(outputW_, filterSize_, padding_, stride_, caffeMode_);
-  /// Check that the imageSizes are consistent with config
-  CHECK_EQ(imageH_, imgSizeY_);
-  CHECK_EQ(imageW_, imgSize_);
-  out_->setFrameHeight(imageH_);
-  out_->setFrameWidth(imageW_);
-
-  reshapeImageDescriptors();
-
-  inputOffset_ = numFilters_ * outputH_ * outputW_;
-  outputOffset_ = channels_ * imageH_ * imageW_;
-  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
-
-  if (!isSelectAlgo_) {
-    allocConvWorkSpace();
-  }
-
-  isSelectAlgo_ = true;
-}
-
-void ConvTransOperator::forward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  reshape(batchSize);
-  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
-  checkFilterSize(ins_[1]->value);
-  Matrix::resizeOrCreate(
-      out_->value, batchSize, imageH_ * imageW_ * channels_, false, useGpu_);
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-      real *outData = out_->value->getData() + outputOffset_ * batchId;
-      hl_convolution_backward_data(imageDesc_,
-                                   outData,
-                                   outputDesc_,
-                                   inputData,
-                                   filterDesc_,
-                                   wgtData,
-                                   convDesc_,
-                                   workSpace_,
-                                   workSpaceInBytes_,
-                                   bwdDataAlgo_);
-    }
-  }
-}
-
-void ConvTransOperator::backward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
-      if (ins_[1]->grad) {
-        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_filter(imageDesc_,
-                                       outGrad,
-                                       outputDesc_,
-                                       inputData,
-                                       filterDesc_,
-                                       weightGrad,
-                                       convDesc_,
-                                       workSpace_,
-                                       workSpaceInBytes_,
-                                       bwdFilterAlgo_);
-      }
-
-      MatrixPtr preGrad = ins_[0]->grad;
-      if (NULL != preGrad) {
-        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
-        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-        hl_convolution_forward(imageDesc_,
-                               outGrad,
-                               outputDesc_,
-                               inputGrad,
-                               filterDesc_,
-                               wgtData,
-                               convDesc_,
-                               workSpace_,
-                               workSpaceInBytes_,
-                               fwdAlgo_);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvTransOperator.h b/paddle/legacy/gserver/layers/ConvTransOperator.h
deleted file mode 100644
index 206335a01ff..00000000000
--- a/paddle/legacy/gserver/layers/ConvTransOperator.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "ConvBaseOperator.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvTransOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-class ConvTransOperator : public ConvBaseOperator {
- public:
-  ConvTransOperator(const OperatorConfig &config, bool useGpu)
-      : ConvBaseOperator(config, useGpu) {}
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvTransOperator() {}
-  void forward() override;
-  void backward() override;
-  void reshape(int batchSize) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvTransProjection.cpp b/paddle/legacy/gserver/layers/ConvTransProjection.cpp
deleted file mode 100644
index 00e34c8f2dc..00000000000
--- a/paddle/legacy/gserver/layers/ConvTransProjection.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvTransProjection.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_PROJECTION(convt, ConvTransProjection);
-size_t ConvTransProjection::calOutputSize() {
-  outputH_ = in_->getFrameHeight();
-  outputW_ = in_->getFrameWidth();
-  if (outputH_ == 0) outputH_ = configOutH_;
-  if (outputW_ == 0) outputW_ = configOutW_;
-  imageH_ = imageSize(outputH_,
-                      (filterH_ - 1) * dilationH_ + 1,
-                      paddingH_,
-                      strideH_,
-                      /* caffeMode */ true);
-
-  imageW_ = imageSize(outputW_,
-                      (filterW_ - 1) * dilationW_ + 1,
-                      paddingW_,
-                      strideW_,
-                      /* caffeMode */ true);
-
-  const_cast<Argument *>(out_)->setFrameHeight(imageH_);
-  const_cast<Argument *>(out_)->setFrameWidth(imageW_);
-
-  inputOffset_ = (configChannels_ / groups_) * outputH_ * outputW_;
-  outputOffset_ = (configNumFilters_ / groups_) * imageH_ * imageW_;
-  return imageH_ * imageW_ * configNumFilters_;
-}
-
-size_t ConvTransProjection::calInputSize() {
-  return static_cast<size_t>(configChannels_ * outputH_ * outputW_);
-}
-
-void ConvTransProjection::forward() {
-  int batchSize = in_->value->getHeight();
-  reshape(batchSize);
-
-  void *workSpace = NULL;
-  if (workSpaceInBytes_ > 0) {
-    workSpace = getSpaceBytes(workSpaceInBytes_);
-  }
-
-  for (int g = 0; g < groups_; ++g) {
-    REGISTER_TIMER_INFO("CudnnConvTransFwTimer", getName().c_str());
-
-    real *inData = in_->value->getData() + g * inputOffset_;
-    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-    real *outData = out_->value->getData() + g * outputOffset_;
-    hl_convolution_backward_data(imageDesc_,
-                                 outData,
-                                 outputDesc_,
-                                 inData,
-                                 filterDesc_,
-                                 wgtData,
-                                 convDesc_,
-                                 workSpace,
-                                 bwdDataLimitBytes_,
-                                 bwdDataAlgo_);
-  }
-}
-
-void ConvTransProjection::backward(const UpdateCallback &callback) {
-  REGISTER_TIMER_INFO("CudnnConvTransBpTimer", getName().c_str());
-
-  void *workSpace = NULL;
-  if (workSpaceInBytes_ > 0) {
-    workSpace = getSpaceBytes(workSpaceInBytes_);
-  }
-
-  for (int g = 0; g < groups_; ++g) {
-    real *outGrad = out_->grad->getData() + g * outputOffset_;
-    if (weight_->getWGrad()) {
-      real *inData = in_->value->getData() + g * inputOffset_;
-      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
-      hl_convolution_backward_filter(imageDesc_,
-                                     outGrad,
-                                     outputDesc_,
-                                     inData,
-                                     filterDesc_,
-                                     weightGrad,
-                                     convDesc_,
-                                     workSpace,
-                                     bwdFilterLimitBytes_,
-                                     bwdFilterAlgo_);
-    }
-
-    MatrixPtr preGrad = in_->grad;
-    if (NULL != preGrad) {
-      real *inGrad = preGrad->getData() + g * inputOffset_;
-      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-      hl_convolution_forward(imageDesc_,
-                             outGrad,
-                             outputDesc_,
-                             inGrad,
-                             filterDesc_,
-                             wgtData,
-                             convDesc_,
-                             workSpace,
-                             fwdLimitBytes_,
-                             fwdAlgo_);
-    }
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvTransProjection.h b/paddle/legacy/gserver/layers/ConvTransProjection.h
deleted file mode 100644
index 9b63dd47352..00000000000
--- a/paddle/legacy/gserver/layers/ConvTransProjection.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ConvBaseProjection.h"
-#include "paddle/legacy/math/MathUtils.h"
-
-namespace paddle {
-
-/**
- * @brief Convolution projection do the same calculation with CudnnConvLayer.
- */
-class ConvTransProjection : public ConvBaseProjection {
- public:
-  /**
-   * Constructor.
-   */
-  ConvTransProjection(const ProjectionConfig& config,
-                      ParameterPtr parameter,
-                      bool useGpu)
-      : ConvBaseProjection(config, parameter, useGpu) {}
-
-  ~ConvTransProjection() {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-  virtual size_t calOutputSize();
-  virtual size_t calInputSize();
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp b/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp
deleted file mode 100644
index c38ab251f18..00000000000
--- a/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for weighted sum of vectors,
- * which is used in NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN AND
- * TRANSLATE
- * - Input: the the size of the first input is weightDim,
- *          and the size of the second input is weightdim * dataDim.
- * - Output: the sizeof the output is dataDim
- * \f[
- *   out(j) = \sum_{i}(in0(i) * in1(i,j + i * dataDim)),
- *               i = 0,1,...,(weightDim-1); j = 0, 1,...,(dataDim-1)
- * \f]
- * Note that the above computation is for one sample. Multiple samples are
- * processed in one batch.
- *
- * The config file api is linear_comb_layer.
- */
-class ConvexCombinationLayer : public Layer {
- protected:
-  /// A matrix pointer pointing to second input.
-  MatrixPtr tmpMtx0;
-  /// A matrix pointer pointing to first input.
-  MatrixPtr tmpRow0;
-  /// A matrix pointer pointing to output.
-  MatrixPtr tmpRow1;
-
- public:
-  explicit ConvexCombinationLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ConvexCombinationLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(convex_comb, ConvexCombinationLayer);
-
-bool ConvexCombinationLayer::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(2U, inputLayers_.size());
-  size_t dataDim = getSize();
-  size_t weightDim = inputLayers_[0]->getSize();
-
-  CHECK_EQ(weightDim * dataDim, inputLayers_[1]->getSize())
-      << "Dimension mismatch";
-
-  tmpRow0 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           weightDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow1 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpMtx0 = Matrix::create(nullptr,
-                           /* height= */ weightDim,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-
-  return true;
-}
-
-void ConvexCombinationLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t weightDim = inV0->getWidth();
-  size_t dataDim = getSize();
-
-  CHECK_EQ(batchSize, inV1->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  REGISTER_TIMER_INFO("FwCvxCombTimer", getName().c_str());
-  for (size_t i = 0; i < batchSize; i++) {
-    tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
-    tmpRow0->setData(inV0->getData() + i * weightDim);
-    tmpRow1->setData(outV->getData() + i * dataDim);
-
-    tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 0);
-  }
-}
-
-void ConvexCombinationLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t weightDim = inV0->getWidth();
-  size_t dataDim = getSize();
-
-  REGISTER_TIMER_INFO("BwCvxCombTimer", getName().c_str());
-
-  if (inG0) {
-    for (size_t i = 0; i < batchSize; i++) {
-      tmpRow0->setData(inG0->getData() + i * weightDim);
-      tmpRow1->setData(outG->getData() + i * dataDim);
-      tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
-
-      tmpRow0->mul(*tmpRow1, *(tmpMtx0->getTranspose()), 1, 1);
-    }
-  }
-
-  if (inG1) {
-    for (size_t i = 0; i < batchSize; i++) {
-      tmpRow0->setData(inV0->getData() + i * weightDim);
-      tmpRow1->setData(outG->getData() + i * dataDim);
-      tmpMtx0->setData(inG1->getData() + i * weightDim * dataDim);
-
-      tmpMtx0->mul(*(tmpRow0->getTranspose()), *tmpRow1, 1, 1);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CosSimLayer.cpp b/paddle/legacy/gserver/layers/CosSimLayer.cpp
deleted file mode 100644
index ab8d7cc1f61..00000000000
--- a/paddle/legacy/gserver/layers/CosSimLayer.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CosSimLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(cos, CosSimLayer);
-
-bool CosSimLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2LU);
-
-  createFunction(forward_,
-                 "CosSimForward",
-                 FuncConfig().set("scale", (real)config_.cos_scale()));
-  createFunction(backward_,
-                 "CosSimBackward",
-                 FuncConfig().set("scale", (real)config_.cos_scale()));
-
-  return true;
-}
-
-void CosSimLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(0)->getHeight();
-  int size = getSize();
-  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
-
-  {
-    REGISTER_TIMER_INFO("CosFwResetTimer", getName().c_str());
-    reserveOutput(batchSize, size);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  /* activation */ {
-    REGISTER_TIMER_INFO("CosFwAtvTimer", getName().c_str());
-    MatrixPtr prevOut1 = getInputValue(0);
-    MatrixPtr prevOut2 = getInputValue(1);
-
-    CHECK(outV && prevOut1 && prevOut2);
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*prevOut1);
-    inputs.addArg(*prevOut2);
-    outputs.addArg(*outV, ASSIGN_TO);
-    forward_[0]->calc(inputs, outputs);
-  }
-}
-
-void CosSimLayer::backward(const UpdateCallback& callback) {
-  /* activation */ {
-    REGISTER_TIMER_INFO("CosBpAtvTimer", getName().c_str());
-    CHECK_EQ(backward_.size(), 1UL) << "Only one backward function needed";
-
-    const auto outG = this->getOutputGrad();
-    const auto outV = this->getOutputValue();
-    const auto inV1 = this->getInputValue(0);
-    const auto inV2 = this->getInputValue(1);
-    auto inG1 = this->getInputGrad(0);
-    auto inG2 = this->getInputGrad(1);
-    CHECK(outG && outV && inV1 && inV2 && inG1 && inG2);
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*outG);
-    inputs.addArg(*outV);
-    inputs.addArg(*inV1);
-    inputs.addArg(*inV2);
-    outputs.addArg(*inG1, ADD_TO);
-    outputs.addArg(*inG2, ADD_TO);
-
-    backward_[0]->calc(inputs, outputs);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CosSimLayer.h b/paddle/legacy/gserver/layers/CosSimLayer.h
deleted file mode 100644
index b08e2c6a353..00000000000
--- a/paddle/legacy/gserver/layers/CosSimLayer.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-/**
- * @brief A layer for calculating cosine similarity between two vector
- * \f[
- * f(x,y)=scale\frac{x_1y_1+x_2y_2+...+x_ny_n}{\sqrt{x_1^2+x_2^2+...
- * +x_n^2}\sqrt{y_1^2+y_2^2+...+y_n^2}}
- * \f]
- *
- * - Input1: A vector (batchSize * dataDim) *
- * - Input2: A vector (batchSize * dataDim) or (1 * dataDim) *
- * - Output: A vector (batchSize * 1)
- *
- * The config file api is cos_sim.
- */
-class CosSimLayer : public Layer {
- public:
-  explicit CosSimLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~CosSimLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp b/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp
deleted file mode 100644
index 03de0be815a..00000000000
--- a/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-/**
- * @brief A layer for computing cosine similarity between a vector
- * and each row of a matrix
- * out[i] = cos_scale * cos(in1, in2(i,:));
- * @note used in NEURAL TURING MACHINE
- *
- * Input1: a vector (batchSize * dataDim)
- *
- * Input2: a matrix in vector form (batchSize * (weightDim*dataDim))
- *
- * Output: a vector (batchSize * weightDim)
- */
-
-class CosSimVecMatLayer : public Layer {
- protected:
-  MatrixPtr tmpMtx0;
-  MatrixPtr tmpMtx1;
-  MatrixPtr tmpRow0;
-  MatrixPtr tmpRow1;
-  MatrixPtr tmpRow2;
-  MatrixPtr tmpRow3;
-
- public:
-  explicit CosSimVecMatLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~CosSimVecMatLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(cos_vm, CosSimVecMatLayer);
-
-bool CosSimVecMatLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  size_t dataDim = inputLayers_[0]->getSize();
-  size_t numKeys = getSize();
-  size_t memoryDim = inputLayers_[1]->getSize();
-
-  CHECK_EQ(dataDim * numKeys, memoryDim) << "Dimension mismatch";
-
-  tmpRow0 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow1 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow2 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           1,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow3 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           1,
-                           /* trans= */ false,
-                           useGpu_);
-
-  tmpMtx0 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpMtx1 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-
-  CHECK(tmpRow0 && tmpRow1 && tmpRow2 && tmpRow3 && tmpMtx0 && tmpMtx1);
-
-  createFunction(forward_,
-                 "CosSimForward",
-                 FuncConfig().set("scale", (real)config_.cos_scale()));
-  createFunction(backward_,
-                 "CosSimBackward",
-                 FuncConfig().set("scale", (real)config_.cos_scale()));
-
-  return true;
-}
-
-void CosSimVecMatLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t numKeys = getSize();
-
-  CHECK_EQ(batchSize, inV1->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, numKeys);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  CHECK(outV && inV0 && inV1);
-  REGISTER_TIMER_INFO("FwCosVMTimer", getName().c_str());
-  for (size_t i = 0; i < batchSize; i++) {
-    tmpRow0->setData(inV0->rowBuf(i));
-    tmpMtx0->setData(inV1->rowBuf(i));
-    tmpRow2->setData(outV->rowBuf(i));
-
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*tmpMtx0);
-    inputs.addArg(*tmpRow0);
-    outputs.addArg(*tmpRow2, ASSIGN_TO);
-    forward_[0]->calc(inputs, outputs);
-  }
-}
-
-void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
-  CHECK_EQ(backward_.size(), 1UL) << "Only one forward function needed";
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-
-  size_t batchSize = inV0->getHeight();
-  CHECK(inV0 && inV1 && inG0 && inG1 && outV && outG);
-  REGISTER_TIMER_INFO("BwCosVMTimer", getName().c_str());
-
-  for (size_t i = 0; i < batchSize; i++) {
-    tmpRow0->setData(inV0->rowBuf(i));
-    tmpRow1->setData(inG0->rowBuf(i));
-    tmpMtx0->setData(inV1->rowBuf(i));
-    tmpMtx1->setData(inG1->rowBuf(i));
-    tmpRow2->setData(outV->rowBuf(i));
-    tmpRow3->setData(outG->rowBuf(i));
-
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*tmpRow3);
-    inputs.addArg(*tmpRow2);
-    inputs.addArg(*tmpMtx0);
-    inputs.addArg(*tmpRow0);
-    outputs.addArg(*tmpMtx1, ADD_TO);
-    outputs.addArg(*tmpRow1, ADD_TO);
-
-    backward_[0]->calc(inputs, outputs);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CostLayer.cpp b/paddle/legacy/gserver/layers/CostLayer.cpp
deleted file mode 100644
index 18b5b77bde9..00000000000
--- a/paddle/legacy/gserver/layers/CostLayer.cpp
+++ /dev/null
@@ -1,748 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CostLayer.h"
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include "paddle/legacy/utils/Logging.h"
-
-#include "paddle/legacy/math/SparseMatrix.h"
-
-namespace paddle {
-
-bool CostLayer::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  bool ret = Layer::init(layerMap, parameterMap);
-  coeff_ = config_.coeff();
-  if (!ret) return ret;
-  CHECK_GE(inputLayers_.size(), 2UL);
-  CHECK_LE(inputLayers_.size(), 3UL);
-  if (inputLayers_.size() == 3) {
-    weightLayer_ = inputLayers_[2];
-  }
-  return true;
-}
-
-void CostLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(*getOutputLayer())->getHeight();
-  int size = 1;
-  resetOutput(batchSize, size);
-
-  const MatrixPtr& output = getInputValue(*getOutputLayer());
-  Argument label = getInput(*getLabelLayer());
-
-  /* get the cost value for each sample*/
-  forwardImp(*output, label, *getOutputValue());
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    getOutputValue()->dotMul(*getOutputValue(), *weight);
-  }
-}
-
-void CostLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  const Argument& output = getInput(*getOutputLayer());
-  Argument label = getInput(*getLabelLayer());
-
-  bool support = true;
-  if (weightLayer_) {
-    support = output.grad->getAbsSum() == 0;
-  }
-
-  backwardImp(*output.value, label, *output.grad);
-
-  if (weightLayer_) {
-    CHECK(support) << "Weighted cost layer '" << getName()
-                   << "' must be the last layer "
-                      "connected to the output layer '"
-                   << getOutputLayer()->getName() << "'";
-    output.grad->rowScale(0, *output.grad, *getInputValue(*weightLayer_));
-  }
-  if (coeff_ != real(1.0f)) {
-    output.grad->add(coeff_, 0);
-  }
-}
-
-//
-// class MultiClassCrossEntropy
-//
-bool MultiClassCrossEntropy::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void MultiClassCrossEntropy::forwardImp(Matrix& output,
-                                        Argument& label,
-                                        Matrix& target) {
-  target.oneHotCrossEntropy(output, *label.ids);
-}
-
-void MultiClassCrossEntropy::backwardImp(Matrix& output,
-                                         Argument& label,
-                                         Matrix& outputG) {
-  outputG.oneHotCrossEntropyBp(output, *label.ids);
-}
-
-//
-// class MultiClassCrossEntropyWithSelfNorm
-//
-REGISTER_LAYER(multi_class_cross_entropy_with_selfnorm,
-               MultiClassCrossEntropyWithSelfNorm);
-
-bool MultiClassCrossEntropyWithSelfNorm::init(
-    const LayerMap& layerMap, const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void MultiClassCrossEntropyWithSelfNorm::forwardImp(Matrix& output,
-                                                    Argument& label,
-                                                    Matrix& target) {
-  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
-  output.rowSum(*sftMaxSum_);
-  sftMaxSum_->log2();
-
-  target.oneHotCrossEntropy(output, *label.ids);
-  target.add(*sftMaxSum_);
-
-  sftMaxSum_->square2();
-  target.add(*sftMaxSum_, config_.softmax_selfnorm_alpha());
-}
-
-void MultiClassCrossEntropyWithSelfNorm::backwardImp(Matrix& output,
-                                                     Argument& label,
-                                                     Matrix& outputG) {
-  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
-  output.rowSum(*sftMaxSum_);
-
-  Matrix::resizeOrCreate(sumInv_, output.getHeight(), 1, false, useGpu_);
-  sftMaxSum_->reciprocal2(*sumInv_);
-
-  outputG.oneHotCrossEntropyBp(output, *label.ids);
-  outputG.addColumnVector(*sumInv_);
-
-  sftMaxSum_->log2();
-  sumInv_->dotMul(*sumInv_, *sftMaxSum_);
-  sumInv_->mulScalar(2 * config_.softmax_selfnorm_alpha());
-
-  outputG.addColumnVector(*sumInv_);
-}
-
-//
-// class SoftBinaryClassCrossEntropy
-//
-REGISTER_LAYER(soft_binary_class_cross_entropy, SoftBinaryClassCrossEntropy);
-
-bool SoftBinaryClassCrossEntropy::init(const LayerMap& layerMap,
-                                       const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void SoftBinaryClassCrossEntropy::forwardImp(Matrix& output,
-                                             Argument& label,
-                                             Matrix& target) {
-  Matrix::resizeOrCreate(
-      targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
-
-  targetPerDim_->softCrossEntropy(output, *label.value);
-  targetPerDim_->rowSum(target);
-}
-
-void SoftBinaryClassCrossEntropy::backwardImp(Matrix& output,
-                                              Argument& label,
-                                              Matrix& outputG) {
-  outputG.softCrossEntropyBp(output, *label.value);
-}
-
-//
-// class SumOfSquaresCostLayer
-//
-
-REGISTER_LAYER(square_error, SumOfSquaresCostLayer);
-
-bool SumOfSquaresCostLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void SumOfSquaresCostLayer::forwardImp(Matrix& output,
-                                       Argument& label,
-                                       Matrix& target) {
-  target.sumOfSquares(output, *label.value);
-}
-
-void SumOfSquaresCostLayer::backwardImp(Matrix& output,
-                                        Argument& label,
-                                        Matrix& outputG) {
-  outputG.sumOfSquaresBp(output, *label.value);
-}
-
-//
-// class SmoothL1CostLayer
-//
-
-REGISTER_LAYER(smooth_l1, SmoothL1CostLayer);
-
-bool SmoothL1CostLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void SmoothL1CostLayer::forwardImp(Matrix& output,
-                                   Argument& label,
-                                   Matrix& target) {
-  MatrixPtr targetCpu, outputCpu, labelCpu;
-  if (useGpu_) {
-    targetCpu =
-        Matrix::create(target.getHeight(), target.getWidth(), false, false);
-    outputCpu =
-        Matrix::create(output.getHeight(), output.getWidth(), false, false);
-    labelCpu = Matrix::create(
-        label.value->getHeight(), label.value->getWidth(), false, false);
-    targetCpu->copyFrom(target);
-    outputCpu->copyFrom(output);
-    labelCpu->copyFrom(*label.value);
-    targetCpu->smoothL1(*outputCpu, *labelCpu, 1.0);
-    target.copyFrom(*targetCpu);
-  } else {
-    target.smoothL1(output, *label.value, 1.0);
-  }
-}
-
-void SmoothL1CostLayer::backwardImp(Matrix& output,
-                                    Argument& label,
-                                    Matrix& outputG) {
-  MatrixPtr outputGCpu, outputCpu, labelCpu;
-  if (useGpu_) {
-    outputGCpu =
-        Matrix::create(outputG.getHeight(), outputG.getWidth(), false, false);
-    outputCpu =
-        Matrix::create(output.getHeight(), output.getWidth(), false, false);
-    labelCpu = Matrix::create(
-        label.value->getHeight(), label.value->getWidth(), false, false);
-    outputGCpu->copyFrom(outputG);
-    outputCpu->copyFrom(output);
-    labelCpu->copyFrom(*label.value);
-    outputGCpu->smoothL1Bp(*outputCpu, *labelCpu, 1.0);
-    outputG.copyFrom(*outputGCpu);
-  } else {
-    outputG.smoothL1Bp(output, *label.value, 1.0);
-  }
-}
-
-//
-// class RankingCost
-//
-bool RankingCost::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  posPairCount_ = 0;
-  negPairCount_ = 0;
-
-  bool ret = Layer::init(layerMap, parameterMap);
-  if (!ret) return ret;
-  CHECK_GE(inputLayers_.size(), 3UL);
-  CHECK_LE(inputLayers_.size(), 4UL);
-  if (inputLayers_.size() == 4) {
-    weightLayer_ = inputLayers_[3];
-  }
-  return true;
-}
-
-void RankingCost::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(*getOutputLayer(0))->getHeight();
-  int size = 1;
-  resizeOutput(batchSize, size);
-  Matrix::resizeOrCreate(margin_, batchSize, size, /* trans= */ false, useGpu_);
-  MatrixPtr label = getInputValue(*getLabelLayer());
-  if (!label) {
-    // input label is not in value, try ids
-    IVectorPtr idLabel = getInput(*getLabelLayer()).ids;
-    CHECK(idLabel) << "label layer has neither value nor ids";
-    CHECK_EQ((size_t)batchSize, idLabel->getSize());
-    Matrix::resizeOrCreate(
-        labelBuf_, batchSize, /*width*/ 1, /*trans*/ false, useGpu_);
-    labelBuf_->copyFrom(*idLabel);
-    label = labelBuf_;
-  }
-
-  MatrixPtr output[] = {getInputValue(*getOutputLayer(0)),
-                        getInputValue(*getOutputLayer(1))};
-  MatrixPtr target = this->getOutputValue();
-  margin_->sub(*output[0], *output[1]);
-
-  // for validation
-  size_t height = output[0]->getHeight();
-  target->biggerThan(*(output[0]), *(output[1]), *label);
-  double total = static_cast<double>(height);
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    target->dotMul(*target, *weight);
-    total = weight->getSum();
-  }
-  double pos = target->getSum();
-  posPairCount_ += pos;
-  negPairCount_ += (total - pos);
-
-  // forward
-  target->logisticRegressionLoss(*margin_, *label);
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    target->dotMul(*target, *weight);
-  }
-}
-
-void RankingCost::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  MatrixPtr label = getInputValue(*getLabelLayer());
-  if (!label) {
-    // input label is not in value, but in ids
-    // use labelBuf_ (should already resized and copied during forward)
-    label = labelBuf_;
-  }
-
-  Matrix::resizeOrCreate(
-      marginGrad_, label->getHeight(), 1, /* trans= */ false, useGpu_);
-  marginGrad_->zeroMem();
-  marginGrad_->logisticRegressionLossBp(*margin_, *label);
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    marginGrad_->dotMul(*marginGrad_, *weight);
-  }
-
-  getInputGrad(0)->add(*marginGrad_);
-  getInputGrad(1)->sub(*marginGrad_);
-}
-
-void RankingCost::onPassEnd() {
-  double ratio = posPairCount_ / ((negPairCount_ <= 0) ? 1.0 : negPairCount_);
-  LOG(INFO) << "calc pos/neg: " << ratio << " pos= " << posPairCount_
-            << " neg= " << negPairCount_;
-
-  posPairCount_ = 0;
-  negPairCount_ = 0;
-}
-
-//
-// class LambdaCost
-//
-REGISTER_LAYER(lambda_cost, LambdaCost);
-
-bool LambdaCost::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  truncationSize_ = config_.ndcg_num();
-  maxSortSize_ = config_.max_sort_size();
-  if (maxSortSize_ != -1) {
-    CHECK_GE(maxSortSize_, truncationSize_)
-        << "maxSortSize must be greater than or equal to NDCG size!";
-  }
-  LOG(INFO) << "LambdaRank v1.3, NDCG size = " << truncationSize_
-            << ", Max partial sort size = " << maxSortSize_;
-  CHECK(!useGpu_) << "LambdaRank supports CPU only!";
-  return Layer::init(layerMap, parameterMap);
-}
-
-void LambdaCost::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(*getOutputLayer())->getHeight();
-  resizeOutput(batchSize, 1);
-
-  MatrixPtr score = getInputValue(*getScoreLayer());
-  MatrixPtr output = getInputValue(*getOutputLayer());
-  MatrixPtr target = this->getOutputValue();
-
-  real* scoreData = score->getData();
-  real* outputData = output->getData();
-  real* targetData = target->getData();
-
-  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
-  const int* startPosData = startPos->getData(false);
-  size_t batchNum = startPos->getSize() - 1;
-  for (size_t i = 0; i < batchNum; ++i) {
-    int beginPos = startPosData[i];
-    int endPos = startPosData[i + 1];
-    real NDCG = calcNDCG(
-        outputData + beginPos, scoreData + beginPos, endPos - beginPos);
-    for (int j = beginPos; j < endPos; ++j) {
-      targetData[j] = NDCG;
-    }
-  }
-}
-
-void LambdaCost::backward(const UpdateCallback& callback) {
-  (void)callback;
-  MatrixPtr score = getInputValue(*getScoreLayer());
-  MatrixPtr output = getInputValue(*getOutputLayer());
-  Matrix::resizeOrCreate(marginGrad_,
-                         score->getHeight(),
-                         1,
-                         /* trans= */ false,
-                         useGpu_);
-  marginGrad_->zeroMem();
-
-  real* gradData = marginGrad_->getData();
-  real* scoreData = score->getData();
-  real* outputData = output->getData();
-
-  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
-  const int* startPosData = startPos->getData(false);
-  size_t batchNum = startPos->getSize() - 1;
-
-  for (size_t i = 0; i < batchNum; ++i) {
-    int beginPos = startPosData[i];
-    int endPos = startPosData[i + 1];
-    calcGrad(outputData + beginPos,
-             scoreData + beginPos,
-             gradData + beginPos,
-             endPos - beginPos);
-  }
-
-  getInputGrad(0)->add(*marginGrad_);
-}
-
-void LambdaCost::calcGrad(const real* outputScore,
-                          const real* score,
-                          real* gradData,
-                          int size) {
-  CHECK_GE(size, truncationSize_)
-      << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
-  int sortSize = maxSortSize_ == -1 ? size : std::min(maxSortSize_, size);
-
-  scorePair_.clear();
-  for (int i = 0; i < size; ++i) {
-    scorePair_.push_back(std::make_pair(score[i], i));
-  }
-  if (size <= sortSize) {
-    std::sort(scorePair_.begin(),
-              scorePair_.end(),
-              [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-                return a.first > b.first;
-              });
-  } else {
-    std::partial_sort(
-        scorePair_.begin(),
-        scorePair_.begin() + sortSize,
-        scorePair_.end(),
-        [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-          return a.first > b.first;
-        });
-  }
-
-  real maxDCG = 0;
-  for (int i = 0; i < truncationSize_; ++i) {
-    maxDCG += (std::pow(2, scorePair_[i].first) - 1) / std::log(i + 2);
-  }
-  CHECK_GT(maxDCG, 0) << "Invalid: max DCG = 0!";
-
-  for (int i = 0; i < sortSize; ++i) {
-    for (int j = i + 1; j < size; ++j) {
-      int index_i = scorePair_[i].second;
-      int index_j = scorePair_[j].second;
-      real score_i = score[index_i];
-      real score_j = score[index_j];
-      real dcgDif = 0;
-      if (j < sortSize) {
-        dcgDif = (std::pow(2, score_i) - std::pow(2, score_j)) *
-                 (1 / std::log(i + 2) - 1 / std::log(j + 2));
-      } else {
-        dcgDif =
-            (std::pow(2, score_i) - std::pow(2, score_j)) / std::log(i + 2);
-      }
-
-      real lambda_ij =
-          -std::abs(dcgDif) /
-          (1 + std::exp(outputScore[index_i] - outputScore[index_j]));
-      gradData[index_i] += lambda_ij / maxDCG;
-      gradData[index_j] -= lambda_ij / maxDCG;
-    }
-  }
-}
-
-real LambdaCost::calcNDCG(const real* outputScore,
-                          const real* score,
-                          int size) {
-  CHECK_GE(size, truncationSize_)
-      << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
-
-  outputScorePair_.clear();
-  for (int i = 0; i < size; ++i) {
-    outputScorePair_.push_back(std::make_pair(outputScore[i], i));
-  }
-  std::partial_sort(
-      outputScorePair_.begin(),
-      outputScorePair_.begin() + truncationSize_,
-      outputScorePair_.end(),
-      [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-        return a.first > b.first;
-      });
-
-  real DCG = 0;
-  for (int i = 0; i < truncationSize_; ++i) {
-    DCG +=
-        (std::pow(2, score[outputScorePair_[i].second]) - 1) / std::log(i + 2);
-  }
-
-  scoreVec_.resize(size);
-  std::copy(score, score + size, scoreVec_.begin());
-  real maxDCG = 0;
-  std::partial_sort(scoreVec_.begin(),
-                    scoreVec_.begin() + truncationSize_,
-                    scoreVec_.end(),
-                    std::greater<real>());
-  for (int i = 0; i < truncationSize_; ++i) {
-    maxDCG += (std::pow(2, scoreVec_[i]) - 1) / std::log(i + 2);
-  }
-  CHECK_GT(maxDCG, 0) << "Invalid: max DCG = 0!";
-
-  return DCG / maxDCG;
-}
-
-//
-// class MultiBinaryLabelCrossEntropy
-//
-
-REGISTER_LAYER(multi_binary_label_cross_entropy, MultiBinaryLabelCrossEntropy);
-
-bool MultiBinaryLabelCrossEntropy::init(const LayerMap& layerMap,
-                                        const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output,
-                                              Argument& label,
-                                              Matrix& target) {
-  MatrixPtr value = nullptr;
-  if (label.ids) {
-    CHECK(!label.value);
-    value = label.ids->toOneHotSparseMatrix(output.getWidth(), useGpu_);
-  } else {
-    CHECK(label.value);
-    value = label.value;
-  }
-
-  if (dynamic_cast<CpuSparseMatrix*>(value.get()) ||
-      dynamic_cast<GpuSparseMatrix*>(value.get())) {
-    target.multiBinaryLabelCrossEntropy(output, *value);
-  } else {
-    Matrix::resizeOrCreate(
-        targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
-
-    targetPerDim_->binaryLabelCrossEntropy(output, *value);
-    targetPerDim_->rowSum(target);
-  }
-}
-
-void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output,
-                                               Argument& label,
-                                               Matrix& outputG) {
-  MatrixPtr value = nullptr;
-  if (label.ids) {
-    CHECK(!value);
-    value = label.ids->toOneHotSparseMatrix(output.getWidth(), useGpu_);
-  } else {
-    CHECK(label.value);
-    value = label.value;
-  }
-
-  if (dynamic_cast<CpuSparseMatrix*>(value.get()) ||
-      dynamic_cast<GpuSparseMatrix*>(value.get())) {
-    outputG.multiBinaryLabelCrossEntropyBp(output, *value);
-  } else {
-    outputG.binaryLabelCrossEntropyBp(output, *value);
-  }
-}
-
-bool HuberCost::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  CostLayer::init(layerMap, parameterMap);
-  if (useGpu_) {
-    tmpCpuInput_.reserve(inputLayers_.size());
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_.push_back(Argument());
-    }
-  }
-  return true;
-}
-
-void HuberCost::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
-  if (useGpu_) {
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(
-          getInput(i), false, HPPL_STREAM_DEFAULT);
-    }
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-}
-
-//
-// Huber loss for robust regression.
-//
-REGISTER_LAYER(huber_regression, HuberRegressionLoss);
-
-bool HuberRegressionLoss::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  HuberCost::init(layerMap, parameterMap);
-  delta_ = config_.delta();
-  return true;
-}
-
-void HuberRegressionLoss::forwardImp(Matrix& output,
-                                     Argument& label,
-                                     Matrix& target) {
-  HuberCost::forwardImp(output, label, target);
-  size_t numSamples = target.getHeight();
-  size_t dim = output.getWidth();
-  CHECK(label.value);
-  CHECK_EQ((*label.value).getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(dim, (*label.value).getWidth());
-  CHECK_EQ(target.getWidth(), (size_t)1);
-
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  real* lbl =
-      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
-  std::vector<real> cost(numSamples, 0);
-  for (size_t i = 0; i < numSamples; ++i) {
-    for (size_t j = 0; j < dim; ++j) {
-      int index = i * dim + j;
-      real a = std::abs(lbl[index] - out[index]);
-      if (a <= delta_)
-        cost[i] += a * a / 2;
-      else
-        cost[i] += delta_ * (a - delta_ / 2);
-    }
-  }
-  target.copyFrom(cost.data(), numSamples);
-}
-
-void HuberRegressionLoss::backwardImp(Matrix& output,
-                                      Argument& label,
-                                      Matrix& outputG) {
-  size_t numSamples = output.getHeight();
-  size_t dim = output.getWidth();
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  real* lbl =
-      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
-  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    for (size_t j = 0; j < dim; ++j) {
-      int index = i * dim + j;
-      real a = lbl[index] - out[index];
-      if (std::abs(a) <= delta_)
-        grad[index] += -a;
-      else
-        grad[index] += a > 0 ? -delta_ : delta_;
-    }
-  }
-  if (useGpu_) outputG.copyFrom(grad, numSamples * dim);
-}
-
-//
-// Huber loss for robust 2-classes classification
-//
-REGISTER_LAYER(huber_classification, HuberTwoClassification);
-
-bool HuberTwoClassification::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  return HuberCost::init(layerMap, parameterMap);
-}
-
-void HuberTwoClassification::forwardImp(Matrix& output,
-                                        Argument& label,
-                                        Matrix& target) {
-  HuberCost::forwardImp(output, label, target);
-  size_t numSamples = target.getHeight();
-  CHECK(label.ids);
-  CHECK_EQ((*label.ids).getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), (size_t)1);
-  CHECK_EQ(target.getWidth(), (size_t)1);
-
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
-  std::vector<real> cost(numSamples, 0);
-  for (size_t i = 0; i < numSamples; ++i) {
-    int y = 2 * lbl[i] - 1;
-    real a = out[i] * y;
-    if (a < -1)
-      cost[i] = -4 * a;
-    else if (a < 1)
-      cost[i] = (1 - a) * (1 - a);
-  }
-  target.copyFrom(cost.data(), numSamples);
-}
-
-void HuberTwoClassification::backwardImp(Matrix& output,
-                                         Argument& label,
-                                         Matrix& outputG) {
-  size_t numSamples = output.getHeight();
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
-  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    int y = 2 * lbl[i] - 1;
-    real a = out[i] * y;
-    if (a < -1)
-      grad[i] += -4 * y;
-    else if (a < 1)
-      grad[i] += -2 * (1 - a) * y;
-  }
-  if (useGpu_) outputG.copyFrom(grad, numSamples);
-}
-/**
- * This cost layer compute the sum of its input as loss.
- * \f[
- * o(i) = \sum_{j=1}^D y_{ij}
- * \f]
- */
-class SumCostLayer : public Layer {
- public:
-  explicit SumCostLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    bool ret = Layer::init(layerMap, parameterMap);
-    if (!ret) return ret;
-    CHECK_EQ(inputLayers_.size(), 1UL);
-    return true;
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-    const MatrixPtr& input = getInputValue(0);
-
-    /* malloc memory for the output_ if necessary */
-    int batchSize = input->getHeight();
-    int size = 1;
-    resizeOutput(batchSize, size);
-    output_.value->sumRows(*input, /* scaleSum= */ 1, /* scaleDest= */ 0);
-  }
-
-  void backward(const UpdateCallback& callback = nullptr) override {
-    getInputGrad(0)->add((real)1);
-  }
-};
-
-REGISTER_LAYER(sum_cost, SumCostLayer);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CostLayer.h b/paddle/legacy/gserver/layers/CostLayer.h
deleted file mode 100644
index 9bfec0e2b16..00000000000
--- a/paddle/legacy/gserver/layers/CostLayer.h
+++ /dev/null
@@ -1,374 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * Base class for a particular type of cost layer.
- * This type of cost should have one data layer, one label layer
- * and an optional weight layer as input.
- * The derived class should implemnt forwardImp() and backwardImp()
- * which calculate the cost for data and label. The weight is automatically
- * handled by the base class.
- */
-class CostLayer : public Layer {
- public:
-  explicit CostLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  LayerPtr getOutputLayer() { return inputLayers_[0]; }
-
-  LayerPtr getLabelLayer() { return inputLayers_[1]; }
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-  virtual void forwardImp(Matrix& outputValue,
-                          Argument& label,
-                          Matrix& cost) = 0;
-
-  virtual void backwardImp(Matrix& outputValue,
-                           Argument& label,
-                           Matrix& outputGrad) = 0;
-
- protected:
-  LayerPtr weightLayer_;
-  real coeff_;
-};
-
-/**
- * The cross-entropy loss for multi-class classification task.
- * The loss function is:
- *
- * \f[
- * L = - \sum_{i}{t_{k} * log(P(y=k))}
- * \f]
- */
-class MultiClassCrossEntropy : public CostLayer {
- public:
-  explicit MultiClassCrossEntropy(const LayerConfig& config)
-      : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-};
-
-/**
- * The cross-entropy with self-normalization for multi-class classification.
- *
- * The loss function is:
- * \f[
- * L = \sum_{i}[-log(P(x_{i})) + alpha * log(Z(x_{i})^2)]
- * \f]
- *
- * The \f$Z(x)\f$ is the softmax normalizer.
- *
- * [1] Jacob Devlin, Rabih Zbib, Zhongqiang Huang, Thomas Lamar,
- *     Richard Schwartz, and John Makhoul. Fast and robust neural
- *     network joint models for statistical machine translation.
- *     In Proceedings of the ACL 2014 Conference.
- */
-class MultiClassCrossEntropyWithSelfNorm : public CostLayer {
- public:
-  explicit MultiClassCrossEntropyWithSelfNorm(const LayerConfig& config)
-      : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-
- protected:
-  MatrixPtr sftMaxSum_;
-  MatrixPtr sumInv_;
-};
-
-/**
- * The cross-entropy for soft binary class.
- * \f[
- * L = \sum_i (\sum_j -y_j(i)*log(x_j(i))-(1-y_j(i))*log(1-x_j(i)))
- * \f]
- */
-class SoftBinaryClassCrossEntropy : public CostLayer {
- public:
-  explicit SoftBinaryClassCrossEntropy(const LayerConfig& config)
-      : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-
- protected:
-  MatrixPtr targetPerDim_;
-};
-
-/**
- * This cost layer compute Euclidean (L2) loss for real-valued regression
- * tasks.
- * \f[
- * L = \sum_{i=1}^N {|| \hat{y}_i - y_i||_2^2}
- * \f]
- */
-class SumOfSquaresCostLayer : public CostLayer {
- public:
-  explicit SumOfSquaresCostLayer(const LayerConfig& config)
-      : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-};
-
-/**
- * This cost layer compute smooth L1 loss for real-valued regression
- * tasks.
- * \f[
- * L =
- *   0.5 * x^2    if / -1 < |x| < 1 /
- *   |x| - 0.5    / otherwise /
- * \f]
- *
- * x = output - label
- */
-class SmoothL1CostLayer : public CostLayer {
- public:
-  explicit SmoothL1CostLayer(const LayerConfig& config) : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-};
-
-/**
- * A cost layer for learning to rank (LTR) task. This layer contains at leat
- * three inputs.
- * \f[
- *  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
- *  o_{i,j} =  o_i - o_j  \\
- *  \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
- * \f]
- *
- * [1]. Chris Burges, Tal Shaked, Erin Renshaw, et al. Learning to
- *      Rank useing Gradient Descent.
- */
-class RankingCost : public Layer {
- public:
-  explicit RankingCost(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  LayerPtr getOutputLayer(size_t i) { return inputLayers_[i]; }
-
-  LayerPtr getLabelLayer() { return inputLayers_[2]; }
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-  void onPassEnd() override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) {
-    (void)output;
-    (void)label;
-    (void)cost;
-  }
-
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {
-    (void)outputValue;
-    (void)label;
-    (void)outputGrad;
-  }
-
- private:
-  double posPairCount_;
-  double negPairCount_;
-  MatrixPtr margin_;
-  MatrixPtr marginGrad_;
-  /// if input label is put in ids (not value), copy to this buffer.
-  MatrixPtr labelBuf_;
-  LayerPtr weightLayer_;
-};
-
-/**
- * LambdaRank os a method for learning arbitrary information retrieval
- * measures. It can be applied to any algorithm that learns through gradient
- * descent. LambdaRank is a listwise method, in that the cost depends on the
- * sorted order of the documents. LambdaRank gives the gradient of cost
- * function:
- *
- * \f[
- * \lambda_{ij} = \frac{1}{1 + e^{o_i - o_j}} \left| \Delta_{NDCG} \right|
- * \f]
- *
- * [1] Christopher J.C. Burges, Robert Ragno, Quoc Viet Le. Learning to Rank
- *     with Nonsmooth Cost Functions.
- */
-class LambdaCost : public Layer {
- public:
-  explicit LambdaCost(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  LayerPtr getOutputLayer() { return inputLayers_[0]; }
-
-  LayerPtr getScoreLayer() { return inputLayers_[1]; }
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-  real calcNDCG(const real* outputScore, const real* score, int size);
-  void calcGrad(const real* outputScore,
-                const real* score,
-                real* gradData,
-                int size);
-
- private:
-  MatrixPtr marginGrad_;
-  int truncationSize_;
-  int maxSortSize_;
-  std::vector<std::pair<real, int>> scorePair_;
-  std::vector<std::pair<real, int>> outputScorePair_;
-  std::vector<real> scoreVec_;
-};
-
-/**
- * Cross entropy for multi binary labels.
- * \f[
- * cost[i] = -sum(label[i][j]*log(output[i][j]) +
- *            (1-label[i][j])*log(1-output[i][j]))
- * \f]
- */
-class MultiBinaryLabelCrossEntropy : public CostLayer {
- protected:
-  MatrixPtr targetPerDim_;
-
- public:
-  explicit MultiBinaryLabelCrossEntropy(const LayerConfig& config)
-      : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-};
-
-/*
- * A base layer for HuberRegressionLoss and HuberTwoClassification.
- */
-class HuberCost : public CostLayer {
- public:
-  std::vector<Argument> tmpCpuInput_;
-
-  explicit HuberCost(const LayerConfig& config) : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override {}
-};
-
-/**
- * Huber loss for robust regression.
- *
- * Given output f(x), label y and delta, the loss is:
- * Loss = 0.5 * (1 - y * f)^2, if abs(y - f) <= delta \\
- * Loss = delta * abs(y - f) - 0.5 * delta^2, otherwise
- */
-class HuberRegressionLoss : public HuberCost {
- public:
-  explicit HuberRegressionLoss(const LayerConfig& config) : HuberCost(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-
- protected:
-  real delta_;
-};
-
-/**
- * Huber loss for robust 2-classes classification.
- *
- * For label={0, 1}, let y=2*label-1. Given output f(x), the loss is:
- * Loss = 4 * y * f, if y* f < -1 \\
- * Loss = (1 - y * f)^2, if -1 < y * f < 1  \\
- * Loss = 0, otherwise
- */
-class HuberTwoClassification : public HuberCost {
- public:
-  explicit HuberTwoClassification(const LayerConfig& config)
-      : HuberCost(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-};
-
-typedef std::shared_ptr<CostLayer> CostLayerPtr;
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CropLayer.cpp b/paddle/legacy/gserver/layers/CropLayer.cpp
deleted file mode 100644
index d891375ecce..00000000000
--- a/paddle/legacy/gserver/layers/CropLayer.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CropLayer.h"
-#include "paddle/legacy/utils/Stat.h"
-namespace paddle {
-
-REGISTER_LAYER(crop, CropLayer);
-
-bool CropLayer::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_LE(static_cast<int>(inputLayers_.size()), 2);
-  CHECK_GE(static_cast<int>(inputLayers_.size()), 1);
-  crop_axis_ = config_.axis();
-  for (int i = 0; i < config_.offset_size(); i++) {
-    crop_offsets_.push_back(config_.offset(i));
-  }
-
-  // 1. get input_0 shape
-  auto& input0_img_conf = config_.inputs(0).image_conf();
-  inDims_ = TensorShape({0,
-                         input0_img_conf.channels(),
-                         input0_img_conf.has_img_size_y()
-                             ? input0_img_conf.img_size_y()
-                             : input0_img_conf.img_size(),
-                         input0_img_conf.img_size()});
-  // 2. get target dims from config
-  if (config_.inputs_size() == 1) {
-    targetDims_ = TensorShape({config_.shape(0),
-                               config_.shape(1),
-                               config_.shape(2),
-                               config_.shape(3)});
-  } else {
-    // 2. get input_1 shape
-    auto& input1_img_conf = config_.inputs(1).image_conf();
-    targetDims_ = TensorShape({0,
-                               input1_img_conf.channels(),
-                               input1_img_conf.has_img_size_y()
-                                   ? input1_img_conf.img_size_y()
-                                   : input1_img_conf.img_size(),
-                               input1_img_conf.img_size()});
-  }
-
-  // 3. get final crop corner
-  int dimSize = 4;
-  crop_corner_ = {0, 0, 0, 0};
-  for (int i = 0; i < dimSize; i++) {
-    if (i >= crop_axis_) {
-      if (crop_offsets_.size() > 1) {
-        crop_corner_[i] = crop_offsets_[i - crop_axis_];
-      } else {
-        crop_corner_[i] = crop_offsets_[0];
-      }
-    }
-  }
-
-  outDims_ = TensorShape(4);
-
-  createFunction(
-      forward_, "Crop", FuncConfig().set("crop_corner", crop_corner_));
-  createFunction(
-      backward_, "CropGrad", FuncConfig().set("crop_corner", crop_corner_));
-
-  return true;
-}
-
-void CropLayer::setOutDims() {
-  MatrixPtr input = inputLayers_[1]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  // get target dims from input_1
-  if (config_.inputs_size() == 2) {
-    targetDims_.setDim(0, batchSize);
-    int ch = config_.inputs(0).image_conf().channels();
-    if (ch != 0) targetDims_.setDim(1, ch);
-    int h = inputLayers_[1]->getOutput().getFrameHeight();
-    if (h != 0) targetDims_.setDim(2, h);
-    int w = inputLayers_[1]->getOutput().getFrameWidth();
-    if (w != 0) targetDims_.setDim(3, w);
-  }
-  // get final crop shape from target dims and crop axis
-  std::vector<uint32_t> crop_shape;
-  int dimSize = 4;
-  for (int i = 0; i < dimSize; i++) {
-    if (i >= crop_axis_) {
-      crop_shape.push_back(targetDims_[i]);
-    } else {
-      crop_shape.push_back(inDims_[i]);
-    }
-  }
-
-  outDims_.reshape(
-      {crop_shape[0], crop_shape[1], crop_shape[2], crop_shape[3]});
-  output_.setFrameHeight(crop_shape[2]);
-  output_.setFrameWidth(crop_shape[3]);
-}
-
-void CropLayer::setInDims() {
-  MatrixPtr input = inputLayers_[0]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  inDims_.setDim(0, batchSize);
-  int h = inputLayers_[0]->getOutput().getFrameHeight();
-  if (h != 0) inDims_.setDim(2, h);
-  int w = inputLayers_[0]->getOutput().getFrameWidth();
-  if (w != 0) inDims_.setDim(3, w);
-}
-
-void CropLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  setInDims();
-  setOutDims();
-  int size = outDims_[1] * outDims_[2] * outDims_[3];
-  resetOutput(outDims_[0], size);
-  MatrixPtr outV = getOutputValue();
-  REGISTER_TIMER_INFO("CropForward", getName().c_str());
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), inDims_);
-  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
-  forward_[0]->calc(inputs, outputs);
-}
-
-void CropLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  REGISTER_TIMER_INFO("CropBackward", getName().c_str());
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getOutputGrad(), outDims_);
-  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
-  backward_[0]->calc(inputs, outputs);
-}
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CropLayer.h b/paddle/legacy/gserver/layers/CropLayer.h
deleted file mode 100644
index ef88bc483d1..00000000000
--- a/paddle/legacy/gserver/layers/CropLayer.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * \brief  This layer crop input according to the specify conf.
- *         input_0: input to be cropped
- *         input_1: optional reference input
- *         axis: start dimension to be croped
- *         offset: offset of cropping  in each dimension
- *         shape: if reference input layer was not setted,
- *                  crop input as this shape conf
- */
-class CropLayer : public Layer {
- public:
-  explicit CropLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~CropLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  void setOutDims();
-  void setInDims();
-
-  int32_t crop_axis_;
-  std::vector<uint32_t> crop_offsets_;
-  std::vector<uint32_t> crop_corner_;
-  TensorShape inDims_;
-  TensorShape targetDims_;
-  TensorShape outDims_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CrossChannelNormLayer.cpp b/paddle/legacy/gserver/layers/CrossChannelNormLayer.cpp
deleted file mode 100644
index 0fe100a96c0..00000000000
--- a/paddle/legacy/gserver/layers/CrossChannelNormLayer.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "NormLayer.h"
-#include "paddle/legacy/math/BaseMatrix.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-MatrixPtr CrossChannelNormLayer::createSampleMatrix(MatrixPtr data,
-                                                    size_t iter,
-                                                    size_t spatialDim) {
-  return Matrix::create(data->getData() + iter * channels_ * spatialDim,
-                        channels_,
-                        spatialDim,
-                        false,
-                        useGpu_);
-}
-
-MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
-                                                     size_t iter,
-                                                     size_t spatialDim) {
-  return Matrix::create(
-      data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
-}
-
-bool CrossChannelNormLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK(parameters_[0]);
-  const NormConfig& conf = config_.inputs(0).norm_conf();
-  channels_ = conf.channels();
-  scale_.reset(new Weight(channels_, 1, parameters_[0]));
-  return true;
-}
-
-void CrossChannelNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  MatrixPtr inV = getInputValue(0);
-
-  size_t batchSize = inV->getHeight();
-  size_t dataDim = inV->getWidth();
-  CHECK_EQ(getSize(), dataDim);
-
-  reserveOutput(batchSize, dataDim);
-  MatrixPtr outV = getOutputValue();
-  size_t spatialDim = dataDim / channels_;
-
-  Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
-  Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
-  Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
-
-  inV->square2(*dataBuffer_);
-  for (size_t i = 0; i < batchSize; i++) {
-    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
-    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
-    MatrixPtr outVTmp = createSampleMatrix(outV, i, spatialDim);
-    MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
-
-    // compute norm.
-    spatialBuffer_->sumCols(*dataTmp, 1, 0);
-    // add eps to avoid overflow
-    spatialBuffer_->add(1e-6);
-    spatialBuffer_->sqrt2(*spatialBuffer_);
-    normTmp->copyFrom(*spatialBuffer_);
-    outVTmp->copyFrom(*inVTmp);
-    outVTmp->divRowVector(*spatialBuffer_);
-    // scale the layer.
-    outVTmp->mulColVector(*scale_->getW());
-  }
-}
-
-void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr outV = getOutputValue();
-
-  size_t batchSize = inG->getHeight();
-  size_t dataDim = inG->getWidth();
-  size_t spatialDim = dataDim / channels_;
-
-  MatrixPtr inGBuffer;
-  Matrix::resizeOrCreate(inGBuffer, channels_, spatialDim, false, useGpu_);
-
-  dataBuffer_->dotMul(*outG, *outV);
-  Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
-  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
-  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
-  scaleDiff_->zeroMem();
-  for (size_t i = 0; i < batchSize; i++) {
-    MatrixPtr outGTmp = createSampleMatrix(outG, i, spatialDim);
-    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
-    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
-    const MatrixPtr inGTmp = createSampleMatrix(inG, i, spatialDim);
-    const MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
-
-    channelBuffer_->sumRows(*dataTmp, 1, 0);
-    channelBuffer_->dotDiv(*channelBuffer_, *(scale_->getW()));
-    // store a / scale[i] in scaleDiff_ temporary
-    scaleDiff_->add(*channelBuffer_, 1.);
-
-    sampleBuffer_->dotMul(*inVTmp, *outGTmp);
-    spatialBuffer_->sumCols(*sampleBuffer_, 1., 0.);
-    // scale the grad
-    inGBuffer->copyFrom(*inVTmp);
-    inGBuffer->mulRowVector(*spatialBuffer_);
-    // divide by square of norm
-    spatialBuffer_->dotMul(*normTmp, *normTmp);
-    inGBuffer->divRowVector(*spatialBuffer_);
-    // subtract
-    inGBuffer->add(*outGTmp, -1, 1);
-    // divide by norm
-    inGBuffer->divRowVector(*normTmp);
-    // scale the diff
-    inGBuffer->mulColVector(*scale_->getW());
-
-    inGTmp->add(*inGBuffer);
-  }
-  // updata scale
-  if (scale_->getWGrad()) scale_->getWGrad()->add(*scaleDiff_);
-  scale_->getParameterPtr()->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CrossEntropyOverBeam.cpp b/paddle/legacy/gserver/layers/CrossEntropyOverBeam.cpp
deleted file mode 100644
index f3bf2148587..00000000000
--- a/paddle/legacy/gserver/layers/CrossEntropyOverBeam.cpp
+++ /dev/null
@@ -1,393 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CrossEntropyOverBeam.h"
-
-namespace paddle {
-
-void CostForOneSequence::calValidExpandStep() {
-  validExpansionCount_ = 0;
-  goldAsExtraPath_ = true;
-
-  for (size_t i = 0; i < beams_->expansionCount; ++i) {
-    real gold = static_cast<real>(beams_->gold[i]);
-    if (i) {
-      real* start = beams_->candidateIds[i - 1]->getData();
-      goldRowIds_[i] = std::count_if(
-          start,
-          start + goldRowIds_[i - 1] * beamSize_ + goldColIds_[i - 1],
-          [](const real& val) { return val != -1.; });
-    } else {
-      goldRowIds_[i] = 0;
-    }
-
-    real* start =
-        beams_->candidateIds[i]->getData() + goldRowIds_[i] * beamSize_;
-    real* findEnd = std::find(start, start + beamSize_, gold);
-    validExpansionCount_++;
-
-    if (start + beamSize_ == findEnd) return;
-    goldColIds_[i] = findEnd - start;
-  }
-  if (goldColIds_[beams_->expansionCount - 1] != -1) goldAsExtraPath_ = false;
-}
-
-size_t CostForOneSequence::initLastExpansion() {
-  int beamId = validExpansionCount_ - 1;
-  const MatrixPtr candidates = beams_->candidateIds[beamId];
-  size_t height = candidates->getHeight();
-
-  /* initialization the last expansion. */
-  size_t pathCount = std::count_if(candidates->getData(),
-                                   candidates->getData() + height * beamSize_,
-                                   [](const real& val) { return val != -1; });
-  /*
-   * if the gold sequence falls off the beam during search, add the gold
-   * sequence as the last path into the all expanded candidates.
-   */
-  if (goldAsExtraPath_) goldIdsInFinalExpansion_ = pathCount++;
-
-  pathRowIdsInEachBeam_.clear();
-  pathRowIdsInEachBeam_.resize(validExpansionCount_,
-                               std::vector<int>(pathCount, 0));
-  parentIdsInBeam_.clear();
-  parentIdsInBeam_.resize(pathCount, 0);
-
-  if (goldAsExtraPath_) {
-    /* add gold sequence into the total expansion. */
-    pathRowIdsInEachBeam_[beamId].back() =
-        beams_->gold[beamId] +
-        getSeqStartPos(beamId, goldRowIds_[validExpansionCount_ - 1]);
-    parentIdsInBeam_.back() = goldRowIds_[validExpansionCount_ - 1];
-  } else {
-    size_t goldOffset = goldRowIds_[beamId] * beamSize_ + goldColIds_[beamId];
-    goldIdsInFinalExpansion_ =
-        std::count_if(candidates->getData(),
-                      candidates->getData() + goldOffset,
-                      [](const real& val) { return val != -1.; });
-  }
-
-  /*
-   * TODO(caoying): fix this, store the indices of selected candidate
-   * paths into Argument.ids
-   */
-  real* ids = candidates->getData();
-  size_t curIdx = 0;
-  for (size_t i = 0; i < height; ++i) {
-    int basePos = getSeqStartPos(beamId, i);
-    for (size_t j = 0; j < beamSize_; ++j) {
-      int id = ids[i * beamSize_ + j];
-      if (id == -1) continue;
-      pathRowIdsInEachBeam_[beamId][curIdx] = id + basePos;
-      parentIdsInBeam_[curIdx++] = i;
-    }
-  }
-  return pathCount;
-}
-
-void CostForOneSequence::constructTotalExpansion() {
-  /*
-   * construct the entire expanded beam by begining with the last search
-   * in which gold falls off the beam.
-   */
-  size_t totalPathCount = initLastExpansion();
-
-  for (int beamId = validExpansionCount_ - 2; beamId >= 0; --beamId) {
-    const MatrixPtr candidates = beams_->candidateIds[beamId];
-    real* ids = candidates->getData();
-
-    int lastParentIdInBeam = -1;
-    int basePos = -1;
-    for (size_t i = 0;
-         i < (goldAsExtraPath_ ? totalPathCount - 1 : totalPathCount);
-         ++i) {
-      int id = ids[parentIdsInBeam_[i]];
-      int parentRowId = std::div(parentIdsInBeam_[i], beamSize_).quot;
-      if (parentIdsInBeam_[i] != lastParentIdInBeam)
-        basePos = getSeqStartPos(beamId, parentRowId);
-
-      pathRowIdsInEachBeam_[beamId][i] = id + basePos;
-      lastParentIdInBeam = parentIdsInBeam_[i];
-      parentIdsInBeam_[i] = parentRowId;
-
-      if (goldAsExtraPath_)
-        pathRowIdsInEachBeam_[beamId][totalPathCount - 1] =
-            beams_->gold[beamId] + getSeqStartPos(beamId, goldRowIds_[beamId]);
-    }
-  }
-}
-
-real CostForOneSequence::globallyNormalizedScore() {
-  expandedPathScores_.resize(validExpansionCount_);
-
-  Matrix::resizeOrCreate(
-      softmaxOut_, 1, pathRowIdsInEachBeam_[0].size(), false, false);
-  softmaxOut_->zeroMem();
-  MatrixPtr tmp = Matrix::create(
-      softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false);
-
-  for (size_t i = 0; i < validExpansionCount_; ++i) {
-    Matrix::resizeOrCreate(expandedPathScores_[i],
-                           pathRowIdsInEachBeam_[i].size(),
-                           1,
-                           false,
-                           false);
-    expandedPathScores_[i]->zeroMem();
-
-    IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(),
-                                        pathRowIdsInEachBeam_[i].size(),
-                                        false);
-    expandedPathScores_[i]->selectRows(*(beams_->scores[i]), *rowIds);
-    tmp->add(*expandedPathScores_[i]);
-  }
-
-  softmaxOut_->softmax(*softmaxOut_);
-  return -std::log(softmaxOut_->getData()[goldIdsInFinalExpansion_]);
-}
-
-real CostForOneSequence::forward() {
-  calValidExpandStep();
-  constructTotalExpansion();
-  return globallyNormalizedScore();
-}
-
-void CostForOneSequence::backward() {
-  /*
-   * when softmax layer is the output layer, and it is combined with
-   * cross-entropy as cost. The derivate with regard to softmax's input
-   * is simply:
-   *
-   * grad_i = softmax_out_i - target_i,
-   *
-   * and here hard label is used.
-   */
-  softmaxOut_->getData()[goldIdsInFinalExpansion_] -= 1.;
-
-  MatrixPtr tmp = Matrix::create(
-      softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false);
-
-  for (size_t i = 0; i < validExpansionCount_; ++i) {
-    IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(),
-                                        pathRowIdsInEachBeam_[i].size(),
-                                        false);
-    /*
-      beams_->scoreGrad[i] has been intialized outside this class, this
-      class only keeps a pointer pointing to the original input gradients,
-      so here does not need to allocate or initalize the memory.
-    */
-    tmp->addToRows(*beams_->scoreGrad[i], *rowIds);
-  }
-}
-
-REGISTER_LAYER(cross_entropy_over_beam, CrossEntropyOverBeam);
-
-bool CrossEntropyOverBeam::init(const LayerMap& layerMap,
-                                const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(0U, inputLayers_.size() % 3) << "Error input number.";
-
-  beamExpanCount_ = inputLayers_.size() / 3;
-
-  candidateScores_.resize(beamExpanCount_);
-  candidateScoreGrad_.resize(beamExpanCount_);
-
-  candidateInBeam_.resize(beamExpanCount_);
-  goldSequence_.resize(beamExpanCount_);
-  gradToInputs_.resize(beamExpanCount_);
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void CrossEntropyOverBeam::checkInputs() {
-  batchSize_ = 0;
-  for (size_t i = 0; i < beamExpanCount_; ++i) {
-    const Argument& scores = getInput(i * 3);
-    const Argument& selCandidates = getInput(i * 3 + 1);
-    const Argument& goldSeq = getInput(i * 3 + 2);
-
-    if (i) {
-      CHECK(scores.hasSubseq()) << "input " << i << " "
-                                << inputLayers_[i * 3]->getName()
-                                << " should be a nested sequence";
-      CHECK_EQ(getInputValue(i * 3 + 1)->getWidth(), beamSize_);
-      CHECK_EQ(batchSize_, static_cast<size_t>(scores.getNumSequences()));
-      CHECK_EQ(scores.getNumSubSequences(), selCandidates.getBatchSize());
-    } else {
-      CHECK(scores.hasSeq()) << "input " << i << " "
-                             << inputLayers_[i]->getName()
-                             << " should be a sequence";
-      batchSize_ = scores.getNumSequences();
-      beamSize_ = getInputValue(i * 3 + 1)->getWidth();
-      CHECK_EQ(batchSize_, static_cast<size_t>(selCandidates.getBatchSize()));
-    }
-    CHECK_EQ(1U, scores.value->getWidth());
-    CHECK_EQ(batchSize_, static_cast<size_t>(goldSeq.getBatchSize()));
-  }
-}
-
-void CrossEntropyOverBeam::copyInputsToCpu() {
-  auto copyValue = [](const MatrixPtr& src, MatrixPtr& trg) {
-    if (dynamic_cast<GpuMatrix*>(src.get())) {
-      Matrix::resizeOrCreate(
-          trg, src->getHeight(), src->getWidth(), false, false);
-      trg->copyFrom(*src);
-    } else {
-      trg = std::move(src);
-    }
-  };
-
-  auto copyIds = [](const IVectorPtr& src, IVectorPtr& trg) {
-    if (dynamic_cast<GpuIVector*>(src.get())) {
-      IVector::resizeOrCreate(trg, src->getSize(), false);
-      trg->copyFrom(*src);
-    } else {
-      trg = std::move(src);
-    }
-  };
-
-  beamSplitPos_.clear();
-  beamSplitPos_.resize(batchSize_, std::vector<int>(beamExpanCount_, 0));
-  for (size_t i = 0; i < beamExpanCount_; ++i) {
-    copyValue(getInputValue(i * 3), candidateScores_[i]);
-    copyValue(getInputValue(i * 3 + 1), candidateInBeam_[i]);
-    copyIds(getInput(i * 3 + 2).ids, goldSequence_[i]);
-
-    if (i) {
-      ICpuGpuVectorPtr seqInfo = getInput(i * 3).sequenceStartPositions;
-      const int* seqStarts = seqInfo->getMutableData(false);
-      ICpuGpuVectorPtr subSeqInfo = getInput(i * 3).subSequenceStartPositions;
-      const int* subSeqStarts = subSeqInfo->getMutableData(false);
-
-      size_t seqId = 1;
-      for (size_t subSeqId = 0; subSeqId < subSeqInfo->getSize() - 1;
-           ++subSeqId) {
-        CHECK_LT(seqId, seqInfo->getSize());
-        if (subSeqStarts[subSeqId] == seqStarts[seqId]) {
-          beamSplitPos_[seqId][i] = beamSplitPos_[seqId - 1][i];
-          seqId++;
-        }
-        beamSplitPos_[seqId - 1][i]++;
-      }
-    } else {
-      for (size_t j = 0; j < batchSize_; ++j) beamSplitPos_[j][i] = j + 1;
-    }
-  }
-}
-
-void CrossEntropyOverBeam::splitBatchBeams() {
-  beamCosts_.resize(batchSize_);
-  beamPerSeq_.resize(batchSize_, BeamExpansion(beamExpanCount_));
-
-  for (size_t i = 0; i < beamExpanCount_; ++i) {
-    int* seqStarts =
-        getInput(i * 3).sequenceStartPositions->getMutableData(false);
-
-    int* subSeqStarts = nullptr;
-    int maxLen = 0;
-    if (i) {
-      subSeqStarts =
-          getInput(i * 3).subSequenceStartPositions->getMutableData(false);
-      maxLen = getInput(i * 3).subSequenceStartPositions->getSize() - 1;
-    } else {
-      maxLen = getInput(i).sequenceStartPositions->getSize() - 1;
-    }
-
-    for (size_t j = 0; j < batchSize_; ++j) {
-      beamPerSeq_[j].scores[i] =
-          Matrix::create(candidateScores_[i]->getData() + seqStarts[j],
-                         seqStarts[j + 1] - seqStarts[j],
-                         1,
-                         false,
-                         false);
-      beamPerSeq_[j].scoreGrad[i] =
-          Matrix::create(candidateScoreGrad_[i]->getData() + seqStarts[j],
-                         seqStarts[j + 1] - seqStarts[j],
-                         1,
-                         false,
-                         false);
-
-      int offset = j ? beamSplitPos_[j - 1][i] : 0;
-      int height = beamSplitPos_[j][i] - (j ? beamSplitPos_[j - 1][i] : 0);
-      CHECK_GE(maxLen, offset + height);
-      beamPerSeq_[j].seqInfo[i] = IVector::create(
-          (i ? subSeqStarts : seqStarts) + offset, height + 1, false);
-
-      beamPerSeq_[j].candidateIds[i] =
-          Matrix::create(candidateInBeam_[i]->getData() + offset * beamSize_,
-                         height,
-                         beamSize_,
-                         false,
-                         false);
-      beamPerSeq_[j].gold[i] = goldSequence_[i]->getData()[j];
-
-      CHECK_LE(beamPerSeq_[j].gold[i], seqStarts[j + 1] - seqStarts[j]);
-    }
-  }
-}
-
-void CrossEntropyOverBeam::resizeOutput() {
-  Matrix::resizeOrCreate(output_.value, batchSize_, 1, false, false);
-  output_.value->zeroMem();
-
-  for (size_t i = 0; i < beamExpanCount_; ++i) {
-    MatrixPtr inGrad = getInputGrad(i * 3);
-    if (dynamic_cast<GpuMatrix*>(inGrad.get())) {
-      Matrix::resizeOrCreate(candidateScoreGrad_[i],
-                             inGrad->getHeight(),
-                             inGrad->getWidth(),
-                             false,
-                             false);
-    } else {
-      candidateScoreGrad_[i] = std::move(inGrad);
-    }
-    candidateScoreGrad_[i]->zeroMem();
-  }
-}
-
-void CrossEntropyOverBeam::copyGradToGpu(size_t copyCount) {
-  for (size_t i = 0; i < beamExpanCount_; ++i) {
-    if (dynamic_cast<GpuMatrix*>(getInputGrad(i * 3).get()))
-      getInputGrad(i * 3)->copyFrom(*candidateScoreGrad_[i]);
-
-    if (i == copyCount - 1) break;
-  }
-}
-
-void CrossEntropyOverBeam::forward(PassType passType) {
-  Layer::forward(passType);
-
-  checkInputs();
-  copyInputsToCpu();
-
-  resizeOutput();
-  splitBatchBeams();
-
-  MatrixPtr outputValue = getOutputValue();
-  for (size_t i = 0; i < batchSize_; ++i) {
-    BeamExpansionPtr ptr = std::make_shared<BeamExpansion>(beamPerSeq_[i]);
-    beamCosts_[i].setData(std::move(ptr), beamSize_);
-    outputValue->getData()[i] = beamCosts_[i].forward();
-  }
-}
-
-void CrossEntropyOverBeam::backward(const UpdateCallback& callback) {
-  for (size_t i = 0; i < batchSize_; ++i) {
-    beamCosts_[i].backward();
-    copyGradToGpu(beamCosts_[i].getValidExpansionCount());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CrossEntropyOverBeam.h b/paddle/legacy/gserver/layers/CrossEntropyOverBeam.h
deleted file mode 100644
index c8702b16165..00000000000
--- a/paddle/legacy/gserver/layers/CrossEntropyOverBeam.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "CrossEntropyOverBeam.h"
-#include "Layer.h"
-
-namespace paddle {
-
-/* This struct stores the beams in all search steps for a single sequence. */
-struct BeamExpansion {
-  std::vector<MatrixPtr> scores;
-  std::vector<IVectorPtr> seqInfo;
-
-  std::vector<MatrixPtr> candidateIds;
-  std::vector<int> gold;
-
-  std::vector<MatrixPtr> scoreGrad;
-
-  size_t expansionCount;
-
-  explicit BeamExpansion(int n) {
-    expansionCount = n;
-    scores.resize(expansionCount);
-    seqInfo.resize(expansionCount);
-    candidateIds.resize(expansionCount);
-    scoreGrad.resize(expansionCount);
-
-    gold.resize(expansionCount);
-  }
-};
-typedef std::shared_ptr<BeamExpansion> BeamExpansionPtr;
-
-class CostForOneSequence {
- public:
-  CostForOneSequence()
-      : beamSize_(0), validExpansionCount_(0), goldAsExtraPath_(false) {}
-  void setData(const BeamExpansionPtr bPtr, size_t beamSize) {
-    beams_ = bPtr;
-    beamSize_ = beamSize;
-
-    expandedPathScores_.clear();
-    expandedPathScores_.resize(beams_->expansionCount);
-
-    goldRowIds_.clear();
-    goldRowIds_.resize(beams_->expansionCount, 0);
-    goldColIds_.clear();
-    goldColIds_.resize(beams_->expansionCount, -1);
-  }
-  size_t getValidExpansionCount() { return validExpansionCount_; }
-
-  real forward();
-  void backward();
-
- private:
-  void calValidExpandStep();
-  void constructTotalExpansion();
-  size_t initLastExpansion();
-  real globallyNormalizedScore();
-
-  int getSeqStartPos(size_t beamId, size_t rowId) {
-    CHECK_GT(beams_->seqInfo[beamId]->getSize() - 1, rowId);
-    int* starts = beams_->seqInfo[beamId]->getData();
-    return starts[rowId] - starts[0];
-  }
-
-  size_t beamSize_;
-  size_t validExpansionCount_;
-  bool goldAsExtraPath_;
-  std::vector<int> goldRowIds_;
-  std::vector<int> goldColIds_;
-
-  BeamExpansionPtr beams_;
-  std::vector<std::vector<int>> pathRowIdsInEachBeam_;
-  std::vector<int> parentIdsInBeam_;
-  size_t goldIdsInFinalExpansion_;
-
-  std::vector<MatrixPtr> expandedPathScores_;
-
-  MatrixPtr softmaxOut_;
-};
-
-class CrossEntropyOverBeam : public Layer {
- public:
-  explicit CrossEntropyOverBeam(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
- private:
-  void checkInputs();
-  void copyInputsToCpu();
-  void resizeOutput();
-  void copyGradToGpu(size_t copyCount);
-  void splitBatchBeams();
-
-  size_t beamExpanCount_;
-  size_t batchSize_;
-  size_t beamSize_;
-
-  /*
-   * the process of constructing beams is not friendly to GPU, currently, this
-   * layer only runs on CPU, if any of its inputs is on GPU memory, then copy
-   * it to CPU memory.
-   */
-  std::vector<MatrixPtr> candidateScores_;
-  std::vector<MatrixPtr> candidateScoreGrad_;
-  std::vector<MatrixPtr> candidateInBeam_;
-  std::vector<MatrixPtr> gradToInputs_;
-  std::vector<IVectorPtr> goldSequence_;
-  std::vector<std::vector<int>> beamSplitPos_;
-
-  /*
-   * split entire bath of beams into beam per sequnence and store the result
-   * into this member.
-   */
-  std::vector<BeamExpansion> beamPerSeq_;
-  /* beamCosts_ is used to propagate error in one sequence. */
-  std::vector<CostForOneSequence> beamCosts_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp
deleted file mode 100644
index 051155e0d2c..00000000000
--- a/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CudnnBatchNormLayer.h"
-#include "Layer.h"
-#include "paddle/legacy/cuda/include/hl_batch_norm.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);
-
-bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!BatchNormBaseLayer::init(layerMap, parameterMap)) return false;
-  CHECK(useGpu_) << "CudnnBatchNorm only support GPU";
-
-  hl_create_tensor_descriptor(&ioDesc_);
-  hl_create_tensor_descriptor(&bnParamDesc_);
-  hl_tensor_reshape(bnParamDesc_, 1, channels_, 1, 1);
-
-  return true;
-}
-
-void CudnnBatchNormLayer::reshape(int batchSize) {
-  hl_tensor_reshape(ioDesc_, batchSize, channels_, imageH_ * imageD_, imageW_);
-}
-
-void CudnnBatchNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInputValue(0)->getHeight();
-  calFeatureMapSize();
-  reshape(batchSize);
-  resetOutput(batchSize, getInputValue(0)->getWidth());
-
-  // for testing in training peroid.
-  useGlobalStats_ = (passType == PASS_TEST);
-  if (passType == PASS_TEST && config_.has_use_global_stats()) {
-    useGlobalStats_ = config_.use_global_stats();
-  }
-
-  real* input = getInputValue(0)->getData();
-  real* output = getOutputValue()->getData();
-  real* gamma = weight_->getW()->getData();
-  real* beta = biases_->getW()->getData();
-  real* movingMean = movingMean_->getW()->getData();
-  real* movingVar = movingVar_->getW()->getData();
-
-  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
-  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
-
-  if (!useGlobalStats_) {
-    REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
-    real* savedMean = savedMean_->getData();
-    real* savedInvVar = savedInvVar_->getData();
-    hl_batch_norm_forward_training(ioDesc_,
-                                   input,
-                                   ioDesc_,
-                                   output,
-                                   bnParamDesc_,
-                                   gamma,
-                                   beta,
-                                   1.0 - movingAvgFraction_,
-                                   movingMean,
-                                   movingVar,
-                                   eps_,
-                                   savedMean,
-                                   savedInvVar);
-  } else {
-    // used movingMean and movingVar in testing
-    if (batchSize <= 1024) {
-      hl_batch_norm_forward_inference(ioDesc_,
-                                      input,
-                                      ioDesc_,
-                                      output,
-                                      bnParamDesc_,
-                                      gamma,
-                                      beta,
-                                      movingMean,
-                                      movingVar,
-                                      eps_);
-    } else {
-      // There is a limitation in cudnn library.
-      // When the batch size is larger than 1024 in cuDNN v5.1,
-      // the cudnnBatchNormalizationForwardInference will fail.
-      hl_batch_norm_cuda_inference(input,
-                                   output,
-                                   gamma,
-                                   beta,
-                                   movingMean,
-                                   movingVar,
-                                   eps_,
-                                   batchSize,
-                                   channels_,
-                                   imageH_ * imageD_,
-                                   imageW_);
-    }
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  real* input = getInputValue(0)->getData();
-  real* outGrad = getOutputGrad()->getData();
-  real* inGrad = getInputGrad(0)->getData();
-  real* gamma = weight_->getW()->getData();
-  real* savedMean = savedMean_->getData();
-  real* savedInvVar = savedInvVar_->getData();
-
-  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
-  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
-
-  auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
-    Matrix::resizeOrCreate(m, h, w, false, true);
-    m->zeroMem();
-    *p = m->getData();
-  };
-
-  real* gammaGrad = nullptr;
-  real* betaGrad = nullptr;
-  if (weight_->getWGrad()) {
-    gammaGrad = weight_->getWGrad()->getData();
-  } else {
-    create(tmpWGrad_, 1, channels_, &gammaGrad);
-  }
-  if (biases_ && biases_->getWGrad()) {
-    betaGrad = biases_->getWGrad()->getData();
-  } else {
-    create(tmpBiasGrad_, 1, channels_, &betaGrad);
-  }
-
-  hl_batch_norm_backward(ioDesc_,
-                         input,
-                         ioDesc_,
-                         outGrad,
-                         ioDesc_,
-                         inGrad,
-                         bnParamDesc_,
-                         gamma,
-                         gammaGrad,
-                         betaGrad,
-                         eps_,
-                         savedMean,
-                         savedInvVar);
-
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    biases_->getParameterPtr()->incUpdate(callback);
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-CudnnBatchNormLayer::~CudnnBatchNormLayer() {
-  hl_destroy_tensor_descriptor(ioDesc_);
-  hl_destroy_tensor_descriptor(bnParamDesc_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnBatchNormLayer.h b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.h
deleted file mode 100644
index 3b33b983b31..00000000000
--- a/paddle/legacy/gserver/layers/CudnnBatchNormLayer.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cudnn.h>
-#include "BatchNormBaseLayer.h"
-#include "Layer.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief Cudnn Batch normalization layer use to cuDNN lib to implentment.
- * @note Cudnn version must >= v4.0, and better to use the latest version
- * (v5.1).
- *
- * The config file api is batch_norm_layer.
- */
-
-class CudnnBatchNormLayer : public BatchNormBaseLayer {
- public:
-  explicit CudnnBatchNormLayer(const LayerConfig& config)
-      : BatchNormBaseLayer(config) {}
-
-  ~CudnnBatchNormLayer();
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  /**
-   * reshape tensor of ioDesc_.
-   */
-  void reshape(int batchSize);
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  /// Epsilon value used in the batch normalization formula.
-  /// Same epsilon value should be used in forward and backward functions.
-  double eps_;
-
-  /// Input/output tensor descriptor desc
-  hl_tensor_descriptor ioDesc_;
-  /// Shared tensor descriptor desc for the 6 tenros:
-  /// bnScale, bnBias, running mean/var, save_mean/var
-  hl_tensor_descriptor bnParamDesc_;
-
-  /**
-   * @brief The gradient of weight and bias in cudnn api can not be empty.
-   * If set is_static for weight or bias, it will not allocate memory for them,
-   * and the gradient is NULL. In this case, will use two matrix.
-   */
-  MatrixPtr tmpWGrad_, tmpBiasGrad_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp b/paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp
deleted file mode 100644
index 9353cca9c83..00000000000
--- a/paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CudnnConvBaseLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-REGISTER_LAYER(cudnn_conv, CudnnConvBaseLayer);
-REGISTER_LAYER(cudnn_convt, CudnnConvBaseLayer);
-
-bool CudnnConvBaseLayer::init(const LayerMap &layerMap,
-                              const ParameterMap &parameterMap) {
-  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
-  CHECK(useGpu_) << "CudnnConvLayer only support gpu";
-
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  projections_.reserve(inputLayers_.size());
-  projConf_.reserve(inputLayers_.size());
-
-  numFilters_ = config_.num_filters();
-  CHECK(config_.shared_biases());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    ProjectionConfig *conf = new ProjectionConfig();
-    if (isDeconv_) {
-      conf->set_type("convt");
-    } else {
-      conf->set_type("conv");
-    }
-    conf->set_num_filters(numFilters_);
-    ConvConfig *convConf = conf->mutable_conv_conf();
-    *convConf = *(config_.mutable_inputs(i)->mutable_conv_conf());
-    conf->set_input_size(getPrev(i)->getSize());
-    conf->set_output_size(getSize());
-    projConf_.emplace_back(conf);
-    projections_.emplace_back(
-        Projection::create(*projConf_[i], parameters_[i], useGpu_));
-
-    // create a new weight
-    size_t height, width;
-    height = filterPixels_[i] * filterChannels_[i];
-    width = (!isDeconv_) ? numFilters_ : channels_[i];
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight *w = new Weight(height, width, parameters_[i]);
-    weights_.emplace_back(w);
-  }
-
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
-    }
-  }
-  if (biases_.get() && sharedBiases_) {
-    hl_create_tensor_descriptor(&biasDesc_);
-    hl_create_tensor_descriptor(&outputDesc_);
-    hl_tensor_reshape(biasDesc_, 1, numFilters_, 1, 1);
-  }
-
-  return true;
-}
-
-void CudnnConvBaseLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInput(0).getBatchSize();
-  resetOutput(batchSize, calOutputSize());
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    projections_[i]->forward(&getInput(i), &getOutput(), passType);
-  }
-
-  if (biases_) {
-    REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
-    int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-    int outH = outputH_[0];
-    int outW = outputW_[0];
-
-    hl_tensor_reshape(outputDesc_,
-                      batchSize,
-                      numFilters_,
-                      outH,
-                      outW,
-                      numFilters_ * outH * outW,
-                      outH * outW,
-                      outW,
-                      1);
-    real *outData = getOutputValue()->getData();
-    real *biasData = biases_->getW()->getData();
-    hl_convolution_forward_add_bias(biasDesc_, biasData, outputDesc_, outData);
-  }
-
-  forwardActivation();
-}
-
-void CudnnConvBaseLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("CudnnConvBpBiasTimer", getName().c_str());
-    real *biasGrad = biases_->getWGrad()->getData();
-    real *outGrad = getOutputGrad()->getData();
-    hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
-
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    projections_[i]->backward(callback);
-  }
-}
-
-CudnnConvBaseLayer::~CudnnConvBaseLayer() {
-  if (biases_) {
-    hl_destroy_tensor_descriptor(biasDesc_);
-    hl_destroy_tensor_descriptor(outputDesc_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnConvBaseLayer.h b/paddle/legacy/gserver/layers/CudnnConvBaseLayer.h
deleted file mode 100644
index d050183eb78..00000000000
--- a/paddle/legacy/gserver/layers/CudnnConvBaseLayer.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "Projection.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A 2-dimension conv layer implemented by cuDNN. It only
- *        supports GPU mode. We automatic select CudnnConvLayer for GPU
- *        mode and ExpandConvLayer for CPU mode if you set type of "conv".
- *        User also can specfiy type of "exconv" or "cudnn_conv" for
- *        particular type.
- *
- * The config file api is img_conv_layer.
- */
-class CudnnConvBaseLayer : public ConvBaseLayer {
- protected:
-  std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
-  std::vector<std::unique_ptr<Projection>> projections_;
-
-  hl_tensor_descriptor biasDesc_;
-  hl_tensor_descriptor outputDesc_;
-
- public:
-  explicit CudnnConvBaseLayer(const LayerConfig& config)
-      : ConvBaseLayer(config) {}
-
-  ~CudnnConvBaseLayer();
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp b/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp
deleted file mode 100644
index c790dfd71ef..00000000000
--- a/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CudnnPoolLayer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-bool CudnnPoolLayer::typeCheck(const std::string &poolType,
-                               hl_pooling_mode_t *mode) {
-  if (poolType == "cudnn-max-pool") {
-    if (mode) {
-      *mode = HL_POOLING_MAX;
-    }
-  } else if (poolType == "cudnn-avg-pool") {
-    if (mode) {
-      *mode = HL_POOLING_AVERAGE;
-    }
-  } else if (poolType == "cudnn-avg-incl-pad-pool") {
-    if (mode) {
-      *mode = HL_POOLING_AVERAGE_INCLUDE_PADDING;
-    }
-  } else {
-    return false;
-  }
-
-  return true;
-}
-
-CudnnPoolLayer::CudnnPoolLayer(const LayerConfig &config) : PoolLayer(config) {
-  const std::string &pool_type = config.inputs(0).pool_conf().pool_type();
-  CHECK_EQ(CudnnPoolLayer::typeCheck(pool_type, &mode_), true);
-}
-
-bool CudnnPoolLayer::init(const LayerMap &layerMap,
-                          const ParameterMap &parameterMap) {
-  PoolLayer::init(layerMap, parameterMap);
-
-  CHECK(useGpu_) << "CudnnPoolLayer only support gpu";
-
-  hl_create_tensor_descriptor(&inputDesc_);
-  hl_create_tensor_descriptor(&outputDesc_);
-
-  windowHeight = sizeY_;
-  windowWidth = sizeX_;
-  heightPadding = confPaddingY_;
-  widthPadding = confPadding_;
-  strideHeight = strideY_;
-  strideWidth = stride_;
-
-  hl_create_pooling_descriptor(&poolingDesc_,
-                               mode_,
-                               windowHeight,
-                               windowWidth,
-                               heightPadding,
-                               widthPadding,
-                               strideHeight,
-                               strideWidth);
-
-  return true;
-}
-
-void CudnnPoolLayer::reshape(int batchSize) {
-  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imageH_ == 0) {
-    imageH_ = imgSizeY_;
-  }
-  if (imageW_ == 0) {
-    imageW_ = imgSize_;
-  }
-  CHECK_EQ(inputLayers_[0]->getOutput().value->getWidth(),
-           channels_ * imageH_ * imageW_);
-  outputH_ = outputSize(imageH_,
-                        sizeY_,
-                        confPaddingY_,
-                        strideY_,
-                        /* caffeMode */ false);
-  outputW_ =
-      outputSize(imageW_, sizeX_, confPadding_, stride_, /* caffeMode */ false);
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
-
-  hl_tensor_reshape(inputDesc_, batchSize, channels_, imageH_, imageW_);
-  hl_tensor_reshape(outputDesc_, batchSize, channels_, outputH_, outputW_);
-}
-
-void CudnnPoolLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  CHECK(inputLayers_[0]->getOutputValue()->useGpu());
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  reshape(batchSize);
-  resetOutput(batchSize, outputH_ * outputW_ * channels_);
-
-  real *inputData = getInputValue(0)->getData();
-  real *outData = getOutputValue()->getData();
-  hl_pooling_forward(inputDesc_, inputData, outputDesc_, outData, poolingDesc_);
-}
-
-void CudnnPoolLayer::backward(const UpdateCallback &callback) {
-  (void)callback;
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-
-  real *inputData = getInputValue(0)->getData();
-  real *inputGrad = getInputGrad(0)->getData();
-  real *outData = getOutputValue()->getData();
-  real *outGrad = getOutputGrad()->getData();
-  hl_pooling_backward(inputDesc_,
-                      inputData,
-                      inputGrad,
-                      outputDesc_,
-                      outData,
-                      outGrad,
-                      poolingDesc_);
-}
-
-CudnnPoolLayer::~CudnnPoolLayer() {
-  hl_destroy_tensor_descriptor(inputDesc_);
-  hl_destroy_tensor_descriptor(outputDesc_);
-  hl_destroy_pooling_descriptor(poolingDesc_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnPoolLayer.h b/paddle/legacy/gserver/layers/CudnnPoolLayer.h
deleted file mode 100644
index fc249354d10..00000000000
--- a/paddle/legacy/gserver/layers/CudnnPoolLayer.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "PoolLayer.h"
-
-namespace paddle {
-
-/**
- * @brief CudnnPoolLayer is subclass of PoolLayer, which is implemented by
- * cudnn api and only supports GPU.
- *
- * The config file api is img_pool_layer.
- */
-
-class CudnnPoolLayer : public PoolLayer {
- protected:
-  int windowHeight, windowWidth;
-  int heightPadding, widthPadding, strideHeight, strideWidth;
-  int imageH_, imageW_, outputH_, outputW_;
-  /// mode_ is poolint type, inlcuding "cudnn-max-pool", "cudnn-avg-pool"
-  /// "cudnn-avg-excl-pad-pool".
-  hl_pooling_mode_t mode_;
-  /// cudnn tensor descriptor for input.
-  hl_tensor_descriptor inputDesc_;
-  /// cudnn tensor descriptor for output.
-  hl_tensor_descriptor outputDesc_;
-  /// A description of a pooling operation.
-  hl_pooling_descriptor poolingDesc_;
-
- public:
-  static bool typeCheck(const std::string& poolType,
-                        hl_pooling_mode_t* mode = nullptr);
-  explicit CudnnPoolLayer(const LayerConfig& config);
-  ~CudnnPoolLayer();
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * Reshape input and output tensor descriptor.
-   * The batch size maybe change during training in last batch of each pass.
-   * So reshaping is needed.
-   */
-  void reshape(int batchSize);
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DataLayer.cpp b/paddle/legacy/gserver/layers/DataLayer.cpp
deleted file mode 100644
index 4cadaa76631..00000000000
--- a/paddle/legacy/gserver/layers/DataLayer.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DataLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(data, DataLayer);
-
-void DataLayer::copyDataToOutput(Argument& output) {
-  if (output.deviceId == data_.deviceId) {
-    output.value = data_.value;
-    output.in = data_.in;
-    output.grad = data_.grad;
-    output.ids = data_.ids;
-  } else {
-    SetDevice device(output.deviceId);
-    if (data_.value) {
-      if (!output.value) {
-        output.value = data_.value->clone(data_.value->getHeight(),
-                                          data_.value->getWidth(),
-                                          useGpu(output.deviceId));
-      } else {
-        output.value->resize(data_.value->getHeight(), data_.value->getWidth());
-      }
-      output.value->copyFrom(*data_.value);
-    }
-    if (data_.grad) {
-      Matrix::resizeOrCreate(output.grad,
-                             data_.grad->getHeight(),
-                             data_.grad->getWidth(),
-                             /* trans= */ false,
-                             useGpu(output.deviceId));
-    }
-    if (data_.ids) {
-      IVector::resizeOrCreate(
-          output.ids, data_.ids->getSize(), useGpu(output.deviceId));
-      output.ids->copyFrom(*data_.ids);
-    }
-  }
-  if (config_.height() && config_.width()) {
-    output.setFrameHeight(config_.height());
-    output.setFrameWidth(config_.width());
-  } else {
-    output.setFrameHeight(data_.getFrameHeight());
-    output.setFrameWidth(data_.getFrameWidth());
-  }
-  output.cpuSequenceDims = data_.cpuSequenceDims;
-  output.sequenceStartPositions = data_.sequenceStartPositions;
-  output.subSequenceStartPositions = data_.subSequenceStartPositions;
-  output.strs = data_.strs;
-
-  output.notifyValueReady();
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DataLayer.h b/paddle/legacy/gserver/layers/DataLayer.h
deleted file mode 100644
index d02f5a4697b..00000000000
--- a/paddle/legacy/gserver/layers/DataLayer.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-
-#include "Layer.h"
-
-namespace paddle {
-/**
- * This layer just copy data to output, and has no backward propagation.
- *
- * The config file api is data_layer.
- */
-class DataLayer : public Layer {
- public:
-  explicit DataLayer(const LayerConfig& config) : Layer(config) {}
-
-  virtual void setData(const Argument& data) { data_ = data; }
-
-  /**
-   * Prefetch sparse matrix/ids only.
-   */
-  void prefetch() override { output_ = data_; }
-
-  /**
-   * Forward propagation. Copy data_ (value, in, grad, ids, cpuSequenceDims,
-   * sequenceStartPositions, subSequenceStartPositions, strs) to output_.
-   */
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-    copyDataToOutput(output_);
-    if (FLAGS_show_layer_stat) {
-      showOutputStats();
-    }
-  }
-
-  /**
-   * Data layer's backward propagation do nothing.
-   */
-  void backward(const UpdateCallback& callback) override { (void)callback; }
-
-  void copyOutputToOtherDevice() override {
-    for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-      copyDataToOutput(outputOtherDevice_[i]);
-    }
-  }
-
- private:
-  void copyDataToOutput(Argument& output);
-
- protected:
-  Argument data_;
-};
-
-typedef std::shared_ptr<DataLayer> DataLayerPtr;
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DataNormLayer.cpp b/paddle/legacy/gserver/layers/DataNormLayer.cpp
deleted file mode 100644
index 6820dfa4d4d..00000000000
--- a/paddle/legacy/gserver/layers/DataNormLayer.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DataNormLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(data_norm, DataNormLayer);
-
-bool DataNormLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize the weight */
-  CHECK(!biasParameter_) << "DataNormLayer does not need bias";
-  CHECK(inputLayers_.size() == 1 && inputLayers_[0]->getType() == "data")
-      << "DataNormLayer accepts one and only one DataLayer as its input layer";
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK_EQ(inputLayers_[0]->getSize(), getSize());
-  CHECK_EQ(parameters_[0]->getSize(), 5 * getSize());
-  CHECK(parameters_[0]->isStatic())
-      << "The parameter of DataNormLayer must be static";
-
-  weight_ = std::unique_ptr<Weight>(new Weight(5, getSize(), parameters_[0]));
-  min_ = Matrix::create(
-      nullptr, /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  rangeReciprocal_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-  mean_ = Matrix::create(nullptr,
-                         /* height= */ 1,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-  stdReciprocal_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  getSize(),
-                                  /* trans= */ false,
-                                  useGpu_);
-  decimalReciprocal_ = Matrix::create(nullptr,
-                                      /* height= */ 1,
-                                      getSize(),
-                                      /* trans= */ false,
-                                      useGpu_);
-
-  min_->setData(weight_->getW()->getData());
-  rangeReciprocal_->setData(weight_->getW()->getData() + getSize());
-  mean_->setData(weight_->getW()->getData() + 2 * getSize());
-  stdReciprocal_->setData(weight_->getW()->getData() + 3 * getSize());
-  decimalReciprocal_->setData(weight_->getW()->getData() + 4 * getSize());
-
-  /* normalization strategy */
-  if (config_.data_norm_strategy() == "z-score") {
-    mode_ = kZScore;
-  } else if (config_.data_norm_strategy() == "min-max") {
-    mode_ = kMinMax;
-  } else if (config_.data_norm_strategy() == "decimal-scaling") {
-    mode_ = kDecimalScaling;
-  } else {
-    LOG(FATAL) << "Unknown data normalization strategy: "
-               << config_.data_norm_strategy();
-  }
-
-  return true;
-}
-
-void DataNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  reserveOutput(batchSize, size);
-
-  const MatrixPtr inValue = getInputValue(0);
-  MatrixPtr outValue = getOutputValue();
-  outValue->copyFrom(*inValue);
-  switch (mode_) {
-    case kZScore: {
-      outValue->addBias(*mean_, -1.0);
-      outValue->colScale(0, *outValue, *stdReciprocal_);
-      break;
-    }
-    case kMinMax: {
-      outValue->addBias(*min_, -1.0);
-      outValue->colScale(0, *outValue, *rangeReciprocal_);
-      break;
-    }
-    case kDecimalScaling: {
-      outValue->colScale(0, *outValue, *decimalReciprocal_);
-      break;
-    }
-    default:
-      LOG(FATAL) << "should not reach here";
-  }
-}
-
-void DataNormLayer::backward(const UpdateCallback& callback) {
-  // The parameter for DataNormLayer is static, and does not need to be updated
-  (void)callback;
-
-  /* Calculate the input layers error */
-  const MatrixPtr& outGrad = getOutputGrad();
-  MatrixPtr inGrad = getInputGrad(0);
-  if (inGrad) {
-    switch (mode_) {
-      case kZScore: {
-        inGrad->addColScale(0, *outGrad, *stdReciprocal_);
-        break;
-      }
-      case kMinMax: {
-        inGrad->addColScale(0, *outGrad, *rangeReciprocal_);
-        break;
-      }
-      case kDecimalScaling: {
-        inGrad->addColScale(0, *outGrad, *decimalReciprocal_);
-        break;
-      }
-      default: { LOG(FATAL) << "should not reach here"; }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DataNormLayer.h b/paddle/legacy/gserver/layers/DataNormLayer.h
deleted file mode 100644
index 7bb8e928248..00000000000
--- a/paddle/legacy/gserver/layers/DataNormLayer.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for data normalization
- * - Input: One and only one input layer is accepted. The input layer must
- *        be DataLayer with dense data type.
- * - Output: The normalization of the input data
- *
- * Reference:
- *    LA Shalabi, Z Shaaban, B Kasasbeh. Data mining: A preprocessing engine
- *
- * Three data normalization methoeds are considered
- * - z-score: y = (x-mean)/std
- * - min-max: y = (x-min)/(max-min)
- * - decimal-scaling: y = x/10^j, where j is the smallest integer such that
- *max(|y|)<1
- */
-
-class DataNormLayer : public Layer {
- public:
-  enum NormalizationStrategy { kZScore = 0, kMinMax = 1, kDecimalScaling = 2 };
-
-  explicit DataNormLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~DataNormLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  int mode_;
-  std::unique_ptr<Weight> weight_;
-  MatrixPtr min_;
-  MatrixPtr rangeReciprocal_;  // 1/(max-min)
-  MatrixPtr mean_;
-  MatrixPtr stdReciprocal_;      // 1/std
-  MatrixPtr decimalReciprocal_;  // 1/10^j
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DeConv3DLayer.cpp b/paddle/legacy/gserver/layers/DeConv3DLayer.cpp
deleted file mode 100644
index 2cd635564c4..00000000000
--- a/paddle/legacy/gserver/layers/DeConv3DLayer.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DeConv3DLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(deconv3d, DeConv3DLayer);
-
-bool DeConv3DLayer::init(const LayerMap &layerMap,
-                         const ParameterMap &parameterMap) {
-  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
-  // for Deconv, the dimension of Kernel is
-  // channel * output * depth * height * weigth
-  // Matrix storage format: (output * depth * height * weigth) x  channel
-  for (int index = 0; index < config_.inputs().size(); ++index) {
-    M_.push_back(filterChannels_[index]);
-    K_.push_back(filterPixels_[index] * (numFilters_ / groups_[index]));
-
-    // create a new weight
-    size_t height, width;
-    height = filterPixels_[index] * numFilters_;
-    width = filterChannels_[index];
-    CHECK_EQ(parameters_[index]->getSize(), width * height);
-    Weight *w = new Weight(height, width, parameters_[index]);
-    weights_.emplace_back(w);
-  }
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
-    }
-  }
-  return true;
-}
-
-size_t DeConv3DLayer::getSize() {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  imgSizeW_.clear();
-  imgSizeH_.clear();
-  imgSizeD_.clear();
-  N_.clear();
-  NOut_.clear();
-  size_t layerSize = 0;
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    imgSizeW_.push_back(
-        imageSize(outputW_[i], filterSize_[i], padding_[i], stride_[i], true));
-    imgSizeH_.push_back(imageSize(
-        outputH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
-    imgSizeD_.push_back(imageSize(
-        outputD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
-    NOut_.push_back(imgSizeD_[i] * imgSizeH_[i] * imgSizeW_[i]);
-    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
-    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
-    layerSize += NOut_[i] * numFilters_;
-  }
-  getOutput().setFrameHeight(imgSizeH_[0]);
-  getOutput().setFrameWidth(imgSizeW_[0]);
-  getOutput().setFrameDepth(imgSizeD_[0]);
-  return layerSize;
-}
-
-void DeConv3DLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  int outWidth = getSize();
-  resetOutput(batchSize, outWidth);
-  const MatrixPtr outMat = getOutputValue();
-
-  REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const MatrixPtr &inMat = getInputValue(i);
-    int M = M_[i];
-    int N = N_[i];
-    int K = K_[i];
-    MatrixPtr wMat = weights_[i]->getW();
-    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-    for (int n = 0; n < batchSize; ++n) {
-      real *inData = inMat->getData() + n * inMat->getStride();
-      for (int g = 0; g < groups_[i]; ++g) {
-        MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
-        MatrixPtr wMatSub = wMat->subMatrix(g * K, K);
-        MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
-        colBufDataSub->mul(*wMatSub, *inMatSub, 1.0, 0.0);
-        inData += M * N;
-      }
-      colBuf_->col2Vol(outMat->getData() + n * outMat->getStride(),
-                       numFilters_,
-                       imgSizeD_[i],
-                       imgSizeH_[i],
-                       imgSizeW_[i],
-                       filterSizeZ_[i],
-                       filterSizeY_[i],
-                       filterSize_[i],
-                       strideZ_[i],
-                       strideY_[i],
-                       stride_[i],
-                       paddingZ_[i],
-                       paddingY_[i],
-                       padding_[i],
-                       1.0,
-                       1.0);
-    }
-  }
-  if (nullptr != this->biasParameter_) {
-    this->addBias();
-  }
-  forwardActivation();
-}
-
-void DeConv3DLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-  int batchSize = getOutputGrad()->getHeight();
-  if (biases_ && biases_->getWGrad()) {
-    bpropBiases();
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-  REGISTER_TIMER_INFO("BwdDeConv3D", getName().c_str());
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    if (weights_[i]->getWGrad() || this->needGradient_) {
-      int M = M_[i];
-      int N = N_[i];
-      int K = K_[i];
-      Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-      const MatrixPtr &inMat = getInputValue(i);
-      for (int n = 0; n < batchSize; ++n) {
-        colBuf_->vol2Col(
-            getOutputGrad()->getData() + n * getOutputGrad()->getStride(),
-            numFilters_,
-            imgSizeD_[i],
-            imgSizeH_[i],
-            imgSizeW_[i],
-            filterSizeZ_[i],
-            filterSizeY_[i],
-            filterSize_[i],
-            strideZ_[i],
-            strideY_[i],
-            stride_[i],
-            paddingZ_[i],
-            paddingY_[i],
-            padding_[i]);
-        if (weights_[i]->getWGrad()) {
-          real *inData = inMat->getData() + n * inMat->getStride();
-          for (int g = 0; g < groups_[i]; ++g) {
-            MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
-            MatrixPtr wGradMatSub =
-                weights_[i]->getWGrad()->subMatrix(g * K, K);
-            MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
-            wGradMatSub->mul(
-                *colBufDataSub, *(inMatSub->getTranspose()), 1.0, 1.0);
-            inData += M * N;
-          }
-        }
-        if (getInputGrad(i)) {
-          real *preGrad =
-              getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
-          for (int g = 0; g < groups_[i]; ++g) {
-            MatrixPtr w = weights_[i]->getW()->subMatrix(g * K, K);
-            MatrixPtr outGradMat = colBuf_->subMatrix(g * K, K);
-            MatrixPtr inGradMatSub =
-                Matrix::create(preGrad, M, N, false, useGpu_);
-            inGradMatSub->mul(*(w->getTranspose()), *outGradMat, 1.0, 1.0);
-            preGrad += M * N;
-          }
-        }
-      }
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-void DeConv3DLayer::bpropWeights(int i) {}
-void DeConv3DLayer::bpropData(int i) {}
-
-void DeConv3DLayer::bpropBiases() {
-  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
-                                    1,
-                                    biases_->getWGrad()->getElementCnt(),
-                                    false,
-                                    useGpu_);
-  const MatrixPtr &outGradMat = getOutputGrad();
-
-  if (this->sharedBiases_) {
-    biases->collectSharedBias(*outGradMat, 1.0f);
-  } else {
-    biases->collectBias(*outGradMat, 1.0f);
-  }
-}
-
-void DeConv3DLayer::addBias() {
-  MatrixPtr outMat = getOutputValue();
-  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
-                                  1,
-                                  biases_->getW()->getElementCnt(),
-                                  false,
-                                  useGpu_);
-  if (this->sharedBiases_) {
-    outMat->addSharedBias(*(bias), 1.0f);
-  } else {
-    outMat->addBias(*(bias), 1.0f);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DeConv3DLayer.h b/paddle/legacy/gserver/layers/DeConv3DLayer.h
deleted file mode 100644
index 9931bccb128..00000000000
--- a/paddle/legacy/gserver/layers/DeConv3DLayer.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A subclass of deconvolution3D layer.
- * This layer expands input and use matrix multiplication to
- * calculate deconvolution3D operation.
- */
-class DeConv3DLayer : public ConvBaseLayer {
- public:
-  explicit DeConv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
-  ~DeConv3DLayer() {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void addBias();
-  void backward(const UpdateCallback& callback);
-  void bpropBiases();
-  void bpropData(int i);
-  void bpropWeights(int i);
-  size_t getSize();
-
- protected:
-  // Figure out the dimensions for individual gemms.
-  IntV M_;  /// numFilters_ / filter_group_;
-  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
-  IntV K_;  /// outputD_ * outputH_ * outputW_
-  IntV NOut_;
-  MatrixPtr colBuf_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DetectionOutputLayer.cpp b/paddle/legacy/gserver/layers/DetectionOutputLayer.cpp
deleted file mode 100644
index 93fe046c6a8..00000000000
--- a/paddle/legacy/gserver/layers/DetectionOutputLayer.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DetectionOutputLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(detection_output, DetectionOutputLayer);
-
-bool DetectionOutputLayer::init(const LayerMap& layerMap,
-                                const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  auto& layerConf = config_.inputs(0).detection_output_conf();
-  numClasses_ = layerConf.num_classes();
-  inputNum_ = layerConf.input_num();
-  nmsThreshold_ = layerConf.nms_threshold();
-  confidenceThreshold_ = layerConf.confidence_threshold();
-  nmsTopK_ = layerConf.nms_top_k();
-  keepTopK_ = layerConf.keep_top_k();
-  backgroundId_ = layerConf.background_id();
-  return true;
-}
-
-void DetectionOutputLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
-
-  locSizeSum_ = 0;
-  confSizeSum_ = 0;
-  for (size_t n = 0; n < inputNum_; ++n) {
-    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
-    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
-    locSizeSum_ += inLoc->getElementCnt();
-    confSizeSum_ += inConf->getElementCnt();
-  }
-
-  Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      confTmpBuffer_, confSizeSum_ / numClasses_, numClasses_, false, useGpu_);
-
-  size_t locOffset = 0;
-  size_t confOffset = 0;
-  auto& layerConf = config_.inputs(0).detection_output_conf();
-  for (size_t n = 0; n < inputNum_; ++n) {
-    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
-    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
-
-    size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
-    if (!height) height = layerConf.height();
-    size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
-    if (!width) width = layerConf.width();
-    locOffset += appendWithPermute(*inLoc,
-                                   height,
-                                   width,
-                                   locSizeSum_,
-                                   locOffset,
-                                   batchSize,
-                                   *locTmpBuffer_,
-                                   kNCHWToNHWC);
-    confOffset += appendWithPermute(*inConf,
-                                    height,
-                                    width,
-                                    confSizeSum_,
-                                    confOffset,
-                                    batchSize,
-                                    *confTmpBuffer_,
-                                    kNCHWToNHWC);
-  }
-  CHECK_EQ(locOffset, locSizeSum_ / batchSize);
-  CHECK_EQ(confOffset, confSizeSum_ / batchSize);
-
-  MatrixPtr priorValue;
-  if (useGpu_) {
-    Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false);
-    Matrix::resizeOrCreate(
-        confCpuBuffer_, confSizeSum_ / numClasses_, numClasses_, false, false);
-    MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer());
-    Matrix::resizeOrCreate(
-        priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false);
-
-    locCpuBuffer_->copyFrom(*locTmpBuffer_);
-    confCpuBuffer_->copyFrom(*confTmpBuffer_);
-    priorCpuValue_->copyFrom(*priorTmpValue);
-
-    locBuffer_ = locCpuBuffer_;
-    confBuffer_ = confCpuBuffer_;
-    priorValue = priorCpuValue_;
-  } else {
-    priorValue = getInputValue(*getPriorBoxLayer());
-    locBuffer_ = locTmpBuffer_;
-    confBuffer_ = confTmpBuffer_;
-  }
-  confBuffer_->softmax(*confBuffer_);
-
-  size_t numPriors = priorValue->getElementCnt() / 8;
-  std::vector<std::vector<NormalizedBBox>> allDecodedBBoxes;
-  for (size_t n = 0; n < batchSize; ++n) {
-    std::vector<NormalizedBBox> decodedBBoxes;
-    for (size_t i = 0; i < numPriors; ++i) {
-      size_t priorOffset = i * 8;
-      size_t locPredOffset = n * numPriors * 4 + i * 4;
-      std::vector<NormalizedBBox> priorBBoxVec;
-      getBBoxFromPriorData(
-          priorValue->getData() + priorOffset, 1, priorBBoxVec);
-      std::vector<std::vector<real>> priorBBoxVar;
-      getBBoxVarFromPriorData(
-          priorValue->getData() + priorOffset, 1, priorBBoxVar);
-      std::vector<real> locPredData;
-      for (size_t j = 0; j < 4; ++j)
-        locPredData.push_back(*(locBuffer_->getData() + locPredOffset + j));
-      NormalizedBBox bbox =
-          decodeBBoxWithVar(priorBBoxVec[0], priorBBoxVar[0], locPredData);
-      decodedBBoxes.push_back(bbox);
-    }
-    allDecodedBBoxes.push_back(decodedBBoxes);
-  }
-
-  std::vector<std::map<size_t, std::vector<size_t>>> allIndices;
-  size_t numKept = getDetectionIndices(confBuffer_->getData(),
-                                       numPriors,
-                                       numClasses_,
-                                       backgroundId_,
-                                       batchSize,
-                                       confidenceThreshold_,
-                                       nmsTopK_,
-                                       nmsThreshold_,
-                                       keepTopK_,
-                                       allDecodedBBoxes,
-                                       &allIndices);
-
-  if (numKept > 0) {
-    resetOutput(numKept, 7);
-  } else {
-    MatrixPtr outV = getOutputValue();
-    if (outV) outV->resize(0, 0);
-    return;
-  }
-  MatrixPtr outV = getOutputValue();
-  getDetectionOutput(confBuffer_->getData(),
-                     numKept,
-                     numPriors,
-                     numClasses_,
-                     batchSize,
-                     allIndices,
-                     allDecodedBBoxes,
-                     *outV);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DetectionOutputLayer.h b/paddle/legacy/gserver/layers/DetectionOutputLayer.h
deleted file mode 100644
index b0270ed3314..00000000000
--- a/paddle/legacy/gserver/layers/DetectionOutputLayer.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <vector>
-#include "DetectionUtil.h"
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * The detection output layer for a SSD detection task. This layer applies the
- * Non-maximum suppression to the all predicted bounding box and keeps the
- * Top-K bounding boxes.
- * - Input: This layer needs three input layers: The first input layer
- *          is the priorbox layer. The rest two input layers are convolution
- *          layers for generating bbox location offset and the classification
- *          confidence.
- * - Output: The predict bounding box locations.
- */
-
-class DetectionOutputLayer : public Layer {
- public:
-  explicit DetectionOutputLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-
-  void backward(const UpdateCallback& callback = nullptr) {}
-
- protected:
-  inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
-
-  inline LayerPtr getLocInputLayer(size_t index) {
-    return inputLayers_[1 + index];
-  }
-
-  inline LayerPtr getConfInputLayer(size_t index) {
-    return inputLayers_[1 + inputNum_ + index];
-  }
-
- private:
-  size_t numClasses_;  // number of classes
-  size_t inputNum_;    // number of input layers
-  real nmsThreshold_;
-  real confidenceThreshold_;
-  size_t nmsTopK_;
-  size_t keepTopK_;
-  size_t backgroundId_;
-
-  size_t locSizeSum_;
-  size_t confSizeSum_;
-
-  MatrixPtr locBuffer_;
-  MatrixPtr confBuffer_;
-  MatrixPtr locTmpBuffer_;
-  MatrixPtr confTmpBuffer_;
-  MatrixPtr priorCpuValue_;
-  MatrixPtr locCpuBuffer_;
-  MatrixPtr confCpuBuffer_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DetectionUtil.cpp b/paddle/legacy/gserver/layers/DetectionUtil.cpp
deleted file mode 100644
index 0dc45e5a751..00000000000
--- a/paddle/legacy/gserver/layers/DetectionUtil.cpp
+++ /dev/null
@@ -1,576 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DetectionUtil.h"
-
-namespace paddle {
-
-size_t appendWithPermute(const Matrix& inMatrix,
-                         size_t height,
-                         size_t width,
-                         size_t outTotalSize,
-                         size_t outOffset,
-                         size_t batchSize,
-                         Matrix& outMatrix,
-                         PermMode permMode) {
-  CHECK_EQ(inMatrix.useGpu(), outMatrix.useGpu());
-  bool useGpu = inMatrix.useGpu();
-  if (permMode == kNCHWToNHWC) {
-    size_t inElementCnt = inMatrix.getElementCnt();
-    size_t channels = inElementCnt / (height * width * batchSize);
-    size_t imgSize = height * width;
-    for (size_t i = 0; i < batchSize; ++i) {
-      size_t offset = i * (outTotalSize / batchSize) + outOffset;
-      const MatrixPtr inTmp = Matrix::create(
-          const_cast<real*>(inMatrix.getData()) + i * channels * imgSize,
-          channels,
-          imgSize,
-          false,
-          useGpu);
-      MatrixPtr outTmp =
-          Matrix::create(const_cast<real*>(outMatrix.getData()) + offset,
-                         imgSize,
-                         channels,
-                         false,
-                         useGpu);
-      inTmp->transpose(outTmp, false);
-    }
-    return channels * imgSize;
-  } else {
-    LOG(FATAL) << "Unkown permute mode";
-  }
-}
-
-size_t decomposeWithPermute(const Matrix& inMatrix,
-                            size_t height,
-                            size_t width,
-                            size_t inTotalSize,
-                            size_t inOffset,
-                            size_t batchSize,
-                            Matrix& outMatrix,
-                            PermMode permMode) {
-  CHECK_EQ(inMatrix.useGpu(), outMatrix.useGpu());
-  bool useGpu = inMatrix.useGpu();
-  if (permMode == kNHWCToNCHW) {
-    size_t outElementCnt = outMatrix.getElementCnt();
-    size_t channels = outElementCnt / (height * width * batchSize);
-    size_t imgSize = height * width;
-    for (size_t i = 0; i < batchSize; ++i) {
-      size_t offset = i * (inTotalSize / batchSize) + inOffset;
-      const MatrixPtr inTmp =
-          Matrix::create(const_cast<real*>(inMatrix.getData()) + offset,
-                         imgSize,
-                         channels,
-                         false,
-                         useGpu);
-      MatrixPtr outTmp = Matrix::create(
-          const_cast<real*>(outMatrix.getData()) + i * channels * imgSize,
-          channels,
-          imgSize,
-          false,
-          useGpu);
-      inTmp->transpose(outTmp, false);
-    }
-    return channels * imgSize;
-  } else {
-    LOG(FATAL) << "Unkown permute mode";
-  }
-}
-
-real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2) {
-  if (bbox2.xMin > bbox1.xMax || bbox2.xMax < bbox1.xMin ||
-      bbox2.yMin > bbox1.yMax || bbox2.yMax < bbox1.yMin) {
-    return 0.0;
-  } else {
-    real interXMin = std::max(bbox1.xMin, bbox2.xMin);
-    real interYMin = std::max(bbox1.yMin, bbox2.yMin);
-    real interXMax = std::min(bbox1.xMax, bbox2.xMax);
-    real interYMax = std::min(bbox1.yMax, bbox2.yMax);
-
-    real interWidth = interXMax - interXMin;
-    real interHeight = interYMax - interYMin;
-    real interArea = interWidth * interHeight;
-
-    real bboxArea1 = bbox1.getArea();
-    real bboxArea2 = bbox2.getArea();
-
-    return interArea / (bboxArea1 + bboxArea2 - interArea);
-  }
-}
-
-void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
-                       const vector<real>& priorBBoxVar,
-                       const NormalizedBBox& gtBBox,
-                       vector<real>& outVec) {
-  real priorBBoxWidth = priorBBox.getWidth();
-  real priorBBoxHeight = priorBBox.getHeight();
-  real priorBBoxCenterX = priorBBox.getCenterX();
-  real priorBBoxCenterY = priorBBox.getCenterY();
-
-  real gtBBoxWidth = gtBBox.getWidth();
-  real gtBBoxHeight = gtBBox.getHeight();
-  real gtBBoxCenterX = gtBBox.getCenterX();
-  real gtBBoxCenterY = gtBBox.getCenterY();
-
-  outVec.clear();
-  outVec.push_back((gtBBoxCenterX - priorBBoxCenterX) / priorBBoxWidth /
-                   priorBBoxVar[0]);
-  outVec.push_back((gtBBoxCenterY - priorBBoxCenterY) / priorBBoxHeight /
-                   priorBBoxVar[1]);
-  outVec.push_back(std::log(std::fabs(gtBBoxWidth / priorBBoxWidth)) /
-                   priorBBoxVar[2]);
-  outVec.push_back(std::log(std::fabs(gtBBoxHeight / priorBBoxHeight)) /
-                   priorBBoxVar[3]);
-}
-
-NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
-                                 const vector<real>& priorBBoxVar,
-                                 const vector<real>& locPredData) {
-  real priorBBoxWidth = priorBBox.getWidth();
-  real priorBBoxHeight = priorBBox.getHeight();
-  real priorBBoxCenterX = priorBBox.getCenterX();
-  real priorBBoxCenterY = priorBBox.getCenterY();
-
-  real decodedBBoxCenterX =
-      priorBBoxVar[0] * locPredData[0] * priorBBoxWidth + priorBBoxCenterX;
-  real decodedBBoxCenterY =
-      priorBBoxVar[1] * locPredData[1] * priorBBoxHeight + priorBBoxCenterY;
-  real decodedBBoxWidth =
-      std::exp(priorBBoxVar[2] * locPredData[2]) * priorBBoxWidth;
-  real decodedBBoxHeight =
-      std::exp(priorBBoxVar[3] * locPredData[3]) * priorBBoxHeight;
-
-  NormalizedBBox decodedBBox;
-  decodedBBox.xMin = decodedBBoxCenterX - decodedBBoxWidth / 2;
-  decodedBBox.yMin = decodedBBoxCenterY - decodedBBoxHeight / 2;
-  decodedBBox.xMax = decodedBBoxCenterX + decodedBBoxWidth / 2;
-  decodedBBox.yMax = decodedBBoxCenterY + decodedBBoxHeight / 2;
-
-  return decodedBBox;
-}
-
-void getBBoxFromPriorData(const real* priorData,
-                          const size_t numBBoxes,
-                          vector<NormalizedBBox>& bboxVec) {
-  size_t outOffset = bboxVec.size();
-  bboxVec.resize(bboxVec.size() + numBBoxes);
-  for (size_t i = 0; i < numBBoxes; ++i) {
-    NormalizedBBox bbox;
-    bbox.xMin = *(priorData + i * 8);
-    bbox.yMin = *(priorData + i * 8 + 1);
-    bbox.xMax = *(priorData + i * 8 + 2);
-    bbox.yMax = *(priorData + i * 8 + 3);
-    bboxVec[outOffset + i] = bbox;
-  }
-}
-
-void getBBoxVarFromPriorData(const real* priorData,
-                             const size_t num,
-                             vector<vector<real>>& varVec) {
-  size_t outOffset = varVec.size();
-  varVec.resize(varVec.size() + num);
-  for (size_t i = 0; i < num; ++i) {
-    vector<real> var;
-    var.push_back(*(priorData + i * 8 + 4));
-    var.push_back(*(priorData + i * 8 + 5));
-    var.push_back(*(priorData + i * 8 + 6));
-    var.push_back(*(priorData + i * 8 + 7));
-    varVec[outOffset + i] = var;
-  }
-}
-
-void getBBoxFromLabelData(const real* labelData,
-                          const size_t numBBoxes,
-                          vector<NormalizedBBox>& bboxVec) {
-  size_t outOffset = bboxVec.size();
-  bboxVec.resize(bboxVec.size() + numBBoxes);
-  for (size_t i = 0; i < numBBoxes; ++i) {
-    NormalizedBBox bbox;
-    bbox.xMin = *(labelData + i * 6 + 1);
-    bbox.yMin = *(labelData + i * 6 + 2);
-    bbox.xMax = *(labelData + i * 6 + 3);
-    bbox.yMax = *(labelData + i * 6 + 4);
-    real isDifficult = *(labelData + i * 6 + 5);
-    if (std::abs(isDifficult - 0.0) < 1e-6)
-      bbox.isDifficult = false;
-    else
-      bbox.isDifficult = true;
-    bboxVec[outOffset + i] = bbox;
-  }
-}
-
-void getBBoxFromDetectData(const real* detectData,
-                           const size_t numBBoxes,
-                           vector<real>& labelVec,
-                           vector<real>& scoreVec,
-                           vector<NormalizedBBox>& bboxVec) {
-  size_t outOffset = bboxVec.size();
-  labelVec.resize(outOffset + numBBoxes);
-  scoreVec.resize(outOffset + numBBoxes);
-  bboxVec.resize(outOffset + numBBoxes);
-  for (size_t i = 0; i < numBBoxes; ++i) {
-    labelVec[outOffset + i] = *(detectData + i * 7 + 1);
-    scoreVec[outOffset + i] = *(detectData + i * 7 + 2);
-    NormalizedBBox bbox;
-    bbox.xMin = *(detectData + i * 7 + 3);
-    bbox.yMin = *(detectData + i * 7 + 4);
-    bbox.xMax = *(detectData + i * 7 + 5);
-    bbox.yMax = *(detectData + i * 7 + 6);
-    bboxVec[outOffset + i] = bbox;
-  }
-}
-
-void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
-               const vector<NormalizedBBox>& gtBBoxes,
-               real overlapThreshold,
-               vector<int>* matchIndices,
-               vector<real>* matchOverlaps) {
-  map<size_t, map<size_t, real>> overlaps;
-  size_t numPriors = priorBBoxes.size();
-  size_t numGTs = gtBBoxes.size();
-
-  matchIndices->clear();
-  matchIndices->resize(numPriors, -1);
-  matchOverlaps->clear();
-  matchOverlaps->resize(numPriors, 0.0);
-
-  // Store the positive overlap between predictions and ground truth
-  for (size_t i = 0; i < numPriors; ++i) {
-    for (size_t j = 0; j < numGTs; ++j) {
-      real overlap = jaccardOverlap(priorBBoxes[i], gtBBoxes[j]);
-      if (overlap > 1e-6) {
-        (*matchOverlaps)[i] = std::max((*matchOverlaps)[i], overlap);
-        overlaps[i][j] = overlap;
-      }
-    }
-  }
-  // Bipartite matching
-  vector<int> gtPool;
-  for (size_t i = 0; i < numGTs; ++i) {
-    gtPool.push_back(i);
-  }
-  while (gtPool.size() > 0) {
-    // Find the most overlapped gt and corresponding predictions
-    int maxPriorIdx = -1;
-    int maxGTIdx = -1;
-    real maxOverlap = -1.0;
-    for (map<size_t, map<size_t, real>>::iterator it = overlaps.begin();
-         it != overlaps.end();
-         ++it) {
-      size_t i = it->first;
-      if ((*matchIndices)[i] != -1) {
-        // The prediction already has matched ground truth or is ignored
-        continue;
-      }
-      for (size_t p = 0; p < gtPool.size(); ++p) {
-        int j = gtPool[p];
-        if (it->second.find(j) == it->second.end()) {
-          // No overlap between the i-th prediction and j-th ground truth
-          continue;
-        }
-        // Find the maximum overlapped pair
-        if (it->second[j] > maxOverlap) {
-          maxPriorIdx = (int)i;
-          maxGTIdx = (int)j;
-          maxOverlap = it->second[j];
-        }
-      }
-    }
-    if (maxPriorIdx == -1) {
-      break;
-    } else {
-      (*matchIndices)[maxPriorIdx] = maxGTIdx;
-      (*matchOverlaps)[maxPriorIdx] = maxOverlap;
-      gtPool.erase(std::find(gtPool.begin(), gtPool.end(), maxGTIdx));
-    }
-  }
-
-  // Get most overlaped for the rest prediction bboxes
-  for (map<size_t, map<size_t, real>>::iterator it = overlaps.begin();
-       it != overlaps.end();
-       ++it) {
-    size_t i = it->first;
-    if ((*matchIndices)[i] != -1) {
-      // The prediction already has matched ground truth or is ignored
-      continue;
-    }
-    int maxGTIdx = -1;
-    real maxOverlap = -1;
-    for (size_t j = 0; j < numGTs; ++j) {
-      if (it->second.find(j) == it->second.end()) {
-        // No overlap between the i-th prediction and j-th ground truth
-        continue;
-      }
-      // Find the maximum overlapped pair
-      real overlap = it->second[j];
-      if (overlap > maxOverlap && overlap >= overlapThreshold) {
-        maxGTIdx = j;
-        maxOverlap = overlap;
-      }
-    }
-    if (maxGTIdx != -1) {
-      (*matchIndices)[i] = maxGTIdx;
-      (*matchOverlaps)[i] = maxOverlap;
-    }
-  }
-}
-
-pair<size_t, size_t> generateMatchIndices(
-    const Matrix& priorValue,
-    const size_t numPriorBBoxes,
-    const Matrix& gtValue,
-    const int* gtStartPosPtr,
-    const size_t seqNum,
-    const vector<vector<real>>& maxConfScore,
-    const size_t batchSize,
-    const real overlapThreshold,
-    const real negOverlapThreshold,
-    const size_t negPosRatio,
-    vector<vector<int>>* matchIndicesVecPtr,
-    vector<vector<int>>* negIndicesVecPtr) {
-  vector<NormalizedBBox> priorBBoxes;  // share same prior bboxes
-  getBBoxFromPriorData(priorValue.getData(), numPriorBBoxes, priorBBoxes);
-  size_t totalPos = 0;
-  size_t totalNeg = 0;
-  for (size_t n = 0; n < batchSize; ++n) {
-    vector<int> matchIndices;
-    vector<int> negIndices;
-    vector<real> matchOverlaps;
-    matchIndices.resize(numPriorBBoxes, -1);
-    matchOverlaps.resize(numPriorBBoxes, 0.0);
-    size_t numGTBBoxes = 0;
-    if (n < seqNum) numGTBBoxes = gtStartPosPtr[n + 1] - gtStartPosPtr[n];
-    if (!numGTBBoxes) {
-      matchIndicesVecPtr->push_back(matchIndices);
-      negIndicesVecPtr->push_back(negIndices);
-      continue;
-    }
-    vector<NormalizedBBox> gtBBoxes;
-    getBBoxFromLabelData(
-        gtValue.getData() + gtStartPosPtr[n] * 6, numGTBBoxes, gtBBoxes);
-
-    matchBBox(
-        priorBBoxes, gtBBoxes, overlapThreshold, &matchIndices, &matchOverlaps);
-
-    size_t numPos = 0;
-    size_t numNeg = 0;
-    for (size_t i = 0; i < matchIndices.size(); ++i)
-      if (matchIndices[i] != -1) ++numPos;
-    totalPos += numPos;
-    vector<pair<real, size_t>> scoresIndices;
-    for (size_t i = 0; i < matchIndices.size(); ++i)
-      if (matchIndices[i] == -1 && matchOverlaps[i] < negOverlapThreshold) {
-        scoresIndices.push_back(std::make_pair(maxConfScore[n][i], i));
-        ++numNeg;
-      }
-    numNeg = std::min(static_cast<size_t>(numPos * negPosRatio), numNeg);
-    std::sort(scoresIndices.begin(),
-              scoresIndices.end(),
-              sortScorePairDescend<size_t>);
-    for (size_t i = 0; i < numNeg; ++i)
-      negIndices.push_back(scoresIndices[i].second);
-    totalNeg += numNeg;
-    matchIndicesVecPtr->push_back(matchIndices);
-    negIndicesVecPtr->push_back(negIndices);
-  }
-  return std::make_pair(totalPos, totalNeg);
-}
-
-void getMaxConfidenceScores(const real* confData,
-                            const size_t batchSize,
-                            const size_t numPriorBBoxes,
-                            const size_t numClasses,
-                            const size_t backgroundId,
-                            vector<vector<real>>* maxConfScoreVecPtr) {
-  maxConfScoreVecPtr->clear();
-  for (size_t i = 0; i < batchSize; ++i) {
-    vector<real> maxConfScore;
-    for (size_t j = 0; j < numPriorBBoxes; ++j) {
-      int offset = j * numClasses;
-      real maxVal = -FLT_MAX;
-      real maxPosVal = -FLT_MAX;
-      real maxScore = 0.0;
-      for (size_t c = 0; c < numClasses; ++c) {
-        maxVal = std::max<real>(confData[offset + c], maxVal);
-        if (c != backgroundId)
-          maxPosVal = std::max<real>(confData[offset + c], maxPosVal);
-      }
-      real sum = 0.0;
-      for (size_t c = 0; c < numClasses; ++c)
-        sum += std::exp(confData[offset + c] - maxVal);
-      maxScore = std::exp(maxPosVal - maxVal) / sum;
-      maxConfScore.push_back(maxScore);
-    }
-    confData += numPriorBBoxes * numClasses;
-    maxConfScoreVecPtr->push_back(maxConfScore);
-  }
-}
-
-template <typename T>
-bool sortScorePairDescend(const pair<real, T>& pair1,
-                          const pair<real, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <>
-bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
-                          const pair<real, NormalizedBBox>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-void applyNMSFast(const vector<NormalizedBBox>& bboxes,
-                  const real* confScoreData,
-                  size_t classIdx,
-                  size_t topK,
-                  real confThreshold,
-                  real nmsThreshold,
-                  size_t numPriorBBoxes,
-                  size_t numClasses,
-                  vector<size_t>* indices) {
-  vector<pair<real, size_t>> scores;
-  for (size_t i = 0; i < numPriorBBoxes; ++i) {
-    size_t confOffset = i * numClasses + classIdx;
-    if (confScoreData[confOffset] > confThreshold)
-      scores.push_back(std::make_pair(confScoreData[confOffset], i));
-  }
-  std::stable_sort(scores.begin(), scores.end(), sortScorePairDescend<size_t>);
-  if (topK > 0 && topK < scores.size()) scores.resize(topK);
-  while (scores.size() > 0) {
-    const size_t idx = scores.front().second;
-    bool keep = true;
-    for (size_t i = 0; i < indices->size(); ++i) {
-      if (keep) {
-        const size_t savedIdx = (*indices)[i];
-        real overlap = jaccardOverlap(bboxes[idx], bboxes[savedIdx]);
-        keep = overlap <= nmsThreshold;
-      } else {
-        break;
-      }
-    }
-    if (keep) indices->push_back(idx);
-    scores.erase(scores.begin());
-  }
-}
-
-size_t getDetectionIndices(
-    const real* confData,
-    const size_t numPriorBBoxes,
-    const size_t numClasses,
-    const size_t backgroundId,
-    const size_t batchSize,
-    const real confThreshold,
-    const size_t nmsTopK,
-    const real nmsThreshold,
-    const size_t keepTopK,
-    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
-    vector<map<size_t, vector<size_t>>>* allDetectionIndices) {
-  size_t totalKeepNum = 0;
-  for (size_t n = 0; n < batchSize; ++n) {
-    const vector<NormalizedBBox>& decodedBBoxes = allDecodedBBoxes[n];
-    size_t numDetected = 0;
-    map<size_t, vector<size_t>> indices;
-    size_t confOffset = n * numPriorBBoxes * numClasses;
-    for (size_t c = 0; c < numClasses; ++c) {
-      if (c == backgroundId) continue;
-      applyNMSFast(decodedBBoxes,
-                   confData + confOffset,
-                   c,
-                   nmsTopK,
-                   confThreshold,
-                   nmsThreshold,
-                   numPriorBBoxes,
-                   numClasses,
-                   &(indices[c]));
-      numDetected += indices[c].size();
-    }
-    if (keepTopK > 0 && numDetected > keepTopK) {
-      vector<pair<real, pair<size_t, size_t>>> scoreIndexPairs;
-      for (size_t c = 0; c < numClasses; ++c) {
-        const vector<size_t>& labelIndices = indices[c];
-        for (size_t i = 0; i < labelIndices.size(); ++i) {
-          size_t idx = labelIndices[i];
-          scoreIndexPairs.push_back(
-              std::make_pair((confData + confOffset)[idx * numClasses + c],
-                             std::make_pair(c, idx)));
-        }
-      }
-      std::sort(scoreIndexPairs.begin(),
-                scoreIndexPairs.end(),
-                sortScorePairDescend<pair<size_t, size_t>>);
-      scoreIndexPairs.resize(keepTopK);
-      map<size_t, vector<size_t>> newIndices;
-      for (size_t i = 0; i < scoreIndexPairs.size(); ++i) {
-        size_t label = scoreIndexPairs[i].second.first;
-        size_t idx = scoreIndexPairs[i].second.second;
-        newIndices[label].push_back(idx);
-      }
-      allDetectionIndices->push_back(newIndices);
-      totalKeepNum += keepTopK;
-    } else {
-      allDetectionIndices->push_back(indices);
-      totalKeepNum += numDetected;
-    }
-  }
-  return totalKeepNum;
-}
-
-void getDetectionOutput(const real* confData,
-                        const size_t numKept,
-                        const size_t numPriorBBoxes,
-                        const size_t numClasses,
-                        const size_t batchSize,
-                        const vector<map<size_t, vector<size_t>>>& allIndices,
-                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
-                        Matrix& out) {
-  MatrixPtr outBuffer;
-  Matrix::resizeOrCreate(outBuffer, numKept, 7, false, false);
-  real* bufferData = outBuffer->getData();
-  size_t count = 0;
-  for (size_t n = 0; n < batchSize; ++n) {
-    for (map<size_t, vector<size_t>>::const_iterator it = allIndices[n].begin();
-         it != allIndices[n].end();
-         ++it) {
-      size_t label = it->first;
-      const vector<size_t>& indices = it->second;
-      const vector<NormalizedBBox>& decodedBBoxes = allDecodedBBoxes[n];
-      for (size_t i = 0; i < indices.size(); ++i) {
-        size_t idx = indices[i];
-        size_t confOffset = n * numPriorBBoxes * numClasses + idx * numClasses;
-        bufferData[count * 7] = n;
-        bufferData[count * 7 + 1] = label;
-        bufferData[count * 7 + 2] = (confData + confOffset)[label];
-        NormalizedBBox clippedBBox = clipBBox(decodedBBoxes[idx]);
-        bufferData[count * 7 + 3] = clippedBBox.xMin;
-        bufferData[count * 7 + 4] = clippedBBox.yMin;
-        bufferData[count * 7 + 5] = clippedBBox.xMax;
-        bufferData[count * 7 + 6] = clippedBBox.yMax;
-        ++count;
-      }
-    }
-  }
-  out.copyFrom(bufferData, numKept * 7);
-}
-
-NormalizedBBox clipBBox(const NormalizedBBox& bbox) {
-  real realOne = static_cast<real>(1.0);
-  real realZero = static_cast<real>(0.0);
-  NormalizedBBox clippedBBox;
-  clippedBBox.xMin = std::max(std::min(bbox.xMin, realOne), realZero);
-  clippedBBox.yMin = std::max(std::min(bbox.yMin, realOne), realZero);
-  clippedBBox.xMax = std::max(std::min(bbox.xMax, realOne), realZero);
-  clippedBBox.yMax = std::max(std::min(bbox.yMax, realOne), realZero);
-  return clippedBBox;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DetectionUtil.h b/paddle/legacy/gserver/layers/DetectionUtil.h
deleted file mode 100644
index c1e0bb809ad..00000000000
--- a/paddle/legacy/gserver/layers/DetectionUtil.h
+++ /dev/null
@@ -1,307 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <float.h>
-#include <algorithm>
-#include <vector>
-#include "paddle/legacy/math/Matrix.h"
-
-using std::vector;
-using std::pair;
-using std::map;
-
-namespace paddle {
-
-template <typename T>
-struct BBoxBase {
-  BBoxBase(T xMin, T yMin, T xMax, T yMax)
-      : xMin(xMin), yMin(yMin), xMax(xMax), yMax(yMax), isDifficult(false) {}
-
-  BBoxBase() {}
-
-  T getWidth() const { return xMax - xMin; }
-
-  T getHeight() const { return yMax - yMin; }
-
-  T getCenterX() const { return (xMin + xMax) / 2; }
-
-  T getCenterY() const { return (yMin + yMax) / 2; }
-
-  T getArea() const { return getWidth() * getHeight(); }
-
-  // coordinate of bounding box
-  T xMin;
-  T yMin;
-  T xMax;
-  T yMax;
-  // whether difficult object (e.g. object with heavy occlusion is difficult)
-  bool isDifficult;
-};
-
-struct NormalizedBBox : BBoxBase<real> {
-  NormalizedBBox() : BBoxBase<real>() {}
-};
-
-enum PermMode { kNCHWToNHWC, kNHWCToNCHW };
-
-/**
- * @brief First permute input maxtrix then append to output matrix
- */
-size_t appendWithPermute(const Matrix& inMatrix,
-                         size_t height,
-                         size_t width,
-                         size_t outTotalSize,
-                         size_t outOffset,
-                         size_t batchSize,
-                         Matrix& outMatrix,
-                         PermMode permMode);
-
-/**
- * @brief First permute input maxtrix then decompose to output
- */
-size_t decomposeWithPermute(const Matrix& inMatrix,
-                            size_t height,
-                            size_t width,
-                            size_t totalSize,
-                            size_t offset,
-                            size_t batchSize,
-                            Matrix& outMatrix,
-                            PermMode permMode);
-
-/**
- * @brief Compute jaccard overlap between two bboxes.
- * @param bbox1 The first bbox
- * @param bbox2 The second bbox
- */
-real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
-
-/**
- * @brief Compute offset parameters between prior bbox and ground truth bbox
- * and variances of prior bbox are considered
- * @param priorBBox Input prior bbox
- * @param priorBBoxVar Variance parameters of prior bbox
- * @param gtBBox Groundtruth bbox
- * @param outVec Output vector
- */
-void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
-                       const vector<real>& priorBBoxVar,
-                       const NormalizedBBox& gtBBox,
-                       vector<real>& outVec);
-
-/**
- * @brief Decode prior bbox with offset parameters
- * and variances of prior bbox are considered
- * @param priorBBox Prior bbox to be decoded
- * @param priorBBoxVar Variance parameters of prior bbox
- * @param locPredData Offset parameters
- */
-NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
-                                 const vector<real>& priorBBoxVar,
-                                 const vector<real>& locPredData);
-
-/**
- * @brief Extract bboxes from prior matrix, the layout is
- * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
- * @param priorData Matrix of prior value
- * @param numBBoxes Number of bbox to be extracted
- * @param bboxVec Append to the vector
- */
-void getBBoxFromPriorData(const real* priorData,
-                          const size_t numBBoxes,
-                          vector<NormalizedBBox>& bboxVec);
-
-/**
- * @brief Extract labels, scores and bboxes from detection matrix, the layout is
- * imageId | label | score | xmin | ymin | xmax | ymax
- * @param detectData Matrix of detection value
- * @param numBBoxes Number of bbox to be extracted
- * @param labelVec Label of bbox
- * @param scoreVec Score of bbox
- * @param bboxVec Append to the vector
- */
-void getBBoxFromDetectData(const real* detectData,
-                           const size_t numBBoxes,
-                           vector<real>& labelVec,
-                           vector<real>& scoreVec,
-                           vector<NormalizedBBox>& bboxVec);
-
-/**
- * @brief Extract variances from prior matrix, the layout is
- * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
- * @param priorData Matrix of prior value
- * @param num Number to be extracted
- * @param varVec Append to the vector
- */
-void getBBoxVarFromPriorData(const real* priorData,
-                             const size_t num,
-                             vector<vector<real>>& varVec);
-
-/**
- * @brief Extract bboxes from label matrix, the layout is
- * class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ...
- * @param labelData Matrix of label value
- * @param numBBoxes Number to be extracted
- * @param bboxVec Append to the vector
- */
-void getBBoxFromLabelData(const real* labelData,
-                          const size_t numBBoxes,
-                          vector<NormalizedBBox>& bboxVec);
-
-/**
-* @brief Match prior bbox to groundtruth bbox, the strategy is:
-1. Find the most overlaped bbox pair (prior and groundtruth)
-2. For rest of prior bboxes find the most overlaped groundtruth bbox
-* @param priorBBoxes prior bbox
-* @param gtBBoxes groundtruth bbox
-* @param overlapThreshold Low boundary of overlap (judge whether matched)
-* @param matchIndices For each prior bbox, groundtruth bbox index if matched
-otherwise -1
-* @param matchOverlaps For each prior bbox, overap with all groundtruth bboxes
-*/
-void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
-               const vector<NormalizedBBox>& gtBBoxes,
-               real overlapThreshold,
-               vector<int>* matchIndices,
-               vector<real>* matchOverlaps);
-
-/**
-* @brief Generate positive bboxes and negative bboxes,
-|positive bboxes|/|negative bboxes| is negPosRatio
-* @param priorValue Prior value
-* @param numPriorBBoxes Number of prior bbox
-* @param gtValue Groundtruth value
-* @param gtStartPosPtr Since groundtruth value stored as sequence type,
-this parameter indicates start position of each record
-* @param seqNum Number of sequence
-* @param maxConfScore Classification score for prior bbox, used to mine
-negative examples
-* @param batchSize Image number
-* @param overlapThreshold Low boundary of overap
-* @param negOverlapThreshold Upper boundary of overap (judge negative example)
-* @param negPosRatio Control number of negative bboxes
-* @param matchIndicesVecPtr Save indices of matched prior bbox
-* @param negIndicesVecPtr Save indices of negative prior bbox
-*/
-pair<size_t, size_t> generateMatchIndices(
-    const Matrix& priorValue,
-    const size_t numPriorBBoxes,
-    const Matrix& gtValue,
-    const int* gtStartPosPtr,
-    const size_t seqNum,
-    const vector<vector<real>>& maxConfScore,
-    const size_t batchSize,
-    const real overlapThreshold,
-    const real negOverlapThreshold,
-    const size_t negPosRatio,
-    vector<vector<int>>* matchIndicesVecPtr,
-    vector<vector<int>>* negIndicesVecPtr);
-
-/**
- * @brief Get max confidence score for each prior bbox
- * @param confData Confidence scores, layout is
- * class1 score | class2 score | ... | classN score ...
- * @param batchSize Image number
- * @param numPriorBBoxes Prior bbox number
- * @param numClasses Classes number
- * @param backgroundId Background id
- * @param maxConfScoreVecPtr Ouput
- */
-void getMaxConfidenceScores(const real* confData,
-                            const size_t batchSize,
-                            const size_t numPriorBBoxes,
-                            const size_t numClasses,
-                            const size_t backgroundId,
-                            vector<vector<real>>* maxConfScoreVecPtr);
-
-template <typename T>
-bool sortScorePairDescend(const pair<real, T>& pair1,
-                          const pair<real, T>& pair2);
-
-template <>
-bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
-                          const pair<real, NormalizedBBox>& pair2);
-
-/**
- * @brief Do NMS for bboxes to remove duplicated bboxes
- * @param bboxes BBoxes to apply NMS
- * @param confScoreData Confidence scores
- * @param classIdx Class to do NMS
- * @param topK Number to keep
- * @param confThreshold Low boundary of confidence score
- * @param nmsThreshold Threshold of overlap
- * @param numPriorBBoxes Total number of prior bboxes
- * @param numClasses Total class number
- * @param indices Indices of high quality bboxes
- */
-void applyNMSFast(const vector<NormalizedBBox>& bboxes,
-                  const real* confScoreData,
-                  size_t classIdx,
-                  size_t topK,
-                  real confThreshold,
-                  real nmsThreshold,
-                  size_t numPriorBBoxes,
-                  size_t numClasses,
-                  vector<size_t>* indices);
-
-/**
- * @brief Get detection results which satify requirements
- * @param numPriorBBoxes Prior bbox number
- * @param numClasses Class number
- * @param backgroundId Background class
- * @param batchSize Image number
- * @param confThreshold Threshold of class confidence
- * @param nmsTopK Used in NMS operation to keep top k bbox
- * @param nmsThreshold Used in NMS, threshold of overlap
- * @param keepTopK How many bboxes keeped in an image
- * @param allDecodedBBoxes Decoded bboxes for all images
- * @param allDetectionIndices Save detection bbox indices
- */
-size_t getDetectionIndices(
-    const real* confData,
-    const size_t numPriorBBoxes,
-    const size_t numClasses,
-    const size_t backgroundId,
-    const size_t batchSize,
-    const real confThreshold,
-    const size_t nmsTopK,
-    const real nmsThreshold,
-    const size_t keepTopK,
-    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
-    vector<map<size_t, vector<size_t>>>* allDetectionIndices);
-
-/**
- * @brief Get detection results
- * @param confData Confidence scores
- * @param numPriorBBoxes Prior bbox number
- * @param numClasses Class number
- * @param batchSize Image number
- * @param allIndices Indices of predicted bboxes
- * @param allDecodedBBoxes BBoxes decoded
- * @param out Output matrix
- * image number | label | confidence score | xMin | yMin | xMax | yMax
- */
-void getDetectionOutput(const real* confData,
-                        const size_t numKept,
-                        const size_t numPriorBBoxes,
-                        const size_t numClasses,
-                        const size_t batchSize,
-                        const vector<map<size_t, vector<size_t>>>& allIndices,
-                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
-                        Matrix& out);
-
-NormalizedBBox clipBBox(const NormalizedBBox& bbox);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DotMulOperator.cpp b/paddle/legacy/gserver/layers/DotMulOperator.cpp
deleted file mode 100644
index 03d18d9b239..00000000000
--- a/paddle/legacy/gserver/layers/DotMulOperator.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Operator.h"
-
-namespace paddle {
-
-/**
- * DotMulOperator takes two inputs, performs element-wise multiplication:
- * \f[
- *   out.row[i] += scale * (in1.row[i] .* in2.row[i])
- * \f]
- * where \f$.*\f$ means element-wise multiplication,
- * and scale is a config scalar, its default value is one.
- *
- * The config file api is dotmul_operator.
- */
-class DotMulOperator : public Operator {
- public:
-  DotMulOperator(const OperatorConfig& config, bool useGpu);
-  virtual void forward();
-  virtual void backward();
-};
-
-REGISTER_OPERATOR(dot_mul, DotMulOperator);
-
-DotMulOperator::DotMulOperator(const OperatorConfig& config, bool useGpu)
-    : Operator(config, useGpu) {
-  CHECK_EQ(config_.input_indices_size(), 2L);
-}
-
-void DotMulOperator::forward() {
-  out_->value->addDotMul(
-      *ins_[0]->value, *ins_[1]->value, 1, config_.dotmul_scale());
-}
-
-void DotMulOperator::backward() {
-  const MatrixPtr& inV0 = ins_[0]->value;
-  const MatrixPtr& inV1 = ins_[1]->value;
-  const MatrixPtr& inG0 = ins_[0]->grad;
-  const MatrixPtr& inG1 = ins_[1]->grad;
-
-  if (inG0) {
-    inG0->addDotMul(*out_->grad, *inV1, 1, config_.dotmul_scale());
-  }
-  if (inG1) {
-    inG1->addDotMul(*out_->grad, *inV0, 1, config_.dotmul_scale());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DotMulProjection.cpp b/paddle/legacy/gserver/layers/DotMulProjection.cpp
deleted file mode 100644
index d7780387670..00000000000
--- a/paddle/legacy/gserver/layers/DotMulProjection.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * DotMulProjection performs element-wise multiplication with weight:
- * \f[
- *   out.row[i] += in.row[i] .* weight
- * \f]
- * where \f$.*\f$ means element-wise multiplication.
- *
- * The config file api is dotmul_projection.
- */
-class DotMulProjection : public Projection {
- public:
-  DotMulProjection(const ProjectionConfig& config,
-                   const ParameterPtr& parameter,
-                   bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
- protected:
-  /// shared memory with parameter
-  std::unique_ptr<Weight> weight_;
-};
-
-REGISTER_PROJECTION(dot_mul, DotMulProjection);
-
-DotMulProjection::DotMulProjection(const ProjectionConfig& config,
-                                   const ParameterPtr& parameter,
-                                   bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  weight_.reset(new Weight(1LU, config.output_size(), parameter));
-}
-
-void DotMulProjection::forward() {
-  out_->value->addDotMulMMV(*in_->value, *(weight_->getW()));
-}
-
-void DotMulProjection::backward(const UpdateCallback& callback) {
-  /* Calculate the W-gradient for the current layer */
-  if (weight_->getWGrad()) {
-    weight_->getWGrad()->addDotMulVMM(*out_->grad, *in_->value);
-  }
-
-  /* Calculate the input layers error */
-  if (in_->grad) {
-    in_->grad->addDotMulMMV(*out_->grad, *(weight_->getW()));
-  }
-
-  parameter_->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DotProdLayer.cpp b/paddle/legacy/gserver/layers/DotProdLayer.cpp
deleted file mode 100644
index 06060d93f76..00000000000
--- a/paddle/legacy/gserver/layers/DotProdLayer.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for computing the dot product of two vectors.
- * Input1: vector (batchSize * dim)
- * Input2: vector (batchSize * dim)
- * Output: a matrix: (batchSize * 1)
- */
-
-class DotProdLayer : public Layer {
- public:
-  explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~DotProdLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(dot_prod, DotProdLayer);
-
-bool DotProdLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-  CHECK_EQ(1UL, getSize())
-      << "The output dimensionality of this layer should be fixed to 1.";
-
-  return true;
-}
-
-void DotProdLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  CHECK_EQ(inV1->getHeight(), batchSize);
-  CHECK_EQ(inV0->getWidth(), inV1->getWidth());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, 1);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
-    outV->sumOfProducts(*inV0, *inV1, 1, 0);
-  }
-}
-
-void DotProdLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  {
-    REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
-
-    if (inG0) {
-      inG0->addRowScale(0, *inV1, *outG);
-    }
-
-    if (inG1) {
-      inG1->addRowScale(0, *inV0, *outG);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/EosIdCheckLayer.cpp b/paddle/legacy/gserver/layers/EosIdCheckLayer.cpp
deleted file mode 100644
index 38671126c62..00000000000
--- a/paddle/legacy/gserver/layers/EosIdCheckLayer.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-/**
- * A layer for checking EOS for each sample:
- * - output_id = (input_id == conf.eos_id)
- *
- * The result is stored in output_.ids.
- * It is used by recurrent layer group.
- */
-class EosIdCheckLayer : public Layer {
- public:
-  explicit EosIdCheckLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    bool ret = Layer::init(layerMap, parameterMap);
-    CHECK_EQ(1UL, inputLayers_.size());
-    return ret;
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-
-    const Argument& input = getInput(0);
-    IVector::resizeOrCreate(output_.ids, input.ids->getSize(), useGpu_);
-    output_.ids->isEqualTo(*input.ids, config_.eos_id());
-  }
-
-  void backward(const UpdateCallback& callback) override {}
-};
-
-REGISTER_LAYER(eos_id, EosIdCheckLayer);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ExpandConvLayer.cpp b/paddle/legacy/gserver/layers/ExpandConvLayer.cpp
deleted file mode 100644
index 8a53db38068..00000000000
--- a/paddle/legacy/gserver/layers/ExpandConvLayer.cpp
+++ /dev/null
@@ -1,248 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ExpandConvLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-DEFINE_bool(use_nnpack,
-            false,
-            "Whether to use nnpack for convolution calculation.");
-
-namespace paddle {
-
-/*
- * The calculation of the exconvt(convolution transpose (deconv) operation)
- * is a swap of forward and backward of the calculation of exconv.
- * */
-REGISTER_LAYER(exconv, ExpandConvLayer);
-REGISTER_LAYER(exconvt, ExpandConvLayer);
-
-inline bool isDepthwiseConv(int channels, int groups) {
-  return channels == groups;
-}
-
-bool ExpandConvLayer::init(const LayerMap &layerMap,
-                           const ParameterMap &parameterMap) {
-  /* Initialize the basic convolutional parent class */
-  ConvBaseLayer::init(layerMap, parameterMap);
-
-  int index = 0;
-  for (auto &inputConfig : config_.inputs()) {
-    const ConvConfig &conf = inputConfig.conv_conf();
-    /* Consistent caffe mode for multiple input */
-    caffeMode_ = conf.caffe_mode();
-
-    // create a new weight
-    size_t height, width;
-    height = filterPixels_[index] * filterChannels_[index];
-    width = (!isDeconv_) ? numFilters_ : channels_[index];
-    CHECK_EQ(parameters_[index]->getSize(), width * height);
-    Weight *w = new Weight(height, width, parameters_[index]);
-    weights_.emplace_back(w);
-    index++;
-  }
-
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ = std::unique_ptr<Weight>(
-          new Weight(1, numFilters_, biasParameter_, 0));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_, 0));
-    }
-  }
-
-  getOutputSize();
-
-  size_t numInputs = config_.inputs_size();
-  inputShape_.resize(numInputs);
-  filterShape_.resize(numInputs);
-  outputShape_.resize(numInputs);
-
-  std::string convType;
-  std::string convGradInputType;
-  std::string convGradFilterType;
-
-  for (int i = 0; i < config_.inputs_size(); i++) {
-    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
-    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
-    std::vector<size_t> dilations = {(size_t)dilationY_[i],
-                                     (size_t)dilation_[i]};
-
-    bool useDilation = ((size_t)dilationY_[i] > 1 || (size_t)dilation_[i] > 1);
-
-    // Convolution Layer uses the GemmConv function by default.
-    convType = "GemmConv";
-    convGradInputType = "GemmConvGradInput";
-    convGradFilterType = "GemmConvGradFilter";
-
-    // If depth wise convolution and useGpu == true
-    if (useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
-      convType = "DepthwiseConv";
-      convGradInputType = "DepthwiseConvGradInput";
-      convGradFilterType = "DepthwiseConvGradFilter";
-    }
-
-    // If depth wise convolution and useGpu == false and ARM-NEON
-    if (!useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      if ((filterSize_[i] == filterSizeY_[i]) &&
-          (filterSize_[i] == 3 || filterSize_[i] == 4) &&
-          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2) &&
-          !useDilation) {
-        convType = "NeonDepthwiseConv";
-      }
-#endif
-    }
-
-    if (FLAGS_use_nnpack && !isDeconv_ && !useDilation) {
-      createFunction(forward_,
-                     "NNPACKConv",
-                     FuncConfig()
-                         .set("paddings", paddings)
-                         .set("strides", strides)
-                         .set("groups", (size_t)groups_[i])
-                         .set("algo", std::string("auto")));
-    } else {
-      createFunction(forward_,
-                     !isDeconv_ ? convType : convGradInputType,
-                     FuncConfig()
-                         .set("paddings", paddings)
-                         .set("strides", strides)
-                         .set("dilations", dilations)
-                         .set("groups", (size_t)groups_[i]));
-
-      createFunction(backward_,
-                     !isDeconv_ ? convGradInputType : convType,
-                     FuncConfig()
-                         .set("paddings", paddings)
-                         .set("strides", strides)
-                         .set("dilations", dilations)
-                         .set("groups", (size_t)groups_[i]));
-
-      createFunction(backward_,
-                     convGradFilterType,
-                     FuncConfig()
-                         .set("paddings", paddings)
-                         .set("strides", strides)
-                         .set("dilations", dilations)
-                         .set("groups", (size_t)groups_[i]));
-    }
-  }
-  return true;
-}
-
-size_t ExpandConvLayer::getOutputSize() {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  size_t layerSize = ConvBaseLayer::calOutputSize();
-  return layerSize;
-}
-
-// i is the index of input layers
-#define BACKWARD_INPUT(i, inputs, outputs) \
-  backward_[2 * i]->calc(inputs, outputs)
-#define BACKWARD_FILTER(i, inputs, outputs) \
-  backward_[2 * i + 1]->calc(inputs, outputs)
-
-void ExpandConvLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  resetOutput(batchSize, getOutputSize());
-
-  // Calculate the shape of the input, output, and filter.
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    inputShape_[i] = TensorShape({(size_t)batchSize,
-                                  (size_t)channels_[i],
-                                  (size_t)imgSizeH_[i],
-                                  (size_t)imgSizeW_[i]});
-    filterShape_[i] =
-        TensorShape({(size_t)groups_[i],
-                     !isDeconv_ ? (size_t)numFilters_ / groups_[i]
-                                : (size_t)channels_[i] / groups_[i],
-                     !isDeconv_ ? (size_t)channels_[i] / groups_[i]
-                                : (size_t)numFilters_ / groups_[i],
-                     (size_t)filterSizeY_[i],
-                     (size_t)filterSize_[i]});
-    outputShape_[i] = TensorShape({(size_t)batchSize,
-                                   (size_t)numFilters_,
-                                   (size_t)outputH_[i],
-                                   (size_t)outputW_[i]});
-  }
-
-  // Calculate the output value.
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*getInputValue(i), inputShape_[i]);
-    inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
-    outputs.addArg(*getOutputValue(),
-                   outputShape_[i],
-                   !isDeconv_ && i == 0 ? ASSIGN_TO : ADD_TO);
-
-    forward_[i]->calc(inputs, outputs);
-  }
-
-  /* add the bias-vector */
-  if (biases_.get()) {
-    output_.value->addBias(*biases_->getW(), 1.0, sharedBiases_);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void ExpandConvLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-
-  MatrixPtr outGrad = getOutputGrad();
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBiases_);
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  // Calculate the input grad and filter grad.
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    if (getInputGrad(i)) {
-      BufferArgs inputs;
-      BufferArgs outputs;
-      inputs.addArg(*getOutputGrad(), outputShape_[i]);
-      inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
-      outputs.addArg(*getInputGrad(i), inputShape_[i], ADD_TO);
-      BACKWARD_INPUT(i, inputs, outputs);
-    }
-
-    if (weights_[i]->getWGrad()) {
-      BufferArgs inputs;
-      BufferArgs outputs;
-      if (!isDeconv_) {
-        inputs.addArg(*getOutputGrad(), outputShape_[i]);
-        inputs.addArg(*getInputValue(i), inputShape_[i]);
-      } else {
-        inputs.addArg(*getInputValue(i), inputShape_[i]);
-        inputs.addArg(*getOutputGrad(), outputShape_[i]);
-      }
-      outputs.addArg(*weights_[i]->getWGrad(), filterShape_[i], ADD_TO);
-      BACKWARD_FILTER(i, inputs, outputs);
-
-      /* Increasing the number of gradient */
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ExpandConvLayer.h b/paddle/legacy/gserver/layers/ExpandConvLayer.h
deleted file mode 100644
index c0eff3ab061..00000000000
--- a/paddle/legacy/gserver/layers/ExpandConvLayer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A subclass of convolution layer.
- * This layer expands input and use matrix multiplication to
- * calculate convolution operation.
- *
- * The config file api is img_conv_layer.
- */
-
-class ExpandConvLayer : public ConvBaseLayer {
- public:
-  explicit ExpandConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
-
-  ~ExpandConvLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  size_t getOutputSize();
-
- protected:
-  std::vector<TensorShape> inputShape_;
-  std::vector<TensorShape> filterShape_;
-  std::vector<TensorShape> outputShape_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ExpandLayer.cpp b/paddle/legacy/gserver/layers/ExpandLayer.cpp
deleted file mode 100644
index 074fbab8ef9..00000000000
--- a/paddle/legacy/gserver/layers/ExpandLayer.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ExpandLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(expand, ExpandLayer);
-
-bool ExpandLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(inputLayers_.size(), 2UL);
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-  // which sequence type of input[0]
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void ExpandLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  // Expand layer should have exactly 2 input, one for data, one for size
-  CHECK_EQ(2U, inputLayers_.size());
-
-  // using two input:
-  // * first one for data;
-  // * second one only for sequence info
-  const Argument& shapeInput = getInput(1);
-  const Argument& dataInput = getInput(0);
-  size_t outputBatchSize = shapeInput.getBatchSize();
-  auto startPositions = type_ ? shapeInput.subSequenceStartPositions
-                              : shapeInput.sequenceStartPositions;
-  size_t numSequences = startPositions->getSize() - 1;
-  const int* starts = startPositions->getData(false);
-
-  CHECK_EQ(starts[numSequences], shapeInput.getBatchSize());
-  if (type_) {
-    // when trans_type = seq, input[1] must hasSubseq
-    CHECK_EQ(shapeInput.hasSubseq(), 1UL);
-    CHECK_EQ(dataInput.getNumSequences(), shapeInput.getNumSequences());
-  } else {
-    CHECK_EQ(dataInput.getBatchSize(), shapeInput.getNumSequences());
-  }
-
-  // set output sequence info as shape sequence
-  output_.sequenceStartPositions = shapeInput.sequenceStartPositions;
-  if (shapeInput.hasSubseq()) {
-    output_.subSequenceStartPositions = shapeInput.subSequenceStartPositions;
-  }
-
-  // reserve output: Expand output to batchsize of sequence data.
-  reserveOutput(outputBatchSize, dataInput.value->getWidth());
-
-  MatrixPtr inputValue = getInputValue(0);
-  MatrixPtr outputValue = getOutputValue();
-
-  ICpuGpuVector::resizeOrCreate(expandStartsPos_, outputBatchSize, false);
-  int* expandStarts = expandStartsPos_->getMutableData(false);
-  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
-    int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
-    for (int j = 0; j < sequenceLength; j++) {
-      expandStarts[starts[sequenceId] + j] = sequenceId;
-    }
-  }
-
-  outputValue->copyByRowIndex(*inputValue,
-                              *expandStartsPos_->getVector(useGpu_));
-
-  if (biases_.get() != NULL) {
-    outputValue->addBias(*(biases_->getW()), 1);
-  }
-}
-
-void ExpandLayer::backward(const UpdateCallback& callback) {
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  if (!getInputGrad(0)) return;
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-  auto cpuSeqStartPos = type_ ? getInput(1).subSequenceStartPositions
-                              : getInput(1).sequenceStartPositions;
-  size_t numSequences = cpuSeqStartPos->getSize() - 1;
-  const int* starts = cpuSeqStartPos->getData(false);
-
-  CHECK_EQ(inputGrad->getWidth(), outputGrad->getWidth());
-  CHECK_EQ(outputGrad->getHeight(), (size_t)starts[numSequences]);
-
-  AsyncGpuBlock asyncGpuBlock;
-
-  // sum to get the grad
-  real scale = 1;
-  for (size_t sequenceId = 0; sequenceId < numSequences; sequenceId++) {
-    // TODO(Dangqingqing) optimization for GPU
-    int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
-    if (sequenceLength == 0) {
-      // empty sequence
-      continue;
-    }
-    MatrixPtr copyData = inputGrad->subMatrix(sequenceId, 1);
-    copyData->collectBias(
-        *outputGrad->subMatrix(starts[sequenceId], sequenceLength), scale);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ExpandLayer.h b/paddle/legacy/gserver/layers/ExpandLayer.h
deleted file mode 100644
index 75a1ec75688..00000000000
--- a/paddle/legacy/gserver/layers/ExpandLayer.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * A layer for "Expand Dense data or (sequence data where the length of each
- * sequence is one) to sequence data."
- *
- * It should have exactly 2 input, one for data, one for size:
- * - first one for data
- *   - If ExpandLevel = kNonSeq: dense data
- *   - If ExpandLevel = kSeq: sequence data where the length of each sequence is
- * one
- * - second one only for sequence info
- *   - should be sequence data with or without sub-sequence.
- *
- * And the output size is the batch size(not instances) of second input.
- *
- * The config file api is expand_layer.
- */
-
-class ExpandLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-  /// if input[0] is dense data, ExpandLevel=kNonSeq;
-  /// if input[0] is sequence data, ExpandLevel=kSeq
-  enum ExpandLevel { kNonSeq = 0, kSeq = 1 };
-  /// store the ExpandLevel
-  int type_;
-  /// expanded sequenceStartPositions or subSequenceStartPositions
-  /// of input[1]
-  ICpuGpuVectorPtr expandStartsPos_;
-
- public:
-  explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ExpandLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp b/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp
deleted file mode 100644
index 6cf269fa3ff..00000000000
--- a/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FactorizationMachineLayer.h"
-#include <algorithm>
-#include <vector>
-#include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
-
-bool FactorizationMachineLayer::init(const LayerMap& layerMap,
-                                     const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  factorSize_ = config_.factor_size();
-
-  /* initialize the latentVectors_ */
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t inputSize = inputLayers_[0]->getSize();
-  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
-  latentVectors_ = std::unique_ptr<Weight>(
-      new Weight(inputSize, factorSize_, parameters_[0]));
-
-  return true;
-}
-
-void FactorizationMachineLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const MatrixPtr& inputV = getInputValue(0);
-
-  size_t batchSize = inputV->getHeight();
-  size_t outputSize = getSize();
-  size_t inputSize = inputLayers_[0]->getSize();
-  reserveOutput(batchSize, outputSize);
-
-  MatrixPtr outV = getOutputValue();
-
-  Matrix::resizeOrCreate(
-      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
-  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
-
-  REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
-  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
-  inputMulFactor_->square2(*tmpOut_);
-  outV->sumRows(*tmpOut_, 0.5, 0);
-
-  if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
-    Matrix::resizeOrCreateSparseMatrix(inputSquare_,
-                                       inputV->getHeight(),
-                                       inputV->getWidth(),
-                                       inputV->getElementCnt(),
-                                       inputV->getValueType());
-    inputSquare_->copyFrom(*inputV);
-    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
-  } else {
-    Matrix::resizeOrCreate(
-        inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
-    inputV->square2(*inputSquare_);
-  }
-  latentVectors_->getW()->square2(*latentVectorsSquare_);
-  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
-  outV->sumRows(*tmpOut_, -0.5, 1.0);
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ { backwardActivation(); }
-
-  const MatrixPtr& inputV = getInputValue(0);
-  const MatrixPtr& oGrad = getOutputGrad();
-
-  Matrix::resizeOrCreate(
-      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
-  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
-                                         latentVectors_->getW()->getHeight(),
-                                         1,
-                                         false,
-                                         useGpu_);
-
-  /* Calculate the gradients of the latentVectors_ matrix */
-  if (latentVectors_->getWGrad()) {
-    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
-      Matrix::resizeOrCreateSparseMatrix(tmpInput_,
-                                         inputV->getHeight(),
-                                         inputV->getWidth(),
-                                         inputV->getElementCnt());
-
-      CpuSparseMatrix* sparseInputV =
-          dynamic_cast<CpuSparseMatrix*>(inputV.get());
-      CpuSparseMatrix* sparseInputSquare =
-          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
-      CpuSparseMatrix* sparseTmpInput =
-          dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
-      sparseTmpInput->copyFrom(*sparseInputV);
-
-      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
-      latentVectors_->getWGrad()->mul(
-          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
-      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
-
-      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
-      negOnes_->zeroMem();
-      negOnes_->add(-1);
-      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
-    } else {
-      Matrix::resizeOrCreate(
-          tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
-
-      tmpInput_->rowScale(0, *inputV, *oGrad);
-      latentVectors_->getWGrad()->mul(
-          *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
-      tmpInput_->rowScale(0, *inputSquare_, *oGrad);
-
-      tmpSum_->sumCols(*tmpInput_, -1, 0);
-    }
-
-    latentVectors_->getWGrad()->addRowScale(
-        0, *latentVectors_->getW(), *tmpSumTrans);
-
-    /* Increasing the number of gradient */
-    latentVectors_->getParameterPtr()->incUpdate(callback);
-  }
-
-  /* Calculate the input layers gradient */
-  MatrixPtr inGrad = getInputGrad(0);
-  if (inGrad != NULL) {
-    inGrad->mul(
-        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
-    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
-    inGrad->addColScale(0, *inputV, *tmpSum_);
-    inGrad->rowScale(0, *inGrad, *oGrad);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FactorizationMachineLayer.h b/paddle/legacy/gserver/layers/FactorizationMachineLayer.h
deleted file mode 100644
index fc015ed727b..00000000000
--- a/paddle/legacy/gserver/layers/FactorizationMachineLayer.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-/**
- * @brief The Factorization Machine models pairwise (order-2) feature
- * interactions as inner product of the learned latent vectors corresponding
- * to each input feature.
- *
- * The Factorization Machine can effectively capture feature interactions
- * especially when the input is sparse. While in principle FM can model higher
- * order feature interaction, in practice usually only order-2 feature
- * interactions are considered. The Factorization Machine Layer here only
- * computes the order-2 interations with the formula:
- *
- * \f[
- *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
- * \f]
- *
- * The detailed calculation for forward and backward can be found at this paper:
- *
- *     Factorization machines.
- *
- * The config file api is factorization_machine.
- */
-
-class FactorizationMachineLayer : public Layer {
- protected:
-  // The latent vectors, shape: (size, factorSize_)
-  // Each row of the latentVectors_ matrix is the latent vector
-  // corresponding to one input feature dimension
-  std::unique_ptr<Weight> latentVectors_;
-  // The hyperparameter that defines the dimensionality of the factorization
-  size_t factorSize_;
-
- private:
-  // Store the square values of the letent vectors matrix
-  MatrixPtr latentVectorsSquare_;
-  // Store the square values of input matrix
-  MatrixPtr inputSquare_;
-  // The result of input matrix * latent vector matrix that will be used in
-  // both forward and backward step
-  MatrixPtr inputMulFactor_;
-  // Store temporary calculation result
-  MatrixPtr tmpOut_;
-  MatrixPtr tmpSum_;
-  MatrixPtr tmpInput_;
-  // Negative identity matrix
-  MatrixPtr negOnes_;
-
- public:
-  explicit FactorizationMachineLayer(const LayerConfig& config)
-      : Layer(config) {}
-  ~FactorizationMachineLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp
deleted file mode 100644
index a3fe1433e4b..00000000000
--- a/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for expanding a batch of images to feature maps.
- * Each data of the input is a 2 dimensional matrix. Each element of the matrix
- * is replicated num_filters times to create a feature map with num_filters
- * channels.
- * - Input: Input one should be dense image data.
- * - Output: expanded fature maps.
- * \f[
- *  y.row[i] = x.row[i \mod x.width], i = 0,1,..., (x.width * num\_filters - 1)
- * \f]
- * For example, num_filters = 4:
- * @code
- *   x = [a1,a2;
- *        b1,b2]
- *   y = [a1, a2, a1, a2, a1, a2, a1, a2;
- *        b1, b2, b1, b2, b1, b2, b1, b2;]
- * @endcode
- */
-
-class FeatureMapExpandLayer : public Layer {
- private:
-  int numFilters_;
-  bool asRowVector_;
-
- public:
-  explicit FeatureMapExpandLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~FeatureMapExpandLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(featmap_expand, FeatureMapExpandLayer);
-
-bool FeatureMapExpandLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  numFilters_ = config_.num_filters();
-  asRowVector_ = config_.user_arg() != "as_col_vec";
-  return true;
-}
-
-void FeatureMapExpandLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  MatrixPtr inputV = getInputValue(0);
-  size_t batchSize = getInput(0).getBatchSize();
-  int imgSize = inputV->getWidth();
-  resetOutput(batchSize, imgSize * numFilters_);
-
-  MatrixPtr outputV = getOutputValue();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    if (asRowVector_) {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outVTmp =
-            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
-                           numFilters_,
-                           imgSize,
-                           false,
-                           useGpu_);
-        MatrixPtr inVTmp = Matrix::create(
-            inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
-        outVTmp->addRowVector(*inVTmp);
-      }
-    } else {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outVTmp =
-            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
-                           imgSize,
-                           numFilters_,
-                           false,
-                           useGpu_);
-        MatrixPtr inVTmp = Matrix::create(
-            inputV->getData() + i * imgSize, imgSize, 1, false, useGpu_);
-        outVTmp->addColVector(*inVTmp);
-      }
-    }
-  }
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inGrad = getInputGrad(0);
-  if (NULL == inGrad) {
-    return;
-  }
-  MatrixPtr outGrad = getOutputGrad();
-  size_t batchSize = getInput(0).getBatchSize();
-  int imgSize = inGrad->getWidth();
-  /* Do activation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    if (asRowVector_) {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outGradTmp =
-            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
-                           numFilters_,
-                           imgSize,
-                           false,
-                           useGpu_);
-        MatrixPtr inGradTmp = Matrix::create(
-            inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
-        inGradTmp->collectBias(*outGradTmp, 1);
-      }
-    } else {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outGradTmp =
-            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
-                           imgSize,
-                           numFilters_,
-                           false,
-                           useGpu_);
-        MatrixPtr inGradTmp = Matrix::create(
-            inGrad->getData() + i * imgSize, imgSize, 1, false, useGpu_);
-        inGradTmp->sumRows(*outGradTmp, 1, 1);
-      }
-    }
-  }
-}
-
-}  // namespace paddle.
diff --git a/paddle/legacy/gserver/layers/FullMatrixProjection.cpp b/paddle/legacy/gserver/layers/FullMatrixProjection.cpp
deleted file mode 100644
index b9f1bc99fab..00000000000
--- a/paddle/legacy/gserver/layers/FullMatrixProjection.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FullMatrixProjection.h"
-
-namespace paddle {
-
-REGISTER_PROJECTION(fc, FullMatrixProjection);
-
-FullMatrixProjection::FullMatrixProjection(const ProjectionConfig& config,
-                                           const ParameterPtr& parameter,
-                                           bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  weight_.reset(
-      new Weight(config.input_size(), config.output_size(), parameter));
-}
-
-void FullMatrixProjection::forward() {
-  REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
-  out_->value->mul(*(in_->value), *(weight_->getW()), 1, 1);
-}
-
-void FullMatrixProjection::backward(const UpdateCallback& callback) {
-  bool syncFlag = hl_get_sync_flag();
-
-  /* Calculate the W-gradient for the current layer */
-  if (weight_->getWGrad()) {
-    REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-    weight_->getWGrad()->mul(
-        *(in_->value->getTranspose()), *(out_->grad), 1, 1);
-  }
-
-  // If callback does not change value, backward propagation error
-  // asynchronously, so that we can do the callback concurrently.
-  hl_set_sync_flag(false);
-
-  /* Calculate the input layers error */
-  if (in_->grad) {
-    REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-    in_->grad->mul(*(out_->grad), *(weight_->getW()->getTranspose()), 1, 1);
-  }
-
-  hl_set_sync_flag(syncFlag);
-  if (weight_->getWGrad()) {
-    parameter_->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FullMatrixProjection.h b/paddle/legacy/gserver/layers/FullMatrixProjection.h
deleted file mode 100644
index c33d02a3aea..00000000000
--- a/paddle/legacy/gserver/layers/FullMatrixProjection.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/legacy/utils/Stat.h"
-
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * FullMatrixProjection performs full matrix multiplication:
- * \f[
- *    out.row[i] += in.row[i] * weight
- * \f]
- *
- * The config file api is full_matrix_projection.
- */
-class FullMatrixProjection : public Projection {
- public:
-  FullMatrixProjection(const ProjectionConfig& config,
-                       const ParameterPtr& parameter,
-                       bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
- protected:
-  std::unique_ptr<Weight> weight_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp b/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp
deleted file mode 100644
index 07f4dfbe39c..00000000000
--- a/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FullyConnectedLayer.h"
-#include <algorithm>
-#include <vector>
-#include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(fc, FullyConnectedLayer);
-
-bool FullyConnectedLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize the weightList */
-  CHECK(inputLayers_.size() == parameters_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    // Option the parameters
-    size_t height = inputLayers_[i]->getSize();
-    size_t width = getSize();
-
-    // create a new weight
-    if (parameters_[i]->isSparse()) {
-      CHECK_LE(parameters_[i]->getSize(), width * height);
-    } else {
-      CHECK_EQ(parameters_[i]->getSize(), width * height);
-    }
-    Weight* w = new Weight(height, width, parameters_[i]);
-
-    // append the new weight to the list
-    weights_.emplace_back(w);
-  }
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  return true;
-}
-
-void FullyConnectedLayer::prefetch() {
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    auto* sparseParam =
-        dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
-    if (sparseParam) {
-      MatrixPtr input = getInputValue(i);
-      sparseParam->addRows(input);
-    }
-  }
-}
-
-void FullyConnectedLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, size);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    auto input = getInput(i);
-    CHECK(input.value) << "The input of 'fc' layer must be matrix";
-    REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
-    i == 0 ? outV->mul(*input.value, *weights_[i]->getW(), 1, 0)
-           : outV->mul(*input.value, *weights_[i]->getW(), 1, 1);
-  }
-
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void FullyConnectedLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  bool syncFlag = hl_get_sync_flag();
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    /* Calculate the W-gradient for the current layer */
-    if (weights_[i]->getWGrad()) {
-      MatrixPtr input_T = getInputValue(i)->getTranspose();
-      MatrixPtr oGrad = getOutputGrad();
-      {
-        REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-        weights_[i]->getWGrad()->mul(*input_T, *oGrad, 1, 1);
-      }
-    }
-
-    // If callback does not change value, backprop error asynchronously so that
-    // we can do the callback concurrently.
-    hl_set_sync_flag(false);
-
-    /* Calculate the input layers error */
-    MatrixPtr preGrad = getInputGrad(i);
-    if (NULL != preGrad) {
-      MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
-      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-      preGrad->mul(*getOutputGrad(), *weights_T, 1, 1);
-    }
-
-    hl_set_sync_flag(syncFlag);
-    {
-      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FullyConnectedLayer.h b/paddle/legacy/gserver/layers/FullyConnectedLayer.h
deleted file mode 100644
index 7e29cac0437..00000000000
--- a/paddle/legacy/gserver/layers/FullyConnectedLayer.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-/**
- * A layer has full connections to all neurons in the previous layer.
- * It computes an inner product with a set of learned weights, and
- * (optionally) adds biases.
- *
- * The config file api is fc_layer.
- */
-
-class FullyConnectedLayer : public Layer {
- protected:
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit FullyConnectedLayer(const LayerConfig& config) : Layer(config) {}
-  ~FullyConnectedLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-
-  void prefetch() override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp b/paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp
deleted file mode 100644
index bdcd445cb47..00000000000
--- a/paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp
+++ /dev/null
@@ -1,414 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GatedRecurrentLayer.h"
-#include "Layer.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(gated_recurrent, GatedRecurrentLayer);
-
-bool GatedRecurrentLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-  CHECK_EQ(1U, parameters_.size());
-  CHECK_EQ(getSize() * getSize() * 3, parameters_[0]->getSize());
-  CHECK_EQ(getSize() * 3, biasParameter_->getSize());
-  weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0]));
-  gateWeight_.reset(new Weight(getSize(), getSize() * 2, parameters_[0], 0));
-  stateWeight_.reset(new Weight(
-      getSize(), getSize(), parameters_[0], 2 * getSize() * getSize()));
-  if (biasParameter_.get() != NULL) {
-    bias_.reset(new Weight(1, getSize() * 3, biasParameter_));
-  }
-
-  reversed_ = config_.reversed();
-  activationGate_.reset(ActivationFunction::create(config_.active_gate_type()));
-
-  GruCompute::init(config_);
-  useBatch_ = true;
-
-  return true;
-}
-
-void GatedRecurrentLayer::resetState() {
-  CHECK(!reversed_) << "state is not allowed for reversed gated "
-                       "recurrent layer";
-  Matrix::resizeOrCreate(
-      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
-  prevOutput_->zeroMem();
-
-  // TODO(hedaoyuan): support prev_batch_state
-  CHECK(!FLAGS_prev_batch_state) << "Not supported";
-
-  useBatch_ = false;
-}
-
-void GatedRecurrentLayer::setState(LayerStatePtr state) {
-  CHECK(state->value.size() == 1)
-      << "one matrix is expected for GatedRecurrentLayer state";
-  prevOutput_->copyFrom(*(state->value[0]));
-}
-
-LayerStatePtr GatedRecurrentLayer::getState() {
-  LayerStatePtr res = std::make_shared<LayerState>();
-  res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
-  res->value[0]->copyFrom(*prevOutput_);
-  return res;
-}
-
-void GatedRecurrentLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("GruFwTimer", getName().c_str());
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  size_t numSequences = input.getNumSequences();
-  resetOutput(batchSize, getSize());
-  CHECK_EQ(getSize() * 3, input.value->getWidth());
-  const int* starts = input.sequenceStartPositions->getData(false);
-  // batchSize = length of total frames in a batch (NOT size of mini-batch)
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize,
-                         getSize() * 3,
-                         /* trans= */ false,
-                         useGpu_);
-  Matrix::resizeOrCreate(resetOutput_.value,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-
-  if (useBatch_) {
-    forwardBatch(batchSize, numSequences, starts, input.value);
-  } else {
-    forwardSequence(batchSize, numSequences, starts, input.value);
-  }
-}
-
-void GatedRecurrentLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("GruBwTimer", getName().c_str());
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  const int* starts = input.sequenceStartPositions->getData(false);
-  size_t numSequences = input.getNumSequences();
-
-  Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize,
-                         getSize() * 3,
-                         /* trans= */ false,
-                         useGpu_);
-  Matrix::resizeOrCreate(resetOutput_.grad,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-
-  if (useBatch_) {
-    backwardBatch(batchSize, input.grad);
-  } else {
-    backwardSequence(batchSize, numSequences, starts, input.grad);
-  }
-
-  if (bias_) {
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-void GatedRecurrentLayer::forwardSequence(int batchSize,
-                                          size_t numSequences,
-                                          const int* starts,
-                                          MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("GruFwSequenceTime", getName().c_str());
-  gate_.value->assign(*inputValue);
-  if (bias_) {
-    gate_.value->addBias(*(bias_->getW()), 1);
-  }
-
-  hl_gru_value gruValue;
-  gruValue.gateWeight = (gateWeight_->getW())->getData();
-  gruValue.stateWeight = (stateWeight_->getW())->getData();
-  gruValue.gateValue = gate_.value->getData();
-  gruValue.resetOutputValue = resetOutput_.value->getData();
-  gruValue.outputValue = output_.value->getData();
-  gruValue.prevOutValue = nullptr;
-
-  if (reversed_) {
-    gruValue.gateValue += (batchSize - 1) * getSize() * 3;
-    gruValue.resetOutputValue += (batchSize - 1) * getSize();
-    gruValue.outputValue += (batchSize - 1) * getSize();
-  }
-
-  auto nextFrame = [&gruValue](bool reversed, int frameSize) {
-    gruValue.prevOutValue = gruValue.outputValue;
-    if (!reversed) {
-      gruValue.gateValue += frameSize * 3;
-      gruValue.resetOutputValue += frameSize;
-      gruValue.outputValue += frameSize;
-    } else {
-      gruValue.gateValue -= frameSize * 3;
-      gruValue.resetOutputValue -= frameSize;
-      gruValue.outputValue -= frameSize;
-    }
-  };
-
-  if (!reversed_) {
-    if (prevOutput_) {
-      gruValue.prevOutValue = prevOutput_->getData();
-    }
-  }
-  AsyncGpuBlock asyncGpuBlock;
-  for (size_t n = 0; n < numSequences; ++n) {
-    int length;
-    if (!reversed_) {
-      length = starts[n + 1] - starts[n];
-    } else {
-      length = starts[numSequences - n] - starts[numSequences - n - 1];
-    }
-    for (int l = 0; l < length; ++l) {
-      if (useGpu_) {
-        GruCompute::forward<1>(gruValue, getSize());
-      } else {
-        GruCompute::forward<0>(gruValue, getSize());
-      }
-
-      nextFrame(reversed_, getSize());
-    }
-    if (!reversed_) {
-      if (!prevOutput_) gruValue.prevOutValue = nullptr;
-    } else {
-      gruValue.prevOutValue = nullptr;
-    }
-  }
-
-  if (!reversed_) {
-    if (prevOutput_) {
-      prevOutput_->assign(*output_.value->subMatrix(batchSize - 1, 1));
-    }
-  }
-}
-
-void GatedRecurrentLayer::backwardSequence(int batchSize,
-                                           size_t numSequences,
-                                           const int* starts,
-                                           MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("GruBwSequenceTime", getName().c_str());
-
-  hl_gru_value gruValue;
-  gruValue.gateWeight = (gateWeight_->getW())->getData();
-  gruValue.stateWeight = (stateWeight_->getW())->getData();
-  gruValue.gateValue = gate_.value->getData();
-  gruValue.resetOutputValue = resetOutput_.value->getData();
-  gruValue.outputValue = output_.value->getData();
-
-  hl_gru_grad gruGrad;
-  gruGrad.gateWeightGrad =
-      (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
-  gruGrad.stateWeightGrad =
-      (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData()
-                                : nullptr);
-  gruGrad.gateGrad = gate_.grad->getData();
-  gruGrad.resetOutputGrad = resetOutput_.grad->getData();
-  gruGrad.outputGrad = output_.grad->getData();
-
-  if (!reversed_) {
-    gruValue.gateValue += (batchSize - 1) * getSize() * 3;
-    gruValue.resetOutputValue += (batchSize - 1) * getSize();
-    gruValue.outputValue += (batchSize - 1) * getSize();
-    gruGrad.gateGrad += (batchSize - 1) * getSize() * 3;
-    gruGrad.resetOutputGrad += (batchSize - 1) * getSize();
-    gruGrad.outputGrad += (batchSize - 1) * getSize();
-    gruValue.prevOutValue = gruValue.outputValue - getSize();
-    gruGrad.prevOutGrad = gruGrad.outputGrad - getSize();
-  } else {
-    gruValue.prevOutValue = gruValue.outputValue + getSize();
-    gruGrad.prevOutGrad = gruGrad.outputGrad + getSize();
-  }
-
-  auto nextFrame = [&gruValue, &gruGrad](bool reversed, int frameSize) {
-    if (reversed) {
-      gruValue.gateValue += frameSize * 3;
-      gruValue.resetOutputValue += frameSize;
-      gruValue.outputValue += frameSize;
-      gruGrad.gateGrad += frameSize * 3;
-      gruGrad.resetOutputGrad += frameSize;
-      gruGrad.outputGrad += frameSize;
-      gruValue.prevOutValue = gruValue.outputValue + frameSize;
-      gruGrad.prevOutGrad = gruGrad.outputGrad + frameSize;
-    } else {
-      gruValue.gateValue -= frameSize * 3;
-      gruValue.resetOutputValue -= frameSize;
-      gruValue.outputValue -= frameSize;
-      gruGrad.gateGrad -= frameSize * 3;
-      gruGrad.resetOutputGrad -= frameSize;
-      gruGrad.outputGrad -= frameSize;
-      gruValue.prevOutValue = gruValue.outputValue - frameSize;
-      gruGrad.prevOutGrad = gruGrad.outputGrad - frameSize;
-    }
-  };
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    for (size_t n = 0; n < numSequences; ++n) {
-      int length;
-      if (reversed_) {
-        length = starts[n + 1] - starts[n];
-      } else {
-        length = starts[numSequences - n] - starts[numSequences - n - 1];
-      }
-      for (int l = 0; l < length; ++l) {
-        if (l == length - 1) {
-          gruValue.prevOutValue = nullptr;
-          gruGrad.prevOutGrad = nullptr;
-        }
-        if (useGpu_) {
-          GruCompute::backward<1>(gruValue, gruGrad, getSize());
-        } else {
-          GruCompute::backward<0>(gruValue, gruGrad, getSize());
-        }
-        nextFrame(reversed_, getSize());
-      }
-    }
-  }
-
-  if (inputGrad) {
-    inputGrad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    bias_->getWGrad()->collectBias(*gate_.grad, 1);
-  }
-}
-
-void GatedRecurrentLayer::forwardBatch(int batchSize,
-                                       size_t numSequences,
-                                       const int* starts,
-                                       MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("GruFwBatchTime", getName().c_str());
-  hl_gru_value gruValue;
-  gruValue.gateWeight = (gateWeight_->getW())->getData();
-  gruValue.stateWeight = (stateWeight_->getW())->getData();
-
-  if (!batchValue_) {
-    batchValue_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
-
-  batchValue_->resizeOrCreate(*output_.value);
-  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
-  if (bias_) {
-    gate_.value->addBias(*(bias_->getW()), 1);
-  }
-
-  {
-    int numBatch = batchValue_->getNumBatch();
-    int curBatchSize = 0;
-    AsyncGpuBlock asyncGpuBlock;
-    for (int n = 0; n < numBatch; n++) {
-      MatrixPtr outputValueTmp = batchValue_->getBatchValue(n);
-      gruValue.outputValue = outputValueTmp->getData();
-      gruValue.gateValue =
-          (batchValue_->getBatchValue(*gate_.value, n))->getData();
-      gruValue.resetOutputValue =
-          (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();
-
-      curBatchSize = outputValueTmp->getHeight();
-      gruValue.prevOutValue =
-          (n == 0
-               ? nullptr
-               : (batchValue_->getBatchValue(n - 1, curBatchSize))->getData());
-
-      {
-        if (useGpu_) {
-          GruCompute::forward<1>(gruValue, getSize(), curBatchSize);
-        } else {
-          GruCompute::forward<0>(gruValue, getSize(), curBatchSize);
-        }
-      }
-    }
-  }
-  { batchValue_->copyBackSeq(*output_.value); }
-}
-
-void GatedRecurrentLayer::backwardBatch(int batchSize, MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("GruBwBatchTime", getName().c_str());
-  hl_gru_value gruValue;
-  gruValue.gateWeight = (gateWeight_->getW())->getData();
-  gruValue.stateWeight = (stateWeight_->getW())->getData();
-
-  hl_gru_grad gruGrad;
-  gruGrad.gateWeightGrad =
-      (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
-  gruGrad.stateWeightGrad =
-      (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData()
-                                : nullptr);
-
-  if (!batchGrad_) {
-    batchGrad_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchGrad_->shareIndexWith(*batchValue_);
-
-  { batchGrad_->copyFromSeq(*output_.grad); }
-
-  {
-    int numBatch = batchGrad_->getNumBatch();
-    int batchSize = 0;
-    AsyncGpuBlock asyncGpuBlock;
-    for (int n = (int)numBatch - 1; n >= 0; n--) {
-      gruValue.gateValue =
-          (batchGrad_->getBatchValue(*gate_.value, n))->getData();
-      gruValue.resetOutputValue =
-          (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData();
-
-      MatrixPtr outputGradTmp = batchGrad_->getBatchValue(n);
-      gruGrad.outputGrad = outputGradTmp->getData();
-      gruGrad.gateGrad = (batchGrad_->getBatchValue(*gate_.grad, n))->getData();
-      gruGrad.resetOutputGrad =
-          (batchGrad_->getBatchValue(*resetOutput_.grad, n))->getData();
-
-      {
-        batchSize = outputGradTmp->getHeight();
-        gruValue.prevOutValue =
-            (n == 0
-                 ? nullptr
-                 : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
-        gruGrad.prevOutGrad =
-            (n == 0 ? nullptr
-                    : (batchGrad_->getBatchValue(n - 1, batchSize))->getData());
-
-        if (useGpu_) {
-          GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize);
-        } else {
-          GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize);
-        }
-      }
-    }
-  }
-
-  if (inputGrad) {
-    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    bias_->getWGrad()->collectBias(*gate_.grad, /* scale */ 1);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GatedRecurrentLayer.h b/paddle/legacy/gserver/layers/GatedRecurrentLayer.h
deleted file mode 100644
index 8bbf01ce200..00000000000
--- a/paddle/legacy/gserver/layers/GatedRecurrentLayer.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "GruCompute.h"
-#include "Layer.h"
-#include "SequenceToBatch.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Please refer to "Junyoung Chung, Empirical Evaluation
- * of Gated Recurrent Neural Networks on Sequence Modeling".
- *
- * GatedRecurrentLayer takes 1 input layer with size * 3.
- * Input layer is diveded into 3 equal parts: (xz_t, xr_t, xi_t).
- * parameter and biasParameter is also diveded into 3 equal parts:
- *   - parameter consists of (U_z, U_r, U)
- *   - baisParameter consists of (bias_z, bias_r, bias_o)
- *
- * \f[
- * update \ gate: z_t = actGate(xz_t + U_z * h_{t-1} + bias_z) \\
- * reset \ gate: r_t = actGate(xr_t + U_r * h_{t-1} + bias_r) \\
- * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, h_{t-1}) + bias_o) \\
- * hidden \ activation: h_t = dot((1-z_t), h_{t-1}) + dot(z_t, {h}_t) \\
- * \f]
- *
- * @note
- * - dot denotes "element-wise multiplication".
- * - actNode is defined by config active_type
- * - actGate is defined by config actvie_gate_type
- *
- * The config file is grumemory.
- */
-
-class GatedRecurrentLayer : public Layer, public GruCompute {
- public:
-  explicit GatedRecurrentLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
- protected:
-  void forwardSequence(int batchSize,
-                       size_t numSequences,
-                       const int* starts,
-                       MatrixPtr inputValue);
-  void backwardSequence(int batchSize,
-                        size_t numSequences,
-                        const int* starts,
-                        MatrixPtr inputGrad);
-
-  void forwardBatch(int batchSize,
-                    size_t numSequences,
-                    const int* starts,
-                    MatrixPtr inputValue);
-  void backwardBatch(int batchSize, MatrixPtr inputGrad);
-
- protected:
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> gateWeight_;
-  std::unique_ptr<Weight> stateWeight_;
-  std::unique_ptr<Weight> bias_;
-
-  Argument gate_;
-  Argument resetOutput_;
-
-  bool reversed_;
-  bool useBatch_;
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-  std::unique_ptr<ActivationFunction> activationGate_;
-
-  MatrixPtr prevOutput_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GetOutputLayer.cpp b/paddle/legacy/gserver/layers/GetOutputLayer.cpp
deleted file mode 100644
index 7c1e3c407cc..00000000000
--- a/paddle/legacy/gserver/layers/GetOutputLayer.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-class GetOutputLayer : public Layer {
- public:
-  explicit GetOutputLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~GetOutputLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    if (!Layer::init(layerMap, parameterMap)) return false;
-    CHECK_EQ(1U, inputLayers_.size());
-    CHECK_NE(inputArgument_[0], "");
-    return true;
-  }
-
-  void forward(PassType passType) override {
-    output_ = getPrev(0)->getOutput(inputArgument_[0]);
-  }
-  void backward(const UpdateCallback& callback = nullptr) override {}
-};
-
-REGISTER_LAYER(get_output, GetOutputLayer);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GruCompute.cpp b/paddle/legacy/gserver/layers/GruCompute.cpp
deleted file mode 100644
index adad6285b7d..00000000000
--- a/paddle/legacy/gserver/layers/GruCompute.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GruCompute.h"
-#include "hl_recurrent_apply.cuh"
-#include "paddle/legacy/function/GruFunctor.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-void GruCompute::init(LayerConfig &config) {
-  activeNode_ = hlActiveType(config.active_type());
-  activeGate_ = hlActiveType(config.active_gate_type());
-}
-
-template <>
-void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) {
-  GruFunctor<DEVICE_TYPE_CPU, real>::compute(hppl::forward::gru_resetOutput(),
-                                             hppl::forward::gru_finalOutput(),
-                                             value,
-                                             frameSize,
-                                             batchSize,
-                                             activeNode_,
-                                             activeGate_);
-}
-
-template <>
-void GruCompute::backward<0>(hl_gru_value value,
-                             hl_gru_grad grad,
-                             int frameSize,
-                             int batchSize) {
-  GruGradFunctor<DEVICE_TYPE_CPU, real>::compute(
-      hppl::backward::gru_stateGrad(),
-      hppl::backward::gru_resetGrad(),
-      value,
-      grad,
-      frameSize,
-      batchSize,
-      activeNode_,
-      activeGate_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GruCompute.cu b/paddle/legacy/gserver/layers/GruCompute.cu
deleted file mode 100644
index 54be6b80475..00000000000
--- a/paddle/legacy/gserver/layers/GruCompute.cu
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GruCompute.h"
-
-#include "hl_recurrent_apply.cuh"
-
-namespace paddle {
-
-template <>
-void GruCompute::forward<1>(hl_gru_value value, int frameSize, int batchSize) {
-  hl_gpu_gru_forward(hppl::forward::gru_resetOutput(),
-                     hppl::forward::gru_finalOutput(),
-                     value,
-                     frameSize,
-                     batchSize,
-                     activeNode_,
-                     activeGate_);
-}
-
-template <>
-void GruCompute::backward<1>(hl_gru_value value,
-                             hl_gru_grad grad,
-                             int frameSize,
-                             int batchSize) {
-  hl_gpu_gru_backward(hppl::backward::gru_stateGrad(),
-                      hppl::backward::gru_resetGrad(),
-                      value,
-                      grad,
-                      frameSize,
-                      batchSize,
-                      activeNode_,
-                      activeGate_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GruCompute.h b/paddle/legacy/gserver/layers/GruCompute.h
deleted file mode 100644
index 6feea7aca81..00000000000
--- a/paddle/legacy/gserver/layers/GruCompute.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ModelConfig.pb.h"
-#include "hl_gpu.h"
-#include "paddle/legacy/utils/Common.h"
-
-namespace paddle {
-
-class GruCompute {
- public:
-  void init(LayerConfig &config);
-
-  template <bool useGpu>
-  void forward(hl_gru_value value, int frameSize, int batchSize = 1);
-
-  template <bool useGpu>
-  void backward(hl_gru_value value,
-                hl_gru_grad grad,
-                int frameSize,
-                int batchSize = 1);
-
- public:
-  hl_activation_mode_t activeNode_;
-  hl_activation_mode_t activeGate_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GruStepLayer.cpp b/paddle/legacy/gserver/layers/GruStepLayer.cpp
deleted file mode 100644
index 2480e42d68b..00000000000
--- a/paddle/legacy/gserver/layers/GruStepLayer.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GruCompute.h"
-#include "Layer.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief GruStepLayer is like GatedRecurrentLayer, but used in recurrent
- * layer group. GruStepLayer takes 2 input layer.
- * - input[0] with size * 3 and diveded into 3 equal parts: (xz_t, xr_t, xi_t).
- * - input[1] with size: {prev_out}.
- *
- * parameter and biasParameter is also diveded into 3 equal parts:
- * - parameter consists of (U_z, U_r, U)
- * - baisParameter consists of (bias_z, bias_r, bias_o)
- *
- * \f[
- * update \ gate: z_t = actGate(xz_t + U_z * prev_out + bias_z) \\
- * reset \ gate: r_t = actGate(xr_t + U_r * prev_out + bias_r)  \\
- * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, prev_out) + bias_o)
- * \\
- * output: h_t = dot((1-z_t), prev_out) + dot(z_t, prev_out)
- * \f]
- *
- * @note
- *   - dot denotes "element-wise multiplication".
- *   - actNode is defined by config active_type
- *   - actGate is defined by config actvie_gate_type
- *
- * The config file api if gru_step_layer.
- */
-class GruStepLayer : public Layer, public GruCompute {
- protected:
-  Argument gate_;
-  Argument resetOutput_;
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> bias_;
-
- public:
-  explicit GruStepLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~GruStepLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(gru_step, GruStepLayer);
-
-bool GruStepLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(2U, inputLayers_.size());
-
-  CHECK_EQ(getSize() * getSize() * 3, parameters_[0]->getSize());
-  weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0]));
-
-  if (biasParameter_.get() != NULL) {
-    CHECK_EQ(getSize() * 3, biasParameter_->getSize());
-    bias_.reset(new Weight(1, getSize() * 3, biasParameter_));
-  }
-
-  GruCompute::init(config_);
-  return true;
-}
-
-void GruStepLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("GruStepFwTime", getName().c_str());
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  const Argument& prevOutput = getInput(1);
-  CHECK_EQ(getSize() * 3, input.value->getWidth());
-  CHECK_EQ(getSize(), prevOutput.value->getWidth());
-
-  int batchSize = input.getBatchSize();
-  resetOutput(batchSize, getSize());
-  resetSpecifyOutput(gate_,
-                     batchSize,
-                     getSize() * 3,
-                     /* isValueClean */ false,
-                     /* isGradClean */ false);
-  resetSpecifyOutput(resetOutput_,
-                     batchSize,
-                     getSize(),
-                     /* isValueClean */ false,
-                     /* isGradClean */ false);
-  gate_.value->assign(*input.value);
-  if (bias_) {
-    gate_.value->addBias(*(bias_->getW()), 1);
-  }
-
-  hl_gru_value gruValue;
-  gruValue.gateWeight = weight_->getW()->getData();
-  gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
-  gruValue.gateValue = gate_.value->getData();
-  gruValue.resetOutputValue = resetOutput_.value->getData();
-  gruValue.outputValue = output_.value->getData();
-  gruValue.prevOutValue = prevOutput.value->getData();
-
-  if (useGpu_) {
-    GruCompute::forward<1>(gruValue, getSize(), batchSize);
-  } else {
-    GruCompute::forward<0>(gruValue, getSize(), batchSize);
-  }
-}
-
-void GruStepLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("GruStepBwTime", getName().c_str());
-
-  const Argument& input = getInput(0);
-  const Argument& prevOutput = getInput(1);
-  int batchSize = input.getBatchSize();
-
-  hl_gru_value gruValue;
-  gruValue.gateWeight = weight_->getW()->getData();
-  gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
-  gruValue.gateValue = gate_.value->getData();
-  gruValue.resetOutputValue = resetOutput_.value->getData();
-  gruValue.outputValue = output_.value->getData();
-  gruValue.prevOutValue = prevOutput.value->getData();
-
-  hl_gru_grad gruGrad;
-  gruGrad.gateWeightGrad =
-      (weight_->getWGrad() ? weight_->getWGrad()->getData() : nullptr);
-  gruGrad.stateWeightGrad =
-      (weight_->getWGrad()
-           ? weight_->getWGrad()->getData() + getSize() * getSize() * 2
-           : nullptr);
-
-  gruGrad.gateGrad = gate_.grad->getData();
-  gruGrad.resetOutputGrad = resetOutput_.grad->getData();
-  gruGrad.outputGrad = output_.grad->getData();
-  if (prevOutput.grad) {
-    gruGrad.prevOutGrad = prevOutput.grad->getData();
-  } else {
-    gruGrad.prevOutGrad = nullptr;
-  }
-
-  if (useGpu_) {
-    GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize);
-  } else {
-    GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize);
-  }
-
-  if (input.grad) {
-    input.grad->add(*gate_.grad);
-  }
-
-  if (bias_ && bias_->getWGrad()) {
-    bias_->getWGrad()->collectBias(*gate_.grad, 1);
-  }
-
-  if (bias_) {
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp
deleted file mode 100644
index 34495994096..00000000000
--- a/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "HierarchicalSigmoidLayer.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-REGISTER_LAYER(hsigmoid, HierarchicalSigmoidLayer);
-
-bool HierarchicalSigmoidLayer::init(const LayerMap& layerMap,
-                                    const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK(config_.has_num_classes()) << "num_classes must be specifed in config";
-  numClasses_ = config_.num_classes();
-  CHECK_GE(numClasses_, (size_t)2);
-  codeLength_ = findLastSet(numClasses_ - 1);
-
-  size_t height = numClasses_ - 1;
-
-  /* initialize the weightList */
-  // The last input layer is for label
-  CHECK(!parameters_.back());
-  for (size_t i = 0; i < inputLayers_.size() - 1; i++) {
-    size_t width = inputLayers_[i]->getSize();
-    // create a new weight
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight* w = new Weight(height, width, parameters_[i]);
-
-    // append the new weight to the list
-    weights_.emplace_back(w);
-  }
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    CHECK_EQ(biasParameter_->getSize(), numClasses_ - 1);
-    biases_.reset(new Weight(1, numClasses_ - 1, biasParameter_));
-  }
-
-  return true;
-}
-
-void HierarchicalSigmoidLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(0)->getHeight();
-  int size = getSize();
-  reserveOutput(batchSize, size);
-  Matrix::resizeOrCreate(preOutput_.value,
-                         batchSize,
-                         codeLength_,
-                         /* trans */ false,
-                         false);
-  Matrix::resizeOrCreate(preOutput_.grad,
-                         batchSize,
-                         codeLength_,
-                         /* trans */ false,
-                         false);
-  IVectorPtr label = getInput(*getLabelLayer()).ids;
-  preOutput_.value->zeroMem();
-
-  if (useGpu_) {
-    Matrix::resizeOrCreate(cpuOutput_,
-                           output_.value->getHeight(),
-                           output_.value->getWidth(),
-                           /* trans */ false,
-                           false);
-    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
-    cpuLabel_->copyFrom(*label);
-    cpuOutput_->copyFrom(*output_.value);
-  } else {
-    cpuOutput_ = output_.value;
-    cpuLabel_ = label;
-  }
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    if (useGpu_) {
-      Matrix::resizeOrCreate(cpuBias_,
-                             1,
-                             numClasses_ - 1,
-                             /* trans */ false,
-                             false);
-      cpuBias_->copyFrom(*biases_->getW());
-    } else {
-      cpuBias_ = biases_->getW();
-    }
-    preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
-  }
-  for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
-    MatrixPtr input = getInputValue(i);
-    if (useGpu_) {
-      Matrix::resizeOrCreate(cpuInput_,
-                             input->getHeight(),
-                             input->getWidth(),
-                             /* trans */ false,
-                             false);
-      Matrix::resizeOrCreate(cpuWeight_,
-                             weights_[i]->getW()->getHeight(),
-                             weights_[i]->getW()->getWidth(),
-                             /* trans */ false,
-                             false);
-      cpuInput_->copyFrom(*input);
-      cpuWeight_->copyFrom(*weights_[i]->getW());
-    } else {
-      cpuInput_ = input;
-      cpuWeight_ = weights_[i]->getW();
-    }
-    preOutput_.value->mulByBitCode(
-        numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
-  }
-  // keep consistent with the clipping in the following softrelu
-  preOutput_.value->clip(-40.0, 40.0);
-  preOutput_.value->sumByBitCode(numClasses_,
-                                 *cpuLabel_,
-                                 *cpuOutput_,
-                                 -1);  // scaleSum
-  preOutput_.value->softrelu(*preOutput_.value);
-  MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
-  preOutput_.value->rowSum(*sum);
-  cpuOutput_->add(*sum);
-  if (useGpu_) {
-    output_.value->copyFrom(*cpuOutput_);
-  } else {
-    output_.value = cpuOutput_;
-  }
-}
-
-void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
-  IVectorPtr label = getInput(*getLabelLayer()).ids;
-  if (useGpu_) {
-    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
-    cpuLabel_->copyFrom(*label);
-  } else {
-    cpuLabel_ = label;
-  }
-  preOutput_.grad->one();
-  preOutput_.grad->softreluDerivative(*preOutput_.value);
-  preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
-
-  if (biases_ && biases_->getWGrad()) {
-    MatrixPtr biases_grad = biases_->getWGrad();
-    if (useGpu_) {
-      Matrix::resizeOrCreate(cpuBias_,
-                             1,
-                             numClasses_ - 1,
-                             /* trans */ false,
-                             false);
-      cpuBias_->copyFrom(*biases_grad);
-    } else {
-      cpuBias_ = biases_grad;
-    }
-    preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
-    if (useGpu_) {
-      biases_grad->copyFrom(*cpuBias_);
-    } else {
-      biases_grad = cpuBias_;
-    }
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
-    /* Calculate the W-gradient for the current layer */
-    MatrixPtr input = getInputValue(i);
-    if (weights_[i]->getWGrad()) {
-      MatrixPtr weights_grad = weights_[i]->getWGrad();
-      if (useGpu_) {
-        Matrix::resizeOrCreate(cpuInput_,
-                               input->getHeight(),
-                               input->getWidth(),
-                               /* trans */ false,
-                               false);
-        Matrix::resizeOrCreate(cpuWeightGrad_,
-                               weights_grad->getHeight(),
-                               weights_grad->getWidth(),
-                               /* trans */ false,
-                               false);
-        cpuInput_->copyFrom(*input);
-        cpuWeightGrad_->copyFrom(*weights_grad);
-      } else {
-        cpuInput_ = input;
-        cpuWeightGrad_ = weights_grad;
-      }
-      preOutput_.grad->mulByBitCodeBackwardWeight(
-          numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
-      if (useGpu_) {
-        weights_grad->copyFrom(*cpuWeightGrad_);
-      } else {
-        weights_grad = cpuWeightGrad_;
-      }
-      /* Increasing the number of gradient */
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-
-    /* Calculate the input layers error */
-    MatrixPtr inputGrad = getInputGrad(i);
-    if (inputGrad) {
-      if (useGpu_) {
-        Matrix::resizeOrCreate(cpuInputGrad_,
-                               inputGrad->getHeight(),
-                               inputGrad->getWidth(),
-                               /* trans */ false,
-                               false);
-        Matrix::resizeOrCreate(cpuWeight_,
-                               weights_[i]->getW()->getHeight(),
-                               weights_[i]->getW()->getWidth(),
-                               /* trans */ false,
-                               false);
-        cpuInputGrad_->copyFrom(*inputGrad);
-        cpuWeight_->copyFrom(*weights_[i]->getW());
-      } else {
-        cpuInputGrad_ = inputGrad;
-        cpuWeight_ = weights_[i]->getW();
-      }
-      preOutput_.grad->mulByBitCodeBackwardError(
-          numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
-      if (useGpu_) {
-        inputGrad->copyFrom(*cpuInputGrad_);
-      } else {
-        inputGrad = cpuInputGrad_;
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.h
deleted file mode 100644
index 73ef252fd5a..00000000000
--- a/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * Organize the classes into a binary tree. At each node, a sigmoid function
- * is used to calculate the probability of belonging to the right branch.
- * This idea is from "F. Morin, Y. Bengio (AISTATS 05):
- * Hierarchical Probabilistic Neural Network Language Model."
- *
- * Here we uses a simple way of making the binary tree.
- * Assuming the number of classes C = 6,
- * The classes are organized as a binary tree in the following way:
- *
- * @code{.py}
- * *-*-*- 2
- * | | |- 3
- * | |
- * | |-*- 4
- * |   |- 5
- * |
- * |-*- 0
- *   |- 1
- * @endcode
- *
- * where * indicates an internal node, and each leaf node represents a class.
- * - Node 0 ... C-2 are internal nodes.
- * - Node C-1 ... 2C-2 are leaf nodes.
- * - Class c is represented by leaf node \f$c+C-1\f$.
- *
- * We assign an id for each node:
- * - the id of root be 0.
- * - the left child of a node i is 2*i+1.
- * - the right child of a node i is 2*i+2.
- *
- * It's easy to see that:
- * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$.
- * - the j-th level ancestor of node i is
- * \f$\left\lfloor(i+1)/2^{j+1}\right\rfloor - 1\f$.
- * - A node i is a left child of its parent if \f$(i-1)\%2==0\f$.
- *
- * The config file api is hsigmod_layer.
- */
-class HierarchicalSigmoidLayer : public Layer {
- public:
-  explicit HierarchicalSigmoidLayer(const LayerConfig& config)
-      : Layer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
- protected:
-  /**
-   * The last of inputs is label layer.
-   */
-  LayerPtr getLabelLayer() { return inputLayers_.back(); }
-
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-  /// number of classes
-  size_t numClasses_;
-  /// codeLength_ = \f$1 + \left\lfloor log_{2}(numClasses-1)\right\rfloor\f$
-  int codeLength_;
-  /// temporary result of output_
-  Argument preOutput_;
-
-  /// The temporary variables in CPU memory.
-  MatrixPtr cpuWeight_;
-  MatrixPtr cpuWeightGrad_;
-  MatrixPtr cpuInput_;
-  MatrixPtr cpuInputGrad_;
-  MatrixPtr cpuBias_;
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/IdentityProjection.cpp b/paddle/legacy/gserver/layers/IdentityProjection.cpp
deleted file mode 100644
index f707642e09b..00000000000
--- a/paddle/legacy/gserver/layers/IdentityProjection.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * IdentityProjection performs addition:
- * \f[
- *   out.row[i] += in.row[i]
- * \f]
- *
- * The config file api is identity_projection.
- */
-class IdentityProjection : public Projection {
- public:
-  IdentityProjection(const ProjectionConfig& config,
-                     const ParameterPtr& parameter,
-                     bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-};
-
-REGISTER_PROJECTION(identity, IdentityProjection);
-
-/**
- * Constructed function.
- * @note IdentityProjection should not have any parameter.
- */
-IdentityProjection::IdentityProjection(const ProjectionConfig& config,
-                                       const ParameterPtr& parameter,
-                                       bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(!parameter) << "'identity' projection should not have any parameter";
-}
-
-void IdentityProjection::forward() { out_->value->add(*in_->value); }
-
-void IdentityProjection::backward(const UpdateCallback& callback) {
-  if (in_->grad) {
-    in_->grad->add(*out_->grad);
-  }
-}
-
-/**
- * IdentityOffsetProjection likes IdentityProjection, but layer size may be
- * smaller
- * than input size. It selects dimensions [offset, offset+layer_size) from input
- * to
- * perform addition:
- * \f[
- *   out.row[i] += in.row[i + \textrm{offset}]
- * \f]
- *
- * The config file api is identity_projection.
- */
-class IdentityOffsetProjection : public Projection {
- public:
-  IdentityOffsetProjection(const ProjectionConfig& config,
-                           const ParameterPtr& parameter,
-                           bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-};
-
-REGISTER_PROJECTION(identity_offset, IdentityOffsetProjection);
-
-/**
- * Constructed function.
- * @note IdentityOffsetProjection should not have any parameter.
- */
-IdentityOffsetProjection::IdentityOffsetProjection(
-    const ProjectionConfig& config, const ParameterPtr& parameter, bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(!parameter) << "'identity_offset' projection "
-                       "should not have any parameter";
-  CHECK_LE(config.output_size() + config.offset(), config.input_size());
-}
-
-void IdentityOffsetProjection::forward() {
-  out_->value->addAtOffset(*in_->value, config_.offset());
-}
-
-void IdentityOffsetProjection::backward(const UpdateCallback& callback) {
-  if (in_->grad) {
-    in_->grad->addAtOffset(*out_->grad, config_.offset());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/InterpolationLayer.cpp b/paddle/legacy/gserver/layers/InterpolationLayer.cpp
deleted file mode 100644
index ed2294e8a39..00000000000
--- a/paddle/legacy/gserver/layers/InterpolationLayer.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for linear interpolation with two inputs,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   y.row[i] = w[i] * x_1.row[i] + (1 - w[i]) * x_2.row[i]
- * \f]
- * where \f$x_1\f$ and \f$x_2\f$ are two (batchSize x dataDim) inputs,
- * \f$w\f$ is (batchSize x 1) weight vector,
- * and \f$y\f$ is (batchSize x dataDim) output.
- *
- * The config file api is interpolation_layer.
- */
-
-class InterpolationLayer : public Layer {
- protected:
-  /// weightLast = 1 - weight
-  MatrixPtr weightLast_;
-  MatrixPtr tmpMatrix;
-
- public:
-  explicit InterpolationLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~InterpolationLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(interpolation, InterpolationLayer);
-
-bool InterpolationLayer::init(const LayerMap& layerMap,
-                              const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(3U, inputLayers_.size());
-
-  return true;
-}
-
-void InterpolationLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inV2 = getInputValue(2);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  CHECK_EQ(dataDim, getSize());
-  CHECK_EQ(dataDim, inV2->getWidth());
-  CHECK_EQ(batchSize, inV1->getHeight());
-  CHECK_EQ(batchSize, inV2->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  Matrix::resizeOrCreate(weightLast_, batchSize, 1, false, useGpu_);
-  weightLast_->one();
-  weightLast_->sub(*weightV);
-
-  REGISTER_TIMER_INFO("FwInterpTimer", getName().c_str());
-  // outV = inV1 * weight + inV2 * weightLast
-  outV->addRowScale(0, *inV1, *weightV);
-  outV->addRowScale(0, *inV2, *weightLast_);
-}
-
-void InterpolationLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inV2 = getInputValue(2);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr inG2 = getInputGrad(2);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  REGISTER_TIMER_INFO("BwInterpTimer", getName().c_str());
-
-  if (inG0) {
-    Matrix::resizeOrCreate(tmpMatrix, batchSize, dataDim, false, useGpu_);
-
-    // inG0 += outG .* (inV1 - inV2)
-    tmpMatrix->sub(*inV1, *inV2);
-    inG0->rowDotMul(0, *outG, *tmpMatrix);
-  }
-
-  if (inG1) {
-    // inG1 += outG * weight
-    inG1->addRowScale(0, *outG, *weightV);
-  }
-
-  if (inG2) {
-    // inG2 += outG * weightLast
-    inG2->addRowScale(0, *outG, *weightLast_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/KmaxSeqScoreLayer.cpp b/paddle/legacy/gserver/layers/KmaxSeqScoreLayer.cpp
deleted file mode 100644
index 7fd25954efe..00000000000
--- a/paddle/legacy/gserver/layers/KmaxSeqScoreLayer.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-class KmaxSeqScoreLayer : public Layer {
- private:
-  MatrixPtr scores_;
-  size_t beamSize_;
-  void kmaxScorePerSeq(const real* score,
-                       real* sortedRes,
-                       const ICpuGpuVectorPtr seqStartPos);
-
- public:
-  explicit KmaxSeqScoreLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(kmax_seq_score, KmaxSeqScoreLayer);
-
-bool KmaxSeqScoreLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  bool ret = Layer::init(layerMap, parameterMap);
-  CHECK_EQ(1U, inputLayers_.size());
-
-  beamSize_ = config_.beam_size();
-  CHECK_GE(beamSize_, 1U);
-
-  setNeedSequenceInfo(false);
-  setNeedGradient(false);
-  return ret;
-}
-
-void KmaxSeqScoreLayer::kmaxScorePerSeq(const real* scores,
-                                        real* sortedIds,
-                                        const ICpuGpuVectorPtr seqStartPos) {
-  int* starts = seqStartPos->getMutableData(false);
-  std::vector<real> indices;
-  for (size_t i = 0; i < seqStartPos->getSize() - 1; ++i) {
-    int seqLen = starts[i + 1] - starts[i];
-    int k = std::min(static_cast<int>(beamSize_), seqLen);
-
-    indices.resize(seqLen, 0);
-    std::iota(begin(indices), end(indices), 0.);
-    std::vector<real> tmpScore(scores + starts[i], scores + starts[i + 1]);
-    std::partial_sort(
-        begin(indices),
-        begin(indices) + k,
-        end(indices),
-        [&](size_t a, size_t b) { return tmpScore[a] > tmpScore[b]; });
-    memcpy(sortedIds + (i * beamSize_), indices.data(), k * sizeof(real));
-  }
-}
-
-void KmaxSeqScoreLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  const MatrixPtr inputScore = getInputValue(0);
-
-  CHECK(input.hasSeq() || input.hasSubseq())
-      << "input of " << getName()
-      << " must be a sequence or a nested sequence.";
-  CHECK_EQ(input.value->getWidth(), 1UL)
-      << "input of " << getName() << " are scores over a sequence or "
-      << "a nested sequence, so its width must be 1.";
-
-  if (useGpu_) {
-    /*
-     * currently, this Layer only runs in CPU, if the other part of the model is
-     * runing on GPU, then copy the input to this layer from GPU to CPU.
-     */
-    Matrix::resizeOrCreate(scores_,
-                           inputScore->getHeight(),
-                           1,
-                           false /* trans */,
-                           false /* useGpu */);
-    scores_->copyFrom(*inputScore);
-  } else {
-    scores_ = inputScore;
-  }
-
-  /*
-   * TODO(caoying)
-   * In PaddePaddle, currently all matrices are real number types,
-   * but output of this layer which is some selected indices of the give
-   * sequence are actually filled with int types so that storing int types
-   * information in a real number matrix is dangerous, since real numbers will
-   * be convered to int types.
-   */
-  Matrix::resizeOrCreate(
-      output_.value,
-      input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(),
-      beamSize_,
-      false,
-      false);
-  output_.value->one();
-  output_.value->mulScalar(-1.);
-
-  kmaxScorePerSeq(scores_->getData(),
-                  output_.value->getData(),
-                  input.hasSubseq() ? input.subSequenceStartPositions
-                                    : input.sequenceStartPositions);
-}
-
-void KmaxSeqScoreLayer::backward(const UpdateCallback& callback) {}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/L2DistanceLayer.cpp b/paddle/legacy/gserver/layers/L2DistanceLayer.cpp
deleted file mode 100644
index a3e627e5704..00000000000
--- a/paddle/legacy/gserver/layers/L2DistanceLayer.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "L2DistanceLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(l2_distance, L2DistanceLayer);
-
-bool L2DistanceLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2UL) << "The L2DistanceLayer accepts two and "
-                                     << "only two inputs.";
-  CHECK_EQ(getSize(), 1UL) << "The output dimensionality of L2DistanceLayer "
-                           << "is fixed to be 1.";
-
-  return true;
-}
-
-void L2DistanceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const auto inV1 = getInputValue(0);
-  const auto inV2 = getInputValue(1);
-
-  CHECK(inV1 && inV2);
-  CHECK_EQ(inV1->getHeight(), inV2->getHeight())
-      << "The height of two inputs of this layer must be the same.";
-  CHECK_EQ(inV1->getWidth(), inV2->getWidth())
-      << "The width of two inputs of this layer must be the same.";
-
-  int batchSize = inV1->getHeight();
-  int output_dim = getSize();
-  {
-    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
-    reserveOutput(batchSize, output_dim);
-    auto outV = getOutputValue();
-    CHECK(outV) << "The output matrix should not be null.";
-
-    Matrix::resizeOrCreate(
-        inputSub_, inV1->getHeight(), inV1->getWidth(), false, useGpu_);
-
-    inputSub_->assign(*inV1);
-    inputSub_->sub(*inV2);
-    outV->sumOfProducts(*inputSub_, *inputSub_, 1, 0);
-    outV->sqrt2(*outV);
-  }
-}
-
-void L2DistanceLayer::backward(const UpdateCallback& callback) {
-  const auto outG = getOutputGrad();
-  const auto outV = getOutputValue();
-  CHECK(outG && outV);
-
-  auto inGrad1 = getInputGrad(0);
-  auto inGrad2 = getInputGrad(1);
-
-  {
-    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
-
-    if (inGrad1 || inGrad2) {
-      outV->scalarDiv(*outV, 1.);
-      outV->dotMul(*outG, *outV);
-    }
-
-    if (inGrad1) inGrad1->addRowScale(0, *inputSub_, *outV);
-
-    if (inGrad2) {
-      inputSub_->mulScalar(-1.);
-      inGrad2->addRowScale(0, *inputSub_, *outV);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/L2DistanceLayer.h b/paddle/legacy/gserver/layers/L2DistanceLayer.h
deleted file mode 100644
index aa8aabd9ca5..00000000000
--- a/paddle/legacy/gserver/layers/L2DistanceLayer.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief The layer calculates the l2 distance between two input vectors.
- * \f[
- * f(\bf{x}, \bf{y}) = \sqrt{\sum_{i=1}^D(x_i - y_i)}
- * \f]
- *
- * - Input1: A vector (batchSize * dataDim)
- * - Input2: A vector (batchSize * dataDim)
- * - Output: A vector (batchSize * 1)
- *
- * The configuration api is: l2_distance_layer.
- */
-
-class L2DistanceLayer : public Layer {
- public:
-  explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
-  ~L2DistanceLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  // Store the result of subtracting Input2 from Input1 in forward computation,
-  // which will be reused in backward computation.
-  MatrixPtr inputSub_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Layer.cpp b/paddle/legacy/gserver/layers/Layer.cpp
deleted file mode 100644
index 890d33552dd..00000000000
--- a/paddle/legacy/gserver/layers/Layer.cpp
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Util.h"
-
-#include "CostLayer.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/legacy/utils/Error.h"
-#include "paddle/legacy/utils/Logging.h"
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "ValidationLayer.h"
-#endif
-
-DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
-
-namespace paddle {
-
-Layer::Layer(const LayerConfig& config, bool useGpu)
-    : config_(config),
-      useGpu_(useGpu),
-      deviceId_(CPU_DEVICE),
-      needSequenceInfo_(true) {}
-
-bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
-  if (useGpu_ && FLAGS_parallel_nn) {
-    /* gpu environment is specified by device property */
-    deviceId_ = config_.device();
-    if (deviceId_ < 0) {
-      useGpu_ = false;
-    }
-  }
-
-  output_.deviceId = deviceId_;
-
-  for (auto& inputConfig : config_.inputs()) {
-    std::string inputName = inputConfig.input_layer_name();
-    LayerPtr inputLayer;
-    CHECK(mapGet(inputName, layerMap, &inputLayer))
-        << "Cannot find input layer " << inputName << " for layer "
-        << getName();
-    this->addPrev(inputLayer);
-
-    inputLayer->addOutputArgument(deviceId_);
-
-    if (inputConfig.has_input_parameter_name()) {
-      ParameterPtr parameter;
-      CHECK(
-          mapGet(inputConfig.input_parameter_name(), parameterMap, &parameter))
-          << "Cannot find input parameter "
-          << inputConfig.input_parameter_name() << " for layer " << getName();
-      parameter->incShared();
-      CHECK_EQ(parameter->getDeviceId(), getDeviceId());
-      parameters_.push_back(parameter);
-    } else {
-      parameters_.push_back(nullptr);
-    }
-
-    if (inputConfig.has_input_layer_argument()) {
-      inputArgument_.push_back(inputConfig.input_layer_argument());
-    } else {
-      inputArgument_.push_back("");
-    }
-  }
-
-  if (config_.has_bias_parameter_name()) {
-    CHECK(mapGet(config_.bias_parameter_name(), parameterMap, &biasParameter_))
-        << "Cannot find bias parameter " << config_.bias_parameter_name()
-        << " for layer " << getName();
-    biasParameter_->incShared();
-    CHECK_EQ(biasParameter_->getDeviceId(), getDeviceId());
-  }
-
-  /* specify the activation function according to the configuration */
-  std::string action_type = config_.active_type();
-  activation_.reset(ActivationFunction::create(action_type));
-  CHECK(activation_);
-
-  initNeedFlags();
-  markInBackward_.assign(inputLayers_.size(), false);
-
-  return true;
-}
-
-ClassRegistrar<Layer, LayerConfig> Layer::registrar_;
-
-LayerPtr Layer::create(const LayerConfig& config) {
-  std::string type = config.type();
-
-#ifndef PADDLE_MOBILE_INFERENCE
-  // NOTE: As following types have illegal character '-',
-  // they can not use REGISTER_LAYER to registrar.
-  // Besides, to fit with old training models,
-  // they can not use '_' instead.
-  if (type == "multi-class-cross-entropy")
-    return LayerPtr(new MultiClassCrossEntropy(config));
-  else if (type == "rank-cost")
-    return LayerPtr(new RankingCost(config));
-  else if (type == "auc-validation")
-    return LayerPtr(new AucValidation(config));
-  else if (type == "pnpair-validation")
-    return LayerPtr(new PnpairValidation(config));
-#endif
-
-  return LayerPtr(registrar_.createByType(config.type(), config));
-}
-
-void Layer::resetSpecifyOutput(Argument& output,
-                               size_t height,
-                               size_t width,
-                               bool isValueClean,
-                               bool isGradClean) {
-  SetDevice device(output.deviceId);
-
-  Matrix::resizeOrCreate(
-      output.value, height, width, /* trans */ false, useGpu(output.deviceId));
-  if (isValueClean) {
-    output.value->zeroMem();
-  }
-
-  if (passType_ != PASS_TEST && needGradient()) {
-    Matrix::resizeOrCreate(
-        output.grad, height, width, /* trans */ false, useGpu(output.deviceId));
-    if (isGradClean) {
-      output.grad->zeroMem();
-    }
-  }
-}
-
-void Layer::resizeOutput(size_t height, size_t width) {
-  resetSpecifyOutput(output_, height, width, false, false);
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    resetSpecifyOutput(outputOtherDevice_[i], height, width, false, false);
-  }
-}
-
-void Layer::reserveOutput(size_t height, size_t width) {
-  resetSpecifyOutput(output_, height, width, false, true);
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    resetSpecifyOutput(outputOtherDevice_[i], height, width, false, true);
-  }
-}
-
-void Layer::resetOutput(size_t height, size_t width) {
-  resetSpecifyOutput(output_, height, width, true, true);
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    resetSpecifyOutput(outputOtherDevice_[i], height, width, true, true);
-  }
-}
-
-void Layer::addOutputArgument(int deviceId) {
-  if (deviceId == deviceId_) {
-    output_.countIncrement();
-    return;
-  } else {
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      if (outputOtherDevice_[i].deviceId == deviceId) {
-        outputOtherDevice_[i].countIncrement();
-        return;
-      }
-    }
-  }
-
-  Argument argu;
-  argu.deviceId = deviceId;
-  outputOtherDevice_.push_back(argu);
-  outputOtherDevice_.back().countIncrement();
-}
-
-void Layer::copyOutputToOtherDevice() {
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    SetDevice device(outputOtherDevice_[i].deviceId);
-    // If outputOtherDevice_[i].value is a CpuMatrix,
-    // the copyFrom is a synchronous interface.
-    // If outputOtherDevice_[i].value is a GpuMatrix, since subsequent
-    // calculations are all on HPPL_STREAM_DEFAULT,
-    // copyFrom can be an asynchronous interface.
-    outputOtherDevice_[i].value->copyFrom(*getOutputValue(),
-                                          HPPL_STREAM_DEFAULT);
-    outputOtherDevice_[i].sequenceStartPositions =
-        output_.sequenceStartPositions;
-    outputOtherDevice_[i].subSequenceStartPositions =
-        output_.subSequenceStartPositions;
-    outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
-
-    outputOtherDevice_[i].notifyValueReady();
-  }
-}
-
-void Layer::waitInputValue() {
-  for (size_t i = 0; i != inputLayers_.size(); i++) {
-    if (inputLayers_[i]->getDeviceId() != deviceId_) {
-      getInput(i).waitValueReady();
-    }
-  }
-}
-
-void Layer::waitAndMergeOutputGrad() {
-  if (!output_.grad || !outputOtherDevice_.size()) {
-    return;
-  }
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    outputOtherDevice_[i].waitGradReady();
-  }
-
-  /* merge output grad */
-  size_t i = 0;
-  if (!output_.getAllCount()) {
-    output_.grad->copyFrom(*outputOtherDevice_[0].grad, HPPL_STREAM_1);
-    hl_stream_synchronize(HPPL_STREAM_1);
-
-    i++;
-    if (outputOtherDevice_.size() == 1) return;
-  }
-
-  Matrix::resizeOrCreate(tmpGrad_,
-                         output_.grad->getHeight(),
-                         output_.grad->getWidth(),
-                         /* trans */ false,
-                         useGpu(output_.deviceId));
-
-  for (; i != outputOtherDevice_.size(); i++) {
-    tmpGrad_->copyFrom(*outputOtherDevice_[i].grad, HPPL_STREAM_1);
-    hl_stream_synchronize(HPPL_STREAM_1);
-    output_.grad->add(*tmpGrad_);
-  }
-}
-
-void Layer::markAllInputGrad() {
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (!markInBackward_[i]) {
-      inputLayers_[i]->getOutput(deviceId_).notifyGradReady();
-    }
-    markInBackward_[i] = false;
-  }
-}
-
-void Layer::markInputGrad(int inputIndex) {
-  inputLayers_[inputIndex]->getOutput(deviceId_).notifyGradReady();
-  markInBackward_[inputIndex] = true;
-}
-
-void Layer::zeroGrad() {
-  CHECK(output_.grad.get() != NULL);
-  output_.grad->zeroMem();
-}
-
-void Layer::initNeedFlags() {
-  auto initFlag = [this](
-      bool& flag, bool (Layer::*flagQueryFunc)() const, ParameterType type) {
-    flag = false;
-    if (biasParameter_ && biasParameter_->hasType(type)) {
-      flag = true;
-    }
-    if (!flag) {
-      for (auto& para : parameters_) {
-        if (para && para->hasType(type)) {
-          flag = true;
-          break;
-        }
-      }
-    }
-    if (!flag) {
-      for (auto& layer : inputLayers_) {
-        if ((layer.get()->*flagQueryFunc)()) {
-          flag = true;
-        }
-      }
-    }
-  };
-  initFlag(needGradient_, &Layer::needGradient, PARAMETER_GRADIENT);
-}
-
-void Layer::showOutputStats() {
-  MatrixPtr out = getOutputValue();
-  if (!out) return;
-  if (!out->getElementCnt()) {
-    LOG(INFO) << "The number of output of " << config_.name()
-              << " is 0, skip to show the statistics";
-    return;
-  }
-  MatrixPtr outSquare;
-  if (dynamic_cast<GpuSparseMatrix*>(out.get())) {
-    GpuSparseMatrix* tmp = dynamic_cast<GpuSparseMatrix*>(out.get());
-    outSquare = std::make_shared<CpuSparseMatrix>(tmp->getHeight(),
-                                                  tmp->getWidth(),
-                                                  tmp->getElementCnt(),
-                                                  tmp->getValueType(),
-                                                  tmp->getFormat());
-  } else {
-    outSquare = out->clone();
-  }
-  outSquare->copyFrom(*out, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-
-  real mean = outSquare->getSum() / out->getElementCnt();
-  real min;
-  real max;
-  if (dynamic_cast<CpuSparseMatrix*>(outSquare.get())) {
-    auto tmpMat = dynamic_cast<CpuSparseMatrix*>(outSquare.get());
-    min = tmpMat->getMin();
-    max = tmpMat->getMax();
-    tmpMat->square2();
-    LOG(INFO) << "show statistics of [none zero values] in sparse matrix";
-  } else {
-    min = outSquare->getMin();
-    max = outSquare->getMax();
-    outSquare->square2();
-  }
-  real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean;
-  std = std > 0 ? std : 0;
-  LOG(INFO) << "The output state of " << config_.name() << ": mean=" << mean
-            << ", "
-            << "std=" << std << ", "
-            << "min=" << min << ", "
-            << "max=" << max;
-}
-
-void Layer::forwardActivation() {
-  /* activation */
-  auto status = activation_->forward(output_);
-  status.check();
-
-  /* dropout */
-  if (config_.drop_rate() > 0) {
-    forwardDropOut();
-    CHECK_NE(activation_->getName(), "softmax")
-        << "Softmax activation cannot be used with Dropout";
-  }
-
-  if (FLAGS_show_layer_stat) {
-    showOutputStats();
-  }
-}
-
-void Layer::backwardActivation() {
-  /* Do error clipping */
-  if (config_.error_clipping_threshold() > 0.0f) {
-    if (FLAGS_log_error_clipping) {
-      VectorPtr outGradVec = Vector::create(
-          output_.grad->getData(), output_.grad->getElementCnt(), useGpu_);
-      real maxAbsGrad = outGradVec->getAbsMax();
-      if (maxAbsGrad > config_.error_clipping_threshold()) {
-        real avgAbsGrad = outGradVec->getAbsSum() / outGradVec->getSize();
-        LOG(INFO) << " layer=" << config_.name() << " need clipping,"
-                  << " max error=" << maxAbsGrad << " avg error=" << avgAbsGrad;
-      }
-    }
-    output_.grad->clip(-config_.error_clipping_threshold(),
-                       config_.error_clipping_threshold());
-  }
-
-  /* Do dropout for delta*/
-  if (config_.drop_rate() > 0 && passType_ != PASS_TEST) {
-    MatrixPtr oGrad = getOutputGrad();
-    oGrad->dotMul(*oGrad, *dropOutMask_);
-  }
-
-  auto status = activation_->backward(output_);
-  status.check();
-}
-
-void Layer::forwardDropOut() {
-  auto& outV = getOutputValue();
-
-  if (passType_ == PASS_TRAIN) {
-    // new dropOutMask_ if dropOutMask_ is null ptr
-    Matrix::resizeOrCreate(dropOutMask_,
-                           outV->getHeight(),
-                           outV->getWidth(),
-                           false,
-                           useGpu(deviceId_));
-    dropOutMask_->randomizeUniform();  // generate a uniform random matrix
-    dropOutMask_->biggerThanScalar(config_.drop_rate());  // random mask
-    outV->dotMul(*outV, *dropOutMask_);                   // dropout
-  } else if (passType_ == PASS_GC) {
-    // only initialize once
-    if (!dropOutMask_) {
-      dropOutMask_ = Matrix::create(
-          outV->getHeight(), outV->getWidth(), false, useGpu(deviceId_));
-      // We use cpu matrix to generate mask so that the mask
-      // will be same for both gpu version and cpu version.
-      // This will help unittest to make sure they have same result.
-      MatrixPtr tmpMask = Matrix::create(outV->getHeight(), outV->getWidth());
-      tmpMask->randomizeUniform();  // generate a uniform random matrix
-      tmpMask->biggerThanScalar(config_.drop_rate());  // random mask
-      dropOutMask_->copyFrom(*tmpMask);
-    }
-    outV->dotMul(*outV, *dropOutMask_);
-  } else {  // passType == PASS_TEST
-    outV->mulScalar(1.0 - config_.drop_rate());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Layer.h b/paddle/legacy/gserver/layers/Layer.h
deleted file mode 100644
index a7ff76decea..00000000000
--- a/paddle/legacy/gserver/layers/Layer.h
+++ /dev/null
@@ -1,512 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/function/Function.h"
-#include "paddle/legacy/gserver/activations/ActivationFunction.h"
-#include "paddle/legacy/math/CpuSparseMatrix.h"
-#include "paddle/legacy/parameter/Argument.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/parameter/Weight.h"
-#include "paddle/legacy/utils/ClassRegistrar.h"
-#include "paddle/legacy/utils/Util.h"
-
-/// Macro for registering a layer type.
-/// Example: REGISTER_LAYER(crf_error, CRFDecodingErrorLayer);
-#define REGISTER_LAYER(__type_name, __class_name) \
-  static InitFunction __reg_type_##__type_name(   \
-      []() { Layer::registrar_.registerClass<__class_name>(#__type_name); })
-
-#define REGISTER_LAYER_CREATE_FUNC(__type_name, createFunction) \
-  static InitFunction __reg_type_##__type_name(                 \
-      []() { Layer::registrar_.registerClass(#__type_name, createFunction); })
-
-namespace paddle {
-
-class Layer;
-typedef std::shared_ptr<Layer> LayerPtr;
-typedef std::map<std::string, LayerPtr> LayerMap;
-class NeuralNetwork;
-
-/// layer state, used for RNN and LSTM layers
-struct LayerState {
-  std::vector<MatrixPtr> value;
-};
-typedef std::shared_ptr<LayerState> LayerStatePtr;
-
-/// Paddle device ID, MKLDNN is -2, CPU is -1
-enum PADDLE_DEVICE_ID {
-  MKLDNN_DEVICE = -2,
-  CPU_DEVICE = -1,
-};
-
-/**
- * @brief Base class for layer.
- * Define necessary variables and functions for every layer.
- */
-class Layer {
- protected:
-  /// Layer config
-  LayerConfig config_;
-  /// whether to use GPU
-  bool useGpu_;
-  /// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
-  int deviceId_;
-  /// Input layers
-  std::vector<LayerPtr> inputLayers_;
-  /// Argument of input layers
-  std::vector<std::string> inputArgument_;
-
-  /// Parameter for each input layer.
-  /// Parameters_[i] is nullptr if inputLayers_[i] does not need parameter.
-  std::vector<ParameterPtr> parameters_;
-
-  /// nullptr if bias is not needed.
-  ParameterPtr biasParameter_;
-
-  /// Output
-  Argument output_;
-  /// Several outputs stored on different devices, used in 'parallel_nn' case,
-  /// and record them by deviceId_.
-  /// Also used in 'use_mkldnn' case.
-  std::vector<Argument> outputOtherDevice_;
-  /// If there are several outputs, map them by each name.
-  /// MKLDNNLayer use it only to merge output grad
-  std::map<std::string, Argument*> outputMap_;
-  /// Used to merge grad on different devices.
-  MatrixPtr tmpGrad_;
-
-  std::unique_ptr<ActivationFunction> activation_;
-
-  /// Current passType, PASS_TRAIN or PASS_TEST
-  PassType passType_;
-
-  /// Random 0-1 matrix for dropOut
-  MatrixPtr dropOutMask_;
-
-  /// Whether the layer need to compute gradient
-  bool needGradient_;
-  /// Whether the layer need to compute re-sequence information
-  bool needSequenceInfo_;
-
-  /// Mark input grad in(true) or out(false) of backward function.
-  std::vector<bool> markInBackward_;
-
-  /// Layer forward function
-  std::vector<std::shared_ptr<FunctionBase>> forward_;
-  /// Layer backward function
-  std::vector<std::shared_ptr<FunctionBase>> backward_;
-
- public:
-  /**
-   * Wait until all input value ready.
-   * Called before Layer::forward() function.
-   */
-  virtual void waitInputValue();
-
-  /**
-   * Copy layer's output_ to other device.
-   * If output layer is in other device, called after Layer::forward() function.
-   */
-  virtual void copyOutputToOtherDevice();
-
-  /**
-   * Wait until all output grad ready and merge them to output_.grad.
-   * Called before Layer::backward() function.
-   */
-  virtual void waitAndMergeOutputGrad();
-
-  /**
-   * Notify previous layer the output grad ready.
-   * Called after Layer::backward() function.
-   */
-  virtual void markAllInputGrad();
-
- protected:
-  /**
-   * Create layer function. Function is called in forward or backward.
-   * \param function, Layer::forward_ or Layer::backward_
-   * \param name, function name
-   * \param config, initialization configuration for the function
-   */
-  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
-                      const std::string& name,
-                      const FuncConfig& config) {
-    if (useGpu_) {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
-    } else {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
-    }
-    auto& func = function.back();
-    func->init(config);
-  }
-
-  /**
-   * Notify specified layer the output grad ready.
-   * Called in the backward function.
-   * If do mark input grad in the backward function, you should to ensure
-   * that all input grad will be marked in the backward function.
-   */
-  void markInputGrad(int inputIndex);
-
-  /**
-   * Get the argument of input layer.
-   */
-  const Argument& getInput(size_t inputIndex) const {
-    return inputLayers_[inputIndex]->getOutput(deviceId_);
-  }
-
-  /**
-   * Get the argument of input layer.
-   */
-  const Argument& getInput(const Layer& inputLayer) const {
-    return inputLayer.getOutput(deviceId_);
-  }
-
-  /**
-   * Get the argument of input layer with deviceId.
-   */
-  const Argument& getInput(size_t inputIndex, int deviceId) const {
-    return inputLayers_[inputIndex]->getOutput(deviceId);
-  }
-
-  /**
-   * Get the forward-input value.
-   */
-  const MatrixPtr& getInputValue(int inputIndex) {
-    return inputLayers_[inputIndex]->getOutput(deviceId_).value;
-  }
-
-  /**
-   * Get the forward-input value.
-   */
-  const MatrixPtr& getInputValue(const Layer& inputLayer) {
-    return inputLayer.getOutput(deviceId_).value;
-  }
-
-  /**
-   * Get the forward-input value with deviceId.
-   */
-  const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
-    return inputLayers_[inputIndex]->getOutput(deviceId).value;
-  }
-
-  /**
-   * Get the forward-input grad.
-   */
-  const MatrixPtr& getInputGrad(int inputIndex) {
-    return inputLayers_[inputIndex]->getOutput(deviceId_).grad;
-  }
-
-  /**
-   * Get the forward-input grad.
-   */
-  const MatrixPtr& getInputGrad(const Layer& inputLayer) {
-    return inputLayer.getOutput(deviceId_).grad;
-  }
-
-  /**
-   * Get the forward-input grad.
-   */
-  const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
-    return inputLayers_[inputIndex]->getOutput(deviceId).grad;
-  }
-
-  /**
-   * Get the forward-input label.
-   */
-  const IVectorPtr& getInputLabel(const Layer& inputLayer) {
-    return inputLayer.getOutput(deviceId_).ids;
-  }
-
-  /**
-   * Change the size of output (value, grad).
-   * Reset to value zero if isValueClean = true,
-   * Reset to grad zero if isGradClean = true.
-   */
-  void resetSpecifyOutput(Argument& output,
-                          size_t height,
-                          size_t width,
-                          bool isValueClean,
-                          bool isGradClean);
-
-  /**
-   * Add output argument to other devices.
-   */
-  void addOutputArgument(int deviceId);
-
- public:
-  explicit Layer(const LayerConfig& config, bool useGpu = FLAGS_use_gpu);
-  virtual ~Layer() {}
-
-  /// Register a Layer
-  static ClassRegistrar<Layer, LayerConfig> registrar_;
-
-  /**
-   * Get the flag whether layer need to compute gradient.
-   */
-  bool needGradient() const { return needGradient_; }
-
-  /**
-   * Set the flag whether layer need to compute gradient.
-   */
-  void setNeedGradient(bool need) { needGradient_ = need; }
-
-  /**
-   * Set the flag whether layer need to re-compute sequence information,
-   * which includes sequenceStartPositions or subSequenceStartPositions.
-   */
-  void setNeedSequenceInfo(bool need) { needSequenceInfo_ = need; }
-
-  /**
-   * Get layer's name.
-   */
-  const std::string& getName() const { return config_.name(); }
-
-  /**
-   * Get layer's type.
-   */
-  const std::string& getType() const { return config_.type(); }
-
-  /**
-   * Get layer's size.
-   */
-  size_t getSize() const { return config_.size(); }
-
-  /**
-   * Get layer's deviceId.
-   */
-  int getDeviceId() const { return deviceId_; }
-
-  /**
-   * Add the inputLayer.
-   */
-  void addPrev(LayerPtr l) { inputLayers_.push_back(l); }
-
-  /**
-   * Get the size of inputLayer[i].
-   */
-  const LayerPtr& getPrev(size_t i) { return inputLayers_[i]; }
-
-  /**
-   * Get the forward-output value.
-   */
-  const MatrixPtr& getOutputValue() { return output_.value; }
-
-  /**
-   * Get the forward-output label.
-   */
-  const IVectorPtr& getOutputLabel() { return output_.ids; }
-
-  /**
-   * Get the backward-Loss value.
-   */
-  const MatrixPtr& getOutputGrad() { return output_.grad; }
-  /**
-   * If layer has multi-output, set output into outputMap_.
-   */
-  void setOutput(const std::string& name, Argument* output) {
-    outputMap_[name] = output;
-  }
-
-  /**
-   * Get the output map size, if layer has multi-output.
-   */
-  size_t getOutputMapSize() { return outputMap_.size(); }
-
-  /**
-   * Get the output based on layer's name.
-   */
-  Argument& getOutput(const std::string& str = "") {
-    if (str == "") {
-      return output_;
-    } else {
-      auto output = outputMap_.find(str);
-      if (output != outputMap_.end()) {
-        return *output->second;
-      } else {
-        LOG(FATAL) << "No specific output " << str;
-        return *((Argument*)nullptr);
-      }
-    }
-  }
-
-  /**
-   * Get the output based on deviceId.
-   */
-  const Argument& getOutput(int deviceId) const {
-    if (deviceId == getDeviceId()) {
-      return output_;
-    } else {
-      for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-        if (outputOtherDevice_[i].deviceId == deviceId) {
-          return outputOtherDevice_[i];
-        }
-      }
-
-      LOG(FATAL) << "No specific device output ";
-      return *((Argument*)nullptr);
-    }
-  }
-
-  /**
-   * Get layer's parameters.
-   */
-  const std::vector<ParameterPtr>& getParameters() { return parameters_; }
-
-  /**
-   * Get layer's bias-parameters.
-   */
-  const ParameterPtr& getBiasParameter() { return biasParameter_; }
-
-  /**
-   * Create pointer of layer.
-   */
-  static LayerPtr create(const LayerConfig& config);
-
-  /**
-   * Resize the output matrix size.
-   */
-  void resizeOutput(size_t height, size_t width);
-
-  /**
-   * Resize the output matrix size,
-   * and reset value to zero.
-   */
-  void reserveOutput(size_t height, size_t width);
-
-  /**
-   * Resize the output matrix size,
-   * and reset value and grad to zero.
-   */
-  void resetOutput(size_t height, size_t width);
-
-  /**
-   * Clear the gradient of output.
-   */
-  void zeroGrad();
-
-  /**
-   * Intialization.
-   * For example, adding input layers from layerMap and parameterMap.
-   */
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  /**
-   * Intialization for sub network if there has sub network.
-   * @param rootNetwork root network
-   * @param config model config
-   * @param parameterTypes parameter's type
-   * @param useGpu whether to use gpu or not
-   */
-  virtual void initSubNetwork(NeuralNetwork* rootNetwork,
-                              const ModelConfig& config,
-                              const std::vector<ParameterType>& parameterTypes,
-                              bool useGpu) {}
-
-  /**
-   * @brief Access SubNetwork Object.
-   *        If subnetwork exists, then invoke callback with subnetwrk.
-   * @param callback if sub-network is exist, the callback is invoked.
-   */
-  virtual void accessSubNetwork(
-      const std::function<void(NeuralNetwork&)>& callback) {}
-
-  /**
-   * If use sparse row matrix as parameter,
-   * prefetch feature ids in input label.
-   */
-  virtual void prefetch() {}
-
-  /**
-   * Forward propagation.
-   * All inherited implementation should call Layer::foward() function.
-   */
-  virtual void forward(PassType passType) {
-    passType_ = passType;
-    if (!inputLayers_.empty() && needSequenceInfo_) {
-      const Argument& input = getInput(0);
-      output_.sequenceStartPositions = input.sequenceStartPositions;
-      output_.subSequenceStartPositions = input.subSequenceStartPositions;
-      output_.cpuSequenceDims = input.cpuSequenceDims;
-    }
-  }
-
-  /**
-   * Reset the internal state variables.
-   * Allocate them if they have not been allocated.
-   * This function need to called before Layer::forward() for generating
-   * sequence.
-   *
-   * This is used for sequence generation. When generating sequence, the
-   * calculation at current timestamp depends on the state from previous
-   * timestamp. The model needs to keep the information about the previous
-   * timestamp in the state variables. Layers such as RecurrentLayer,
-   * LstmLayer and ContextLayer have state variables.
-   */
-  virtual void resetState() {}
-
-  /**
-   * Set layer state.
-   */
-  virtual void setState(LayerStatePtr state) {}
-
-  /**
-   * Get layer state.
-   * @return A copy of internal state.
-   */
-  virtual LayerStatePtr getState() { return nullptr; }
-
-  /**
-   * Show output state.
-   */
-  void showOutputStats();
-
-  /**
-   * Backward propagation.
-   * Should only be called after Layer::forward() function.
-   */
-  virtual void backward(const UpdateCallback& callback = nullptr) = 0;
-
-  /**
-   * One pass is finished.
-   */
-  virtual void onPassEnd() {}
-
- protected:
-  /**
-   * Forward of activation function.
-   */
-  void forwardActivation();
-  /**
-   * Backward of activation function.
-   */
-  void backwardActivation();
-  /**
-   * Forward of dropOut.
-   */
-  void forwardDropOut();
-  /**
-   * Initilize the needGradient_ flag.
-   */
-  void initNeedFlags();
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LinearChainCRF.cpp b/paddle/legacy/gserver/layers/LinearChainCRF.cpp
deleted file mode 100644
index 315fc25fab3..00000000000
--- a/paddle/legacy/gserver/layers/LinearChainCRF.cpp
+++ /dev/null
@@ -1,218 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LinearChainCRF.h"
-#include <algorithm>
-
-namespace paddle {
-
-LinearChainCRF::LinearChainCRF(int numClasses, real* para)
-    : numClasses_(numClasses) {
-  a_ = Matrix::create(para, 1, numClasses_);
-  b_ = Matrix::create(para + numClasses_, 1, numClasses_);
-  w_ = Matrix::create(para + 2 * numClasses_, numClasses_, numClasses_);
-
-  ones_ = Matrix::create(1, numClasses_);
-  ones_->one();
-
-  expW_ = Matrix::create(numClasses_, numClasses_);
-}
-
-// normalize x so that its sum is 1 and return the original sum;
-static real normalizeL1(real* x, int n) {
-  real sum = 0;
-  for (int i = 0; i < n; ++i) {
-    sum += x[i];
-  }
-  // Right now, we just bet that sum won't be zero. If this really happens,
-  // we will figure out what should be done then.
-  CHECK_GT(sum, 0);
-  real s = 1 / sum;
-  for (int i = 0; i < n; ++i) {
-    x[i] *= s;
-  }
-  return sum;
-}
-
-real LinearChainCRF::forward(real* x, int* s, int length) {
-  Matrix::resizeOrCreate(maxX_, length, 1);
-  Matrix::resizeOrCreate(expX_, length, numClasses_);
-  Matrix::resizeOrCreate(alpha_, length, numClasses_);
-  MatrixPtr matX = Matrix::create(x, length, numClasses_);
-  matX->rowMax(*maxX_);
-  expX_->assign(*matX);
-  // subtract max to avoid overflow or underflow
-  expX_->mul(*maxX_, *ones_, (real)-1, (real)1);
-  expX_->exp2();
-
-  real* a = a_->getData();
-  real* b = b_->getData();
-  real* w = w_->getData();
-  real* alpha = alpha_->getData();
-  real* expX = expX_->getData();
-  real* maxX = maxX_->getData();
-
-  expW_->exp2(*w_);
-  real* expW = expW_->getData();
-
-  for (int i = 0; i < numClasses_; ++i) {
-    alpha[i] = exp(a[i]) * expX[i];
-  }
-  real ll = -maxX[0] - log(normalizeL1(alpha, numClasses_));
-
-  for (int k = 1; k < length; ++k) {
-    for (int i = 0; i < numClasses_; ++i) {
-      real sum = 0;
-      for (int j = 0; j < numClasses_; ++j) {
-        sum += alpha[(k - 1) * numClasses_ + j]  // (*)
-               * expW[j * numClasses_ + i];
-      }
-      alpha[k * numClasses_ + i] = expX[k * numClasses_ + i] * sum;
-    }
-    // normalizeL1 is to avoid underflow or overflow at (*)
-    ll -= maxX[k] + log(normalizeL1(alpha + k * numClasses_, numClasses_));
-  }
-  real sum = 0;
-  for (int i = 0; i < numClasses_; ++i) {
-    sum += alpha[(length - 1) * numClasses_ + i] * exp(b[i]);
-  }
-  ll -= log(sum);
-  // Now ll is equal to -log(Z)
-
-  CHECK_LT(*std::max_element(s, s + length), numClasses_);
-  // Calculate the nominator part, which depends on s
-  ll += a[s[0]] + x[s[0]] + b[s[length - 1]];
-  for (int k = 1; k < length; ++k) {
-    ll += x[k * numClasses_ + s[k]] + w[s[k - 1] * numClasses_ + s[k]];
-  }
-
-  VLOG(1) << "ll=" << ll;
-  return -ll;
-}
-
-void LinearChainCRF::backward(real* x, int* s, int length, bool needWGrad) {
-  Matrix::resizeOrCreate(matGrad_, length, numClasses_);
-  Matrix::resizeOrCreate(beta_, length, numClasses_);
-  real* b = b_->getData();
-  if (needWGrad) {
-    Matrix::resizeOrCreate(matWGrad_, numClasses_ + 2, numClasses_);
-    matWGrad_->zeroMem();
-    da_ = matWGrad_->subRowMatrix(0, 1);
-    db_ = matWGrad_->subRowMatrix(1, 2);
-    dw_ = matWGrad_->subRowMatrix(2, numClasses_ + 2);
-  }
-
-  real* alpha = alpha_->getData();
-  real* beta = beta_->getData();
-  real* expW = expW_->getData();
-  real* expX = expX_->getData();
-  real* grad = matGrad_->getData();
-
-  for (int i = 0; i < numClasses_; ++i) {
-    beta[(length - 1) * numClasses_ + i] = exp(b[i]);
-  }
-  normalizeL1(beta + (length - 1) * numClasses_, numClasses_);
-
-  for (int k = length - 2; k >= 0; --k) {
-    for (int i = 0; i < numClasses_; ++i) {
-      real sum = 0;
-      for (int j = 0; j < numClasses_; ++j) {
-        sum += expW[i * numClasses_ + j]  // (**)
-               * beta[(k + 1) * numClasses_ + j] *
-               expX[(k + 1) * numClasses_ + j];
-      }
-      beta[k * numClasses_ + i] = sum;
-    }
-    // normalizeL1 is to avoid underflow or overflow at (**)
-    normalizeL1(beta + k * numClasses_, numClasses_);
-  }
-
-  matGrad_->dotMul(*alpha_, *beta_);
-  matGrad_->rowNormalizeL1(*matGrad_);
-  for (int k = 0; k < length; ++k) {
-    grad[k * numClasses_ + s[k]] -= (real)1;
-  }
-
-  if (needWGrad) {
-    da_->add(*matGrad_->subMatrix(/* startRow= */ 0, /* numRows= */ 1));
-    db_->add(*matGrad_->subMatrix(/* startRow= */ length - 1, 1));
-
-    beta_->dotMul(*beta_, *expX_);
-    beta_->rowNormalizeL1(*beta_);
-
-    real* dw = dw_->getData();
-    for (int k = 1; k < length; ++k) {
-      real sum = 0;
-      for (int i = 0; i < numClasses_; ++i) {
-        for (int j = 0; j < numClasses_; ++j) {
-          sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] *
-                 beta[k * numClasses_ + j];
-        }
-      }
-      sum = 1 / sum;
-      for (int i = 0; i < numClasses_; ++i) {
-        for (int j = 0; j < numClasses_; ++j) {
-          dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] *
-                                     alpha[(k - 1) * numClasses_ + i] *
-                                     beta[k * numClasses_ + j];
-        }
-      }
-      dw[s[k - 1] * numClasses_ + s[k]] -= (real)1;
-    }
-  }
-}
-
-void LinearChainCRF::decode(real* x, int* s, int length) {
-  Matrix::resizeOrCreate(alpha_, length, numClasses_);
-  real* a = a_->getData();
-  real* b = b_->getData();
-  real* w = w_->getData();
-  IVector::resizeOrCreate(track_, numClasses_ * length, /* useGpu= */ false);
-  int* track = track_->getData();
-  real* alpha = alpha_->getData();
-
-  for (int i = 0; i < numClasses_; ++i) {
-    alpha[i] = a[i] + x[i];
-  }
-  for (int k = 1; k < length; ++k) {
-    for (int i = 0; i < numClasses_; ++i) {
-      real maxScore = -std::numeric_limits<real>::max();
-      int maxJ = 0;
-      for (int j = 0; j < numClasses_; ++j) {
-        real score = alpha[(k - 1) * numClasses_ + j] + w[j * numClasses_ + i];
-        if (score > maxScore) {
-          maxScore = score;
-          maxJ = j;
-        }
-      }
-      alpha[k * numClasses_ + i] = maxScore + x[k * numClasses_ + i];
-      track[k * numClasses_ + i] = maxJ;
-    }
-  }
-  real maxScore = -std::numeric_limits<real>::max();
-  int maxI = 0;
-  for (int i = 0; i < numClasses_; ++i) {
-    real score = alpha[(length - 1) * numClasses_ + i] + b[i];
-    if (score > maxScore) {
-      maxScore = score;
-      maxI = i;
-    }
-  }
-  s[length - 1] = maxI;
-  for (int k = length - 1; k >= 1; --k) {
-    s[k - 1] = maxI = track[k * numClasses_ + maxI];
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LinearChainCRF.h b/paddle/legacy/gserver/layers/LinearChainCRF.h
deleted file mode 100644
index 65e23905435..00000000000
--- a/paddle/legacy/gserver/layers/LinearChainCRF.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-class LinearChainCRF {
- public:
-  /**
-   * The size of para must be \f$(numClasses + 2) * numClasses\f$.
-   * The first numClasses values of para are for starting weights (\f$a\f$).
-   * The next numClasses values of para are for ending weights (\f$b\f$),
-   * The remaning values are for transition weights (\f$w\f$).
-   *
-   * The probability of a state sequence s of length \f$L\f$ is defined as:
-   * \f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
-   *                  + \sum_{l=1}^L x_{s_l}
-   *                  + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
-   * where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over
-   * all possible
-   * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
-   */
-  LinearChainCRF(int numClasses, real* para);
-
-  /**
-   * Calculate the negative log likelihood of s given x.
-   * The size of x must be length * numClasses. Each consecutive numClasses
-   * values are the features for one time step.
-   */
-  real forward(real* x, int* s, int length);
-
-  /**
-   * Calculate the gradient with respect to x, a, b, and w.
-   * backward() can only be called after a corresponding call to forward() with
-   * the same x, s and length.
-   * The gradient with respect to a, b, and w will not be calculated if
-   * needWGrad is false.
-   * @note Please call getWGrad() and getXGrad() to get the gradient with
-   * respect to (a, b, w) and x respectively.
-   */
-  void backward(real* x, int* s, int length, bool needWGrad);
-
-  /**
-   * Find the most probable sequence given x. The result will be stored in s.
-   */
-  void decode(real* x, int* s, int length);
-
-  /*
-   * Return the gradient with respect to (a, b, w). It can only be called after
-   * a corresponding call to backward().
-   */
-  MatrixPtr getWGrad() { return matWGrad_; }
-
-  /*
-   * Return the gradient with respect to x. It can only be called after a
-   * corresponding call to backward().
-   */
-  MatrixPtr getXGrad() { return matGrad_; }
-
- protected:
-  int numClasses_;
-  MatrixPtr a_;
-  MatrixPtr b_;
-  MatrixPtr w_;
-  MatrixPtr matWGrad_;
-  MatrixPtr da_;
-  MatrixPtr db_;
-  MatrixPtr dw_;
-  MatrixPtr ones_;
-
-  MatrixPtr expX_;
-  MatrixPtr matGrad_;
-  MatrixPtr alpha_;
-  MatrixPtr beta_;
-  MatrixPtr maxX_;
-  MatrixPtr expW_;
-
-  // track_(k,i) = j means that the best sequence at time k for class i comes
-  // from the sequence at time k-1 for class j
-  IVectorPtr track_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LinearChainCTC.cpp b/paddle/legacy/gserver/layers/LinearChainCTC.cpp
deleted file mode 100644
index 1fad545b7a5..00000000000
--- a/paddle/legacy/gserver/layers/LinearChainCTC.cpp
+++ /dev/null
@@ -1,265 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LinearChainCTC.h"
-#include <math.h>
-#include <limits>
-
-namespace paddle {
-
-/* log scale */
-const real EXP_MAX = std::numeric_limits<real>::max();
-const real EXP_MIN = std::numeric_limits<real>::min();
-const real LOG_ZERO = std::log(EXP_MIN);
-const real LOG_INFINITY = std::log(EXP_MAX);
-
-static inline real safeExp(real x) {
-  if (x <= LOG_ZERO) {
-    return 0;
-  }
-  if (x >= LOG_INFINITY) {
-    return EXP_MAX;
-  }
-  return std::exp(x);
-}
-
-static inline real safeLog(real x) {
-  if (x <= EXP_MIN) {
-    return LOG_ZERO;
-  }
-  return std::log(x);
-}
-
-// x=lna and y=lnb is log scale, ln(a/b)=lna-lnb
-static inline real logDiv(real x, real y) {
-  if (x - y <= LOG_ZERO) {
-    return LOG_ZERO;
-  }
-  if (x - y >= LOG_INFINITY) {
-    return LOG_INFINITY;
-  }
-  return x - y;
-}
-
-// x=lna and y=lnb is log scale, ln(a*b)=lna+lnb
-static inline real logMul(real x, real y) {
-  if (x + y <= LOG_ZERO) {
-    return LOG_ZERO;
-  }
-  if (x + y >= LOG_INFINITY) {
-    return LOG_INFINITY;
-  }
-  return x + y;
-}
-
-// x=lna and y=lnb is log scale, ln(a+b)=lna+ln(1+exp(lnb-lna)), where b > a
-static inline real logAdd(real x, real y) {
-  if (x < y) {
-    real t = y;
-    y = x;
-    x = t;
-  }
-  return x + safeLog(1 + safeExp(y - x));
-}
-
-static void setLogZero(MatrixPtr mat) {
-  size_t size = mat->getElementCnt();
-  real* data = mat->getData();
-  for (size_t i = 0; i < size; i++) {
-    data[i] = LOG_ZERO;
-  }
-}
-
-LinearChainCTC::LinearChainCTC(int numClasses, bool normByTimes)
-    : numClasses_(numClasses), normByTimes_(normByTimes), logProb_(0) {
-  // set the class label of blank as "numClasses-1"
-  blank_ = numClasses - 1;
-
-  Matrix::resizeOrCreate(gradTerms_, 1, numClasses_);
-}
-
-real LinearChainCTC::forward(real* softmaxSeq,
-                             int softmaxSeqLen,
-                             int* labelSeq,
-                             int labelSeqLen) {
-  isInvalid_ = false;
-  totalTime_ = softmaxSeqLen;
-  totalSegments_ = labelSeqLen * 2 + 1;
-
-  int requiredTime = labelSeqLen;
-  int oldLabel = -1;
-
-  for (int i = 0; i < labelSeqLen; i++) {
-    if (labelSeq[i] == oldLabel) {
-      requiredTime++;
-    }
-    oldLabel = labelSeq[i];
-  }
-
-  if (totalTime_ < requiredTime) {
-    isInvalid_ = true;
-    return 0;
-  }
-
-  /* calculate the forward and backward variables,
-   * reference Chapter 7.3 of "Alex Grave, Supervised Sequence
-   * Labelling with Recurrent Neural Networks" */
-  Matrix::resizeOrCreate(logActs_, totalTime_, numClasses_, false, false);
-  real* logActsData = logActs_->getData();
-  for (int i = 0; i < totalTime_ * numClasses_; i++) {
-    logActsData[i] = safeLog(softmaxSeq[i]);
-  }
-
-  Matrix::resizeOrCreate(forwardVars_, totalTime_, totalSegments_);
-  Matrix::resizeOrCreate(backwardVars_, totalTime_, totalSegments_);
-
-  /* calculate the forward variables */
-  setLogZero(forwardVars_);
-  real* fwdVars = forwardVars_->getData();
-
-  /* dp initialization at t0 */
-  fwdVars[0] = logActs_->getData()[blank_];
-  if (totalSegments_ > 1) {
-    fwdVars[1] = logActs_->getData()[labelSeq[0]];
-  }
-  /* dp from t1 */
-  for (int i = 1; i < totalTime_; i++) {
-    real* dataPerStep = logActsData + i * numClasses_;
-    real* oldFvars = fwdVars + (i - 1) * totalSegments_;
-    real* fvars = fwdVars + i * totalSegments_;
-    int start, end;
-    segmentRange(start, end, i);
-    for (int j = start; j < end; j++) {
-      real fv;
-      if (j & 1) {
-        int labelIdx = j / 2;
-        int labelVal = labelSeq[labelIdx];
-        fv = logAdd(oldFvars[j], oldFvars[j - 1]);
-        if (j > 1 && (labelVal != labelSeq[labelIdx - 1])) {
-          fv = logAdd(fv, oldFvars[j - 2]);
-        }
-        fv = logMul(fv, dataPerStep[labelVal]);
-      } else {
-        fv = oldFvars[j];
-        if (j) {
-          fv = logAdd(fv, oldFvars[j - 1]);
-        }
-        fv = logMul(fv, dataPerStep[blank_]);
-      }
-      fvars[j] = fv;
-    }
-  }
-
-  real* lastFvs = fwdVars + (totalTime_ - 1) * totalSegments_;
-
-  /* sum the last two value as logprob */
-  logProb_ = lastFvs[totalSegments_ - 1];
-  if (totalSegments_ > 1) {
-    logProb_ = logAdd(logProb_, lastFvs[totalSegments_ - 2]);
-  }
-
-  /* calculate the backward variables */
-  setLogZero(backwardVars_);
-  real* bwdVars = backwardVars_->getData();
-  real* lastBvs = bwdVars + (totalTime_ - 1) * totalSegments_;
-
-  lastBvs[totalSegments_ - 1] = 0;
-  if (totalSegments_ > 1) {
-    lastBvs[totalSegments_ - 2] = 0;
-  }
-
-  for (int i = totalTime_ - 2; i >= 0; i--) {
-    real* oldDataPerStep = logActsData + (i + 1) * numClasses_;
-    real* oldBvars = bwdVars + (i + 1) * totalSegments_;
-    real* bvars = bwdVars + i * totalSegments_;
-    int start, end;
-    segmentRange(start, end, i);
-    for (int j = start; j < end; j++) {
-      real bv;
-      if (j & 1) {
-        int labelIdx = j / 2;
-        int labelVal = labelSeq[labelIdx];
-
-        bv = logAdd(logMul(oldBvars[j], oldDataPerStep[labelVal]),
-                    logMul(oldBvars[j + 1], oldDataPerStep[blank_]));
-        if (j < (totalSegments_ - 2)) {
-          int nextLabelVal = labelSeq[labelIdx + 1];
-          if (labelVal != nextLabelVal) {
-            bv = logAdd(bv,
-                        logMul(oldBvars[j + 2], oldDataPerStep[nextLabelVal]));
-          }
-        }
-      } else {
-        bv = logMul(oldBvars[j], oldDataPerStep[blank_]);
-        if (j < (totalSegments_ - 1)) {
-          bv = logAdd(bv,
-                      logMul(oldBvars[j + 1], oldDataPerStep[labelSeq[j / 2]]));
-        }
-      }
-      bvars[j] = bv;
-    }
-  }
-
-  VLOG(1) << "ctcLoss=" << -logProb_;
-
-  return -logProb_;
-}
-
-void LinearChainCTC::backward(real* softmaxSeq,
-                              real* grad,
-                              int* labelSeq,
-                              int labelSeqLen) {
-  /* if not meet the conditions of CTC computing, then set the grads to zeros */
-  if (isInvalid_) {
-    for (int i = 0; i < totalTime_ * numClasses_; i++) {
-      grad[i] += 0;
-    }
-    return;
-  }
-
-  real* fwdVars = forwardVars_->getData();
-  real* bwdVars = backwardVars_->getData();
-  real* logActsData = logActs_->getData();
-
-  for (int i = 0; i < totalTime_; i++) {
-    setLogZero(gradTerms_);
-    real* gradTermsData = gradTerms_->getData();
-    real* fvars = fwdVars + i * totalSegments_;
-    real* bvars = bwdVars + i * totalSegments_;
-    for (int j = 0; j < totalSegments_; j++) {
-      int k = (j & 1) ? labelSeq[j / 2] : blank_;
-      gradTermsData[k] = logAdd(gradTermsData[k], logMul(fvars[j], bvars[j]));
-    }
-    for (int j = 0; j < numClasses_; j++) {
-      if (normByTimes_) {
-        grad[i * numClasses_ + j] +=
-            -safeExp(
-                logDiv(gradTermsData[j],
-                       logMul(logProb_, logActsData[i * numClasses_ + j]))) /
-            totalTime_;
-      } else {
-        grad[i * numClasses_ + j] += -safeExp(
-            logDiv(gradTermsData[j],
-                   logMul(logProb_, logActsData[i * numClasses_ + j])));
-      }
-    }
-  }
-}
-
-void LinearChainCTC::segmentRange(int& start, int& end, int time) {
-  start = std::max(0, totalSegments_ - (2 * (totalTime_ - time)));
-  end = std::min(totalSegments_, 2 * (time + 1));
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LinearChainCTC.h b/paddle/legacy/gserver/layers/LinearChainCTC.h
deleted file mode 100644
index e6c4c7bfe0c..00000000000
--- a/paddle/legacy/gserver/layers/LinearChainCTC.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-class LinearChainCTC {
- public:
-  LinearChainCTC(int numClasses, bool normByTimes);
-
-  // Calculate the negative log probability as loss
-  real forward(real* softmaxSeq,
-               int softmaxSeqLen,
-               int* labelSeq,
-               int labelSeqLen);
-
-  // calculate the gradient
-  void backward(real* softmaxSeq,
-                real* softmaxSeqGrad,
-                int* labelSeq,
-                int labelSeqLen);
-
- protected:
-  int numClasses_, blank_, totalSegments_, totalTime_;
-  bool normByTimes_;
-  bool isInvalid_;
-
-  MatrixPtr logActs_, forwardVars_, backwardVars_, gradTerms_;
-
-  real logProb_;
-
-  void segmentRange(int& start, int& end, int time);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmCompute.cpp b/paddle/legacy/gserver/layers/LstmCompute.cpp
deleted file mode 100644
index 70f08e1d4ef..00000000000
--- a/paddle/legacy/gserver/layers/LstmCompute.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LstmCompute.h"
-#include "hl_recurrent_apply.cuh"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-void LstmCompute::init(LayerConfig &config) {
-  activeNode_ = hlActiveType(config.active_type());
-  activeGate_ = hlActiveType(config.active_gate_type());
-  activeState_ = hlActiveType(config.active_state_type());
-}
-
-template <>
-void LstmCompute::forwardOneSequence<0>(hl_lstm_value value, int frameSize) {
-  hl_cpu_lstm_forward(hppl::forward::lstm(),
-                      value,
-                      frameSize,
-                      activeNode_,
-                      activeGate_,
-                      activeState_);
-}
-
-template <>
-void LstmCompute::backwardOneSequence<0>(hl_lstm_value value,
-                                         hl_lstm_grad grad,
-                                         int frameSize) {
-  hl_cpu_lstm_backward(hppl::backward::lstm(),
-                       value,
-                       grad,
-                       frameSize,
-                       activeNode_,
-                       activeGate_,
-                       activeState_);
-}
-
-template <>
-void LstmCompute::forwardBatch<0>(hl_lstm_value value,
-                                  int frameSize,
-                                  int batchSize) {
-  for (int b = 0; b < batchSize; b++) {
-    forwardOneSequence<0>(value, frameSize);
-
-    value.gateValue += frameSize * 4;
-    value.stateValue += frameSize;
-    value.stateActiveValue += frameSize;
-    value.outputValue += frameSize;
-    if (value.prevStateValue) {
-      value.prevStateValue += frameSize;
-    }
-  }
-}
-
-template <>
-void LstmCompute::backwardBatch<0>(hl_lstm_value value,
-                                   hl_lstm_grad grad,
-                                   int frameSize,
-                                   int batchSize) {
-  for (int b = 0; b < batchSize; b++) {
-    backwardOneSequence<0>(value, grad, frameSize);
-
-    value.gateValue += frameSize * 4;
-    value.stateValue += frameSize;
-    value.stateActiveValue += frameSize;
-    value.outputValue += frameSize;
-    if (value.prevStateValue) {
-      value.prevStateValue += frameSize;
-    }
-
-    grad.gateGrad += frameSize * 4;
-    grad.stateGrad += frameSize;
-    grad.stateActiveGrad += frameSize;
-    grad.outputGrad += frameSize;
-    if (grad.prevStateGrad) {
-      grad.prevStateGrad += frameSize;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmCompute.cu b/paddle/legacy/gserver/layers/LstmCompute.cu
deleted file mode 100644
index 3f15edcacab..00000000000
--- a/paddle/legacy/gserver/layers/LstmCompute.cu
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LstmCompute.h"
-#include "hl_recurrent_apply.cuh"
-
-namespace paddle {
-
-template <>
-void LstmCompute::forwardBatch<1>(hl_lstm_value value,
-                                  int frameSize,
-                                  int batchSize) {
-  hl_gpu_lstm_forward(hppl::forward::lstm(),
-                      value,
-                      frameSize,
-                      batchSize,
-                      activeNode_,
-                      activeGate_,
-                      activeState_);
-}
-
-template <>
-void LstmCompute::backwardBatch<1>(hl_lstm_value value,
-                                   hl_lstm_grad grad,
-                                   int frameSize,
-                                   int batchSize) {
-  hl_gpu_lstm_backward(hppl::backward::lstm(),
-                       value,
-                       grad,
-                       frameSize,
-                       batchSize,
-                       activeNode_,
-                       activeGate_,
-                       activeState_);
-}
-
-template <>
-void LstmCompute::forwardOneSequence<1>(hl_lstm_value value, int frameSize) {
-  hl_gpu_lstm_forward(hppl::forward::lstm(),
-                      value,
-                      frameSize,
-                      /* batchSize */ 1,
-                      activeNode_,
-                      activeGate_,
-                      activeState_);
-}
-
-template <>
-void LstmCompute::backwardOneSequence<1>(hl_lstm_value value,
-                                         hl_lstm_grad grad,
-                                         int frameSize) {
-  hl_gpu_lstm_backward(hppl::backward::lstm(),
-                       value,
-                       grad,
-                       frameSize,
-                       /* batchSize */ 1,
-                       activeNode_,
-                       activeGate_,
-                       activeState_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmCompute.h b/paddle/legacy/gserver/layers/LstmCompute.h
deleted file mode 100644
index ac40c35ef1b..00000000000
--- a/paddle/legacy/gserver/layers/LstmCompute.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ModelConfig.pb.h"
-#include "hl_gpu.h"
-#include "paddle/legacy/utils/Common.h"
-
-namespace paddle {
-
-class LstmCompute {
- public:
-  void init(LayerConfig &config);
-
-  /**
-   * LstmLayer batch compute API (forwardBatch, backwardBatch).
-   * If use batch compute api, lstm value(and grad) need to be batch structure.
-   * Compute order:
-   *   forwardBatch:  for 0 <= id < numBatch
-   *   backwardBatch:  for numBatch > id >= 0
-   */
-  template <bool useGpu>
-  void forwardBatch(hl_lstm_value value, int frameSize, int batchSize);
-
-  template <bool useGpu>
-  void backwardBatch(hl_lstm_value value,
-                     hl_lstm_grad grad,
-                     int frameSize,
-                     int batchSize);
-
-  /**
-   * LstmLayer sequence compute API (forwardOneSequence, backwardOneSequence).
-   * Compute order(for each sequence):
-   *   forwardOneSequence:
-   *     if (!reversed) for 0 <= seqId < seqLength
-   *     if (reversed)  for seqLength > seqId >= 0
-   *   backwardOneSequence:
-   *     if (!reversed) for seqLength > seqId >= 0
-   *     if (reversed)  for 0 <= seqId < seqLength
-   */
-  template <bool useGpu>
-  void forwardOneSequence(hl_lstm_value value, int frameSize);
-  template <bool useGpu>
-  void backwardOneSequence(hl_lstm_value value,
-                           hl_lstm_grad grad,
-                           int frameSize);
-
- public:
-  hl_activation_mode_t activeNode_;
-  hl_activation_mode_t activeGate_;
-  hl_activation_mode_t activeState_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmLayer.cpp b/paddle/legacy/gserver/layers/LstmLayer.cpp
deleted file mode 100644
index 43a55d8d490..00000000000
--- a/paddle/legacy/gserver/layers/LstmLayer.cpp
+++ /dev/null
@@ -1,805 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LstmLayer.h"
-#include "paddle/legacy/math/BaseMatrix.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Stat.h"
-
-DECLARE_bool(prev_batch_state);
-
-namespace paddle {
-
-REGISTER_LAYER(lstmemory, LstmLayer);
-
-bool LstmLayer::init(const LayerMap &layerMap,
-                     const ParameterMap &parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-  CHECK_EQ(1U, parameters_.size());
-  CHECK_EQ(getSize() * getSize() * 4, parameters_[0]->getSize());
-  CHECK_EQ(getSize() * 7, biasParameter_->getSize());
-  weight_.reset(new Weight(getSize(), getSize() * 4, parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    bias_.reset(new Weight(1, getSize() * 7, biasParameter_));
-    if (bias_->getW()) {
-      localBias_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  getSize() * 4,
-                                  /* trans= */ false,
-                                  useGpu_);
-      checkIg_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-      checkFg_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-      checkOg_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-
-      localBias_->setData(bias_->getW()->getData());
-      checkIg_->setData(bias_->getW()->getData() + getSize() * 4);
-      checkFg_->setData(bias_->getW()->getData() + getSize() * 5);
-      checkOg_->setData(bias_->getW()->getData() + getSize() * 6);
-    }
-
-    if (bias_->getWGrad()) {
-      localBiasGrad_ = Matrix::create(nullptr,
-                                      /* height= */ 1,
-                                      getSize() * 4,
-                                      /* trans= */ false,
-                                      useGpu_);
-      checkIgGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-      checkFgGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-      checkOgGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-      localBiasGrad_->setData(bias_->getWGrad()->getData());
-      checkIgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 4);
-      checkFgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 5);
-      checkOgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 6);
-    }
-  } else {
-    LOG(FATAL) << "Bias should be here.";
-  }
-  reversed_ = config_.reversed();
-
-  // create IdentityActivation for using drop_rate
-  activation_.reset(ActivationFunction::create(""));
-
-  LstmCompute::init(config_);
-  useBatch_ = true;
-  useSeqParallel_ = false;
-  if (useGpu_ && (getSize() == 32 || getSize() == 64)) {
-    useSeqParallel_ = true;
-  }
-
-  return true;
-}
-
-void LstmLayer::resetState() {
-  CHECK(!reversed_) << "state is not allowed for reversed lstmemory layer";
-  Matrix::resizeOrCreate(
-      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
-  Matrix::resizeOrCreate(prevState_, 1, getSize(), /* trans= */ false, useGpu_);
-  prevOutput_->resize(0, getSize());
-  prevState_->resize(0, getSize());
-  if (FLAGS_prev_batch_state) {
-    useBatch_ = true;
-  } else {
-    useBatch_ = false;
-  }
-}
-
-void LstmLayer::setState(LayerStatePtr state) {
-  CHECK(state->value.size() == 2) << "two matrices are expected for LSTM state";
-  prevOutput_->resize(state->value[0]->getHeight(),
-                      state->value[0]->getWidth());
-  prevState_->resize(state->value[1]->getHeight(), state->value[1]->getWidth());
-  prevOutput_->copyFrom(*(state->value[0]));
-  prevState_->copyFrom(*(state->value[1]));
-}
-
-LayerStatePtr LstmLayer::getState() {
-  LayerStatePtr res = std::make_shared<LayerState>();
-  if (prevOutput_->getHeight() && prevOutput_->getWidth()) {
-    res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
-    res->value[0]->copyFrom(*prevOutput_);
-    res->value.push_back(prevState_->clone(0, 0, useGpu_));
-    res->value[1]->copyFrom(*prevState_);
-  } else {
-    MatrixPtr output =
-        Matrix::create(1, getSize(), /* trans= */ false, useGpu_);
-    MatrixPtr state = Matrix::create(1, getSize(), /* trans= */ false, useGpu_);
-    output->resize(0, getSize());
-    state->resize(0, getSize());
-    res->value.push_back(output);
-    res->value.push_back(state);
-  }
-  return res;
-}
-
-void LstmLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("LstmFwTimer", getName().c_str());
-  Layer::forward(passType);
-
-  const Argument &input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  resetOutput(batchSize, getSize());
-  CHECK_EQ(getSize() * 4, input.value->getWidth());
-  size_t numSequences = input.getNumSequences();
-  const int *starts = input.sequenceStartPositions->getData(false);
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize,
-                         getSize() * 4,
-                         /* trans= */ false,
-                         useGpu_);
-  if (prevOutput_) {
-    size_t prevNumSeq = useBatch_ ? numSequences : 1;
-    if (prevOutput_->getHeight() == 0) {
-      prevOutput_->resize(prevNumSeq, getSize());
-      prevState_->resize(prevNumSeq, getSize());
-      prevOutput_->zeroMem();
-      prevState_->zeroMem();
-    } else {
-      CHECK_EQ(prevOutput_->getHeight(), prevNumSeq)
-          << "the number of sequences must be the same";
-    }
-    Matrix::resizeOrCreate(totalState_,
-                           prevState_->getHeight() + batchSize,
-                           getSize(),
-                           /*trans*/ false,
-                           useGpu_);
-    state_.value = Matrix::create(nullptr,
-                                  /* height= */ batchSize,
-                                  getSize(),
-                                  /* trans= */ false,
-                                  useGpu_);
-    state_.value->setData(totalState_->getData() +
-                          prevState_->getHeight() * getSize());
-  } else {
-    Matrix::resizeOrCreate(state_.value,
-                           /* height= */ batchSize,
-                           getSize(),
-                           /* trans= */ false,
-                           useGpu_);
-  }
-  Matrix::resizeOrCreate(preOutput_.value,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-
-  if (!useBatch_) {
-    forwardSequence(batchSize, numSequences, starts, input.value);
-  } else {
-    if (!useSeqParallel_) {
-      forwardBatch(batchSize, numSequences, starts, input.value);
-    } else {
-      const int *starts = input.sequenceStartPositions->getData(useGpu_);
-      forwardSeqParallel(batchSize, numSequences, starts, input.value);
-    }
-  }
-  /*  activation */ { forwardActivation(); }
-}
-
-void LstmLayer::backward(const UpdateCallback &callback) {
-  REGISTER_TIMER_INFO("LstmBwTimer", getName().c_str());
-  /*  Do derivation */ { backwardActivation(); }
-
-  const Argument &input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  size_t numSequences = input.getNumSequences();
-
-  Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize,
-                         getSize() * 4,
-                         /* trans= */ false,
-                         useGpu_);
-  Matrix::resizeOrCreate(state_.grad,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-  Matrix::resizeOrCreate(preOutput_.grad,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-  state_.grad->zero();
-
-  const int *starts = input.sequenceStartPositions->getData(false);
-  if (!useBatch_) {
-    backwardSequence(batchSize, numSequences, starts, input.grad);
-  } else {
-    if (!useSeqParallel_) {
-      backwardBatch(batchSize, numSequences, starts, input.grad);
-    } else {
-      const int *starts = input.sequenceStartPositions->getData(useGpu_);
-      backwardSeqParallel(batchSize, numSequences, starts, input.grad);
-    }
-  }
-
-  if (bias_) {
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-void LstmLayer::forwardSequence(int batchSize,
-                                size_t numSequences,
-                                const int *starts,
-                                MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("LstmFwSequenceTime", getName().c_str());
-  gate_.value->assign(*inputValue);
-  if (bias_) {
-    gate_.value->addBias(*localBias_, 1);
-  }
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-  lstmValue.gateValue = gate_.value->getData();
-  lstmValue.stateValue = state_.value->getData();
-  lstmValue.stateActiveValue = preOutput_.value->getData();
-  lstmValue.outputValue = output_.value->getData();
-  lstmValue.prevStateValue = nullptr;
-  if (reversed_) {
-    lstmValue.gateValue += (batchSize - 1) * getSize() * 4;
-    lstmValue.stateValue += (batchSize - 1) * getSize();
-    lstmValue.stateActiveValue += (batchSize - 1) * getSize();
-    lstmValue.outputValue += (batchSize - 1) * getSize();
-  }
-
-  auto nextFrame = [&lstmValue](bool reversed, int frameSize) {
-    lstmValue.prevStateValue = lstmValue.stateValue;
-    if (!reversed) {
-      lstmValue.gateValue += frameSize * 4;
-      lstmValue.stateValue += frameSize;
-      lstmValue.stateActiveValue += frameSize;
-      lstmValue.outputValue += frameSize;
-    } else {
-      lstmValue.gateValue -= frameSize * 4;
-      lstmValue.stateValue -= frameSize;
-      lstmValue.stateActiveValue -= frameSize;
-      lstmValue.outputValue -= frameSize;
-    }
-  };
-
-  MatrixPtr frameGate = Matrix::create(nullptr,
-                                       /* height= */ 1,
-                                       getSize() * 4,
-                                       /* trans= */ false,
-                                       useGpu_);
-  MatrixPtr frameOutput = Matrix::create(nullptr,
-                                         /* height= */ 1,
-                                         getSize(),
-                                         /* trans= */ false,
-                                         useGpu_);
-
-  if (!reversed_) {
-    if (prevState_) {
-      lstmValue.prevStateValue = prevState_->getData();
-    }
-    if (prevOutput_) {
-      frameGate->setData(lstmValue.gateValue);
-      frameGate->mul(*prevOutput_, *weight_->getW(), 1, 1);
-    }
-  }
-  AsyncGpuBlock asyncGpuBlock;
-  for (size_t n = 0; n < numSequences; ++n) {
-    int length;
-    if (!reversed_) {
-      length = starts[n + 1] - starts[n];
-    } else {
-      length = starts[numSequences - n] - starts[numSequences - n - 1];
-    }
-    for (int l = 0; l < length; ++l) {
-      if (useGpu_) {
-        LstmCompute::forwardOneSequence<1>(lstmValue, getSize());
-      } else {
-        LstmCompute::forwardOneSequence<0>(lstmValue, getSize());
-      }
-
-      if (l != length - 1) {
-        frameOutput->setData(lstmValue.outputValue);
-        nextFrame(reversed_, getSize());
-        frameGate->setData(lstmValue.gateValue);
-        frameGate->mul(*frameOutput, *weight_->getW(), 1, 1);
-      }
-    }
-    if (n != numSequences - 1) {
-      frameOutput->setData(lstmValue.outputValue);
-      nextFrame(reversed_, getSize());
-      frameGate->setData(lstmValue.gateValue);
-      if (!reversed_) {
-        if (!prevState_) lstmValue.prevStateValue = nullptr;
-        if (prevOutput_) {
-          frameGate->mul(*frameOutput, *weight_->getW(), 1, 1);
-        }
-      } else {
-        lstmValue.prevStateValue = nullptr;
-      }
-    }
-  }
-
-  if (!reversed_) {
-    if (prevState_) {
-      prevState_->assign(*state_.value->subMatrix(batchSize - 1, 1));
-    }
-    if (prevOutput_) {
-      prevOutput_->assign(*output_.value->subMatrix(batchSize - 1, 1));
-    }
-  }
-}
-
-void LstmLayer::backwardSequence(int batchSize,
-                                 size_t numSequences,
-                                 const int *starts,
-                                 MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("LstmBwSequenceTime", getName().c_str());
-  MatrixPtr weightT = weight_->getW()->getTranspose();
-
-  hl_lstm_value lstmValue;
-  hl_lstm_grad lstmGrad;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-  lstmValue.gateValue = gate_.value->getData();
-  lstmValue.stateValue = state_.value->getData();
-  lstmValue.stateActiveValue = preOutput_.value->getData();
-  lstmValue.outputValue = nullptr;
-
-  if (bias_->getWGrad()) {
-    lstmGrad.checkIgGrad = checkIgGrad_->getData();
-    lstmGrad.checkFgGrad = checkFgGrad_->getData();
-    lstmGrad.checkOgGrad = checkOgGrad_->getData();
-  } else {
-    lstmGrad.checkIgGrad = nullptr;
-    lstmGrad.checkFgGrad = nullptr;
-    lstmGrad.checkOgGrad = nullptr;
-  }
-  lstmGrad.gateGrad = gate_.grad->getData();
-  lstmGrad.stateGrad = state_.grad->getData();
-  lstmGrad.stateActiveGrad = nullptr;
-  lstmGrad.outputGrad = output_.grad->getData();
-
-  if (!reversed_) {
-    lstmValue.gateValue += (batchSize - 1) * getSize() * 4;
-    lstmGrad.gateGrad += (batchSize - 1) * getSize() * 4;
-    lstmValue.stateValue += (batchSize - 1) * getSize();
-    lstmGrad.stateGrad += (batchSize - 1) * getSize();
-    lstmValue.stateActiveValue += (batchSize - 1) * getSize();
-    lstmGrad.outputGrad += (batchSize - 1) * getSize();
-    lstmValue.prevStateValue = lstmValue.stateValue - getSize();
-    lstmGrad.prevStateGrad = lstmGrad.stateGrad - getSize();
-  } else {
-    lstmValue.prevStateValue = lstmValue.stateValue + getSize();
-    lstmGrad.prevStateGrad = lstmGrad.stateGrad + getSize();
-  }
-
-  auto nextFrame = [&lstmValue, &lstmGrad](bool reversed, int frameSize) {
-    if (reversed) {
-      lstmValue.gateValue += frameSize * 4;
-      lstmGrad.gateGrad += frameSize * 4;
-      lstmValue.stateValue += frameSize;
-      lstmGrad.stateGrad += frameSize;
-      lstmValue.stateActiveValue += frameSize;
-      lstmGrad.outputGrad += frameSize;
-      lstmValue.prevStateValue = lstmValue.stateValue + frameSize;
-      lstmGrad.prevStateGrad = lstmGrad.stateGrad + frameSize;
-    } else {
-      lstmValue.gateValue -= frameSize * 4;
-      lstmGrad.gateGrad -= frameSize * 4;
-      lstmValue.stateValue -= frameSize;
-      lstmGrad.stateGrad -= frameSize;
-      lstmValue.stateActiveValue -= frameSize;
-      lstmGrad.outputGrad -= frameSize;
-      lstmValue.prevStateValue = lstmValue.stateValue - frameSize;
-      lstmGrad.prevStateGrad = lstmGrad.stateGrad - frameSize;
-    }
-  };
-
-  MatrixPtr frameGate = Matrix::create(nullptr,
-                                       /* height= */ 1,
-                                       getSize() * 4,
-                                       /* trans= */ false,
-                                       useGpu_);
-  MatrixPtr frameOutput = Matrix::create(nullptr,
-                                         /* height= */ 1,
-                                         getSize(),
-                                         /* trans= */ false,
-                                         useGpu_);
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    for (size_t n = 0; n < numSequences; ++n) {
-      int length;
-      int start;
-      if (reversed_) {
-        length = starts[n + 1] - starts[n];
-        start = starts[n];
-      } else {
-        length = starts[numSequences - n] - starts[numSequences - n - 1];
-        start = starts[numSequences - n - 1];
-      }
-      for (int l = 0; l < length; ++l) {
-        if (l == length - 1) {
-          lstmValue.prevStateValue = nullptr;
-          lstmGrad.prevStateGrad = nullptr;
-        }
-        if (useGpu_) {
-          LstmCompute::backwardOneSequence<1>(lstmValue, lstmGrad, getSize());
-        } else {
-          LstmCompute::backwardOneSequence<0>(lstmValue, lstmGrad, getSize());
-        }
-        if (l != length - 1) {
-          frameGate->setData(lstmGrad.gateGrad);
-          nextFrame(reversed_, getSize());
-          frameOutput->setData(lstmGrad.outputGrad);
-          frameOutput->mul(*frameGate, *weightT, 1, 1);
-        } else {
-          nextFrame(reversed_, getSize());
-        }
-      }
-
-      if (weight_->getWGrad()) {
-        if (!reversed_) {
-          weight_->getWGrad()->mul(
-              *output_.value->subMatrix(start, length - 1)->getTranspose(),
-              *gate_.grad->subMatrix(start + 1, length - 1),
-              1,
-              1);
-        } else {
-          weight_->getWGrad()->mul(
-              *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
-              *gate_.grad->subMatrix(start, length - 1),
-              1,
-              1);
-        }
-      }
-    }
-  }
-
-  if (inputGrad) {
-    inputGrad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, 1);
-  }
-}
-
-void LstmLayer::forwardBatch(int batchSize,
-                             size_t numSequences,
-                             const int *starts,
-                             MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("LstmFwBatchTime", getName().c_str());
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-
-  if (!batchValue_) {
-    batchValue_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchValue_->resizeOrCreateBatch(
-      batchSize, numSequences, starts, reversed_, prevOutput_ ? true : false);
-
-  batchValue_->resizeOrCreate(*output_.value);
-  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
-  if (bias_) {
-    gate_.value->addBias(*localBias_, 1);
-  }
-
-  {
-    int numBatch = batchValue_->getNumBatch();
-    int batchSize = 0;
-    AsyncGpuBlock asyncGpuBlock;
-    if (prevState_) {
-      lstmValue.prevStateValue = totalState_->getData();
-    } else {
-      lstmValue.prevStateValue = nullptr;
-    }
-    for (int n = 0; n < numBatch; n++) {
-      MatrixPtr outputValue = batchValue_->getBatchValue(n);
-      MatrixPtr gateValue = batchValue_->getBatchValue(*gate_.value, n);
-      batchSize = outputValue->getHeight();
-
-      if (n != 0) {
-        MatrixPtr batch1 = batchValue_->getBatchValue(n - 1, batchSize);
-        gateValue->mul(*batch1, *weight_->getW(), 1, 1);
-      } else if (prevOutput_) {
-        Matrix::resizeOrCreate(prevBatchOutput2_,
-                               gateValue->getHeight(),
-                               getSize(),
-                               false,
-                               useGpu_);
-        batchValue_->prevOutput2Batch(*prevOutput_, *prevBatchOutput2_);
-        gateValue->mul(*prevBatchOutput2_, *weight_->getW(), 1, 1);
-
-        batchValue_->prevOutput2Batch(*prevState_,
-                                      *totalState_->subMatrix(0, numSequences));
-      }
-
-      lstmValue.gateValue = gateValue->getData();
-      lstmValue.outputValue = outputValue->getData();
-      lstmValue.stateValue =
-          batchValue_->getBatchValue(*state_.value, n)->getData();
-      lstmValue.stateActiveValue =
-          batchValue_->getBatchValue(*preOutput_.value, n)->getData();
-      {
-        if (useGpu_) {
-          LstmCompute::forwardBatch<1>(lstmValue, getSize(), batchSize);
-        } else {
-          LstmCompute::forwardBatch<0>(lstmValue, getSize(), batchSize);
-        }
-      }
-      lstmValue.prevStateValue = lstmValue.stateValue;
-    }
-  }
-  {
-    REGISTER_TIMER_INFO("batchToSeq", getName().c_str());
-    batchValue_->copyBackSeq(*output_.value);
-  }
-  if (prevOutput_) {
-    getPrevBatchOutput(numSequences);
-    getPrevBatchState(numSequences);
-  }
-}
-
-void LstmLayer::getPrevBatchOutput(size_t numSequences) {
-  prevOutput_->resize(numSequences, getSize());
-  batchValue_->getSeqOutputFromBatch(*prevOutput_,
-                                     *batchValue_->getBatchValue());
-}
-
-void LstmLayer::getPrevBatchState(size_t numSequences) {
-  prevState_->resize(numSequences, getSize());
-  batchValue_->getSeqOutputFromBatch(*prevState_, *state_.value);
-}
-
-void LstmLayer::backwardBatch(int batchSize,
-                              size_t numSequences,
-                              const int *starts,
-                              MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("LstmBwBatchTime", getName().c_str());
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-
-  hl_lstm_grad lstmGrad;
-  lstmGrad.stateActiveGrad = preOutput_.grad->getData();
-
-  if (bias_->getWGrad()) {
-    lstmGrad.checkIgGrad = checkIgGrad_->getData();
-    lstmGrad.checkFgGrad = checkFgGrad_->getData();
-    lstmGrad.checkOgGrad = checkOgGrad_->getData();
-  } else {
-    lstmGrad.checkIgGrad = nullptr;
-    lstmGrad.checkFgGrad = nullptr;
-    lstmGrad.checkOgGrad = nullptr;
-  }
-
-  if (!batchGrad_) {
-    batchGrad_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchGrad_->shareIndexWith(*batchValue_);
-
-  {
-    REGISTER_TIMER_INFO("seqToBatch", getName().c_str());
-    batchGrad_->copyFromSeq(*output_.grad);
-  }
-
-  {
-    MatrixPtr weightT = weight_->getW()->getTranspose();
-    int numBatch = batchGrad_->getNumBatch();
-    int batchSize = 0;
-    AsyncGpuBlock asyncGpuBlock;
-    for (int n = (int)numBatch - 1; n >= 0; n--) {
-      MatrixPtr outputGrad = batchGrad_->getBatchValue(n);
-      MatrixPtr gateGrad = batchGrad_->getBatchValue(*gate_.grad, n);
-
-      lstmValue.gateValue =
-          batchGrad_->getBatchValue(*gate_.value, n)->getData();
-      lstmValue.stateValue =
-          batchGrad_->getBatchValue(*state_.value, n)->getData();
-      lstmValue.stateActiveValue =
-          batchGrad_->getBatchValue(*preOutput_.value, n)->getData();
-      lstmGrad.stateGrad =
-          batchGrad_->getBatchValue(*state_.grad, n)->getData();
-      lstmGrad.gateGrad = gateGrad->getData();
-      lstmGrad.outputGrad = outputGrad->getData();
-      {
-        batchSize = outputGrad->getHeight();
-        if (n != 0) {
-          lstmValue.prevStateValue =
-              batchGrad_->getBatchValue(*state_.value, n - 1)->getData();
-          lstmGrad.prevStateGrad =
-              batchGrad_->getBatchValue(*state_.grad, n - 1)->getData();
-        } else {
-          if (prevState_) {
-            lstmValue.prevStateValue = totalState_->getData();
-            lstmGrad.prevStateGrad = nullptr;
-          } else {
-            lstmValue.prevStateValue = nullptr;
-            lstmGrad.prevStateGrad = nullptr;
-          }
-        }
-        if (useGpu_) {
-          LstmCompute::backwardBatch<1>(
-              lstmValue, lstmGrad, getSize(), batchSize);
-        } else {
-          LstmCompute::backwardBatch<0>(
-              lstmValue, lstmGrad, getSize(), batchSize);
-        }
-      }
-
-      if (n != 0) {
-        MatrixPtr tmp = batchGrad_->getBatchValue(n - 1, batchSize);
-        tmp->mul(*gateGrad, *weightT, 1, 1);
-      }
-
-      if (n != 0 && weight_->getWGrad()) {
-        /* backward weight */
-        MatrixPtr outputValue = batchValue_->getBatchValue(n - 1, batchSize);
-        weight_->getWGrad()->mul(*outputValue->getTranspose(), *gateGrad, 1, 1);
-      } else if (prevOutput_ && weight_->getWGrad()) {
-        weight_->getWGrad()->mul(
-            *prevBatchOutput2_->getTranspose(), *gateGrad, 1, 1);
-      }
-    }
-  }
-
-  if (inputGrad) {
-    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, /* scale */ 1);
-  }
-}
-
-void LstmLayer::forwardSeqParallel(int batchSize,
-                                   size_t numSequences,
-                                   const int *starts,
-                                   MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("LstmFwSeqParallelTime", getName().c_str());
-  gate_.value->assign(*inputValue);
-  if (bias_) {
-    gate_.value->addBias(*localBias_, /* scale */ 1);
-  }
-
-  real *gateValue = gate_.value->getData();
-  real *stateValue = state_.value->getData();
-  real *outputValue = output_.value->getData();
-  real *preOutputValue = preOutput_.value->getData();
-  real *checkIg = checkIg_->getData();
-  real *checkFg = checkFg_->getData();
-  real *checkOg = checkOg_->getData();
-  real *weight = weight_->getW()->getData();
-  hl_lstm_parallel_forward(gateValue,
-                           stateValue,
-                           preOutputValue,
-                           outputValue,
-                           checkIg,
-                           checkFg,
-                           checkOg,
-                           weight,
-                           starts,
-                           getSize(),
-                           numSequences,
-                           reversed_,
-                           activeNode_,
-                           activeGate_,
-                           activeState_);
-}
-
-void LstmLayer::backwardSeqParallel(int batchSize,
-                                    size_t numSequences,
-                                    const int *starts,
-                                    MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("LstmBwSeqParallelTime", getName().c_str());
-  real *gateValue = gate_.value->getData();
-  real *gateGrad = gate_.grad->getData();
-  real *stateValue = state_.value->getData();
-  real *stateGrad = state_.grad->getData();
-  real *preOutputValue = preOutput_.value->getData();
-  real *preOutputGrad = preOutput_.grad->getData();
-  real *checkIg = checkIg_->getData();
-  real *checkFg = checkFg_->getData();
-  real *checkOg = checkOg_->getData();
-  real *outputGrad = output_.grad->getData();
-  real *weight = weight_->getW()->getData();
-
-  real *checkIgGrad;
-  real *checkFgGrad;
-  real *checkOgGrad;
-  if (bias_->getWGrad()) {
-    checkIgGrad = checkIgGrad_->getData();
-    checkFgGrad = checkFgGrad_->getData();
-    checkOgGrad = checkOgGrad_->getData();
-  } else {
-    checkIgGrad = nullptr;
-    checkFgGrad = nullptr;
-    checkOgGrad = nullptr;
-  }
-
-  hl_lstm_parallel_backward_data(gateValue,
-                                 gateGrad,
-                                 stateValue,
-                                 stateGrad,
-                                 preOutputValue,
-                                 preOutputGrad,
-                                 outputGrad,
-                                 checkIg,
-                                 checkIgGrad,
-                                 checkFg,
-                                 checkFgGrad,
-                                 checkOg,
-                                 checkOgGrad,
-                                 weight,
-                                 starts,
-                                 getSize(),
-                                 numSequences,
-                                 reversed_,
-                                 activeNode_,
-                                 activeGate_,
-                                 activeState_);
-
-  if (inputGrad) {
-    inputGrad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, 1);
-  }
-
-  real *outputValue = output_.value->getData();
-  if (weight_->getWGrad()) {
-    real *weightGrad = weight_->getWGrad()->getData();
-    hl_lstm_parallel_backward_weight(weightGrad,
-                                     outputValue,
-                                     gateGrad,
-                                     starts,
-                                     getSize(),
-                                     batchSize,
-                                     numSequences,
-                                     reversed_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmLayer.h b/paddle/legacy/gserver/layers/LstmLayer.h
deleted file mode 100644
index 8c8b382f505..00000000000
--- a/paddle/legacy/gserver/layers/LstmLayer.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "LstmCompute.h"
-#include "SequenceToBatch.h"
-#include "paddle/legacy/math/BaseMatrix.h"
-#include "paddle/legacy/math/Matrix.h"
-namespace paddle {
-
-/**
- * @brief LstmLayer takes 1 input layer with size * 4.
- * Input layer is diveded into 4 equal parts:
- *   (input_s, input_ig, input_fg, input_og)
- *
- * For each sequence [start, end] it performs the following computation:
- * @code
- * output_{i} = actState(state_{i}) * actGate(outputGate_{i})
- * state_{i} = actInput(input_s_{i} + bias_s +
- *             output_{i-1} * recurrIW) * actGate(inputGate_{i}) +
- *             actGate(forgetGate_{i}) * state_{i-1}
- * inputGate = input_ig_{i} + bias_ig + output_{i-1} * recurrIGW +
- *             state_{i-1} * inputCheck
- * ouputGate = input_og_{i} + bias_og + output_{i-1} * recurrOGW +
- *             state_{i} * outputCheck
- * forgetGate = input_fg_{i} + bias_fg + output_{i-1} * recurrFGW +
- *              state_{i-1} * forgetCheck
- * @endcode
- *
- * - parameter[0] consists of (recurrIW, recurrIGW, recurrFGW, recurrOGW)
- * - baisParameter consists of
- *   (bias_s, bias_ig, bias_og, bias_fg, inputCheck, forgetCheck, outputCheck)
- *
- * - actInput is defined by config active_type.
- * - actState is defined by config active_state_type.
- * - actGate is defined by config actvie_gate_type.
- *
- * There are two ways to compute, namely one sequence by one sequence or
- * one batch by one batch. By default and no setting pre_batch_state true,
- * it will compute batch by batch.
- *
- * The formula in the paper is as follows:
- * \f[
- * i_t = \sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) \\
- * f_t = \sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) \\
- * \tilde{c_t} = tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) \\
- * o_t = \sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) \\
- * c_t = f_t * c_{t-1} + i_t * \tilde{c_t} \\
- * h_t = o_t tanh(c_t)
- * \f]
- *
- * @note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
- * operations on the input sequence were NOT included in LstmLayer. So
- * users should use fc_layer or mixed_layer before lstm_later.
- *
- * The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
- * The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$.
- */
-
-class LstmLayer : public Layer, public LstmCompute {
- public:
-  explicit LstmLayer(const LayerConfig &config) : Layer(config) {}
-
-  bool init(const LayerMap &layerMap,
-            const ParameterMap &parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback &callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
- protected:
-  /**
-   * @brief Compute lstm forward one sequence by one sequence.
-   * @param batchSize The batchSize is not equal to the batch_size in
-   * the config file. It is the total words number of all samples
-   * in this forward batch.
-   * @param numSequences The sample number. It is equal to the batch_size
-   * in the config file.
-   * @param starts Each start position of each samples.
-   * @param inputValue The input values.
-   */
-  void forwardSequence(int batchSize,
-                       size_t numSequences,
-                       const int *starts,
-                       MatrixPtr inputValue);
-  /**
-   * Compute lstm backward one sequence by one sequence.
-   */
-  void backwardSequence(int batchSize,
-                        size_t numSequences,
-                        const int *starts,
-                        MatrixPtr inputGrad);
-
-  /**
-   * Compute lstm forward one batch by one batch. The batch value is
-   * reorganized by SequenceToBatch class. The batch output value will
-   * be convert into sequence value after finishing forward. Here, one
-   * batch contains one word of each sample. If the length of each sample
-   * is not equality, the batch will not pads zero and contains less words.
-   * The total batch numbers are the max length of the sequence. The details
-   * can refer to SequenceToBatch class. On GPU mode, it will launch GPU
-   * kernel for loop.
-   *
-   * @code
-   * for (int i = 0; i < numBatch(max_sequence_length); ++i) {
-   *   compute one batch.
-   * }
-   * @endcode
-   */
-  void forwardBatch(int batchSize,
-                    size_t numSequences,
-                    const int *starts,
-                    MatrixPtr inputValue);
-  /**
-   * Compute lstm backward one batch by one batch.
-   */
-  void backwardBatch(int batchSize,
-                     size_t numSequences,
-                     const int *starts,
-                     MatrixPtr inputGrad);
-
-  /**
-   * This function only supports GPU. It not need to reorganize input into
-   * batch value. It will launch one kernel to parallelly compute forward
-   * propagation in sequence level.
-   */
-  void forwardSeqParallel(int batchSize,
-                          size_t numSequences,
-                          const int *starts,
-                          MatrixPtr inputValue);
-  /**
-   * Backward propagation corresponding to forwardSeqParallel.
-   */
-  void backwardSeqParallel(int batchSize,
-                           size_t numSequences,
-                           const int *starts,
-                           MatrixPtr inputGrad);
-  /**
-   * This function is used for sequence generation and get output after
-   * forwardBatch.
-   */
-  void getPrevBatchOutput(size_t numSequences);
-  /**
-   * This function is used for sequence generation and get state after
-   * forwardBatch.
-   */
-  void getPrevBatchState(size_t numSequences);
-
- protected:
-  /// Learned parameters, shape: (size, 4*size).
-  /// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
-  std::unique_ptr<Weight> weight_;
-  /// Learned bias parameter, shape: (1, 7 * size).
-  /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf},
-  /// W_{co}\f$.
-  std::unique_ptr<Weight> bias_;
-  /// The reeal bias, point to \f$b_i, b_f, b_c, b_o\f$.
-  MatrixPtr localBias_;
-  /// The peephole connection for input gate.
-  MatrixPtr checkIg_;
-  /// The peephole connection for forget gate.
-  MatrixPtr checkFg_;
-  /// The peephole connection for output gate.
-  MatrixPtr checkOg_;
-  /// The gradient of real bias
-  MatrixPtr localBiasGrad_;
-  /// The gradient of peephole connection for input gates.
-  MatrixPtr checkIgGrad_;
-  /// The gradient of peephole connection for forget gates.
-  MatrixPtr checkFgGrad_;
-  /// The gradient of peephole connection for output gates.
-  MatrixPtr checkOgGrad_;
-
-  /// Stores the cell state of previous time step, namely \f$c_{t-1}\f$.
-  Argument state_;
-  /// Stores the hidden of previous time step, namely \f$h_{t-1}\f$.
-  Argument preOutput_;
-  /// Stores the value and gradient of four gates, namely
-  /// \f$i_t, f_t, o_t, c_t\f$.
-  Argument gate_;
-  /// Whether it is reversed lstm.
-  bool reversed_;
-  /// Whether to use batch method to compute.
-  bool useBatch_;
-  /// Whether to use sequence parallell method to compute.
-  bool useSeqParallel_;
-  /// batchValue_ is used in method of batch calculation. It stores the
-  /// batch value after reorganized input.
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  /// The gradient of batchValue_.
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-
-  /// Used in generation and stores the state of previous time step.
-  MatrixPtr prevState_;
-  /// Used in generation and stores the output of previous time step.
-  MatrixPtr prevOutput_;
-  MatrixPtr prevBatchOutput2_;
-  /// The total state.
-  MatrixPtr totalState_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmStepLayer.cpp b/paddle/legacy/gserver/layers/LstmStepLayer.cpp
deleted file mode 100644
index f02f8ad62fe..00000000000
--- a/paddle/legacy/gserver/layers/LstmStepLayer.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "LstmCompute.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/*
- * LstmStepLayer used in recurrent layer group.
- */
-class LstmStepLayer : public Layer, public LstmCompute {
- protected:
-  Argument state_;
-  Argument gate_;
-  Argument stateActive_;
-  MatrixPtr checkIg_, checkFg_, checkOg_;
-  MatrixPtr checkIgGrad_, checkFgGrad_, checkOgGrad_;
-  std::unique_ptr<Weight> weight_;
-
- public:
-  explicit LstmStepLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~LstmStepLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(lstm_step, LstmStepLayer);
-
-bool LstmStepLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(2U, inputLayers_.size());
-
-  checkIg_ = Matrix::create(nullptr,
-                            /* height= */ 1,
-                            getSize(),
-                            /* trans= */ false,
-                            useGpu_);
-  checkFg_ = Matrix::create(nullptr,
-                            /* height= */ 1,
-                            getSize(),
-                            /* trans= */ false,
-                            useGpu_);
-  checkOg_ = Matrix::create(nullptr,
-                            /* height= */ 1,
-                            getSize(),
-                            /* trans= */ false,
-                            useGpu_);
-  checkIgGrad_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-  checkFgGrad_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-  checkOgGrad_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-
-  if (biasParameter_.get() != NULL) {
-    CHECK_EQ(getSize() * 3, biasParameter_->getSize());
-    weight_.reset(new Weight(1, getSize() * 3, biasParameter_));
-    if (weight_->getW()) {
-      real* data = weight_->getW()->getData();
-      checkIg_->setData(data);
-      checkFg_->setData(data + getSize());
-      checkOg_->setData(data + getSize() * 2);
-    }
-
-    if (weight_->getWGrad()) {
-      real* data = weight_->getWGrad()->getData();
-      checkIgGrad_->setData(data);
-      checkFgGrad_->setData(data + getSize());
-      checkOgGrad_->setData(data + getSize() * 2);
-    }
-  }
-
-  setOutput("state", &state_);
-  LstmCompute::init(config_);
-  return true;
-}
-
-void LstmStepLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("LstmRecurrentFwTime", getName().c_str());
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  const Argument& prevState = getInput(1);
-  CHECK_EQ(getSize() * 4, input.value->getWidth());
-  CHECK_EQ(getSize(), prevState.value->getWidth());
-  int batchSize = input.getBatchSize();
-  reserveOutput(batchSize, getSize());
-  resetSpecifyOutput(state_,
-                     batchSize,
-                     getSize(),
-                     /*  isValueClean */ false,
-                     /* isGradClean */ true);
-  resetSpecifyOutput(gate_,
-                     batchSize,
-                     getSize() * 4,
-                     /* isValueClean */ false,
-                     /* isGradClean */ false);
-  resetSpecifyOutput(stateActive_,
-                     batchSize,
-                     getSize(),
-                     /*  isValueClean */ false,
-                     /* isGradClean */ false);
-  gate_.value->assign(*input.value);
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-  lstmValue.gateValue = gate_.value->getData();
-  lstmValue.stateValue = state_.value->getData();
-  lstmValue.prevStateValue = prevState.value->getData();
-  lstmValue.stateActiveValue = stateActive_.value->getData();
-  lstmValue.outputValue = output_.value->getData();
-
-  if (useGpu_) {
-    LstmCompute::forwardBatch<1>(lstmValue, getSize(), batchSize);
-  } else {
-    LstmCompute::forwardBatch<0>(lstmValue, getSize(), batchSize);
-  }
-}
-
-void LstmStepLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("LstmRecurrentBwTime", getName().c_str());
-  const Argument& input = getInput(0);
-  const Argument& prevState = getInput(1);
-  int batchSize = input.getBatchSize();
-
-  hl_lstm_value lstmValue;
-  hl_lstm_grad lstmGrad;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-  lstmValue.gateValue = gate_.value->getData();
-  lstmValue.prevStateValue = prevState.value->getData();
-  lstmValue.stateValue = state_.value->getData();
-  lstmValue.stateActiveValue = stateActive_.value->getData();
-
-  lstmGrad.gateGrad = gate_.grad->getData();
-  if (prevState.grad) {
-    lstmGrad.prevStateGrad = prevState.grad->getData();
-  } else {
-    lstmGrad.prevStateGrad = nullptr;
-  }
-  lstmGrad.stateGrad = state_.grad->getData();
-  lstmGrad.stateActiveGrad = stateActive_.grad->getData();
-  lstmGrad.outputGrad = output_.grad->getData();
-  lstmGrad.checkIgGrad = checkIgGrad_->getData();
-  lstmGrad.checkFgGrad = checkFgGrad_->getData();
-  lstmGrad.checkOgGrad = checkOgGrad_->getData();
-
-  if (useGpu_) {
-    LstmCompute::backwardBatch<1>(lstmValue, lstmGrad, getSize(), batchSize);
-  } else {
-    LstmCompute::backwardBatch<0>(lstmValue, lstmGrad, getSize(), batchSize);
-  }
-
-  if (input.grad) {
-    input.grad->add(*gate_.grad);
-  }
-
-  if (weight_) {
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MDLstmLayer.cpp b/paddle/legacy/gserver/layers/MDLstmLayer.cpp
deleted file mode 100644
index 4838183e8cc..00000000000
--- a/paddle/legacy/gserver/layers/MDLstmLayer.cpp
+++ /dev/null
@@ -1,769 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LstmLayer.h"
-#include "paddle/legacy/math/BaseMatrix.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-class CoordIterator {
- public:
-  std::vector<int> dims_;
-  std::vector<bool> directions_;
-  std::vector<int> curPos_;
-  bool end_;
-
-  void step(size_t d, bool reversed) {
-    if (directions_[d] ^ reversed) {
-      if (curPos_[d] == dims_[d] - 1) {
-        curPos_[d] = 0;
-        if (d) {
-          step(d - 1, reversed);
-        } else {
-          end_ = true;
-        }
-      } else {
-        curPos_[d]++;
-      }
-    } else {
-      if (curPos_[d] == 0) {
-        curPos_[d] = dims_[d] - 1;
-        if (d) {
-          step(d - 1, reversed);
-        } else {
-          end_ = true;
-        }
-      } else {
-        curPos_[d]--;
-      }
-    }
-  }
-
- public:
-  CoordIterator(std::vector<int> dim, std::vector<bool> directions)
-      : dims_(dim), directions_(directions), end_(false) {
-    CHECK_EQ(dims_.size(), directions_.size());
-    for (size_t i = 0; i < dims_.size(); i++) {
-      curPos_.push_back(-1);
-    }
-  }
-  CoordIterator& operator++() {
-    step(dims_.size() - 1, false);
-    return *this;
-  }
-
-  CoordIterator& operator--() {
-    step(dims_.size() - 1, true);
-    return *this;
-  }
-
-  std::vector<int>& curPos() { return curPos_; }
-
-  int offset() {
-    int offset = curPos_[0];
-    for (size_t i = 1; i < dims_.size(); i++) {
-      offset = offset * dims_[i] + curPos_[i];
-    }
-    return offset;
-  }
-
-  int offset(const std::vector<int>& pos) {
-    int offset = pos[0];
-    for (size_t i = 1; i < dims_.size(); i++) {
-      offset = offset * dims_[i] + pos[i];
-    }
-    return offset;
-  }
-
-  std::vector<int>& begin() {
-    for (size_t i = 0; i < dims_.size(); i++) {
-      curPos_[i] = directions_[i] ? 0 : dims_[i] - 1;
-    }
-    end_ = false;
-    return curPos_;
-  }
-
-  std::vector<int>& rbegin() {
-    for (size_t i = 0; i < dims_.size(); i++) {
-      curPos_[i] = directions_[i] ? dims_[i] - 1 : 0;
-    }
-    end_ = false;
-    return curPos_;
-  }
-
-  bool end() { return end_; }
-
-  bool getPrePos(const std::vector<int>& delays,
-                 int idx,
-                 std::vector<int>& prePos) {
-    bool isAvial = true;
-    prePos.clear();
-    prePos.reserve(directions_.size());
-    for (size_t i = 0; i < directions_.size(); i++) {
-      if (int(i) == idx) {
-        prePos.push_back(curPos_[i] + delays[i] * (directions_[i] ? 1 : -1));
-        if (prePos[i] < 0) {
-          prePos[i] = 0;
-          isAvial = false;
-        }
-        if (prePos[i] >= dims_[i]) {
-          prePos[i] = dims_[i] - 1;
-          isAvial = false;
-        }
-      } else {
-        prePos.push_back(curPos_[i]);
-      }
-    }
-    return isAvial;
-  }
-
-  bool getNextPos(const std::vector<int>& delays,
-                  int idx,
-                  std::vector<int>& nextPos) {
-    bool isAvial = true;
-    nextPos.clear();
-    nextPos.reserve(directions_.size());
-    for (size_t i = 0; i < directions_.size(); i++) {
-      if (int(i) == idx) {
-        nextPos.push_back(curPos_[i] - delays[i] * (directions_[i] ? 1 : -1));
-        if (nextPos[i] < 0) {
-          nextPos[i] = 0;
-          isAvial = false;
-        }
-        if (nextPos[i] >= dims_[i]) {
-          nextPos[i] = dims_[i] - 1;
-          isAvial = false;
-        }
-      } else {
-        nextPos.push_back(curPos_[i]);
-      }
-    }
-    return isAvial;
-  }
-};
-/*
- * MDLstmLayer takes 1 input layer with size * (3+numDims).
- * For each sequence [start, end] it performs the following computation:
- * out_i = actState(state_i) * actGate(outputGate_i)
- *
- * For example the image with 2 dims, we take the scanning order from left-top
- * to right-bottom, then the 2 previous states of the current pixels are the
- * ones located at left and top. And each of them has a independent forget gate.
- *
- * state_i = actInput(input_i) * actGate(inputGate_i) +
- *           \sum{j}(actGate(forgetGate_i_j) * state_prev_i_j)
- *
- * inputGate = input_i * inputW + \sum{j}(output_prev_i_j * recurrInputW_j) +
- *             \sum{j}(state_prev_i_j * inputCheck_j)
- *
- * ouputGate = input_i * outputW + \sum{j}(output_prev_i_j * recurrOutputW_j) +
- *             state_i * outputCheck
- *
- * forgetGate_j = input_i * forgetW_j + \sum{j}(output_prev_i_j *
- *                recurrForgetW_j) + \sum{j}(state_prev_i_j * forgetCheck_j)
- *
- * IG Layer: (Input, InputGate, ForgetGates, OutputGate) * OutputSize
- * */
-
-class MDLstmLayer : public LstmLayer {
- public:
-  explicit MDLstmLayer(const LayerConfig& config) : LstmLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
- protected:
-  void forwardOneSequence(int start, CoordIterator& coordIter);
-  void backwardOneSequence(int start, CoordIterator& coordIter);
-  void forwardGate2OutputSequence(int start, CoordIterator& coordIter);
-  void backwardGate2OutputSequence(int start, CoordIterator& coordIter);
-
- protected:
-  std::vector<Argument> frameInputGate_;
-  std::vector<Argument> frameForgetGate_;
-  std::vector<Argument> frameOutputGate_;
-  std::vector<Argument> frameInputNode_;
-  std::vector<Argument> frameGate_;
-  std::vector<Argument> frameState_;
-  std::vector<Argument> framePreOutput_;
-  std::vector<Argument> frameOutput_;
-
-  // Activation
-  std::unique_ptr<ActivationFunction> activationGate_;
-  std::unique_ptr<ActivationFunction> activationState_;
-
-  int numDims_;
-  size_t numBlocks_;
-  std::vector<bool> directions_;
-  std::vector<int> delays_;
-  std::vector<std::vector<int>> dimsV_;
-};
-
-REGISTER_LAYER(mdlstmemory, MDLstmLayer);
-
-bool MDLstmLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-  CHECK_EQ(1U, parameters_.size());
-
-  numBlocks_ = getSize();
-  numDims_ = config_.directions_size();
-  CHECK_EQ(numBlocks_ * numBlocks_ * (3 + numDims_), parameters_[0]->getSize());
-
-  // inode(1), ig(1), fg(numDims_), og(1), peepIg(1), peepFg(numDims_),
-  // peepOg(1), then size of localBias_ is 3+numDims_
-  CHECK_EQ(numBlocks_ * (5 + 2 * numDims_), biasParameter_->getSize());
-  weight_.reset(
-      new Weight(numBlocks_, numBlocks_ * (3 + numDims_), parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    bias_.reset(new Weight(1, numBlocks_ * (5 + 2 * numDims_), biasParameter_));
-    localBias_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                numBlocks_ * (3 + numDims_),
-                                /* trans= */ false,
-                                useGpu_);
-    checkIg_ = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    checkFg_ = Matrix::create(nullptr,
-                              /* height= */ numDims_,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    checkOg_ = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    localBiasGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    numBlocks_ * (3 + numDims_),
-                                    /* trans= */ false,
-                                    useGpu_);
-    checkIgGrad_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  numBlocks_,
-                                  /* trans= */ false,
-                                  useGpu_);
-    checkFgGrad_ = Matrix::create(nullptr,
-                                  /* height= */ numDims_,
-                                  numBlocks_,
-                                  /* trans= */ false,
-                                  useGpu_);
-    checkOgGrad_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  numBlocks_,
-                                  /* trans= */ false,
-                                  useGpu_);
-
-    localBias_->setData(bias_->getW()->getData());
-    checkIg_->setData(bias_->getW()->getData() + numBlocks_ * (3 + numDims_));
-    checkFg_->setData(bias_->getW()->getData() + numBlocks_ * (4 + numDims_));
-    checkOg_->setData(bias_->getW()->getData() +
-                      numBlocks_ * (4 + 2 * numDims_));
-
-    if (bias_->getWGrad()) {
-      localBiasGrad_->setData(bias_->getWGrad()->getData());
-      checkIgGrad_->setData(bias_->getWGrad()->getData() +
-                            numBlocks_ * (3 + numDims_));
-      checkFgGrad_->setData(bias_->getWGrad()->getData() +
-                            numBlocks_ * (4 + numDims_));
-      checkOgGrad_->setData(bias_->getWGrad()->getData() +
-                            numBlocks_ * (4 + 2 * numDims_));
-    }
-  } else {
-    LOG(FATAL) << "Bias should be here.";
-  }
-  for (int i = 0; i < numDims_; i++) {
-    directions_.push_back(config_.directions(i));
-  }
-  for (int i = 0; i < numDims_; i++) {
-    delays_.push_back(-1);
-  }
-  activationGate_.reset(ActivationFunction::create(config_.active_gate_type()));
-  activationState_.reset(
-      ActivationFunction::create(config_.active_state_type()));
-
-  return true;
-}
-
-void MDLstmLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  int numSequences = input.getNumSequences();
-  resetOutput(batchSize, numBlocks_);
-  CHECK_EQ(numBlocks_ * (3 + numDims_), input.value->getWidth());
-  const int* starts = input.sequenceStartPositions->getData(false);
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  int* dimsData = input.cpuSequenceDims->getData();
-  CHECK_EQ(int(input.cpuSequenceDims->getSize()), numDims_* numSequences);
-
-  for (int i = 0; i < numSequences; i++) {
-    std::vector<int> dims;
-    for (int j = 0; j < numDims_; j++) {
-      dims.push_back(dimsData[i * numDims_ + j]);
-    }
-    dimsV_.push_back(dims);
-  }
-
-  frameInputGate_.reserve(batchSize);
-  frameForgetGate_.reserve(batchSize);
-  frameOutputGate_.reserve(batchSize);
-  frameInputNode_.reserve(batchSize);
-  frameGate_.reserve(batchSize);
-  frameState_.reserve(batchSize);
-  framePreOutput_.reserve(batchSize);
-  frameOutput_.reserve(batchSize);
-
-  Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize,
-                         numBlocks_ * (3 + numDims_),
-                         /* trans= */ false,
-                         useGpu_);
-
-  for (int i = frameGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_ * (3 + numDims_),
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_ * (3 + numDims_),
-                              /* trans= */ false,
-                              useGpu_);
-    frameGate_.push_back(arg);
-  }
-  for (int i = frameInputGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameInputGate_.push_back(arg);
-  }
-  for (int i = frameForgetGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ numDims_,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ numDims_,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameForgetGate_.push_back(arg);
-  }
-  for (int i = frameOutputGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameOutputGate_.push_back(arg);
-  }
-  for (int i = frameInputNode_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameInputNode_.push_back(arg);
-  }
-  for (int i = frameState_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(
-        /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-    frameState_.push_back(arg);
-  }
-  for (int i = framePreOutput_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(
-        /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-    framePreOutput_.push_back(arg);
-  }
-  for (int i = frameOutput_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameOutput_.push_back(arg);
-  }
-
-  for (int i = 0; i < batchSize; i++) {
-    frameOutput_[i].value->setData(output_.value->getData() + i * numBlocks_);
-    frameGate_[i].value->setData(gate_.value->getData() +
-                                 i * numBlocks_ * (3 + numDims_));
-    frameInputNode_[i].value->setData(gate_.value->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * 0);
-    frameInputGate_[i].value->setData(gate_.value->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * 1);
-    frameForgetGate_[i].value->setData(gate_.value->getData() +
-                                       i * numBlocks_ * (3 + numDims_) +
-                                       numBlocks_ * 2);
-    frameOutputGate_[i].value->setData(gate_.value->getData() +
-                                       i * numBlocks_ * (3 + numDims_) +
-                                       numBlocks_ * (2 + numDims_));
-  }
-
-  AsyncGpuBlock asyncGpuBlock;
-  gate_.value->assign(*input.value);
-
-  if (bias_) {
-    gate_.value->addBias(*localBias_, 1);
-  }
-
-  for (int i = 0; i < numSequences; i++) {
-    CoordIterator coordIter(dimsV_[i], directions_);
-    forwardOneSequence(starts[i], coordIter);
-  }
-}
-
-void MDLstmLayer::forwardGate2OutputSequence(int start,
-                                             CoordIterator& coordIter) {
-  int idxCurr = start + coordIter.offset();
-  std::vector<int> preOffsetV;
-  preOffsetV.reserve(numDims_);
-  for (int i = 0; i < numDims_; i++) {
-    std::vector<int> prePos;
-    if (coordIter.getPrePos(delays_, i, prePos)) {
-      preOffsetV[i] = coordIter.offset(prePos);
-    } else {
-      preOffsetV[i] = -1;
-    }
-  }
-
-  for (int i = 0; i < numDims_; i++) {
-    if (preOffsetV[i] >= 0) {
-      frameInputGate_[idxCurr].value->addDotMul(
-          *frameState_[start + preOffsetV[i]].value, *checkIg_, 1.0, 1.0);
-
-      MatrixPtr fgGateOneDim = Matrix::create(
-          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      MatrixPtr checkFgOneDim =
-          Matrix::create(checkFg_->getData() + i * numBlocks_,
-                         1.0,
-                         numBlocks_,
-                         false,
-                         useGpu_);
-      fgGateOneDim->addDotMul(
-          *frameState_[start + preOffsetV[i]].value, *checkFgOneDim, 1.0, 1.0);
-    }
-  }
-  auto status = activationGate_->forward(frameInputGate_[idxCurr]);
-  status.check();
-  status = activationGate_->forward(frameForgetGate_[idxCurr]);
-  status.check();
-  status = activation_->forward(frameInputNode_[idxCurr]);
-  status.check();
-
-  frameState_[idxCurr].value->zeroMem();
-  for (int i = 0; i < numDims_; i++) {
-    if (preOffsetV[i] >= 0) {
-      MatrixPtr fgGateOneDim = Matrix::create(
-          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      frameState_[idxCurr].value->addDotMul(
-          *frameState_[start + preOffsetV[i]].value, *fgGateOneDim, 1.0, 1.0);
-    }
-  }
-  frameState_[idxCurr].value->addDotMul(*frameInputNode_[idxCurr].value,
-                                        *frameInputGate_[idxCurr].value,
-                                        1.0,
-                                        1.0);
-
-  frameOutputGate_[idxCurr].value->addDotMul(
-      *frameState_[idxCurr].value, *checkOg_, 1.0, 1.0);
-  status = activationGate_->forward(frameOutputGate_[idxCurr]);
-  status.check();
-
-  framePreOutput_[idxCurr].value->copyFrom(*(frameState_[idxCurr].value));
-  status = activationState_->forward(framePreOutput_[idxCurr]);
-  status.check();
-
-  frameOutput_[idxCurr].value->dotMul(*framePreOutput_[idxCurr].value,
-                                      *frameOutputGate_[idxCurr].value);
-}
-
-void MDLstmLayer::forwardOneSequence(int start, CoordIterator& coordIter) {
-  for (coordIter.begin(); !coordIter.end(); ++coordIter) {
-    int offset = coordIter.offset();
-    for (int i = 0; i < numDims_; i++) {
-      std::vector<int> prePos;
-      if (coordIter.getPrePos(delays_, i, prePos)) {
-        int preOffset = coordIter.offset(prePos);
-        frameGate_[start + offset].value->mul(
-            *frameOutput_[start + preOffset].value, *weight_->getW(), 1.0, 1.0);
-      }
-    }
-    forwardGate2OutputSequence(start, coordIter);
-  }
-}
-
-void MDLstmLayer::backward(const UpdateCallback& callback) {
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  const int* starts = input.sequenceStartPositions->getData(false);
-  size_t numSequences = input.getNumSequences();
-
-  Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize,
-                         numBlocks_ * (3 + numDims_),
-                         /* trans= */ false,
-                         useGpu_);
-
-  for (int i = 0; i < batchSize; i++) {
-    if (frameState_[i].grad == NULL)
-      frameState_[i].grad = Matrix::create(
-          /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-  }
-  for (int i = 0; i < batchSize; i++) {
-    if (framePreOutput_[i].grad == NULL)
-      framePreOutput_[i].grad = Matrix::create(
-          /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-  }
-
-  for (int i = 0; i < batchSize; i++) {
-    frameOutput_[i].grad->setData(output_.grad->getData() + i * numBlocks_);
-    frameGate_[i].grad->setData(gate_.grad->getData() +
-                                i * numBlocks_ * (3 + numDims_));
-    frameInputNode_[i].grad->setData(gate_.grad->getData() +
-                                     i * numBlocks_ * (3 + numDims_) +
-                                     numBlocks_ * 0);
-    frameInputGate_[i].grad->setData(gate_.grad->getData() +
-                                     i * numBlocks_ * (3 + numDims_) +
-                                     numBlocks_ * 1);
-    frameForgetGate_[i].grad->setData(gate_.grad->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * 2);
-    frameOutputGate_[i].grad->setData(gate_.grad->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * (2 + numDims_));
-  }
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-
-    for (size_t i = 0; i < numSequences; i++) {
-      CoordIterator coordIter(dimsV_[i], directions_);
-      backwardOneSequence(starts[i], coordIter);
-    }
-  }
-
-  if (input.grad) {
-    input.grad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, 1);
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-void MDLstmLayer::backwardGate2OutputSequence(int start,
-                                              CoordIterator& coordIter) {
-  int idxCurr = start + coordIter.offset();
-  std::vector<int> preOffsetV;
-  std::vector<int> nextOffsetV;
-  preOffsetV.reserve(numDims_);
-  nextOffsetV.reserve(numDims_);
-  for (int i = 0; i < numDims_; i++) {
-    std::vector<int> prePos;
-    if (coordIter.getPrePos(delays_, i, prePos)) {
-      preOffsetV[i] = coordIter.offset(prePos);
-    } else {
-      preOffsetV[i] = -1;
-    }
-    std::vector<int> nextPos;
-    if (coordIter.getNextPos(delays_, i, nextPos)) {
-      nextOffsetV[i] = coordIter.offset(nextPos);
-    } else {
-      nextOffsetV[i] = -1;
-    }
-  }
-
-  framePreOutput_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
-                                        *frameOutputGate_[idxCurr].value);
-  activationState_->backward(framePreOutput_[idxCurr]).check();
-  frameState_[idxCurr].grad->copyFrom(*(framePreOutput_[idxCurr].grad));
-
-  frameOutputGate_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
-                                         *framePreOutput_[idxCurr].value);
-  activationGate_->backward(frameOutputGate_[idxCurr]).check();
-
-  frameState_[idxCurr].grad->addDotMul(
-      *frameOutputGate_[idxCurr].grad, *checkOg_, 1.0, 1.0);
-  for (int i = 0; i < numDims_; i++) {
-    if (nextOffsetV[i] >= 0) {
-      frameState_[idxCurr].grad->addDotMul(
-          *frameInputGate_[start + nextOffsetV[i]].grad, *checkIg_, 1.0, 1.0);
-
-      MatrixPtr fgGateOneDimGrad = Matrix::create(
-          frameForgetGate_[start + nextOffsetV[i]].grad->getData() +
-              i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      MatrixPtr fgGateOneDimVal = Matrix::create(
-          frameForgetGate_[start + nextOffsetV[i]].value->getData() +
-              i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      MatrixPtr checkFgOneDim = Matrix::create(
-          checkFg_->getData() + i * numBlocks_, 1, numBlocks_, false, useGpu_);
-
-      frameState_[idxCurr].grad->addDotMul(
-          *fgGateOneDimGrad, *checkFgOneDim, 1.0, 1.0);
-      frameState_[idxCurr].grad->addDotMul(
-          *frameState_[start + nextOffsetV[i]].grad,
-          *fgGateOneDimVal,
-          1.0,
-          1.0);
-    }
-  }
-
-  frameInputNode_[idxCurr].grad->dotMul(*frameState_[idxCurr].grad,
-                                        *frameInputGate_[idxCurr].value);
-  frameInputGate_[idxCurr].grad->dotMul(*frameState_[idxCurr].grad,
-                                        *frameInputNode_[idxCurr].value);
-
-  frameForgetGate_[idxCurr].grad->zeroMem();
-  for (int i = 0; i < numDims_; i++) {
-    if (preOffsetV[i] >= 0) {
-      MatrixPtr fgGateOneDimGrad = Matrix::create(
-          frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      fgGateOneDimGrad->addDotMul(*frameState_[idxCurr].grad,
-                                  *frameState_[start + preOffsetV[i]].value,
-                                  1.0,
-                                  1.0);
-    }
-  }
-
-  activationGate_->backward(frameInputGate_[idxCurr]).check();
-  activationGate_->backward(frameForgetGate_[idxCurr]).check();
-  activation_->backward(frameInputNode_[idxCurr]).check();
-
-  if (bias_->getWGrad()) {
-    for (int i = 0; i < numDims_; i++) {
-      if (preOffsetV[i] >= 0) {
-        checkIgGrad_->addDotMul(*frameInputGate_[idxCurr].grad,
-                                *frameState_[start + preOffsetV[i]].value,
-                                1.0,
-                                1.0);
-
-        MatrixPtr fgGateOneDimGrad = Matrix::create(
-            frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
-            1,
-            numBlocks_,
-            false,
-            useGpu_);
-        MatrixPtr checkFgOneDimGrad =
-            Matrix::create(checkFgGrad_->getData() + i * numBlocks_,
-                           1,
-                           numBlocks_,
-                           false,
-                           useGpu_);
-        checkFgOneDimGrad->addDotMul(*fgGateOneDimGrad,
-                                     *frameState_[start + preOffsetV[i]].value,
-                                     1.0,
-                                     1.0);
-      }
-    }
-    checkOgGrad_->addDotMul(
-        *frameOutputGate_[idxCurr].grad, *frameState_[idxCurr].value, 1.0, 1.0);
-  }
-}
-
-void MDLstmLayer::backwardOneSequence(int start, CoordIterator& coordIter) {
-  MatrixPtr weightT = weight_->getW()->getTranspose();
-  for (coordIter.rbegin(); !coordIter.end(); --coordIter) {
-    int offset = coordIter.offset();
-    backwardGate2OutputSequence(start, coordIter);
-    for (int i = 0; i < numDims_; i++) {
-      std::vector<int> prePos;
-      if (coordIter.getPrePos(delays_, i, prePos)) {
-        int preOffset = coordIter.offset(prePos);
-        frameOutput_[start + preOffset].grad->mul(
-            *frameGate_[start + offset].grad, *weightT, 1.0, 1.0);
-        if (weight_->getWGrad()) {
-          weight_->getWGrad()->mul(
-              *frameOutput_[start + preOffset].value->getTranspose(),
-              *frameGate_[start + offset].grad,
-              1.0,
-              1.0);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.cpp
deleted file mode 100644
index 544b4082fa0..00000000000
--- a/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNAddtoLayer.h"
-
-using namespace mkldnn;  // NOLINT
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_addto, MKLDNNAddtoLayer);
-
-bool MKLDNNAddtoLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-
-  layerSize_ = getSize();
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    CHECK_EQ(layerSize_, inputLayers_[i]->getSize()) << "input size must equal";
-  }
-  if (biasParameter_.get() != NULL) {
-    biases_ =
-        std::unique_ptr<Weight>(new Weight(1, layerSize_, biasParameter_, 0));
-  }
-  return true;
-}
-
-void MKLDNNAddtoLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed";
-  reshapeInput(bs, ih, iw);
-  ic = inputLayers_[0]->getSize() / ih / iw;
-  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
-  CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
-           (size_t)bs * ic * ih * iw);
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize());
-    CHECK_EQ(layerSize_, inputLayers_[i]->getSize());
-  }
-
-  oc = ic;
-  oh = ih;
-  ow = iw;
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
-                                std::vector<MKLDNNMatrixPtr>& inputs,
-                                MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inputs, biasVal_, out);
-
-  std::shared_ptr<sum::primitive_desc> fwdPD;
-  std::shared_ptr<sum::primitive_desc> biasPD;
-  resetFwdPD(fwdPD, biasPD, inputs, biasVal_, out);
-
-  resetFwdPipeline(pipeline, fwdPD, biasPD, inputs, biasVal_, out);
-}
-
-void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
-                                std::vector<MKLDNNMatrixPtr>& inputs,
-                                MKLDNNMatrixPtr& out) {
-  resetBwdBuffers(inputs, biasGrad_, out);
-
-  // backward only need share output grad to input grad
-  for (size_t i = 0; i < inputs.size(); i++) {
-    if (inputs[i] != nullptr) {
-      inputs[i] = out;
-      inputLayers_[i]->getOutputGrad()->setData(inputs[i]->getData());
-    }
-  }
-
-  // backward bias
-  bwdBias_ = nullptr;
-  if (biasGrad_) {
-    std::vector<float> scales(bs_, 1.0);
-    std::vector<memory::primitive_desc> srcPDs(bs_,
-                                               biasGrad_->getPrimitiveDesc());
-    auto biasPD =
-        sum::primitive_desc(biasGrad_->getMemoryDesc(), scales, srcPDs);
-    std::vector<primitive::at> srcs;
-    for (size_t i = 0; i < grads_.size(); ++i) {
-      srcs.push_back(*(grads_[i]));
-    }
-    bwdBias_.reset(new sum(biasPD, srcs, *biasGrad_));
-    pipeline.push_back(*bwdBias_);
-  }
-}
-
-void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-void MKLDNNAddtoLayer::prepareBias(MKLDNNMatrixPtr& bias,
-                                   const MatrixPtr& biasMat,
-                                   const MKLDNNMatrixPtr& out,
-                                   std::vector<MKLDNNMatrixPtr>& outs) {
-  auto pd = MKLDNNMatrix::createPrimitiveDesc(
-      {(int)layerSize_}, memory::format::x, engine_);
-  bias = MKLDNNMatrix::create(pd, biasMat);
-  outs.clear();
-  real* data = out->getData();
-  CHECK_EQ(bs_ * layerSize_, out->getElementCnt());
-  for (int i = 0; i < bs_; ++i) {
-    MatrixPtr tmp =
-        Matrix::create(data + i * layerSize_, 1, layerSize_, false, false);
-    outs.push_back(MKLDNNMatrix::create(bias->getPrimitiveDesc(), tmp));
-  }
-}
-
-void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                                       MKLDNNMatrixPtr& bias,
-                                       MKLDNNMatrixPtr& out) {
-  inputs.resize(inputLayers_.size());
-  for (size_t i = 0; i < inputs.size(); i++) {
-    resetInValue(inputs[i], nullptr, i);
-    CHECK(inputs[i]);
-    inputs[i]->downSpatial();
-  }
-  for (size_t i = 1; i < inputs.size(); i++) {
-    CHECK_PRIMITIVE_DESC_EQ(inputs[i], inputs[0]->getPrimitiveDesc());
-  }
-
-  resetOutValue(out, inputs[0]->getPrimitiveDesc());
-
-  if (biases_ && biases_->getW()) {
-    prepareBias(bias, biases_->getW(), out, vals_);
-  } else {
-    bias = nullptr;
-  }
-}
-
-void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
-                                  std::shared_ptr<sum::primitive_desc>& biasPD,
-                                  std::vector<MKLDNNMatrixPtr>& inputs,
-                                  MKLDNNMatrixPtr bias,
-                                  MKLDNNMatrixPtr out) {
-  std::vector<float> scales(inputs.size(), 1.0);
-  std::vector<memory::primitive_desc> srcPDs;
-  for (size_t i = 0; i < inputs.size(); i++) {
-    srcPDs.push_back(inputs[i]->getPrimitiveDesc());
-  }
-  CHECK(out);
-  pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
-  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
-
-  biasPD = nullptr;
-  if (bias) {
-    std::vector<float> scales(2, 1.0);
-    std::vector<memory::primitive_desc> srcPDs(2, bias->getPrimitiveDesc());
-    biasPD.reset(
-        new sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs));
-    CHECK_PRIMITIVE_DESC_EQ(bias, biasPD->dst_primitive_desc());
-  }
-}
-
-void MKLDNNAddtoLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<sum::primitive_desc>& pd,
-    std::shared_ptr<sum::primitive_desc>& biasPD,
-    std::vector<MKLDNNMatrixPtr>& inputs,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  std::vector<primitive::at> srcs;
-  for (size_t i = 0; i < inputs.size(); i++) {
-    srcs.push_back(*(inputs[i]));
-  }
-  fwd_.reset(new sum(*pd, srcs, *out));
-  pipeline.push_back(*fwd_);
-
-  fwdBias_.clear();
-  if (biasPD == nullptr || bias == nullptr) {
-    return;
-  }
-  fwdBias_.resize(vals_.size());
-  for (size_t i = 0; i < vals_.size(); ++i) {
-    std::vector<primitive::at> srcs;
-    srcs.push_back(*(vals_[i]));
-    srcs.push_back(*bias);
-    fwdBias_[i].reset(new sum(*biasPD, srcs, *vals_[i]));
-    pipeline.push_back(*fwdBias_[i]);
-  }
-}
-
-void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                                       MKLDNNMatrixPtr& bias,
-                                       MKLDNNMatrixPtr& out) {
-  CHECK(outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  CHECK(out);
-
-  inputs.resize(inputLayers_.size());
-  for (size_t i = 0; i < inputs.size(); i++) {
-    resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
-    CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
-  }
-
-  if (biases_ && biases_->getWGrad()) {
-    prepareBias(bias, biases_->getWGrad(), out, grads_);
-  } else {
-    bias = nullptr;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.h b/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.h
deleted file mode 100644
index 0b385e804fd..00000000000
--- a/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-
-/**
- * @brief A subclass of MKLDNNLayer Addto layer.
- *
- * The config file api is mkldnn_addto
- */
-class MKLDNNAddtoLayer : public MKLDNNLayer {
- protected:
-  // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
-  size_t layerSize_;
-
-  std::unique_ptr<Weight> biases_;
-
-  // buffers for adding bias
-  std::vector<MKLDNNMatrixPtr> vals_;
-  std::vector<MKLDNNMatrixPtr> grads_;
-  // primitives for adding bias
-  std::vector<std::shared_ptr<mkldnn::primitive>> fwdBias_;
-  std::shared_ptr<mkldnn::primitive> bwdBias_;
-
- public:
-  explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
-
-  ~MKLDNNAddtoLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void updateWeights(const UpdateCallback& callback) override;
-
- protected:
-  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                       MKLDNNMatrixPtr& bias,
-                       MKLDNNMatrixPtr& out);
-  void resetFwdPD(std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
-                  std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
-                  std::vector<MKLDNNMatrixPtr>& inputs,
-                  MKLDNNMatrixPtr bias,
-                  MKLDNNMatrixPtr out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
-                        std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& bias,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                       MKLDNNMatrixPtr& bias,
-                       MKLDNNMatrixPtr& out);
-
-  void prepareBias(MKLDNNMatrixPtr& bias,
-                   const MatrixPtr& biasMat,
-                   const MKLDNNMatrixPtr& out,
-                   std::vector<MKLDNNMatrixPtr>& outs);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNBase.h b/paddle/legacy/gserver/layers/MKLDNNBase.h
deleted file mode 100644
index 786ceaf8608..00000000000
--- a/paddle/legacy/gserver/layers/MKLDNNBase.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "mkldnn.hpp"
-
-namespace paddle {
-
-typedef enum {
-  MKLDNN_BASE = 1,   // basical info of MKLDNN
-  MKLDNN_TESTS = 1,  // gtest info of MKLDNN
-  MKLDNN_FMTS = 2,   // format info of MKLDNN
-  MKLDNN_SIZES = 3,  // size info of MKLDNN
-  MKLDNN_ALL = 4,    // show all info of MKLDNN
-} MKLDNN_LOG_LEVEL;
-
-/**
- * @brief MKLDNN CPU engine.
- *
- */
-class CPUEngine {
- public:
-  static CPUEngine& Instance() {
-    // Thread-safe in C++11.
-    static CPUEngine myInstance;
-    return myInstance;
-  }
-
-  // Disallow copy or move
-  CPUEngine(const CPUEngine&) = delete;             // Copy constructor
-  CPUEngine(CPUEngine&&) = delete;                  // Move constructor
-  CPUEngine& operator=(const CPUEngine&) = delete;  // Copy assignment
-  CPUEngine& operator=(CPUEngine&&) = delete;       // Move assignment
-
-  mkldnn::engine& getEngine() { return cpuEngine_; }
-
- protected:
-  CPUEngine() : cpuEngine_(mkldnn::engine::cpu, 0) {}
-  //    CPUEngine() : cpuEngine_(mkldnn::engine::cpu_lazy, 0) {}
-  ~CPUEngine() {}
-
- private:
-  mkldnn::engine cpuEngine_;
-};
-
-/**
- * @brief MKLDNN Stream.
- *
- */
-class MKLDNNStream {
- public:
-  MKLDNNStream() : ready_(false) { resetState(); }
-
-  virtual ~MKLDNNStream() {}
-
-  /**
-   * @brief Submit stream
-   * @param prims The primitives vector
-   * @param block Waiting for the stream to complete
-   */
-  void submit(std::vector<mkldnn::primitive>& prims, bool block = true) {
-    resetState();
-    stream_->submit(prims).wait(block);
-    ready_ = false;
-  }
-
-  /**
-   * @brief Reset the mkldnn stream
-   */
-  void resetState() {
-    if (ready_) {
-      return;
-    }
-    // TODO(TJ): change me when mkldnn have method to reset this state
-    // stream_.reset(new mkldnn::stream(mkldnn::stream::kind::lazy));
-    stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
-    ready_ = true;
-  }
-
- private:
-  bool ready_;
-  std::shared_ptr<mkldnn::stream> stream_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.cpp
deleted file mode 100644
index dbdfaff32f7..00000000000
--- a/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNBatchNormLayer.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_batch_norm, MKLDNNBatchNormLayer);
-
-bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
-                                const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-
-  // first one is input layer
-  // the other two are created in config_parser.py saving moving mean and var
-  CHECK_EQ(inputLayers_.size(), 3U);
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK_EQ(inputLayers_.size(), size_t(config_.inputs_size()));
-
-  const ImageConfig& conf = config_.inputs(0).image_conf();
-  ic_ = conf.channels();
-  ih_ = inputLayers_[0]->getOutput().getFrameHeight();
-  iw_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (iw_ == 0 && ih_ == 0) {
-    iw_ = conf.img_size();
-    ih_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  }
-  oc_ = ic_;
-  oh_ = ih_;
-  ow_ = iw_;
-  if (config_.has_use_global_stats()) {
-    useGlobalStats_ = config_.use_global_stats();
-  }
-  movingAvgFraction_ = config_.moving_average_fraction();
-  epsilon_ = config_.epsilon();
-
-  VLOG(MKLDNN_BASE) << "--- " << (useGlobalStats_ ? "use" : "do not use")
-                    << " --- global stats";
-  VLOG(MKLDNN_BASE) << "Moving average fraction: " << movingAvgFraction_;
-
-  initWeight();
-  movingMean_.reset(new Weight(oc_, 1, parameters_[1], 0));
-  movingVar_.reset(new Weight(oc_, 1, parameters_[2], 0));
-  return true;
-}
-
-void MKLDNNBatchNormLayer::initWeight() {
-  weight_.reset(new Weight(1, oc_, parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_));
-  }
-  CHECK_EQ(weight_ != nullptr, biases_ != nullptr)
-      << "only support have both weight and bias, or neither";
-  if (weight_ && weight_->getW()) {
-    CHECK(biases_ && biases_->getW());
-    valueScaleShift_ = Matrix::create(2, oc_, false, false);
-    valueScaleShift_->zeroMem();
-    VectorPtr scale(new CpuVector(oc_, valueScaleShift_->getMemoryHandle(), 0));
-    VectorPtr shift(
-        new CpuVector(oc_, valueScaleShift_->getMemoryHandle(), oc_));
-    const VectorPtr& wgt = parameters_[0]->getBuf(PARAMETER_VALUE);
-    const VectorPtr& bias = biasParameter_->getBuf(PARAMETER_VALUE);
-    scale->copyFrom(*wgt);
-    shift->copyFrom(*bias);
-    wgt->setData(valueScaleShift_->getData());
-    bias->setData(valueScaleShift_->getData() + oc_);
-  }
-  if (weight_ && weight_->getWGrad()) {
-    CHECK(biases_ && biases_->getWGrad());
-    gradScaleShift_ = Matrix::create(2, oc_, false, false);
-    gradScaleShift_->zeroMem();
-    const VectorPtr& wgt = parameters_[0]->getBuf(PARAMETER_GRADIENT);
-    const VectorPtr& bias = biasParameter_->getBuf(PARAMETER_GRADIENT);
-    wgt->setData(gradScaleShift_->getData());
-    bias->setData(gradScaleShift_->getData() + oc_);
-  }
-}
-
-void MKLDNNBatchNormLayer::convertWeightsFromPaddle() {
-  if (hasInitedWgt_) {
-    return;
-  }
-  // prepare mean and var if necessary
-  if (useGlobalStats_) {
-    CHECK(mean_);
-    CHECK(var_);
-    mean_->copyFrom(*(movingMean_->getW()));
-    var_->copyFrom(*(movingVar_->getW()));
-  }
-  hasInitedWgt_ = true;
-}
-
-void MKLDNNBatchNormLayer::calMovingMeanAndVar() {
-  // calculating and saving moving mean and variance
-  CHECK_EQ(useGlobalStats_, false);
-  movingMean_->getW()->add(
-      *mean_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-  // here var is v^2
-  movingVar_->getW()->add(*var_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-}
-
-void MKLDNNBatchNormLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  reshapeInput(bs, ih, iw);
-  oh = ih;
-  ow = iw;
-  // ic_ and oc can not be changed
-  CHECK_EQ((size_t)ic,
-           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
-      << "Input channel can not be changed";
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
-                                    std::vector<MKLDNNMatrixPtr>& inputs,
-                                    MKLDNNMatrixPtr& out) {
-  // In training phase, it will always calculate mean and var,
-  // so useGlobalStats must be false.
-  // In scoring phase, it depends on useGlobalStats choice.
-  if (passType_ != PASS_TEST && useGlobalStats_ == true) {
-    LOG(WARNING) << "use_global_stats is invalid setting in training phase";
-    useGlobalStats_ = false;
-  }
-
-  resetFwdBuffers(inputs[0], wgtVal_, out);
-
-  resetFwdPD(fwdPD_, inputs[0], wgtVal_, out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, out);
-}
-
-void MKLDNNBatchNormLayer::resetBwd(std::vector<primitive>& pipeline,
-                                    std::vector<MKLDNNMatrixPtr>& inputs,
-                                    MKLDNNMatrixPtr& out) {
-  std::shared_ptr<bn_bwd::primitive_desc> pd;
-
-  resetBwdBuffers(inputs[0], wgtGrad_, out);
-
-  resetBwdPD(pd, inputs[0], wgtGrad_, out);
-
-  resetBwdPipeline(pipeline, pd, inputs[0], wgtGrad_, out);
-}
-
-void MKLDNNBatchNormLayer::forward(PassType passType) {
-  MKLDNNLayer::forward(passType);
-
-  // calculate and save moving mean and variance
-  if (passType_ != PASS_TEST) {
-    calMovingMeanAndVar();
-  }
-}
-
-void MKLDNNBatchNormLayer::updateWeights(const UpdateCallback& callback) {
-  weight_->getParameterPtr()->incUpdate(callback);
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-void MKLDNNBatchNormLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
-                                           MKLDNNMatrixPtr& wgt,
-                                           MKLDNNMatrixPtr& out) {
-  resetInValue(in);
-
-  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  CHECK(in);
-  auto outPD =
-      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
-  resetOutValue(out, outPD);
-
-  if (valueScaleShift_) {
-    auto pd = MKLDNNMatrix::createPrimitiveDesc({2, oc_}, format::nc, engine_);
-    resetWithMatrix(wgt, valueScaleShift_, pd);
-  }
-  if (passType_ != PASS_TEST || useGlobalStats_) {
-    auto pd = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
-    mean_ = MKLDNNMatrix::create(pd);
-    var_ = MKLDNNMatrix::create(pd);
-  }
-}
-
-void MKLDNNBatchNormLayer::resetFwdPD(
-    std::shared_ptr<bn_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr in,
-    MKLDNNMatrixPtr wgt,
-    MKLDNNMatrixPtr out) {
-  flags_ = 0u;
-  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
-                                        : prop_kind::forward_training;
-  if (useGlobalStats_) {
-    flags_ = (flags_ | batch_normalization_flag::use_global_stats);
-  }
-  if (wgt) {
-    flags_ = (flags_ | batch_normalization_flag::use_scale_shift);
-  }
-  auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), epsilon_, flags_);
-  pd.reset(new bn_fwd::primitive_desc(fwdDesc, engine_));
-  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
-  if (wgt) {
-    CHECK_PRIMITIVE_DESC_EQ(wgt, pd->weights_primitive_desc());
-  }
-  if (passType_ != PASS_TEST || useGlobalStats_) {
-    CHECK_PRIMITIVE_DESC_EQ(mean_, pd->mean_primitive_desc());
-    CHECK_PRIMITIVE_DESC_EQ(var_, pd->variance_primitive_desc());
-  }
-}
-
-void MKLDNNBatchNormLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<bn_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& out) {
-  if (passType_ == PASS_TEST) {
-    if (useGlobalStats_) {
-      fwd_.reset(wgt != nullptr ? new bn_fwd(*pd,
-                                             *in,
-                                             (const primitive::at)(*mean_),
-                                             (const primitive::at)(*var_),
-                                             *wgt,
-                                             *out)
-                                : new bn_fwd(*pd,
-                                             *in,
-                                             (const primitive::at)(*mean_),
-                                             (const primitive::at)(*var_),
-                                             *out));
-    } else {
-      fwd_.reset(wgt != nullptr ? new bn_fwd(*pd, *in, *wgt, *out)
-                                : new bn_fwd(*pd, *in, *out));
-    }
-  } else {
-    CHECK_EQ(useGlobalStats_, false)
-        << "useGlobalStats should be false in training";
-    fwd_.reset(wgt != nullptr ? new bn_fwd(*pd, *in, *wgt, *out, *mean_, *var_)
-                              : new bn_fwd(*pd, *in, *out, *mean_, *var_));
-  }
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNBatchNormLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
-                                           MKLDNNMatrixPtr& wgt,
-                                           MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0] && outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
-  if (gradScaleShift_) {
-    CHECK(wgtVal_);
-    resetWithMatrix(wgt, gradScaleShift_, wgtVal_->getPrimitiveDesc());
-  }
-}
-
-void MKLDNNBatchNormLayer::resetBwdPD(
-    std::shared_ptr<bn_bwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& out) {
-  pd = nullptr;
-  if (in == nullptr) {
-    return;
-  }
-  CHECK_PRIMITIVE_DESC_EQ(out, in->getPrimitiveDesc());
-  auto md = in->getMemoryDesc();
-  auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, epsilon_, flags_);
-  pd.reset(new bn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
-  CHECK(pd->weights_primitive_desc() == fwdPD_->weights_primitive_desc());
-  CHECK_PRIMITIVE_DESC_EQ(wgt, pd->diff_weights_primitive_desc());
-  CHECK_PRIMITIVE_DESC_EQ(mean_, pd->mean_primitive_desc());
-  CHECK_PRIMITIVE_DESC_EQ(var_, pd->variance_primitive_desc());
-}
-
-void MKLDNNBatchNormLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<bn_bwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& out) {
-  if (pd == nullptr) {
-    return;
-  }
-  CHECK(inVals_[0]);
-  bwdData_.reset(
-      wgt && wgtVal_
-          ? new bn_bwd(
-                *pd, *inVals_[0], *mean_, *var_, *out, *wgtVal_, *in, *wgt)
-          : new bn_bwd(*pd, *inVals_[0], *mean_, *var_, *out, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.h b/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.h
deleted file mode 100644
index 9aa20df98f3..00000000000
--- a/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-typedef mkldnn::batch_normalization_forward bn_fwd;
-typedef mkldnn::batch_normalization_backward bn_bwd;
-
-/**
- * @brief A subclass of MKLDNNLayer BatchNorm layer.
- *
- * The config file api is mkldnn_batch_norm
- */
-class MKLDNNBatchNormLayer : public MKLDNNLayer {
- protected:
-  // save forward primitive_desc, which can be used backward
-  std::shared_ptr<bn_fwd::primitive_desc> fwdPD_;
-
-  // Epsilon value used in the batch normalization formula.
-  real epsilon_;
-
-  // weight and bias in paddle
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> biases_;
-  // mkldnn use a large buffer store both scale and shift
-  // which are weight and bias in paddle corresponding.
-  MatrixPtr valueScaleShift_;
-  MatrixPtr gradScaleShift_;
-  // Moving average of mean.
-  std::unique_ptr<Weight> movingMean_;
-  // Moving average of variance.
-  std::unique_ptr<Weight> movingVar_;
-
-  // if useGlobalStats_ is true, will use the loaded mean and variance.
-  // otherwise, calculate mean and variance in every mini-batch.
-  bool useGlobalStats_;
-  // used in MKLDNN primitive desc
-  unsigned flags_;
-  // use to compute moving mean and variance.
-  real movingAvgFraction_;
-  // whether the weight has been init
-  bool hasInitedWgt_;
-
-  // local mean and variance
-  // when useGlobalStats_ they are loaded from moving mean and variance
-  // when do not useGlobalStats_ they are calculated from this mini-batch
-  MKLDNNMatrixPtr mean_;
-  MKLDNNMatrixPtr var_;
-
- public:
-  explicit MKLDNNBatchNormLayer(const LayerConfig& config)
-      : MKLDNNLayer(config), useGlobalStats_(true), hasInitedWgt_(false) {}
-
-  ~MKLDNNBatchNormLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void updateWeights(const UpdateCallback& callback) override;
-
-  void convertWeightsFromPaddle() override;
-
- protected:
-  void initWeight();
-  /**
-   * cal moving mean and variance.
-   * moving = moving * AvgFraction + local * (1 - AvgFraction)
-   */
-  void calMovingMeanAndVar();
-
-  void resetFwdBuffers(MKLDNNMatrixPtr& in,
-                       MKLDNNMatrixPtr& wgt,
-                       MKLDNNMatrixPtr& out);
-  void resetFwdPD(std::shared_ptr<bn_fwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr in,
-                  MKLDNNMatrixPtr wgt,
-                  MKLDNNMatrixPtr out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<bn_fwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdBuffers(MKLDNNMatrixPtr& in,
-                       MKLDNNMatrixPtr& wgt,
-                       MKLDNNMatrixPtr& out);
-  void resetBwdPD(std::shared_ptr<bn_bwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr& in,
-                  MKLDNNMatrixPtr& wgt,
-                  MKLDNNMatrixPtr& out);
-  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<bn_bwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& out);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNConcatLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNConcatLayer.cpp
deleted file mode 100644
index beed6176e11..00000000000
--- a/paddle/legacy/gserver/layers/MKLDNNConcatLayer.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNConcatLayer.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_concat, MKLDNNConcatLayer);
-
-bool MKLDNNConcatLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  CHECK_GT(inputLayers_.size(), 1UL);
-  CHECK(!biasParameter_);
-  return true;
-}
-
-void MKLDNNConcatLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  reshapeInput(bs, ih, iw);
-  ic = inputLayers_[0]->getSize() / ih / iw;
-  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
-  CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
-           (size_t)bs * ic * ih * iw);
-  CHECK_GT(inputLayers_.size(), 1UL);
-  channels_.resize(inputLayers_.size());
-  channels_[0] = ic;
-  oc = ic;
-  for (size_t i = 1; i < inputLayers_.size(); i++) {
-    int batchsize = 0, height = 0, witdh = 0;
-    reshapeInput(batchsize, height, witdh, i);
-    CHECK_EQ(bs, batchsize);
-    CHECK_EQ(ih, height);
-    CHECK_EQ(iw, witdh);
-
-    channels_[i] = inputLayers_[i]->getSize() / height / witdh;
-    CHECK_EQ((size_t)channels_[i] * height * witdh, inputLayers_[i]->getSize());
-    oc += channels_[i];
-  }
-  oh = ih;
-  ow = iw;
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNConcatLayer::resetFwd(std::vector<primitive>& pipeline,
-                                 std::vector<MKLDNNMatrixPtr>& inputs,
-                                 MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inputs, out);
-
-  std::shared_ptr<concat::primitive_desc> fwdPD;
-  resetFwdPD(fwdPD, inputs, out);
-
-  resetFwdPipeline(pipeline, fwdPD, inputs, out);
-}
-
-void MKLDNNConcatLayer::resetBwd(std::vector<primitive>& pipeline,
-                                 std::vector<MKLDNNMatrixPtr>& inputs,
-                                 MKLDNNMatrixPtr& out) {
-  resetBwdBuffers(inputs, out);
-
-  resetBwdPipeline(pipeline, bwds_, inputs, out);
-}
-
-void MKLDNNConcatLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                                        MKLDNNMatrixPtr& out) {
-  inputs.resize(inputLayers_.size());
-  bool has8c = false, has16c = false, hasnc = false;
-  for (size_t i = 0; i < inputs.size(); i++) {
-    resetInValue(inputs[i], nullptr, i, channels_[i]);
-    inputs[i]->downSpatial();
-    CHECK(inputs[i]);
-    auto dm = inputs[i]->getDims();
-    // inputs format can be different, but ndims must equal
-    CHECK(i == 0 || dm.size() == inputs[0]->getDims().size());
-    CHECK_EQ(bs_, dm[0]);
-    CHECK_EQ(channels_[i], dm[1]);
-    if (dm.size() > 2) {
-      CHECK_EQ(ih_, dm[2]);
-      CHECK_EQ(iw_, dm[3]);
-    }
-    if (inputs[i]->getFormat() == format::nc) {
-      hasnc = true;
-    }
-    if (inputs[i]->getFormat() == format::nChw8c) {
-      has8c = true;
-    }
-    if (inputs[i]->getFormat() == format::nChw16c) {
-      has16c = true;
-    }
-  }
-
-  format outFmt;
-  if (has16c && oc_ % 16 == 0) {
-    outFmt = format::nChw16c;
-  } else if (has8c && oc_ % 8 == 0) {
-    outFmt = format::nChw8c;
-  } else if (hasnc) {
-    CHECK(oh_ == 1 && ow_ == 1);
-    outFmt = format::nc;
-  } else {
-    outFmt = format::nchw;
-  }
-  memory::dims outDims =
-      hasnc ? memory::dims{bs_, oc_} : memory::dims{bs_, oc_, oh_, ow_};
-  auto outPD = MKLDNNMatrix::createPrimitiveDesc(outDims, outFmt, engine_);
-  resetOutValue(out, outPD);
-}
-
-void MKLDNNConcatLayer::resetFwdPD(std::shared_ptr<concat::primitive_desc>& pd,
-                                   std::vector<MKLDNNMatrixPtr>& inputs,
-                                   MKLDNNMatrixPtr out) {
-  std::vector<memory::primitive_desc> srcPDs;
-  for (size_t i = 0; i < inputs.size(); i++) {
-    srcPDs.push_back(inputs[i]->getPrimitiveDesc());
-  }
-  CHECK(out);
-  pd.reset(new concat::primitive_desc(out->getMemoryDesc(), axis_, srcPDs));
-  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
-}
-
-void MKLDNNConcatLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<concat::primitive_desc>& pd,
-    std::vector<MKLDNNMatrixPtr>& inputs,
-    MKLDNNMatrixPtr& out) {
-  std::vector<primitive::at> srcs;
-  for (size_t i = 0; i < inputs.size(); i++) {
-    srcs.push_back(*(inputs[i]));
-  }
-  fwd_.reset(new concat(*pd, srcs, *out));
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNConcatLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                                        MKLDNNMatrixPtr& out) {
-  CHECK(outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  CHECK(out);
-
-  inputs.resize(inputLayers_.size());
-  for (size_t i = 0; i < inputs.size(); i++) {
-    CHECK(inVals_[i]);
-    resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
-    CHECK_PRIMITIVE_DESC_EQ(inputs[i], inVals_[i]->getPrimitiveDesc());
-  }
-}
-
-void MKLDNNConcatLayer::resetBwdPipeline(
-    std::vector<mkldnn::primitive>& pipeline,
-    std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
-    std::vector<MKLDNNMatrixPtr>& inputs,
-    MKLDNNMatrixPtr& out) {
-  // reset the backward primitives
-  memory::dims offsets = {0, 0, 0, 0};
-  prims.resize(inputs.size());
-  CHECK_EQ(inputs.size(), channels_.size());
-  for (size_t i = 0; i < inputs.size(); i++) {
-    auto viewPD = view::primitive_desc(
-        out->getPrimitiveDesc(), inputs[i]->getDims(), offsets);
-    auto bwdPD = reorder::primitive_desc(viewPD.dst_primitive_desc(),
-                                         inputs[i]->getPrimitiveDesc());
-    prims[i].reset(new reorder(bwdPD, *out, *(inputs[i])));
-    offsets[axis_] += channels_[i];
-    // push to pipeline
-    pipeline.push_back(*prims[i]);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNConcatLayer.h b/paddle/legacy/gserver/layers/MKLDNNConcatLayer.h
deleted file mode 100644
index d7738df6c10..00000000000
--- a/paddle/legacy/gserver/layers/MKLDNNConcatLayer.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-
-/**
- * @brief A subclass of MKLDNNLayer Concatenate layer.
- *
- * The config file api is mkldnn_concat
- */
-class MKLDNNConcatLayer : public MKLDNNLayer {
- protected:
-  std::vector<std::shared_ptr<mkldnn::primitive>> bwds_;
-  // input channel numbers
-  std::vector<int> channels_;
-
-  // concat_dimension in MKLDNN
-  // if axis_ == 0, concat batchsize
-  // if axis_ == 1, concat channel (default)
-  int axis_;
-
- public:
-  explicit MKLDNNConcatLayer(const LayerConfig& config)
-      : MKLDNNLayer(config), axis_(1) {}
-
-  ~MKLDNNConcatLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void printSizeInfo() override {
-    CHECK_EQ(channels_.size(), inputLayers_.size());
-    for (size_t i = 0; i < channels_.size(); ++i) {
-      VLOG(MKLDNN_SIZES) << "Input " << i << ", " << inputLayers_[i]->getName()
-                         << ": " << bs_ << ", " << channels_[i] << ", " << ih_
-                         << ", " << iw_;
-    }
-    VLOG(MKLDNN_SIZES) << "Output: " << bs_ << ", " << oc_ << ", " << oh_
-                       << ", " << ow_;
-  }
-
-  size_t keepCondition() {
-    // reset when the total element size of all inputs changed
-    size_t totalSize = inputLayers_[0]->getOutputValue()->getElementCnt();
-    for (size_t i = 1; i < inputLayers_.size(); ++i) {
-      totalSize += inputLayers_[i]->getOutputValue()->getElementCnt();
-    }
-    return totalSize;
-  }
-
- protected:
-  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                       MKLDNNMatrixPtr& out);
-  void resetFwdPD(std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
-                  std::vector<MKLDNNMatrixPtr>& inputs,
-                  MKLDNNMatrixPtr out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                       MKLDNNMatrixPtr& out);
-  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& out);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp
deleted file mode 100644
index b47bf14821f..00000000000
--- a/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp
+++ /dev/null
@@ -1,388 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNConvLayer.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/utils/Logging.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_conv, MKLDNNConvLayer);
-
-bool MKLDNNConvLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  CHECK_EQ(inputLayers_.size(), 1UL) << "Only support one input layer yet";
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK(config_.shared_biases()) << "Only support shared biases yet";
-
-  oc_ = config_.num_filters();
-  const ConvConfig& conf = config_.inputs(0).conv_conf();
-  ic_ = conf.channels();
-  fw_ = conf.filter_size();
-  fh_ = conf.filter_size_y();
-  pw_ = conf.padding();
-  ph_ = conf.padding_y();
-  dw_ = conf.dilation();
-  dh_ = conf.dilation_y();
-  sw_ = conf.stride();
-  sh_ = conf.stride_y();
-  gp_ = conf.groups();
-  oh_ = conf.output_y();
-  ow_ = conf.output_x();
-  ih_ = conf.img_size_y();
-  iw_ = conf.img_size();
-  caffeMode_ = conf.caffe_mode();
-  CHECK(caffeMode_) << "Only support caffe mode yet";
-  CHECK(dh_ == 1 && dw_ == 1) << "Only support dilation 1 yet";
-  // check group setting
-  CHECK_EQ((oc_ / gp_) * gp_, oc_) << "group is indivisible for oc";
-  CHECK_EQ((ic_ / gp_) * gp_, ic_) << "group is indivisible for ic";
-
-  // create weight
-  size_t height = oc_ / gp_;
-  size_t width = ic_ * fh_ * fw_;
-  CHECK_EQ(parameters_[0]->getSize(), height * width);
-  weight_ =
-      std::unique_ptr<Weight>(new Weight(height, width, parameters_[0], 0));
-
-  // create biases
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
-  }
-  return true;
-}
-
-void MKLDNNConvLayer::convertWeightsFromPaddle() {
-  if (hasInitedWgt_) {
-    return;
-  }
-
-  CHECK(wgtVal_) << "should have been initialized";
-  // the paddle weight format is oihw or goihw
-  auto targetDim = wgtVal_->getDims();
-  auto srcFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
-  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
-  hasInitedWgt_ = true;
-}
-
-void MKLDNNConvLayer::convertWeightsToPaddle() {
-  CHECK(wgtVal_) << "should have been initialized";
-  auto targetDim = wgtVal_->getDims();
-  auto dstFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
-  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
-}
-
-void MKLDNNConvLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  reshapeInput(bs, ih, iw);
-
-  // cal output sizes
-  // oc can not be changed
-  int fh = (fh_ - 1) * dh_ + 1;
-  int fw = (fw_ - 1) * dw_ + 1;
-  oh = outputSize(ih, fh, ph_, sh_, caffeMode_);
-  ow = outputSize(iw, fw, pw_, sw_, caffeMode_);
-
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  resetFwdPD(fwdPD_);
-
-  resetFwdBuffers(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
-}
-
-void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  std::shared_ptr<conv_bwdWgt::primitive_desc> bwdWgtPD;
-  std::shared_ptr<conv_bwdData::primitive_desc> bwdDataPD;
-
-  resetBwdWgtPD(bwdWgtPD);
-
-  resetBwdDataPD(bwdDataPD);
-
-  resetBwdBuffers(bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
-
-  resetBwdPipeline(
-      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
-}
-
-void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
-  weight_->getParameterPtr()->incUpdate(callback);
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-void MKLDNNConvLayer::loadConvSettings(memory::dims& wgt,
-                                       memory::dims& bias,
-                                       memory::dims& stride,
-                                       memory::dims& dilation,
-                                       memory::dims& padL,
-                                       memory::dims& padR) {
-  wgt = (gp_ == 1) ? memory::dims{oc_, ic_, fh_, fw_}
-                   : memory::dims{gp_, oc_ / gp_, ic_ / gp_, fh_, fw_};
-  bias = memory::dims{oc_};
-  stride = memory::dims{sh_, sw_};
-  padL = memory::dims{ph_, pw_};
-  padR = getPaddingR();
-  // note: mkldnn dilation start from 0
-  dilation = memory::dims{dh_ - 1, dw_ - 1};
-}
-
-void MKLDNNConvLayer::resetFwdPD(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd) {
-  // dims for conv
-  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
-  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
-  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-
-  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
-                                        : prop_kind::forward_training;
-  algorithm algo = algorithm::convolution_direct;
-  padding_kind padKind = padding_kind::zero;
-  conv_fwd::desc fwdDesc =
-      biases_ && biases_->getW()
-          ? conv_fwd::desc(pk,
-                           algo,
-                           MKLDNNMatrix::createMemoryDesc(inDims),
-                           MKLDNNMatrix::createMemoryDesc(wgtDims),
-                           MKLDNNMatrix::createMemoryDesc(biasDims),
-                           MKLDNNMatrix::createMemoryDesc(outDims),
-                           strides,
-                           dilations,
-                           padL,
-                           padR,
-                           padKind)
-          : conv_fwd::desc(pk,
-                           algo,
-                           MKLDNNMatrix::createMemoryDesc(inDims),
-                           MKLDNNMatrix::createMemoryDesc(wgtDims),
-                           MKLDNNMatrix::createMemoryDesc(outDims),
-                           strides,
-                           dilations,
-                           padL,
-                           padR,
-                           padKind);
-  pd.reset(new conv_fwd::primitive_desc(fwdDesc, engine_));
-}
-
-void MKLDNNConvLayer::resetFwdBuffers(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(pd);
-  resetInValue(
-      in, std::make_shared<memory::primitive_desc>(pd->src_primitive_desc()));
-
-  resetOutValue(out, pd->dst_primitive_desc());
-
-  resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc());
-
-  if (biases_ && biases_->getW()) {
-    resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc());
-  } else {
-    bias = nullptr;
-  }
-}
-
-void MKLDNNConvLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<conv_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  if (bias) {
-    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out));
-  } else {
-    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out));
-  }
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNConvLayer::resetBwdWgtPD(
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& pd) {
-  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
-  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-
-  // create backward weight using input, output and weight value memory desc
-  CHECK(inVals_[0]) << "Should have internal input value";
-  CHECK(outVal_) << "Should have internal output value";
-  CHECK(wgtVal_) << "Should have weight value";
-  algorithm algo = algorithm::convolution_direct;
-  padding_kind padKind = padding_kind::zero;
-  auto bwdWgtDesc = biasVal_ != nullptr
-                        ? conv_bwdWgt::desc(algo,
-                                            inVals_[0]->getMemoryDesc(),
-                                            wgtVal_->getMemoryDesc(),
-                                            biasVal_->getMemoryDesc(),
-                                            outVal_->getMemoryDesc(),
-                                            strides,
-                                            padL,
-                                            padR,
-                                            padKind)
-                        : conv_bwdWgt::desc(algo,
-                                            inVals_[0]->getMemoryDesc(),
-                                            wgtVal_->getMemoryDesc(),
-                                            outVal_->getMemoryDesc(),
-                                            strides,
-                                            padL,
-                                            padR,
-                                            padKind);
-  pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
-  CHECK_PRIMITIVE_DESC_EQ(inVals_[0], pd->src_primitive_desc());
-  CHECK_PRIMITIVE_DESC_EQ(
-      outVal_,
-      pd->diff_dst_primitive_desc(),
-      "primitive desc of out value and grad should be equal");
-  CHECK_PRIMITIVE_DESC_EQ(
-      wgtVal_,
-      pd->diff_weights_primitive_desc(),
-      "primitive desc of weight value and grad should be equal");
-}
-
-void MKLDNNConvLayer::resetBwdDataPD(
-    std::shared_ptr<conv_bwdData::primitive_desc>& pd) {
-  pd = nullptr;
-  if (inputLayers_[0]->getOutput().grad == nullptr) {
-    return;
-  }
-
-  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
-  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-  CHECK(inVals_[0]) << "Should have internal input value";
-  CHECK(outVal_) << "Should have internal output value";
-  // create backward data using input and output value memory desc
-  // but using weight memory desc with any format
-  auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
-                                        inVals_[0]->getMemoryDesc(),
-                                        MKLDNNMatrix::createMemoryDesc(wgtDims),
-                                        outVal_->getMemoryDesc(),
-                                        strides,
-                                        padL,
-                                        padR,
-                                        padding_kind::zero);
-  pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
-  CHECK_PRIMITIVE_DESC_EQ(
-      inVals_[0],
-      pd->diff_src_primitive_desc(),
-      "primitive desc of in value and grad should be equal");
-  CHECK_PRIMITIVE_DESC_EQ(
-      outVal_,
-      pd->diff_dst_primitive_desc(),
-      "primitive desc of out value and grad should be equal");
-}
-
-void MKLDNNConvLayer::resetBwdBuffers(
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(wgtPD);
-  resetOutGrad(out, wgtPD->diff_dst_primitive_desc());
-
-  resetWithMatrix(
-      wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc());
-  CHECK_PRIMITIVE_DESC_EQ(
-      wgtVal_,
-      wgt->getPrimitiveDesc(),
-      "primitive desc of weight grad and value should be equal");
-
-  bias = nullptr;
-  if (biases_ && biases_->getWGrad()) {
-    resetWithMatrix(
-        bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc());
-    CHECK(bias);
-    CHECK_PRIMITIVE_DESC_EQ(
-        biasVal_,
-        bias->getPrimitiveDesc(),
-        "primitive desc of bias grad and value should be equal");
-  }
-
-  if (dataPD == nullptr) {
-    return;
-  }
-  resetInGrad(in, dataPD->diff_src_primitive_desc());
-  resetWgtValBwdData(dataPD, wgtValBwdData_);
-}
-
-void MKLDNNConvLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0]);
-  // add bwdWgt handle
-  if (bias) {
-    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt, *bias));
-  } else {
-    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt));
-  }
-  pipeline.push_back(*bwdWgt_);
-
-  if (dataPD == nullptr) {
-    return;
-  }
-  if (cvtWgtVal_) {
-    pipeline.push_back(*cvtWgtVal_);
-  }
-  // add bwdData handle
-  CHECK(wgtValBwdData_) << "Should have weight memory";
-  bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-void MKLDNNConvLayer::resetWgtValBwdData(
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& wgt) {
-  if (dataPD == nullptr) {
-    return;
-  }
-
-  // create new weight value for backward data, and create reorder if necessary
-  // since the primitive_desc would be different with wgtVal_
-  CHECK(wgtVal_) << "should have weight value";
-  if (dataPD->weights_primitive_desc() != wgtVal_->getPrimitiveDesc()) {
-    wgtValBwdData_ = MKLDNNMatrix::create(dataPD->weights_primitive_desc());
-    cvtWgtVal_ = MKLDNNMatrix::createReorder(wgtVal_, wgtValBwdData_);
-    CHECK(cvtWgtVal_);
-  } else {
-    wgtValBwdData_ = wgtVal_;
-  }
-  VLOG(MKLDNN_FMTS) << "weight value format for backward data: "
-                    << wgtValBwdData_->getFormat();
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNConvLayer.h b/paddle/legacy/gserver/layers/MKLDNNConvLayer.h
deleted file mode 100644
index d399035ed3a..00000000000
--- a/paddle/legacy/gserver/layers/MKLDNNConvLayer.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-typedef mkldnn::convolution_forward conv_fwd;
-typedef mkldnn::convolution_backward_weights conv_bwdWgt;
-typedef mkldnn::convolution_backward_data conv_bwdData;
-
-/**
- * @brief A subclass of MKLDNNLayer conv layer.
- *
- * The config file api is mkldnn_conv
- */
-class MKLDNNConvLayer : public MKLDNNLayer {
- protected:
-  // padding height and width
-  int ph_, pw_;
-  // stride height and width
-  int sh_, sw_;
-  // dilation height and width
-  int dh_, dw_;
-  // filter(kenerl) height and width
-  int fh_, fw_;
-  // group number
-  int gp_;
-
-  // in resetBwdData, the format of wgtValBwdData_ is different with wgtVal_
-  MKLDNNMatrixPtr wgtValBwdData_;
-  // convert handle from wgtVal_ to wgtValBwdData_
-  std::shared_ptr<mkldnn::reorder> cvtWgtVal_;
-
-  // save forward primitive_desc, which can be used backward
-  std::shared_ptr<conv_fwd::primitive_desc> fwdPD_;
-
-  // whether the weight has been init
-  bool hasInitedWgt_;
-
-  // true by default, which impact the calculation of output image size.
-  // details can refer to mathUtil.h
-  bool caffeMode_;
-
-  // weight and bias
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit MKLDNNConvLayer(const LayerConfig& config)
-      : MKLDNNLayer(config), hasInitedWgt_(false), caffeMode_(true) {}
-
-  ~MKLDNNConvLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void updateWeights(const UpdateCallback& callback) override;
-
-  void convertWeightsFromPaddle() override;
-
-  void convertWeightsToPaddle() override;
-
-  void printSizeInfo() override {
-    MKLDNNLayer::printSizeInfo();
-    VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
-                       << ", ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
-                       << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
-  }
-
- protected:
-  /**
-   * load the dims settings of this conv
-   */
-  void loadConvSettings(mkldnn::memory::dims& wgt,
-                        mkldnn::memory::dims& bias,
-                        mkldnn::memory::dims& stride,
-                        mkldnn::memory::dims& dilation,
-                        mkldnn::memory::dims& padL,
-                        mkldnn::memory::dims& padR);
-
-  void resetFwdPD(std::shared_ptr<conv_fwd::primitive_desc>& pd);
-  void resetFwdBuffers(std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                       MKLDNNMatrixPtr& in,
-                       MKLDNNMatrixPtr& wgt,
-                       MKLDNNMatrixPtr& bias,
-                       MKLDNNMatrixPtr& out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdWgtPD(std::shared_ptr<conv_bwdWgt::primitive_desc>& pd);
-  void resetBwdDataPD(std::shared_ptr<conv_bwdData::primitive_desc>& pd);
-  void resetBwdBuffers(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-                       std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-                       MKLDNNMatrixPtr& in,
-                       MKLDNNMatrixPtr& wgt,
-                       MKLDNNMatrixPtr& bias,
-                       MKLDNNMatrixPtr& out);
-  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-                        std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias,
-                        MKLDNNMatrixPtr& out);
-
-  /**
-   * reset MKLDNNMatrix of weight value for backward data
-   * since the primitive_desc would be different with wgtVal_
-   */
-  void resetWgtValBwdData(std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-                          MKLDNNMatrixPtr& wgt);
-
-  /**
-   * get padding_r according to
-   * https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
-   * test_convolution_forward_common.hpp
-   * @note: mkldnn dilation start from 0 while paddle start from 1
-   */
-  mkldnn::memory::dims getPaddingR() const {
-    mkldnn::memory::dims padR = {ph_, pw_};
-    for (int i = 0; i < 2; ++i) {
-      if ((ih_ - ((fh_ - 1) * dh_ + 1) + ph_ + padR[0]) / sh_ + 1 != oh_) {
-        ++padR[0];
-      }
-      if ((iw_ - ((fw_ - 1) * dw_ + 1) + pw_ + padR[1]) / sw_ + 1 != ow_) {
-        ++padR[1];
-      }
-    }
-    return padR;
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp
deleted file mode 100644
index f3747c7db84..00000000000
--- a/paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNFcLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_fc, MKLDNNFcLayer);
-
-bool MKLDNNFcLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-
-  CHECK_EQ(inputLayers_.size(), 1UL) << "Only support one input layer yet";
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK(!parameters_[0]->isSparse()) << "Do not support sparse yet";
-
-  // output size, cat not be changed
-  oc_ = getSize();
-  oh_ = 1;
-  ow_ = 1;
-  ih_ = 1;
-  iw_ = 1;
-
-  // input size can not change in FC
-  iLayerSize_ = inputLayers_[0]->getSize();
-  CHECK_EQ(parameters_[0]->getSize(), iLayerSize_ * oc_);
-
-  // create weight
-  weight_ =
-      std::unique_ptr<Weight>(new Weight(oc_, iLayerSize_, parameters_[0], 0));
-
-  // create biases
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
-  }
-  return true;
-}
-
-void MKLDNNFcLayer::convertWeightsFromPaddle() {
-  if (hasInitedWgt_) {
-    return;
-  }
-
-  CHECK(wgtVal_) << "should have been initialized";
-  auto targetDim = wgtVal_->getDims();
-  auto srcFmt = targetDim.size() == 2 ? format::io : format::ihwo;
-  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
-  hasInitedWgt_ = true;
-}
-
-void MKLDNNFcLayer::convertWeightsToPaddle() {
-  CHECK(wgtVal_) << "should have been initialized";
-  auto targetDim = wgtVal_->getDims();
-  auto dstFmt = targetDim.size() == 2 ? format::io : format::ihwo;
-  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
-}
-
-void MKLDNNFcLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  reshapeInput(bs, ih, iw);
-
-  CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
-  ic = iLayerSize_ / (ih * iw);
-  CHECK_EQ(size_t(ic * ih * iw), iLayerSize_) << "not divisible";
-  CHECK_EQ(size_t(oc), getSize());
-
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc);
-}
-
-void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
-                             std::vector<MKLDNNMatrixPtr>& inputs,
-                             MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inputs[0], wgtVal_, biasVal_, out);
-
-  resetFwdPD(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
-}
-
-void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
-                             std::vector<MKLDNNMatrixPtr>& inputs,
-                             MKLDNNMatrixPtr& out) {
-  std::shared_ptr<fc_bwdWgt::primitive_desc> bwdWgtPD;
-  std::shared_ptr<fc_bwdData::primitive_desc> bwdDataPD;
-
-  resetBwdBuffers(inputs[0], wgtGrad_, biasGrad_, out);
-
-  resetBwdWgtPD(bwdWgtPD, wgtGrad_, biasGrad_, out);
-
-  resetBwdDataPD(bwdDataPD, inputs[0], out);
-
-  resetBwdPipeline(
-      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
-}
-
-void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
-  weight_->getParameterPtr()->incUpdate(callback);
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
-                                    MKLDNNMatrixPtr& wgt,
-                                    MKLDNNMatrixPtr& bias,
-                                    MKLDNNMatrixPtr& out) {
-  resetInValue(in);
-  CHECK(in);
-  in->downSpatial();
-
-  auto outPD =
-      MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_);
-  resetOutValue(out, outPD);
-
-  format wgtFmt = format::oihw;
-  if (in->getFormat() == format::nChw8c) {
-    wgtFmt = format::oIhw8i;
-  } else if (in->getFormat() == format::nChw16c) {
-    wgtFmt = format::oIhw16i;
-  }
-  auto wgtPD =
-      MKLDNNMatrix::createPrimitiveDesc({oc_, ic_, ih_, iw_}, wgtFmt, engine_);
-  resetWithMatrix(wgt, weight_->getW(), wgtPD);
-  wgt->downSpatial();
-
-  if (biases_ && biases_->getW()) {
-    auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
-    resetWithMatrix(bias, biases_->getW(), biasPD);
-  } else {
-    bias = nullptr;
-  }
-}
-
-void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
-                               MKLDNNMatrixPtr in,
-                               MKLDNNMatrixPtr wgt,
-                               MKLDNNMatrixPtr bias,
-                               MKLDNNMatrixPtr out) {
-  CHECK(in);
-  CHECK(wgt);
-  CHECK(out);
-  prop_kind pk = prop_kind::forward;
-  fc_fwd::desc fwdDesc = bias != nullptr ? fc_fwd::desc(pk,
-                                                        in->getMemoryDesc(),
-                                                        wgt->getMemoryDesc(),
-                                                        bias->getMemoryDesc(),
-                                                        out->getMemoryDesc())
-                                         : fc_fwd::desc(pk,
-                                                        in->getMemoryDesc(),
-                                                        wgt->getMemoryDesc(),
-                                                        out->getMemoryDesc());
-  pd.reset(new fc_fwd::primitive_desc(fwdDesc, engine_));
-}
-
-void MKLDNNFcLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<fc_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  if (bias) {
-    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *bias, *out));
-  } else {
-    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out));
-  }
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
-                                    MKLDNNMatrixPtr& wgt,
-                                    MKLDNNMatrixPtr& bias,
-                                    MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0] && outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
-
-  CHECK(wgtVal_);
-  resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
-
-  if (biasVal_) {
-    resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc());
-  } else {
-    bias = nullptr;
-  }
-}
-
-void MKLDNNFcLayer::resetBwdWgtPD(
-    std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0]);
-  fc_bwdWgt::desc bwdWgtDesc =
-      bias ? fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
-                             wgt->getMemoryDesc(),
-                             bias->getMemoryDesc(),
-                             out->getMemoryDesc())
-           : fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
-                             wgt->getMemoryDesc(),
-                             out->getMemoryDesc());
-  pd.reset(new fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
-}
-
-void MKLDNNFcLayer::resetBwdDataPD(
-    std::shared_ptr<fc_bwdData::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  pd = nullptr;
-  if (in == nullptr) {
-    return;
-  }
-  CHECK(wgtVal_);
-  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(
-      in->getMemoryDesc(), wgtVal_->getMemoryDesc(), out->getMemoryDesc());
-  pd.reset(new fc_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
-}
-
-void MKLDNNFcLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<fc_bwdWgt::primitive_desc>& bwdWgtPD,
-    std::shared_ptr<fc_bwdData::primitive_desc>& bwdDataPD,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0]);
-  if (bias) {
-    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt, *bias));
-  } else {
-    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt));
-  }
-  pipeline.push_back(*bwdWgt_);
-
-  if (bwdDataPD == nullptr) {
-    return;
-  }
-  CHECK(wgtVal_) << "Should have weight memory";
-  bwdData_.reset(new fc_bwdData(*bwdDataPD, *out, *wgtVal_, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNFcLayer.h b/paddle/legacy/gserver/layers/MKLDNNFcLayer.h
deleted file mode 100644
index a704066cc81..00000000000
--- a/paddle/legacy/gserver/layers/MKLDNNFcLayer.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-typedef mkldnn::inner_product_forward fc_fwd;
-typedef mkldnn::inner_product_backward_weights fc_bwdWgt;
-typedef mkldnn::inner_product_backward_data fc_bwdData;
-
-/**
- * @brief A subclass of MKLDNNLayer fc layer.
- *
- * The config file api is mkldnn_fc
- */
-class MKLDNNFcLayer : public MKLDNNLayer {
- protected:
-  // input layer size, can not be change after init
-  size_t iLayerSize_;  // == ic * ih * iw
-
-  // if has already init the weight
-  bool hasInitedWgt_;
-
-  // save forward primitive_desc, which can be used backward
-  std::shared_ptr<fc_fwd::primitive_desc> fwdPD_;
-
-  // fc weight and bias
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit MKLDNNFcLayer(const LayerConfig& config)
-      : MKLDNNLayer(config), hasInitedWgt_(false) {}
-
-  ~MKLDNNFcLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void updateWeights(const UpdateCallback& callback) override;
-
-  void convertWeightsFromPaddle() override;
-
-  void convertWeightsToPaddle() override;
-
- protected:
-  void resetFwdBuffers(MKLDNNMatrixPtr& in,
-                       MKLDNNMatrixPtr& wgt,
-                       MKLDNNMatrixPtr& bias,
-                       MKLDNNMatrixPtr& out);
-  void resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr in,
-                  MKLDNNMatrixPtr wgt,
-                  MKLDNNMatrixPtr bias,
-                  MKLDNNMatrixPtr out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<fc_fwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdBuffers(MKLDNNMatrixPtr& in,
-                       MKLDNNMatrixPtr& wgt,
-                       MKLDNNMatrixPtr& bias,
-                       MKLDNNMatrixPtr& out);
-  void resetBwdWgtPD(std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
-                     MKLDNNMatrixPtr& wgt,
-                     MKLDNNMatrixPtr& bias,
-                     MKLDNNMatrixPtr& out);
-  void resetBwdDataPD(std::shared_ptr<fc_bwdData::primitive_desc>& pd,
-                      MKLDNNMatrixPtr& in,
-                      MKLDNNMatrixPtr& out);
-  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<fc_bwdWgt::primitive_desc>& bwdWgtPD,
-                        std::shared_ptr<fc_bwdData::primitive_desc>& bwdDataPD,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias,
-                        MKLDNNMatrixPtr& out);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp
deleted file mode 100644
index 739482348f7..00000000000
--- a/paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNLRNLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_lrn, MKLDNNLRNLayer);
-
-bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-
-  /* the size of inputs for norm-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-  const NormConfig& conf = config_.inputs(0).norm_conf();
-  localSize_ = conf.size();
-  alpha_ = conf.scale();
-  beta_ = conf.pow();
-
-  ic_ = conf.channels();
-  oc_ = ic_;
-  iw_ = conf.img_size();
-  ow_ = conf.output_x();
-  ih_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  oh_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  CHECK_EQ(iw_, ow_);
-  CHECK_EQ(ih_, oh_);
-  return true;
-}
-
-void MKLDNNLRNLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  reshapeInput(bs, ih, iw);
-  // ic_ and oc can not be changed
-  CHECK_EQ((size_t)ic,
-           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
-      << "Input channel can not be changed";
-  oh = ih;
-  ow = iw;
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNLRNLayer::resetFwd(std::vector<primitive>& pipeline,
-                              std::vector<MKLDNNMatrixPtr>& inputs,
-                              MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inputs[0], out);
-
-  resetFwdPD(fwdPD_, inputs[0], out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
-}
-
-void MKLDNNLRNLayer::resetBwd(std::vector<primitive>& pipeline,
-                              std::vector<MKLDNNMatrixPtr>& inputs,
-                              MKLDNNMatrixPtr& out) {
-  std::shared_ptr<lrn_bwd::primitive_desc> pd;
-
-  resetBwdBuffers(inputs[0], out);
-
-  resetBwdPD(pd, inputs[0], out);
-
-  resetBwdPipeline(pipeline, pd, inputs[0], out);
-}
-
-void MKLDNNLRNLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
-                                     MKLDNNMatrixPtr& out) {
-  resetInValue(in);
-  CHECK(in);
-  resetOutValue(out, in->getPrimitiveDesc());
-}
-
-void MKLDNNLRNLayer::resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
-                                MKLDNNMatrixPtr in,
-                                MKLDNNMatrixPtr out) {
-  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
-                                        : prop_kind::forward_training;
-  auto fwdDesc = lrn_fwd::desc(pk,
-                               algorithm::lrn_across_channels,
-                               in->getMemoryDesc(),
-                               localSize_,
-                               alpha_,
-                               beta_,
-                               1.0f);
-  pd.reset(new lrn_fwd::primitive_desc(fwdDesc, engine_));
-  // prepare workspace if necessary
-  workspace_ =
-      passType_ != PASS_TEST
-          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
-          : nullptr;
-}
-
-void MKLDNNLRNLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<lrn_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  fwd_ = workspace_
-             ? std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *workspace_, *out))
-             : std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *out));
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNLRNLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
-                                     MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0] && outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
-}
-
-void MKLDNNLRNLayer::resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
-                                MKLDNNMatrixPtr& in,
-                                MKLDNNMatrixPtr& out) {
-  pd = nullptr;
-  if (in == nullptr) {
-    return;
-  }
-  CHECK(out);
-  auto bwdDesc = lrn_bwd::desc(algorithm::lrn_across_channels,
-                               in->getMemoryDesc(),
-                               out->getMemoryDesc(),
-                               localSize_,
-                               alpha_,
-                               beta_,
-                               1.0f);
-  pd.reset(new lrn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
-}
-
-void MKLDNNLRNLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<lrn_bwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  if (pd == nullptr) {
-    return;
-  }
-  CHECK(inVals_[0]);
-  CHECK(workspace_);
-  bwdData_ = std::make_shared<lrn_bwd>(
-      lrn_bwd(*pd, *inVals_[0], *out, *workspace_, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNLRNLayer.h b/paddle/legacy/gserver/layers/MKLDNNLRNLayer.h
deleted file mode 100644
index 028438f2c93..00000000000
--- a/paddle/legacy/gserver/layers/MKLDNNLRNLayer.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-typedef mkldnn::lrn_forward lrn_fwd;
-typedef mkldnn::lrn_backward lrn_bwd;
-
-/**
- * @brief A subclass of MKLDNNLayer LRN(Local Response Norm) layer.
- *
- * The config file api is mkldnn_lrn
- */
-class MKLDNNLRNLayer : public MKLDNNLayer {
- protected:
-  // save forward primitive_desc, which can be used in backward
-  std::shared_ptr<lrn_fwd::primitive_desc> fwdPD_;
-  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
-  // test_lrn_backward.cpp, lrn need workspace for backward
-  std::shared_ptr<mkldnn::memory> workspace_;
-
-  int localSize_;
-  float alpha_, beta_;  // scale and pow in paddle
-
- public:
-  explicit MKLDNNLRNLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
-
-  ~MKLDNNLRNLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
- protected:
-  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr in,
-                  MKLDNNMatrixPtr out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<lrn_fwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr& in,
-                  MKLDNNMatrixPtr& out);
-  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<lrn_bwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& out);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNLayer.cpp
deleted file mode 100644
index f0acffe8716..00000000000
--- a/paddle/legacy/gserver/layers/MKLDNNLayer.cpp
+++ /dev/null
@@ -1,304 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNLayer.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-bool MKLDNNLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  CHECK(FLAGS_use_mkldnn) << "MKLDNNLayers only support use_mkldnn."
-                          << "Please set WITH_MKL=ON "
-                          << "and set use_mkldnn=True";
-  CHECK(!useGpu_) << "Do not support GPU yet";
-
-  // set device id before Layer::init
-  setDevice(MKLDNN_DEVICE);
-  // change param device to MKLDNN device
-  setParamsDevice(MKLDNN_DEVICE, parameterMap);
-  if (!Layer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  setOutputMap();
-  checkCPUOutputsNumber();
-
-  stream_.reset(new MKLDNNStream());
-  engine_ = CPUEngine::Instance().getEngine();
-  return true;
-}
-
-void MKLDNNLayer::forward(PassType passType) {
-  passType_ = passType;
-
-  {
-    REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
-    CHECK(!inputLayers_.empty());
-    copySeqInfoToOutputs();
-    if (condition_ != keepCondition()) {
-      VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
-      condition_ = keepCondition();
-      reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
-      printSizeInfo();
-      // the output_.value and output_.grad are shared with CPU device
-      shareCPUDevice();
-      pipelineFwd_.clear();
-      inVals_.resize(inputLayers_.size(), nullptr);
-      extInVals_.resize(inputLayers_.size(), nullptr);
-      cvtInVals_.resize(inputLayers_.size(), nullptr);
-      resetFwd(pipelineFwd_, inVals_, outVal_);
-      prepareValueConversions(pipelineFwd_);
-      convertWeightsFromPaddle();
-      printValueFormat();
-      needResetBwd_ = true;
-    }
-
-    if (inputLayers_[0]->getType() == "data" && inputLayers_.size() == 1) {
-      // Update input value data when input layer is "data" type,
-      // since the input value data address might be changed.
-      CHECK(extInVals_[0]);
-      extInVals_[0]->setData(getInputValue(0, CPU_DEVICE)->getData());
-    }
-
-    if (!outputOnlyMKLDNN_) {
-      clearGrads();
-    }
-    stream_->submit(pipelineFwd_);
-  }
-  {
-    REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void MKLDNNLayer::backward(const UpdateCallback& callback) {
-  if (needResetBwd_) {
-    VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
-    pipelineBwd_.clear();
-    inGrads_.resize(inputLayers_.size(), nullptr);
-    extInGrads_.resize(inputLayers_.size(), nullptr);
-    cvtInGrads_.resize(inputLayers_.size(), nullptr);
-    pipelineMergeGrad_.clear();
-    mergeGrad_ = nullptr;
-    resetBwd(pipelineBwd_, inGrads_, outGrad_);
-    prepareGradConversions(pipelineBwd_);
-    printGradFormat();
-    needResetBwd_ = false;
-  }
-
-  // merge grad must before backward activation
-  if (mergeGrad_) {
-    REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
-    stream_->submit(pipelineMergeGrad_);
-  }
-  {
-    REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
-    backwardActivation();
-  }
-  {
-    REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
-    stream_->submit(pipelineBwd_);
-  }
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    updateWeights(callback);
-  }
-}
-
-void MKLDNNLayer::reshapeInput(int& batchsize,
-                               int& height,
-                               int& width,
-                               size_t idx) {
-  const Argument& input = inputLayers_[idx]->getOutput();
-  batchsize = input.getBatchSize();
-  int h = input.getFrameHeight();
-  int w = input.getFrameWidth();
-  if (h != 0) {
-    height = h;
-  }
-  if (w != 0) {
-    width = w;
-  }
-  height = height != 0 ? height : 1;
-  width = width != 0 ? width : 1;
-}
-
-void MKLDNNLayer::reshapeOutput(size_t height, size_t width) {
-  output_.setFrameHeight(height);
-  output_.setFrameWidth(width);
-  for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-    outputOtherDevice_[i].setFrameHeight(height);
-    outputOtherDevice_[i].setFrameWidth(width);
-  }
-}
-
-void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
-                                  const MatrixPtr& mat,
-                                  memory::primitive_desc pd) {
-  dnn = nullptr;
-  if (mat == nullptr) {
-    return;
-  }
-  dnn = MKLDNNMatrix::create(pd, mat);
-}
-
-void MKLDNNLayer::resetInValue(
-    MKLDNNMatrixPtr& in,
-    const std::shared_ptr<memory::primitive_desc>& intPD,
-    size_t idx,
-    int inputChannel) {
-  cvtInVals_[idx] = nullptr;
-  extInVals_[idx] = nullptr;
-  in = nullptr;
-  inputChannel = inputChannel == 0 ? ic_ : inputChannel;
-  CHECK_GT(bs_ * inputChannel * ih_ * iw_, 0);
-  auto extPD = MKLDNNMatrix::createPrimitiveDesc(
-      {bs_, inputChannel, ih_, iw_}, format::nchw, engine_);
-  const MatrixPtr& inMat = inputLayers_[idx]->getOutputValue();
-  extInVals_[idx] = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
-  CHECK_EQ(inputIsOnlyMKLDNN(), extInVals_[idx] != nullptr);
-  if (extInVals_[idx] == nullptr ||
-      extInVals_[idx]->getFormat() == format::nc) {
-    extInVals_[idx] = MKLDNNMatrix::create(extPD, inMat);
-  }
-  in = extInVals_[idx];
-  if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
-    return;
-  }
-  // need create reorder
-  in = MKLDNNMatrix::create(*intPD);
-  cvtInVals_[idx] = MKLDNNMatrix::createReorder(extInVals_[idx], in);
-  CHECK(cvtInVals_[idx]) << "should not be emptry";
-}
-
-void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
-                                memory::primitive_desc intPD) {
-  cvtOutVal_ = nullptr;
-  out = MKLDNNMatrix::create(intPD, output_.value);
-  extOutVal_ = out;
-  if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) {
-    return;
-  }
-  // need create reorder
-  CHECK_GT(bs_ * oc_ * oh_ * ow_, 0);
-  extOutVal_ = MKLDNNMatrix::create(
-      memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_, output_.value);
-  out = MKLDNNMatrix::create(intPD);
-  cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_);
-  CHECK(cvtOutVal_) << "should not be empty";
-}
-
-void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
-                              memory::primitive_desc intPD,
-                              size_t idx) {
-  cvtInGrads_[idx] = nullptr;
-  extInGrads_[idx] = nullptr;
-  in = nullptr;
-  LayerPtr& input = inputLayers_[idx];
-  if (input->getOutputGrad() == nullptr) {
-    // no need input grad
-    return;
-  }
-  CHECK(inputIsOnlyMKLDNN() || input->getOutputMapSize() <= 1)
-      << "only support input is MKLDNN layer or only have one output layer";
-  // when input is a mkldnn branch node,
-  // this layer will save input grad to a internal buffer,
-  // and the mkldnn input layer will merge them to actual prev->output_.grad
-  const MatrixPtr& inMat =
-      input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr;
-  in = MKLDNNMatrix::create(intPD, inMat);
-  Argument& arg = input->getOutput(this->getName());
-  arg.grad = std::dynamic_pointer_cast<Matrix>(in);
-  CHECK_PRIMITIVE_DESC_EQ(inVals_[idx], intPD);
-  if (inputIsOnlyMKLDNN()) {
-    return;
-  }
-
-  extInGrads_[idx] = in;
-  if (isPaddleFormat(extInGrads_[idx]->getFormat())) {
-    return;
-  }
-  // need create reorder
-  CHECK(extInVals_[idx] != nullptr &&
-        isPaddleFormat(extInVals_[idx]->getFormat()))
-      << "should have external input value and the format must be nchw(nc)";
-  extInGrads_[idx] =
-      MKLDNNMatrix::create(extInVals_[idx]->getPrimitiveDesc(), inMat);
-  CHECK_PRIMITIVE_DESC_EQ(inVals_[idx], intPD);
-  in = MKLDNNMatrix::create(intPD);
-  cvtInGrads_[idx] = MKLDNNMatrix::createReorder(in, extInGrads_[idx]);
-  CHECK(cvtInGrads_[idx]);
-}
-
-void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out,
-                               memory::primitive_desc intPD) {
-  cvtOutGrad_ = nullptr;
-  extOutGrad_ = nullptr;
-  out = nullptr;
-  MatrixPtr& outMat = output_.grad;
-  out = MKLDNNMatrix::create(intPD, outMat);
-  resetMergeGrad(out);
-  if (outputIsOnlyMKLDNN()) {
-    return;
-  }
-  CHECK_LE(outputMap_.size(), 1U) << "do not support mixed with cpu device";
-  extOutGrad_ = out;
-  if (isPaddleFormat(extOutGrad_->getFormat())) {
-    return;
-  }
-  // need create reorder
-  CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat()))
-      << "should have external output value and the format must be nchw(nc)";
-  extOutGrad_ = MKLDNNMatrix::create(extOutVal_->getPrimitiveDesc(), outMat);
-  CHECK_PRIMITIVE_DESC_EQ(outVal_, intPD);
-  out = MKLDNNMatrix::create(intPD);
-  cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out);
-  CHECK(cvtOutGrad_);
-}
-
-void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
-  mergeGrad_ = nullptr;
-  pipelineMergeGrad_.clear();
-  if (outputMap_.size() <= 1 || !outputIsOnlyMKLDNN()) {
-    // do not merge when output is not all MKLDNN or only one output
-    return;
-  }
-  CHECK(out) << "should have reset internal ouput grad";
-  std::vector<float> scales(outputMap_.size(), 1.0);
-  std::vector<memory::primitive_desc> srcPDs;
-  std::vector<primitive::at> srcs;
-  for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
-    MKLDNNMatrixPtr src =
-        std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
-    CHECK(src) << "should be MKLDNNMatrix";
-    auto srcDims = src->getDims();
-    auto dstDims = out->getDims();
-    CHECK_EQ(srcDims.size(), dstDims.size());
-    for (size_t i = 0; i < srcDims.size(); ++i) {
-      CHECK_EQ(srcDims[i], dstDims[i]);
-    }
-    VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first
-                      << ", format " << src->getFormat();
-    srcPDs.push_back(src->getPrimitiveDesc());
-    srcs.push_back(*src);
-  }
-
-  auto sumPD = sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs);
-  mergeGrad_.reset(new sum(sumPD, srcs, *out));
-  pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNLayer.h b/paddle/legacy/gserver/layers/MKLDNNLayer.h
deleted file mode 100644
index 94dc8625f68..00000000000
--- a/paddle/legacy/gserver/layers/MKLDNNLayer.h
+++ /dev/null
@@ -1,477 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "MKLDNNBase.h"
-#include "mkldnn.hpp"
-#include "paddle/legacy/math/MKLDNNMatrix.h"
-#include "paddle/legacy/utils/Stat.h"
-
-DECLARE_bool(use_mkldnn);
-
-namespace paddle {
-
-class MKLDNNLayer;
-typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
-
-/**
- * @brief Base class of MKLDNNlayer.
- *
- */
-class MKLDNNLayer : public Layer {
- protected:
-  // batch size
-  int bs_;
-  // their sizes are always from the first input layer
-  // input image channel, height and width
-  int ic_, ih_, iw_;
-  // output image channel, height and width
-  int oc_, oh_, ow_;
-
-  // the condition that forward need be reset
-  size_t condition_;
-  // backward also need reset after reset forward handle
-  bool needResetBwd_;
-
-  // is output only mkldnn
-  bool outputOnlyMKLDNN_;
-
-  // mkldnn engine, stream and primivtives
-  mkldnn::engine engine_;
-  std::shared_ptr<MKLDNNStream> stream_;
-  std::shared_ptr<mkldnn::primitive> fwd_;
-  std::shared_ptr<mkldnn::primitive> bwdWgt_;
-  std::shared_ptr<mkldnn::primitive> bwdData_;
-  std::vector<mkldnn::primitive> pipelineFwd_;
-  std::vector<mkldnn::primitive> pipelineBwd_;
-
-  /* Value and grad are seperated as internal and external buffers.
-   * Each MKLDNNLayer must init or reset internal buffer at least,
-   * and the external buffer format is always nchw of nc(when h==w==1),
-   * which is the same format as paddle.
-   * The output_.value and output_.grad always save the external data,
-   * when mixed with cpu device.
-   * When all layers are mkldnn layers, they could save internal data.
-   */
-  // below MKLDNNMatrix buffers are all internal buffers
-  std::vector<MKLDNNMatrixPtr> inVals_;
-  std::vector<MKLDNNMatrixPtr> inGrads_;
-  MKLDNNMatrixPtr outVal_;
-  MKLDNNMatrixPtr outGrad_;
-  // below are external value and grad
-  std::vector<MKLDNNMatrixPtr> extInVals_;
-  std::vector<MKLDNNMatrixPtr> extInGrads_;
-  MKLDNNMatrixPtr extOutVal_;
-  MKLDNNMatrixPtr extOutGrad_;
-  // convert handle between external and internal buffers
-  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInVals_;
-  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInGrads_;
-  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
-  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
-
-  // weight and bias are always internal buffers
-  MKLDNNMatrixPtr wgtVal_;
-  MKLDNNMatrixPtr wgtGrad_;
-  MKLDNNMatrixPtr biasVal_;
-  MKLDNNMatrixPtr biasGrad_;
-
-  // merge grad primitive
-  std::shared_ptr<mkldnn::primitive> mergeGrad_;
-  std::vector<mkldnn::primitive> pipelineMergeGrad_;
-  // tmp input argument to save input grad, only used to merge grad
-  Argument tmpInArg_;
-
- public:
-  explicit MKLDNNLayer(const LayerConfig& config)
-      : Layer(config),
-        ih_(0),
-        iw_(0),
-        condition_(0),
-        needResetBwd_(true),
-        outputOnlyMKLDNN_(false),
-        engine_(mkldnn::engine::cpu, 0),
-        stream_(nullptr),
-        fwd_(nullptr),
-        bwdWgt_(nullptr),
-        bwdData_(nullptr) {}
-
-  ~MKLDNNLayer() {}
-
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback);
-
-  /**
-   * reshape the input and output channels and image sizes
-   * and reset output buffer size
-   */
-  virtual void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) = 0;
-
-  /**
-   * reset the mkldnn forward primitve and memories
-   * only would be called when input size changes
-   * weight and bias buffers should be coverd by child class itself
-   */
-  virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& out) = 0;
-
-  /**
-   * reset the mkldnn backward primitve and memories
-   * only would be called when needed
-   * weight and bias buffers should be coverd by child class itself
-   */
-  virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& out) = 0;
-
-  /**
-   * Update weights and biases if necessary.
-   */
-  virtual void updateWeights(const UpdateCallback& callback) {}
-
-  /**
-   * convert weight from paddle format to mkldnn format
-   * weight_ will be override
-   */
-  virtual void convertWeightsFromPaddle() {}
-
-  /**
-   * convert mkldnn weight to paddle format
-   * weight_ will be override
-   */
-  virtual void convertWeightsToPaddle() {}
-
-  /**
-   * add this interface as public for unit test
-   */
-  void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); }
-
- protected:
-  /**
-   * Some layers may have different condition to reset the forward.
-   * The function returns the condition that do not need reset forward.
-   */
-  inline virtual size_t keepCondition() {
-    // reset when the first input element size changed, not only the batchsize
-    return inputLayers_[0]->getOutputValue()->getElementCnt();
-  }
-
-  /**
-   * reshape the input image sizes and input batchsize
-   */
-  void reshapeInput(int& batchsize, int& height, int& width, size_t idx = 0);
-
-  /**
-   * reshape output image sizes
-   */
-  void reshapeOutput(size_t height, size_t width);
-
-  /**
-   * reset MKLDNNMatrix from Matrix and internal primitive desc.
-   * reset nullptr if matrix or primitive desc is empty
-   */
-  void resetWithMatrix(MKLDNNMatrixPtr& dnn,
-                       const MatrixPtr& mat,
-                       mkldnn::memory::primitive_desc pd);
-
-  /**
-   * reset input value from input MKLDNNMatrix and internal primitive desc.
-   * reset both internal and external buffer and create reorder if necessary.
-   * input channel may be different in concat.
-   */
-  void resetInValue(
-      MKLDNNMatrixPtr& in,
-      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
-      size_t idx = 0,
-      int inputChannel = 0);
-
-  /**
-   * reset output value from internal primitive desc.
-   * reset both internal and external buffer and create reorder if necessary.
-   */
-  void resetOutValue(MKLDNNMatrixPtr& out,
-                     mkldnn::memory::primitive_desc intPD);
-
-  /**
-   * reset input grad from internal primitive desc.
-   * reset both internal and external buffer and create reorder if necessary.
-   */
-  void resetInGrad(MKLDNNMatrixPtr& in,
-                   mkldnn::memory::primitive_desc intPD,
-                   size_t idx = 0);
-
-  /**
-   * reset output grad from internal primitive desc.
-   * merge grad if necessary.
-   * reset both internal and external buffer and create reorder if necessary.
-   * note: about merge grad, when this layer has several outputs,
-   *       it could not be mixed with cpu device,
-   *       since it can not get memory desc from cpu device.
-   */
-  void resetOutGrad(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc intPD);
-
-  /**
-   * reset the merge grad primitive if necessary.
-   * note: do not support the grads mixed with cpu device,
-   *       since it can not get memory desc from cpu device.
-   */
-  void resetMergeGrad(MKLDNNMatrixPtr& out);
-
- protected:
-  /**
-   * Set deviceId of this layer.
-   */
-  void setDevice(int id) { deviceId_ = id; }
-
-  /**
-   * check the format is nchw or nc,
-   * which is supported by Paddle default memory layout
-   */
-  bool isPaddleFormat(mkldnn::memory::format fmt) {
-    if (fmt == mkldnn::memory::format::nchw ||
-        fmt == mkldnn::memory::format::nc) {
-      return true;
-    } else {
-      return false;
-    }
-  }
-
-  /**
-   * If input only has MKLDNN device.
-   * Otherwise, only support the previous layer using CPU device.
-   */
-  bool inputIsOnlyMKLDNN(int index = 0) {
-    int prevDevice = getPrev(index)->getDeviceId();
-    if (prevDevice == MKLDNN_DEVICE) {
-      return true;
-    } else {
-      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
-      return false;
-    }
-  }
-
-  /**
-   * If output only has MKLDNN device.
-   * Otherwise, other devices should only using CPU device.
-   */
-  bool outputIsOnlyMKLDNN() {
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
-          << "Only support other device is CPU yet";
-    }
-    outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0;
-    return outputOnlyMKLDNN_;
-  }
-
-  /**
-   * print info about sizes
-   */
-  virtual void printSizeInfo() {
-    VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
-                       << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
-                       << ", oh: " << oh_ << ", ow: " << ow_;
-  }
-
-  /**
-   * print the mkldnn memory format of value
-   */
-  virtual void printValueFormat() {
-    for (size_t i = 0; i < inVals_.size(); ++i) {
-      if (!inVals_[i]) {
-        continue;
-      }
-      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
-                        << ": " << (extInVals_[i] ? extInVals_[i]->getFormat()
-                                                  : inVals_[i]->getFormat())
-                        << " >>> " << inVals_[i]->getFormat() << " >>>";
-    }
-    if (outVal_) {
-      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "
-                        << (extOutVal_ ? extOutVal_->getFormat()
-                                       : outVal_->getFormat());
-    }
-    if (wgtVal_) {
-      VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
-    }
-    if (biasVal_) {
-      VLOG(MKLDNN_FMTS) << "Bias value format: " << biasVal_->getFormat();
-    }
-  }
-
-  /**
-   * print the mkldnn memory format of grad
-   */
-  virtual void printGradFormat() {
-    if (outGrad_) {
-      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "
-                        << (extOutGrad_ ? extOutGrad_->getFormat()
-                                        : outGrad_->getFormat());
-    }
-    for (size_t i = 0; i < inGrads_.size(); ++i) {
-      if (!inGrads_[i]) {
-        continue;
-      }
-      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
-                        << ": " << (extInGrads_[i] ? extInGrads_[i]->getFormat()
-                                                   : inGrads_[i]->getFormat())
-                        << " <<< " << inGrads_[i]->getFormat() << " <<<";
-    }
-    if (wgtGrad_) {
-      VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
-    }
-    if (biasGrad_) {
-      VLOG(MKLDNN_FMTS) << "Bias grad format: " << biasGrad_->getFormat();
-    }
-  }
-
- private:
-  /**
-   * clear all grad
-   */
-  void clearGrads() {
-    if (output_.grad) {
-      output_.grad->zeroMem();
-    }
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      if (outputOtherDevice_[i].grad) {
-        outputOtherDevice_[i].grad->zeroMem();
-      }
-    }
-  }
-
-  /**
-   * Set deviceId of the params used in this layer.
-   */
-  void setParamsDevice(int id, const ParameterMap& parameterMap) {
-    for (auto& inputConfig : config_.inputs()) {
-      if (inputConfig.has_input_parameter_name()) {
-        ParameterPtr parameter;
-        std::string name = inputConfig.input_parameter_name();
-        CHECK(mapGet(name, parameterMap, &parameter))
-            << "Cannot find input parameter " << name << " for layer "
-            << getName();
-        parameter->setDevice(id);
-      }
-    }
-    if (config_.has_bias_parameter_name()) {
-      ParameterPtr parameter;
-      std::string name = config_.bias_parameter_name();
-      CHECK(mapGet(name, parameterMap, &parameter))
-          << "Cannot find bias parameter " << name << " for layer "
-          << getName();
-      parameter->setDevice(id);
-    }
-  }
-
-  /**
-   * Set output map of prev layers.
-   */
-  void setOutputMap() {
-    outputMap_.clear();
-    for (size_t i = 0; i < inputLayers_.size(); ++i) {
-      inputLayers_[i]->setOutput(getName(), &tmpInArg_);
-    }
-  }
-
-  /**
-   * if have cpu device, share value and grad data with output_
-   */
-  void shareCPUDevice() {
-    if (outputIsOnlyMKLDNN()) {
-      return;
-    }
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      outputOtherDevice_[i].value = output_.value;
-      outputOtherDevice_[i].grad = output_.grad;
-    }
-  }
-
-  /**
-   * Check the cpu device number of outputOtherDevice_.
-   * should have only one at most.
-   */
-  void checkCPUOutputsNumber(int max = 1) {
-    int cnt = 0;
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
-        ++cnt;
-      }
-    }
-    CHECK_LE(cnt, max) << "too much CPU devies";
-  }
-
-  /**
-   * copy SeqInfo from input layer to this output and other output devices.
-   * @note: do not use getInput(0) since it used this deviceId_,
-   *        use "inputLayers_[0]->getOutput()" instead.
-   */
-  void copySeqInfoToOutputs() {
-    if (inputLayers_.empty() || !needSequenceInfo_) {
-      return;
-    }
-    const Argument& input = inputLayers_[0]->getOutput();
-    output_.sequenceStartPositions = input.sequenceStartPositions;
-    output_.subSequenceStartPositions = input.subSequenceStartPositions;
-    output_.cpuSequenceDims = input.cpuSequenceDims;
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      outputOtherDevice_[i].sequenceStartPositions =
-          output_.sequenceStartPositions;
-      outputOtherDevice_[i].subSequenceStartPositions =
-          output_.subSequenceStartPositions;
-      outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
-    }
-  }
-
-  void prepareValueConversions(std::vector<mkldnn::primitive>& pipeline) {
-    // MKLDNNLayer output value should be MKLDNNMatrix
-    // so external output value is necessary.
-    // Then external input value is not necessary,
-    // since input may be mkldnn internal buffer.
-    CHECK(extOutVal_) << "external output value is necessary";
-    output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
-    CHECK(inVals_[0] && outVal_) << "internal memories are necessary";
-    for (size_t i = 0; i < cvtInVals_.size(); ++i) {
-      if (cvtInVals_[i]) {
-        pipeline.insert(pipeline.begin(), *cvtInVals_[i]);
-      }
-    }
-    if (cvtOutVal_) {
-      pipeline.push_back(*cvtOutVal_);
-    }
-  }
-  void prepareGradConversions(std::vector<mkldnn::primitive>& pipeline) {
-    // external output grad is not necessary
-    // since output may be mkldnn internal buffer or merge them directly.
-    CHECK(outGrad_) << "internal output grad is necessary";
-    if (extOutGrad_) {
-      CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
-          << "the external buffer should share the same data with output_.grad";
-    }
-    if (cvtOutGrad_) {
-      pipeline.insert(pipeline.begin(), *cvtOutGrad_);
-    }
-    for (size_t i = 0; i < cvtInGrads_.size(); ++i) {
-      if (cvtInGrads_[i]) {
-        pipeline.push_back(*cvtInGrads_[i]);
-      }
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp
deleted file mode 100644
index 83d980538d2..00000000000
--- a/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNPoolLayer.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/utils/Logging.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_pool, MKLDNNPoolLayer);
-
-bool MKLDNNPoolLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-
-  /* the size of inputs for pool-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-  const PoolConfig& conf = config_.inputs(0).pool_conf();
-  ic_ = conf.channels();
-  ih_ = conf.img_size_y();
-  iw_ = conf.img_size();
-  oc_ = ic_;
-  oh_ = conf.output_y();
-  ow_ = conf.output_x();
-  fh_ = conf.size_y();
-  fw_ = conf.size_x();
-  ph_ = conf.padding_y();
-  pw_ = conf.padding();
-  sh_ = conf.stride_y();
-  sw_ = conf.stride();
-
-  const std::string& type = conf.pool_type();
-  if (type == "max-projection") {
-    poolAlgo_ = algorithm::pooling_max;
-  } else if (type == "avg-projection") {
-    // paddle only use exclude_padding
-    poolAlgo_ = algorithm::pooling_avg_exclude_padding;
-  } else {
-    LOG(FATAL) << "unknow pooling type!";
-  }
-  return true;
-}
-
-void MKLDNNPoolLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  reshapeInput(bs, ih, iw);
-  // ic_ and oc can not be changed
-  CHECK_EQ((size_t)ic,
-           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
-      << "Input channel can not be changed";
-
-  // cal output sizes
-  // paddle used false caffeMode for pooling
-  oh = outputSize(ih, fh_, ph_, sh_, false);
-  ow = outputSize(iw, fw_, pw_, sw_, false);
-  reshapeOutput(oh, ow);
-
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inputs[0], out);
-
-  resetFwdPD(fwdPD_, inputs[0], out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
-}
-
-void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  std::shared_ptr<pool_bwd::primitive_desc> pd;
-
-  resetBwdBuffers(inputs[0], out);
-
-  resetBwdPD(pd, inputs[0], out);
-
-  resetBwdPipeline(pipeline, pd, inputs[0], out);
-}
-
-void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
-                                      MKLDNNMatrixPtr& out) {
-  resetInValue(in);
-
-  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  CHECK(in);
-  auto outPD =
-      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
-  resetOutValue(out, outPD);
-}
-
-void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
-                                 MKLDNNMatrixPtr in,
-                                 MKLDNNMatrixPtr out) {
-  memory::dims kernels = memory::dims{fh_, fw_};
-  memory::dims strides = memory::dims{sh_, sw_};
-  memory::dims padL = memory::dims{ph_, pw_};
-  memory::dims padR = getPaddingR();
-  padding_kind padKind = padding_kind::zero;
-  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
-                                        : prop_kind::forward_training;
-  auto fwdDesc = pool_fwd::desc(pk,
-                                poolAlgo_,
-                                in->getMemoryDesc(),
-                                out->getMemoryDesc(),
-                                strides,
-                                kernels,
-                                padL,
-                                padR,
-                                padKind);
-  pd.reset(new pool_fwd::primitive_desc(fwdDesc, engine_));
-
-  // prepare workspace if necessary
-  workspace_ =
-      (passType_ != PASS_TEST && poolAlgo_ == algorithm::pooling_max)
-          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
-          : nullptr;
-}
-
-void MKLDNNPoolLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<pool_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  fwd_ = workspace_
-             ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
-             : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
-                                      MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0] && outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
-}
-
-void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
-                                 MKLDNNMatrixPtr& in,
-                                 MKLDNNMatrixPtr& out) {
-  pd = nullptr;
-  if (in == nullptr) {
-    return;
-  }
-  memory::dims kernels = memory::dims{fh_, fw_};
-  memory::dims strides = memory::dims{sh_, sw_};
-  memory::dims padL = memory::dims{ph_, pw_};
-  memory::dims padR = getPaddingR();
-  CHECK(out);
-  auto bwdDesc = pool_bwd::desc(poolAlgo_,
-                                in->getMemoryDesc(),
-                                out->getMemoryDesc(),
-                                strides,
-                                kernels,
-                                padL,
-                                padR,
-                                padding_kind::zero);
-  pd.reset(new pool_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
-}
-
-void MKLDNNPoolLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<pool_bwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  if (pd == nullptr) {
-    return;
-  }
-
-  bwdData_ =
-      workspace_
-          ? std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *workspace_, *in))
-          : std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNPoolLayer.h b/paddle/legacy/gserver/layers/MKLDNNPoolLayer.h
deleted file mode 100644
index 1eb0ee4ad94..00000000000
--- a/paddle/legacy/gserver/layers/MKLDNNPoolLayer.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-typedef mkldnn::pooling_forward pool_fwd;
-typedef mkldnn::pooling_backward pool_bwd;
-
-/**
- * @brief A subclass of MKLDNNLayer pool layer.
- *
- * The config file api is mkldnn_pool
- */
-class MKLDNNPoolLayer : public MKLDNNLayer {
- protected:
-  // padding height and width
-  int ph_, pw_;
-  // stride height and width
-  int sh_, sw_;
-  // filter(kenerl) height and width
-  int fh_, fw_;
-
-  // pooling_avg or pooling_max
-  mkldnn::algorithm poolAlgo_;
-
-  // save forward primitive_desc, which can be used backward
-  std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
-  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
-  // test_pooling_forward.cpp, pool need workspace for backward
-  std::shared_ptr<mkldnn::memory> workspace_;
-
- public:
-  explicit MKLDNNPoolLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
-
-  ~MKLDNNPoolLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void printSizeInfo() override {
-    MKLDNNLayer::printSizeInfo();
-    VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
-                       << ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
-                       << ", sw: " << sw_;
-  }
-
- protected:
-  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr in,
-                  MKLDNNMatrixPtr out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<pool_fwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr& in,
-                  MKLDNNMatrixPtr& out);
-  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<pool_bwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& out);
-
-  /**
-   * get padding_r according to
-   * https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
-   * test_pooling_forward.cpp
-   */
-  mkldnn::memory::dims getPaddingR() const {
-    mkldnn::memory::dims padR = {ph_, pw_};
-    for (int i = 0; i < 2; ++i) {
-      if ((ih_ + ph_ + padR[0] - fh_) / sh_ + 1 < oh_) {
-        ++padR[0];
-      }
-      if ((iw_ + pw_ + padR[1] - fw_) / sw_ + 1 < ow_) {
-        ++padR[1];
-      }
-    }
-    return padR;
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.cpp
deleted file mode 100644
index d928ebc3248..00000000000
--- a/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLPackedRecurrentLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(mkl_packed_recurrent, MKLPackedRecurrentLayer);
-
-bool MKLPackedRecurrentLayer::init(const LayerMap& layerMap,
-                                   const ParameterMap& parameterMap) {
-  if (!RecurrentLayer::init(layerMap, parameterMap)) return false;
-  packed_weight_.reset(new MKLPackedWeight(weight_->getW()));
-  packed_weight_->pack();
-  if (needGradient_) {
-    packed_weightT_.reset(new MKLPackedWeight(weight_->getW(), true));
-    packed_weightT_->pack();
-  }
-  return true;
-}
-
-void MKLPackedRecurrentLayer::backward(const UpdateCallback& callback) {
-  RecurrentLayer::backward(callback);
-  packed_weight_->pack();
-  if (needGradient_) {
-    packed_weightT_->pack();
-  }
-}
-
-void MKLPackedRecurrentLayer::forwardBatch(int batchSize,
-                                           size_t numSequences,
-                                           const int* starts) {
-  if (!batchValue_) {
-    batchValue_.reset(new SequenceToBatch(useGpu_));
-  }
-
-  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
-
-  batchValue_->copyFromSeq(*output_.value);
-
-  {
-    REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str());
-    /* forward one batch */
-    for (size_t n = 0; n < batchValue_->getNumBatch(); n++) {
-      MatrixPtr batchValue = batchValue_->getBatchValue(n);
-
-      if (n != 0) {
-        MatrixPtr preBatchValue =
-            batchValue_->getBatchValue(n - 1, batchValue->getHeight());
-
-        packed_weight_->gemm_compute(preBatchValue, batchValue);
-      }
-      Argument arg;
-      arg.value = batchValue;
-      activation_->forward(arg).check();
-    }
-  }
-  batchValue_->copyBackSeq(*output_.value);
-}
-
-void MKLPackedRecurrentLayer::backwardBatch(int batchSize,
-                                            size_t numSequences,
-                                            const int* starts) {
-  if (!batchGrad_) {
-    batchGrad_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchGrad_->shareIndexWith(*batchValue_);
-
-  size_t numBatch = batchGrad_->getNumBatch();
-  bool backwardByBatch = numBatch < numSequences;
-
-  batchGrad_->copyFromSeq(*output_.grad);
-  {
-    REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str());
-    /* backward one batch */
-    for (int n = (int)numBatch - 1; n >= 0; n--) {
-      MatrixPtr batchGrad = batchGrad_->getBatchValue(n);
-      MatrixPtr batchValue =
-          batchValue_->getBatchValue(n, batchGrad->getHeight());
-
-      Argument arg;
-      arg.value = batchValue;
-      arg.grad = batchGrad;
-      activation_->backward(arg).check();
-
-      if (n != 0) {
-        batchValue = batchGrad_->getBatchValue(n - 1, batchGrad->getHeight());
-        packed_weightT_->gemm_compute(batchGrad, batchValue);
-      }
-
-      if (backwardByBatch && weight_->getWGrad()) {
-        if (n != 0) {
-          /* backward weight */
-          batchValue =
-              batchValue_->getBatchValue(n - 1, batchGrad->getHeight());
-          weight_->getWGrad()->mul(
-              *batchValue->getTranspose(), *batchGrad, 1, 1);
-        }
-      }
-    }
-  }
-
-  batchGrad_->copyBackSeq(*output_.grad);
-
-  if (!backwardByBatch && weight_->getWGrad()) {
-    REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str());
-    for (size_t seq = 0; seq < numSequences; ++seq) {
-      int len = starts[seq + 1] - starts[seq];
-      weight_->getWGrad()->mul(
-          *output_.value
-               ->subMatrix(reversed_ ? starts[seq] + 1 : starts[seq], len - 1)
-               ->getTranspose(),
-          *output_.grad->subMatrix(reversed_ ? starts[seq] : starts[seq] + 1,
-                                   len - 1),
-          1,
-          1);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h
deleted file mode 100644
index 441025a9c9d..00000000000
--- a/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLPackedWeight.h"
-#include "RecurrentLayer.h"
-
-DECLARE_bool(rnn_use_batch);
-
-namespace paddle {
-
-/**
- * @brief MKLPackedRecurrentLayer is almost the same with RecurrentLayer
- * but is optimized with MKL cblas packed gemm.
- * More details:
- * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/mkl/mkl_packed.md
- */
-
-class MKLPackedRecurrentLayer : public RecurrentLayer {
- public:
-  explicit MKLPackedRecurrentLayer(const LayerConfig& config)
-      : RecurrentLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void backward(const UpdateCallback& callback) override;
-
- protected:
-  void forwardBatch(int batchSize,
-                    size_t numSequences,
-                    const int* starts) override;
-
-  void backwardBatch(int batchSize,
-                     size_t numSequences,
-                     const int* starts) override;
-
- protected:
-  /// packed_weight_ contains same data with
-  /// RecurrentLayer::weight_ but is packed
-  std::unique_ptr<MKLPackedWeight> packed_weight_;
-  /// packed_weightT_ is the transposition matrix of packed_weight_
-  std::unique_ptr<MKLPackedWeight> packed_weightT_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLPackedWeight.h b/paddle/legacy/gserver/layers/MKLPackedWeight.h
deleted file mode 100644
index 47f225bd03c..00000000000
--- a/paddle/legacy/gserver/layers/MKLPackedWeight.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/math/MathFunctions.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/parameter/Weight.h"
-
-namespace paddle {
-
-class MKLPackedWeight {
- protected:
-  /// The pointer of weight
-  real *weight_;
-  /// The pointer of cblas packed gemm to weight
-  real *packedWeight_;
-  size_t height_;
-  size_t width_;
-  bool transW_;
-
- public:
-  explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) {
-    packedWeight_ = nullptr;
-    weight_ = weight->getData();
-    height_ = weight->getHeight();
-    width_ = weight->getWidth();
-    transW_ = transW;
-  }
-
-  ~MKLPackedWeight() { free_(); }
-
-  void pack() { pack_(weight_); }
-
-  void gemm_compute(const MatrixPtr src, MatrixPtr dst) {
-    cblas_sgemm_compute(CblasRowMajor,
-                        CblasNoTrans,
-                        CblasPacked,
-                        src->getHeight(),
-                        transW_ ? height_ : width_,
-                        transW_ ? width_ : height_,
-                        src->getData(),
-                        src->getWidth(),
-                        packedWeight_,
-                        width_,
-                        1.0,
-                        dst->getData(),
-                        dst->getWidth());
-  }
-
- protected:
-  void pack_(real *src) {
-    if (!packedWeight_) {
-      packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_);
-    }
-    cblas_sgemm_pack(CblasRowMajor,
-                     CblasBMatrix,
-                     transW_ ? CblasTrans : CblasNoTrans,
-                     1,
-                     transW_ ? height_ : width_,
-                     transW_ ? width_ : height_,
-                     1.0,
-                     src,
-                     width_,
-                     packedWeight_);
-  }
-
-  void free_() {
-    if (packedWeight_) {
-      cblas_sgemm_free(packedWeight_);
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxIdLayer.cpp b/paddle/legacy/gserver/layers/MaxIdLayer.cpp
deleted file mode 100644
index eecd4996e96..00000000000
--- a/paddle/legacy/gserver/layers/MaxIdLayer.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * A layer for finding the id which has the maximal value for each sample.
- * The result is stored in output_.ids.
- *
- * The config file api is maxid_layer.
- */
-class MaxIdLayer : public Layer {
- private:
-  /// a predetermined number of best states at each level
-  size_t beamSize_;
-
- public:
-  explicit MaxIdLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    bool ret = Layer::init(layerMap, parameterMap);
-    CHECK_EQ(1UL, inputLayers_.size());
-
-    beamSize_ = config_.has_beam_size() ? config_.beam_size() : FLAGS_beam_size;
-    CHECK_GE(beamSize_, 1LU);
-    return ret;
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-    const Argument& input = getInput(0);
-    size_t batchSize = input.getBatchSize();
-    IVector::resizeOrCreate(output_.ids, batchSize * beamSize_, useGpu_);
-    Matrix::resizeOrCreate(output_.in,
-                           batchSize,
-                           beamSize_,
-                           false,
-                           /* useGpu */ useGpu_);
-    output_.value = nullptr;
-    input.value->rowMax(*output_.ids, *output_.in);
-  }
-
-  void backward(const UpdateCallback& callback) override {}
-};
-
-REGISTER_LAYER(maxid, MaxIdLayer);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxLayer.cpp b/paddle/legacy/gserver/layers/MaxLayer.cpp
deleted file mode 100644
index b51251b663c..00000000000
--- a/paddle/legacy/gserver/layers/MaxLayer.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MaxLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(max, MaxLayer);
-
-void MaxLayer::forward(PassType passType) {
-  SequencePoolLayer::forward(passType);
-
-  IVector::resizeOrCreate(
-      maxIndex_, newBatchSize_ * getSize(), useGpu(deviceId_));
-  maxIndex_->zeroMem();
-
-  MatrixPtr inputValue = getInputValue(0);
-  MatrixPtr outputValue = getOutputValue();
-
-  {
-    REGISTER_TIMER_INFO("MaxLayerForward", getName().c_str());
-    outputValue->maxSequenceForward(
-        *inputValue, *startPositions_->getVector(useGpu_), *maxIndex_);
-  }
-
-  if (config_.output_max_index()) {
-    // copy maxIndex_ to output
-    outputValue->copyFrom(*maxIndex_);
-  } else {
-    /* add the bias-vector AFTER max operation */
-    if (biases_.get() != NULL) {
-      outputValue->addBias(*(biases_->getW()), 1);
-    }
-    /* activation */ { forwardActivation(); }
-  }
-}
-
-void MaxLayer::backward(const UpdateCallback& callback) {
-  CHECK(!config_.output_max_index())
-      << "backward is not available when output_max_index is set";
-  SequencePoolLayer::backward(callback);
-
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-  if (inputGrad) {
-    REGISTER_TIMER_INFO("MaxLayerBackward", getName().c_str());
-    inputGrad->maxSequenceBackward(
-        *outputGrad, *(startPositions_->getVector(useGpu_)), *maxIndex_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxLayer.h b/paddle/legacy/gserver/layers/MaxLayer.h
deleted file mode 100644
index 12d0128e39f..00000000000
--- a/paddle/legacy/gserver/layers/MaxLayer.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "SequencePoolLayer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * A layer for "internal max" for sequence input.
- * Input: one or more sequences. Each sequence contains some instances.
- * If SequenceLevel = kNonSeq:
- *    Output: output size is the number of input sequences (NOT input instances)
- *    output[i] = max_{for each instance in this sequence}{input[i]}
- *    If stride_ > 0:
- *      Output: a shorten sequence. Stride is the step size by which we slide a
- *              window upon the input sequence, and the max pooling operation is
- *              then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *    Check input sequence must has sub-sequence
- *    Output: output size is the number of input sub-sequences
- *    output[i] = max_{for each instance in this sub-sequence}{input[i]}
- *
- * The config file api is pooling_layer.
- */
-
-class MaxLayer : public SequencePoolLayer {
- protected:
-  // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
-  IVectorPtr maxIndex_;
-
- public:
-  explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    return SequencePoolLayer::init(layerMap, parameterMap);
-  }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxOutLayer.cpp b/paddle/legacy/gserver/layers/MaxOutLayer.cpp
deleted file mode 100644
index 919f62a45ba..00000000000
--- a/paddle/legacy/gserver/layers/MaxOutLayer.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MaxOutLayer.h"
-#include "hl_cnn.h"
-#include "hl_gpu.h"
-
-namespace paddle {
-
-REGISTER_LAYER(maxout, MaxOutLayer);
-
-size_t MaxOutLayer::getSize() {
-  const MaxOutConfig& maxoutConf = config_.inputs(0).maxout_conf();
-  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imgSizeH_ == 0) {
-    imgSizeH_ = maxoutConf.image_conf().img_size_y();
-  }
-  if (imgSizeW_ == 0) {
-    imgSizeW_ = maxoutConf.image_conf().img_size();
-  }
-
-  featLen_ = imgSizeH_ * imgSizeW_;
-  size_t layerSize = featLen_ * outputChannels_;
-
-  getOutput().setFrameHeight(imgSizeH_);
-  getOutput().setFrameWidth(imgSizeW_);
-
-  return layerSize;
-}
-
-bool MaxOutLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* the size of inputs for maxout-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  const MaxOutConfig& conf = config_.inputs(0).maxout_conf();
-  groups_ = conf.groups();
-  channels_ = conf.image_conf().channels();
-  CHECK_EQ(channels_ % groups_, 0UL);
-  outputChannels_ = channels_ / groups_;
-
-  return true;
-}
-
-void MaxOutLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  /* note: one sample correspond to one column */
-  size_t batchSize = getInput(0).getBatchSize();
-  size_t size = getSize();
-  resetOutput(batchSize, size);
-  MatrixPtr inputV = getInputValue(0);
-  MatrixPtr outV = getOutputValue();
-
-  IVector::resizeOrCreate(maxoutId_, size * batchSize, useGpu_);
-  outV->maxoutForward(*inputV, *maxoutId_, outputChannels_, groups_);
-}
-
-void MaxOutLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  /* Do derivation */
-  MatrixPtr inputG = getInputGrad(0);
-  MatrixPtr outG = getOutputGrad();
-
-  if (inputG) {
-    inputG->maxoutBackward(*outG, *maxoutId_, outputChannels_, groups_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxOutLayer.h b/paddle/legacy/gserver/layers/MaxOutLayer.h
deleted file mode 100644
index e56f34b8e02..00000000000
--- a/paddle/legacy/gserver/layers/MaxOutLayer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * A layer to do max out on conv layer output.
- * Input: output of a conv layer.
- * Output: feature map size same as input.  Channel is (input channel) / groups.
- * So the num of channels should be able to devided by groups.
- *
- * The config file api is maxout_layer.
- */
-
-class MaxOutLayer : public Layer {
- protected:
-  size_t groups_;
-  size_t imgSizeH_, imgSizeW_;
-  /// outputChannels_ = channels_ / groups_
-  size_t channels_, outputChannels_;
-  /// feature length = imgSizeH_ * imgSizeW_
-  size_t featLen_;
-  IVectorPtr maxoutId_;
-
- public:
-  /// return imgSizeH_ * imgSizeW_ * outputChannels_;
-  size_t getSize();
-
-  explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {}
-  virtual ~MaxOutLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp b/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp
deleted file mode 100644
index a1cc59a719e..00000000000
--- a/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MaxPoolWithMaskLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-bool MaxPoolWithMaskLayer::init(const LayerMap& layerMap,
-                                const ParameterMap& parameterMap) {
-  PoolLayer::init(layerMap, parameterMap);
-  setOutput("mask", &mask_);
-  return true;
-}
-
-size_t MaxPoolWithMaskLayer::getSize() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t layerSize = 0;
-
-  outputY_ = outputSize(imgSizeY_,
-                        sizeY_,
-                        confPaddingY_,
-                        strideY_,
-                        /* caffeMode */ false);
-  outputX_ = outputSize(imgSize_,
-                        sizeX_,
-                        confPadding_,
-                        stride_,
-                        /* caffeMode */ false);
-
-  layerSize = outputX_ * outputY_ * channels_;
-  getOutput().setFrameHeight(outputY_);
-  getOutput().setFrameWidth(outputX_);
-
-  return layerSize;
-}
-
-void MaxPoolWithMaskLayer::forward(PassType passType) {
-  size_t size = getSize();
-  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
-  int batchSize = inputV->getHeight();
-  resetOutput(batchSize, size);
-
-  MatrixPtr outV = getOutputValue();
-  CHECK_EQ(size, outV->getWidth());
-
-  resetSpecifyOutput(mask_,
-                     batchSize,
-                     size,
-                     /* isValueClean */ false,
-                     /* isGradClean */ true);
-
-  MatrixPtr maskV = mask_.value;
-  outV->maxPoolForward(*inputV,
-                       imgSizeY_,
-                       imgSize_,
-                       channels_,
-                       sizeX_,
-                       sizeY_,
-                       strideY_,
-                       stride_,
-                       outputY_,
-                       outputX_,
-                       confPaddingY_,
-                       confPadding_,
-                       maskV);
-}
-
-void MaxPoolWithMaskLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-
-  MatrixPtr outGrad = getOutputGrad();
-  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr inputGrad = inputLayers_[0]->getOutputGrad();
-
-  inputGrad->maxPoolBackward(*inputV,
-                             imgSizeY_,
-                             imgSize_,
-                             *outGrad,
-                             *outV,
-                             sizeX_,
-                             sizeY_,
-                             strideY_,
-                             stride_,
-                             outputY_,
-                             outputX_,
-                             1,
-                             1,
-                             confPaddingY_,
-                             confPadding_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.h b/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.h
deleted file mode 100644
index fcd5388abe3..00000000000
--- a/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "PoolLayer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief Basic parent layer of different kinds of pooling
- */
-class MaxPoolWithMaskLayer : public PoolLayer {
- protected:
-  Argument mask_;
-
- public:
-  explicit MaxPoolWithMaskLayer(const LayerConfig& config)
-      : PoolLayer(config) {}
-
-  size_t getSize();
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MixedLayer.cpp b/paddle/legacy/gserver/layers/MixedLayer.cpp
deleted file mode 100644
index 63e658c09c2..00000000000
--- a/paddle/legacy/gserver/layers/MixedLayer.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MixedLayer.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(mixed, MixedLayer);
-
-bool MixedLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!Layer::init(layerMap, parameterMap)) return false;
-
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  projections_.resize(inputLayers_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    if (config_.inputs(i).has_proj_conf()) {
-      projections_[i].reset(Projection::create(
-          config_.inputs(i).proj_conf(), parameters_[i], useGpu_));
-    } else {
-      CHECK(!parameters_[i]) << "should no parameters for operators";
-    }
-  }
-  for (auto& operator_conf : config_.operator_confs()) {
-    for (auto& input_index : operator_conf.input_indices()) {
-      CHECK(!config_.inputs(input_index).has_proj_conf());
-    }
-    operators_.emplace_back(Operator::create(operator_conf, useGpu_));
-  }
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    sharedBias_ = config_.shared_biases();
-    size_t psize = config_.bias_size();
-    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
-  }
-
-  return true;
-}
-
-void MixedLayer::prefetch() {
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (projections_[i]) {
-      projections_[i]->prefetch(&getInput(i));
-    }
-  }
-}
-
-void MixedLayer::resetState() {
-  for (auto& proj : projections_) {
-    if (proj) {
-      proj->resetState();
-    }
-  }
-}
-
-void MixedLayer::setState(LayerStatePtr state) {
-  CHECK(projectionStateMatrixSize_.size() == projections_.size())
-      << "projection size mis-match";
-
-  int start = 0;
-  LayerStatePtr statePtr = std::make_shared<LayerState>();
-  for (int i = 0; i < (int)projectionStateMatrixSize_.size(); i++) {
-    if (projectionStateMatrixSize_[i] > 0) {
-      statePtr->value.clear();
-      for (int j = start; j < start + projectionStateMatrixSize_[i]; j++) {
-        statePtr->value.push_back(state->value[j]);
-      }
-      projections_[i]->setState(statePtr);
-      start += projectionStateMatrixSize_[i];
-    }
-  }
-  CHECK((int)state->value.size() == start) << "state matrix size mis-match";
-}
-
-// Return state which consists of all projections states
-LayerStatePtr MixedLayer::getState() {
-  bool init = projectionStateMatrixSize_.size() == 0;
-  LayerStatePtr res = std::make_shared<LayerState>();
-  for (int i = 0; i < (int)projections_.size(); i++) {
-    LayerStatePtr statePtr =
-        projections_[i] ? projections_[i]->getState() : nullptr;
-    int stateSize = statePtr == nullptr ? 0 : statePtr->value.size();
-    if (init) {
-      projectionStateMatrixSize_.push_back(stateSize);
-    } else {
-      CHECK(projectionStateMatrixSize_[i] == stateSize)
-          << "state matrix size mis-match";
-    }
-    if (statePtr != nullptr) {
-      for (auto& matrixPtr : statePtr->value) {
-        res->value.push_back(matrixPtr);
-      }
-    }
-  }
-  return res;
-}
-
-void MixedLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, size);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (projections_[i]) {
-      projections_[i]->forward(&getInput(i), &output_, passType);
-    }
-  }
-
-  std::vector<const Argument*> ins;
-  for (auto& op : operators_) {
-    ins.clear();
-    for (auto& input_index : op->getConfig().input_indices()) {
-      ins.push_back(&getInput(input_index));
-    }
-    op->forward(ins, &output_, passType);
-  }
-
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
-    outV->addBias(*(biases_->getW()), 1, sharedBias_);
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void MixedLayer::backward(const UpdateCallback& callback) {
-  /* Do activation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
-
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (projections_[i]) {
-      projections_[i]->backward(callback);
-    }
-  }
-
-  for (auto& op : operators_) {
-    op->backward();
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MixedLayer.h b/paddle/legacy/gserver/layers/MixedLayer.h
deleted file mode 100644
index 43ee2bd8185..00000000000
--- a/paddle/legacy/gserver/layers/MixedLayer.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "Operator.h"
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * A mixed layer has multiple input layers.
- * Each input layer was processed by a Projection or Operator.
- * The results of all projections or Operators are summed together with bias
- * (if configured), and then go through an activation function and dropout
- * (if configured).
- *
- * The config file api is mixed_layer.
- */
-class MixedLayer : public Layer {
- public:
-  explicit MixedLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~MixedLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void prefetch() override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-  void resetState() override;
-  /**
-   * setState() should be called after getState().
-   * Argument state consists of all projections states.
-   */
-  void setState(LayerStatePtr state) override;
-  /**
-   * Return state which consists of all projections states.
-   */
-  LayerStatePtr getState() override;
-
- protected:
-  std::vector<std::unique_ptr<Projection>> projections_;
-  std::vector<std::unique_ptr<Operator>> operators_;
-  /// the matrix size of projection state
-  std::vector<int> projectionStateMatrixSize_;
-  std::unique_ptr<Weight> biases_;
-  bool sharedBias_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MultiBoxLossLayer.cpp b/paddle/legacy/gserver/layers/MultiBoxLossLayer.cpp
deleted file mode 100644
index 335e9a6ac47..00000000000
--- a/paddle/legacy/gserver/layers/MultiBoxLossLayer.cpp
+++ /dev/null
@@ -1,376 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MultiBoxLossLayer.h"
-#include <float.h>
-#include <vector>
-#include "DataLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(multibox_loss, MultiBoxLossLayer);
-
-bool MultiBoxLossLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  auto layerConf = config_.inputs(0).multibox_loss_conf();
-  numClasses_ = layerConf.num_classes();
-  inputNum_ = layerConf.input_num();
-  overlapThreshold_ = layerConf.overlap_threshold();
-  negPosRatio_ = layerConf.neg_pos_ratio();
-  negOverlap_ = layerConf.neg_overlap();
-  backgroundId_ = layerConf.background_id();
-  return true;
-}
-
-void MultiBoxLossLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
-  resetOutput(batchSize, 1);
-
-  // all location data and confidence score data
-  locSizeSum_ = 0;
-  confSizeSum_ = 0;
-  for (size_t n = 0; n < inputNum_; ++n) {
-    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
-    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
-    locSizeSum_ += inLoc->getElementCnt();
-    confSizeSum_ += inConf->getElementCnt();
-  }
-
-  // locBuffer layout:
-  // | xmin1 | ymin1 | xmax1 | ymax1 | xmin2 ......
-  Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
-  locBuffer_ = locTmpBuffer_;
-
-  // confBuffer layout:
-  // | class1 score | class2 score | ... |classN score | class1 score | ......
-  Matrix::resizeOrCreate(confTmpBuffer_, 1, confSizeSum_, false, useGpu_);
-  confBuffer_ = confTmpBuffer_;
-
-  // concate location data and confidence score data
-  size_t locOffset = 0;
-  size_t confOffset = 0;
-  auto& layerConf = config_.inputs(0).multibox_loss_conf();
-  for (size_t n = 0; n < inputNum_; ++n) {
-    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
-    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
-    size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
-    if (!height) height = layerConf.height();
-    size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
-    if (!width) width = layerConf.width();
-    locOffset += appendWithPermute(*inLoc,
-                                   height,
-                                   width,
-                                   locSizeSum_,
-                                   locOffset,
-                                   batchSize,
-                                   *locBuffer_,
-                                   kNCHWToNHWC);
-    confOffset += appendWithPermute(*inConf,
-                                    height,
-                                    width,
-                                    confSizeSum_,
-                                    confOffset,
-                                    batchSize,
-                                    *confBuffer_,
-                                    kNCHWToNHWC);
-  }
-  CHECK_EQ(locOffset, locSizeSum_ / batchSize);
-  CHECK_EQ(confOffset, confSizeSum_ / batchSize);
-
-  // priorValue layout:
-  // | xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var
-  // | xmin2 | ......
-  MatrixPtr priorValue;
-
-  // labelValue layout:
-  // | class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ......
-  MatrixPtr labelValue;
-
-  // Copy data from GPU to CPU if use GPU
-  if (useGpu_) {
-    Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false);
-    Matrix::resizeOrCreate(confCpuBuffer_, 1, confSizeSum_, false, false);
-    MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer());
-    Matrix::resizeOrCreate(
-        priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false);
-    MatrixPtr labelTmpValue = getInputValue(*getLabelLayer());
-    Matrix::resizeOrCreate(labelCpuValue_,
-                           labelTmpValue->getHeight(),
-                           labelTmpValue->getWidth(),
-                           false,
-                           false);
-
-    locCpuBuffer_->copyFrom(*locTmpBuffer_);
-    confCpuBuffer_->copyFrom(*confTmpBuffer_);
-    priorCpuValue_->copyFrom(*priorTmpValue);
-    labelCpuValue_->copyFrom(*labelTmpValue);
-
-    locBuffer_ = locCpuBuffer_;
-    confBuffer_ = confCpuBuffer_;
-    priorValue = priorCpuValue_;
-    labelValue = labelCpuValue_;
-  } else {
-    priorValue = getInputValue(*getPriorBoxLayer());
-    labelValue = getInputValue(*getLabelLayer());
-  }
-
-  // Get max scores for each prior bbox. Used in negative mining
-  std::vector<std::vector<real>> allMaxConfScore;
-  numPriors_ = priorValue->getElementCnt() / 8;
-  getMaxConfidenceScores(confBuffer_->getData(),
-                         batchSize,
-                         numPriors_,
-                         numClasses_,
-                         backgroundId_,
-                         &allMaxConfScore);
-
-  // Match prior bbox to groundtruth bbox
-  Argument label = getInput(*getLabelLayer());
-  const int* labelIndex = label.sequenceStartPositions->getData(false);
-  size_t seqNum = label.getNumSequences();
-  numMatches_ = 0;
-  numNegs_ = 0;
-  allMatchIndices_.clear();
-  allNegIndices_.clear();
-
-  std::pair<size_t, size_t> retPair = generateMatchIndices(*priorValue,
-                                                           numPriors_,
-                                                           *labelValue,
-                                                           labelIndex,
-                                                           seqNum,
-                                                           allMaxConfScore,
-                                                           batchSize,
-                                                           overlapThreshold_,
-                                                           negOverlap_,
-                                                           negPosRatio_,
-                                                           &allMatchIndices_,
-                                                           &allNegIndices_);
-  numMatches_ = retPair.first;
-  numNegs_ = retPair.second;
-
-  // BBox location L1 smooth loss
-  locLoss_ = 0.0;
-  if (numMatches_ >= 1) {
-    size_t count = 0;
-    MatrixPtr locLossOutput;
-    Matrix::resizeOrCreate(locLossOutput, numMatches_ * 4, 1, false, false);
-    Matrix::resizeOrCreate(locGTData_, numMatches_ * 4, 1, false, false);
-    Matrix::resizeOrCreate(locDiff_, numMatches_ * 4, 1, false, false);
-    locDiff_->zeroMem();
-    std::vector<real> locGTData;
-
-    real* locDiffData = locDiff_->getData();
-    const real* locBufferData = locBuffer_->getData();
-    for (size_t n = 0; n < batchSize; ++n) {
-      for (size_t i = 0; i < numPriors_; ++i) {
-        if (allMatchIndices_[n][i] == -1) continue;  // match none
-        size_t locOffset =
-            n * (locBuffer_->getElementCnt() / batchSize) + i * 4;
-        std::copy(locBufferData + locOffset,
-                  locBufferData + locOffset + 4,
-                  locDiffData + count);
-        count += 4;
-        const int gtIdx = allMatchIndices_[n][i];
-        size_t priorOffset = i * 8;
-        std::vector<NormalizedBBox> priorBBoxVec;
-        getBBoxFromPriorData(
-            priorValue->getData() + priorOffset, 1, priorBBoxVec);
-        std::vector<std::vector<real>> priorBBoxVar;
-        getBBoxVarFromPriorData(
-            priorValue->getData() + priorOffset, 1, priorBBoxVar);
-        size_t labelOffset = (labelIndex[n] + gtIdx) * 6;
-        std::vector<NormalizedBBox> gtBBoxVec;
-        getBBoxFromLabelData(labelValue->getData() + labelOffset, 1, gtBBoxVec);
-        std::vector<real> gtEncode;
-        encodeBBoxWithVar(
-            priorBBoxVec[0], priorBBoxVar[0], gtBBoxVec[0], gtEncode);
-        locGTData.insert(locGTData.end(), gtEncode.begin(), gtEncode.end());
-      }
-    }
-    locGTData_->copyFrom(&locGTData[0], numMatches_ * 4);
-    locLossOutput->smoothL1(*locDiff_, *locGTData_, 0.0);
-    locLoss_ = locLossOutput->getSum() / numMatches_;
-  }
-
-  // BBox confidence softmax loss
-  confLoss_ = 0;
-  numConf_ = numMatches_ + numNegs_;
-  if (numConf_ >= 1) {
-    Matrix::resizeOrCreate(confProb_, numConf_, numClasses_, false, false);
-    IVector::resizeOrCreate(confGTData_, numConf_, false);
-    confProb_->zeroMem();
-    size_t count = 0;
-
-    std::vector<real> confPredData;
-    real* confProbData = confProb_->getData();
-    const real* confBufferData = confBuffer_->getData();
-    for (size_t n = 0; n < batchSize; ++n) {
-      for (size_t i = 0; i < numPriors_; ++i) {
-        if (allMatchIndices_[n][i] == -1) continue;
-        size_t labelOffset = (labelIndex[n] + allMatchIndices_[n][i]) * 6;
-        const int gtLabel = (labelValue->getData() + labelOffset)[0];
-        confGTData_->getData()[count] = gtLabel;
-        size_t confOffset = n * numPriors_ * numClasses_ + i * numClasses_;
-        std::copy(confBufferData + confOffset,
-                  confBufferData + confOffset + numClasses_,
-                  confProbData + count * numClasses_);
-        confPredData.reserve(confPredData.size() + numClasses_);
-        confPredData.insert(confPredData.end(),
-                            confBufferData + confOffset,
-                            confBufferData + confOffset + numClasses_);
-        ++count;
-      }
-      // Negative mining samples
-      for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
-        confGTData_->getData()[count] = backgroundId_;
-        size_t confOffset =
-            n * numPriors_ * numClasses_ + allNegIndices_[n][i] * numClasses_;
-        std::copy(confBufferData + confOffset,
-                  confBufferData + confOffset + numClasses_,
-                  confProbData + count * numClasses_);
-        confPredData.reserve(confPredData.size() + numClasses_);
-        confPredData.insert(confPredData.end(),
-                            confBufferData + confOffset,
-                            confBufferData + confOffset + numClasses_);
-        ++count;
-      }
-    }
-    CHECK_EQ(numConf_, count);
-    confProb_->softmax(*confProb_);
-    MatrixPtr confLossOutput;
-    Matrix::resizeOrCreate(confLossOutput, numConf_, 1, false, false);
-    confLossOutput->oneHotCrossEntropy(*confProb_, *confGTData_);
-    confLoss_ = confLossOutput->getSum() / numMatches_;
-  }
-  real loss = locLoss_ + confLoss_;
-  MatrixPtr outV = getOutputValue();
-  outV->assign(loss);
-}
-
-void MultiBoxLossLayer::backward(const UpdateCallback& callback) {
-  size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
-  locBuffer_->zeroMem();
-  confBuffer_->zeroMem();
-
-  // Back propagate on location prediction
-  if (numMatches_ >= 1) {
-    MatrixPtr locDiffBuffer;
-    Matrix::resizeOrCreate(locDiffBuffer, numMatches_ * 4, 1, false, false);
-    locDiffBuffer->smoothL1Bp(*locDiff_, *locGTData_, 0.0);
-    locDiff_->copyFrom(*locDiffBuffer);
-    // scale gradient
-    for (size_t i = 0; i < numMatches_ * 4; ++i)
-      locDiff_->getData()[i] *= (1. / numMatches_);
-    // Copy gradient back
-    size_t count = 0;
-    const real* locDiffData = locDiff_->getData();
-    for (size_t n = 0; n < batchSize; ++n) {
-      for (size_t i = 0; i < numPriors_; ++i) {
-        if (allMatchIndices_[n][i] == -1) continue;
-        real* locBufferData =
-            locBuffer_->getData() + n * numPriors_ * 4 + i * 4;
-        std::copy(locDiffData + count * 4,
-                  locDiffData + (count + 1) * 4,
-                  locBufferData);
-        ++count;
-      }
-    }
-    CHECK_EQ(count, numMatches_);
-  }
-
-  if (numConf_ >= 1) {
-    for (size_t i = 0; i < numConf_; ++i)
-      confProb_->getData()[i * numClasses_ + confGTData_->getData()[i]] -= 1;
-    for (size_t i = 0; i < numConf_ * numClasses_; ++i)
-      confProb_->getData()[i] *= (1. / numMatches_);
-    size_t count = 0;
-    const real* confProbData = confProb_->getData();
-    for (size_t n = 0; n < batchSize; ++n) {
-      for (size_t i = 0; i < numPriors_; ++i) {
-        if (allMatchIndices_[n][i] == -1) continue;
-        real* confDiffData = confBuffer_->getData() +
-                             n * numPriors_ * numClasses_ + i * numClasses_;
-        std::copy(confProbData + count * numClasses_,
-                  confProbData + (count + 1) * numClasses_,
-                  confDiffData);
-        ++count;
-      }
-      for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
-        int idx = allNegIndices_[n][i];
-        real* confDiffData = confBuffer_->getData() +
-                             n * numPriors_ * numClasses_ + idx * numClasses_;
-        std::copy(confProbData + count * numClasses_,
-                  confProbData + (count + 1) * numClasses_,
-                  confDiffData);
-        ++count;
-      }
-    }
-    CHECK_EQ(count, numConf_);
-  }
-  if (useGpu_) {
-    locTmpBuffer_->copyFrom(*locCpuBuffer_);
-    confTmpBuffer_->copyFrom(*confCpuBuffer_);
-    locBuffer_ = locTmpBuffer_;
-    confBuffer_ = confTmpBuffer_;
-  }
-  // copy back
-  size_t locOffset = 0;
-  size_t confOffset = 0;
-  auto layerConf = config_.inputs(0).multibox_loss_conf();
-  for (size_t n = 0; n < inputNum_; ++n) {
-    const MatrixPtr inLocG = getInputGrad(*getLocInputLayer(n));
-    const MatrixPtr inConfG = getInputGrad(*getConfInputLayer(n));
-    size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
-    // only for unittest, there are no width and height information
-    // when constructing matrix in unittest, so we should
-    // set the shape in configuration
-    if (!height) height = layerConf.height();
-    size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
-    if (!width) width = layerConf.width();
-
-    // NHWC to NCHW
-    MatrixPtr locGBuffer;
-    Matrix::resizeOrCreate(
-        locGBuffer, inLocG->getHeight(), inLocG->getWidth(), false, useGpu_);
-    MatrixPtr confGBuffer;
-    Matrix::resizeOrCreate(
-        confGBuffer, inConfG->getHeight(), inConfG->getWidth(), false, useGpu_);
-
-    locOffset += decomposeWithPermute(*locBuffer_,
-                                      height,
-                                      width,
-                                      locSizeSum_,
-                                      locOffset,
-                                      batchSize,
-                                      *locGBuffer,
-                                      kNHWCToNCHW);
-    inLocG->add(*locGBuffer);
-    confOffset += decomposeWithPermute(*confBuffer_,
-                                       height,
-                                       width,
-                                       confSizeSum_,
-                                       confOffset,
-                                       batchSize,
-                                       *confGBuffer,
-                                       kNHWCToNCHW);
-    inConfG->add(*confGBuffer);
-  }
-  CHECK_EQ(locOffset, locSizeSum_ / batchSize);
-  CHECK_EQ(confOffset, confSizeSum_ / batchSize);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MultiBoxLossLayer.h b/paddle/legacy/gserver/layers/MultiBoxLossLayer.h
deleted file mode 100644
index a358cded00b..00000000000
--- a/paddle/legacy/gserver/layers/MultiBoxLossLayer.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* copyright (c) 2016 paddlepaddle authors. all rights reserve.
-
-licensed under the apache license, version 2.0 (the "license");
-you may not use this file except in compliance with the license.
-you may obtain a copy of the license at
-
-    http://www.apache.org/licenses/license-2.0
-
-unless required by applicable law or agreed to in writing, software
-distributed under the license is distributed on an "as is" basis,
-without warranties or conditions of any kind, either express or implied.
-see the license for the specific language governing permissions and
-limitations under the license. */
-
-#pragma once
-
-#include <vector>
-#include "CostLayer.h"
-#include "DataLayer.h"
-#include "DetectionUtil.h"
-#include "Layer.h"
-
-using std::vector;
-using std::pair;
-
-namespace paddle {
-
-/**
- * The multibox loss layer for a SSD detection task.
- * The loss is composed by the location loss and the confidence loss.
- * The location loss is a smooth L1 loss and the confidence loss is
- * a softmax loss.
- * - Input: This layer needs four input layers: The first input layer
- *          is the priorbox layer and the second layer is a label layer.
- *          The rest two input layers are convolution layers for generating
- *          bbox location offset and the classification confidence.
- * - Output: The Single Shot Multibox Detection loss value.
- * Reference:
- *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
- *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
- */
-
-class MultiBoxLossLayer : public CostLayer {
- public:
-  explicit MultiBoxLossLayer(const LayerConfig& config) : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-
-  void backward(const UpdateCallback& callback = nullptr);
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) {}
-
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {}
-
- protected:
-  inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
-  inline LayerPtr getLabelLayer() { return inputLayers_[1]; }
-  inline LayerPtr getLocInputLayer(size_t index) {
-    return inputLayers_[2 + index];
-  }
-  inline LayerPtr getConfInputLayer(size_t index) {
-    return inputLayers_[2 + inputNum_ + index];
-  }
-
- protected:
-  size_t numClasses_;
-  real overlapThreshold_;
-  real negPosRatio_;
-  real negOverlap_;
-  size_t inputNum_;
-  size_t backgroundId_;
-
-  real locLoss_;
-  real confLoss_;
-
-  size_t numPriors_;
-  size_t numMatches_;
-  size_t numNegs_;
-  size_t numConf_;
-  size_t locSizeSum_;
-  size_t confSizeSum_;
-
-  vector<vector<int>> allMatchIndices_;
-  vector<vector<int>> allNegIndices_;
-  MatrixPtr locGTData_;
-  IVectorPtr confGTData_;
-
-  MatrixPtr locBuffer_;
-  MatrixPtr confBuffer_;
-  MatrixPtr locDiff_;
-  MatrixPtr confProb_;
-
-  MatrixPtr labelCpuValue_;
-  MatrixPtr priorCpuValue_;
-  MatrixPtr locCpuBuffer_;
-  MatrixPtr confCpuBuffer_;
-  MatrixPtr locTmpBuffer_;
-  MatrixPtr confTmpBuffer_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MultinomialSampler.cpp b/paddle/legacy/gserver/layers/MultinomialSampler.cpp
deleted file mode 100644
index e74ed795a15..00000000000
--- a/paddle/legacy/gserver/layers/MultinomialSampler.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MultinomialSampler.h"
-
-namespace paddle {
-
-MultinomialSampler::MultinomialSampler(const real* prob, int size)
-    : rand_(0.0, size) {
-  intervals_.resize(size + 1);
-  double sum = 0;
-  for (int i = 0; i < size; ++i) {
-    sum += prob[i];
-  }
-
-  double intervalLength = sum / size;
-  double s = 1 / intervalLength;
-  for (int i = 0; i < size; ++i) {
-    intervals_[i] = {i, (real)(prob[i] * s)};
-  }
-
-  auto nextSmallPos = [&](int pos) {
-    while (pos < size &&
-           (pos != intervals_[pos].otherId || intervals_[pos].thresh >= 1)) {
-      ++pos;
-    }
-    return pos;
-  };
-
-  auto nextBigPos = [&](int pos) {
-    while (pos < size && intervals_[pos].thresh < 1) {
-      ++pos;
-    }
-    return pos;
-  };
-
-  int smallPos = nextSmallPos(0);
-  int bigPos = nextBigPos(0);
-
-  auto fillIntervals = [&]() {
-    while (bigPos < size) {
-      while (intervals_[bigPos].thresh > 1 && smallPos < size) {
-        intervals_[smallPos].otherId = bigPos;
-        intervals_[bigPos].thresh -= 1 - intervals_[smallPos].thresh;
-        smallPos = nextSmallPos(smallPos + 1);
-      }
-      if (smallPos >= size) break;
-      bigPos = nextBigPos(bigPos + 1);
-      // If intervals_[bigPos].thresh < 1, it becomes a small interval
-    }
-  };
-
-  fillIntervals();
-
-  smallPos = nextSmallPos(0);
-
-  // At this point there is no small intervals after bigPos. And this condition
-  // will remain true during the next fillIntervals()
-
-  fillIntervals();
-
-  // Handle the inaccuracy caused by finite-precision arithmetic which
-  // may results in some unprocessed small or big intervals at this point.
-  for (int i = 0; i < size; ++i) {
-    if (intervals_[i].otherId == i) {
-      intervals_[i].thresh = 1;
-    }
-  }
-
-  // The last one is to safeguard the case that the random number is equal
-  // to size
-  intervals_[size] = {size - 1, 1};
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MultinomialSampler.h b/paddle/legacy/gserver/layers/MultinomialSampler.h
deleted file mode 100644
index ed445352418..00000000000
--- a/paddle/legacy/gserver/layers/MultinomialSampler.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <random>
-#include "paddle/legacy/utils/Common.h"
-
-namespace paddle {
-
-/**
- * @brief Given the probability of N objects, the sampler random select
- * one of the object.
- * @note: prob does not have to be unnormalized.
- *
- * The space requirement is O(N)=O(N * sizeof(Interval)).
- * The computational complexity of generate one sample is O(1).
- */
-class MultinomialSampler {
- public:
-  MultinomialSampler(const real* prob, int size);
-
-  //! protobuf always using double.
-  static MultinomialSampler* create(const double* prob, int size) {
-#ifdef PADDLE_TYPE_DOUBLE
-    return new MultinomialSampler(prob, size);
-#else
-    std::unique_ptr<real[]> tmp(new real[size]);
-    std::copy(prob, prob + size, tmp.get());
-    return new MultinomialSampler(tmp.get(), size);
-#endif
-  }
-
-  /**
-   * @brief Generate a random sample.
-   * @param g is a random number engine. See <random>.
-   * @return Random integer.
-   */
-  template <typename URNG>
-  int gen(URNG& g) {
-    return gen1([&g, this]() { return rand_(g); });
-  }
-
- protected:
-  /**
-   * @brief Generation
-   * @param[in] rand rand is a real random number distribution
-   * for the range [0, size).
-   * @return random int number or intervals_[random_int_number].otherId.
-   */
-  template <typename Rand>
-  int gen1(Rand rand) {
-    double r = rand();  // NOLINT
-    int i = (int)r;
-    r -= i;
-    return r < intervals_[i].thresh ? i : intervals_[i].otherId;
-  }
-
-  struct Interval {
-    int otherId;
-    real thresh;
-  };
-
-  /// The probability of each interval will be 1./size
-  std::vector<Interval> intervals_;
-  std::uniform_real_distribution<double> rand_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MultiplexLayer.cpp b/paddle/legacy/gserver/layers/MultiplexLayer.cpp
deleted file mode 100644
index 9ca2b241759..00000000000
--- a/paddle/legacy/gserver/layers/MultiplexLayer.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- *@brief This layer multiplex multiple layers according to the index,
- * which is provided by the first input layer.
- * - Input[0]: the index of the layer to output of size batchSize.
- * - Input[1:N]; the candidate output data.
- * For each index i from 0 to batchSize -1, the output is the i-th row of the
- * (index[i] + 1)-th layer.
- *
- * For each i-th row of output:
- *
- * \f[
- *   y[i][j] = x_{x_{0}[i] + 1}[i][j], j = 0,1, ... , (x_{1}.width - 1)
- * \f]
- * where, y is output. \f$x_{k}\f$ is the k-th input layer and
- * \f$k = x_{0}[i] + 1\f$.
- */
-
-class MultiplexLayer : public Layer {
- protected:
-  /**
-   * @brief A struct is used to save the copy information, includes input
-   * layer index and copy size.
-   */
-  struct CopyInfo {
-    CopyInfo(int inStartIdx, int inLength, int inCopyIdx)
-        : startIdx(inStartIdx), length(inLength), copyIdx(inCopyIdx) {}
-
-    /// The start row of input.
-    int startIdx;
-    /// Number of rows. If the layer index in Input[0] is not consecutive,
-    /// the length is one. Otherwise, the length is > 1 and copy multi rows
-    /// once.
-    int length;
-    /// The copied layer index, which needs to add 1.
-    int copyIdx;
-  };
-
-  /// A list of CopyInfo used to save copy information.
-  std::vector<CopyInfo> copySchedule_;
-
-  /// Temporary matrix pointer to point to input data.
-  MatrixPtr tmpSrc_;
-  /// Temporary matrix pointer to point to output data.
-  MatrixPtr tmpDest_;
-
- public:
-  explicit MultiplexLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~MultiplexLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  /**
-   * @brief Calculate copy info for input layers.
-   */
-  void calculateCopySchedule(const IVectorPtr& copyIds, size_t numIns);
-};
-
-REGISTER_LAYER(multiplex, MultiplexLayer);
-
-void MultiplexLayer::calculateCopySchedule(const IVectorPtr& copyIds,
-                                           size_t numIns) {
-  copySchedule_.clear();
-  CopyInfo prevCopyInfo(0, 0, -1);
-  for (size_t i = 0; i < copyIds->getSize(); i++) {
-    int copyId = copyIds->getElement(i);
-    CHECK_GE(copyId, 0);
-    CHECK_LT(copyId, int(numIns));
-    // copy same input layer with prevous and will copy consecutive.
-    if (copyId == prevCopyInfo.copyIdx) {
-      ++prevCopyInfo.length;
-    } else {
-      if (prevCopyInfo.copyIdx != -1) {
-        copySchedule_.emplace_back(prevCopyInfo);
-      }
-      prevCopyInfo.startIdx = i;
-      prevCopyInfo.length = 1;
-      prevCopyInfo.copyIdx = copyId;
-    }
-  }
-  if (prevCopyInfo.copyIdx != -1) {
-    copySchedule_.emplace_back(prevCopyInfo);
-  }
-}
-
-bool MultiplexLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_GE(inputLayers_.size(), 2U);
-
-  tmpSrc_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  tmpDest_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  return true;
-}
-
-void MultiplexLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  IVectorPtr copyIds = getInput(0).ids;
-  MatrixPtr inV1 = getInputValue(1);
-  CHECK_EQ(copyIds->getSize(), inV1->getHeight());
-  for (size_t i = 2; i < inputLayers_.size(); i++) {
-    CHECK_EQ(inV1->getHeight(), getInputValue(i)->getHeight());
-    CHECK_EQ(inV1->getWidth(), getInputValue(i)->getWidth());
-  }
-
-  calculateCopySchedule(copyIds, inputLayers_.size() - 1);
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(inV1->getHeight(), inV1->getWidth());
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwLMultplexingTimer", getName().c_str());
-    AsyncGpuBlock block;
-    for (const CopyInfo& info : copySchedule_) {
-      outV->subMatrix(info.startIdx, info.length, tmpDest_)
-          ->copyFrom(*getInputValue(info.copyIdx + 1)
-                          ->subMatrix(info.startIdx, info.length, tmpSrc_));
-    }
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void MultiplexLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  MatrixPtr outG = getOutputGrad();
-
-  {
-    REGISTER_TIMER_INFO("BwLMultiplexTimer", getName().c_str());
-    AsyncGpuBlock block;
-    for (const CopyInfo& info : copySchedule_) {
-      if (getInputGrad(info.copyIdx + 1)) {
-        getInputGrad(info.copyIdx + 1)
-            ->subMatrix(info.startIdx, info.length, tmpDest_)
-            ->add(*outG->subMatrix(info.startIdx, info.length, tmpSrc_));
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NCELayer.cpp b/paddle/legacy/gserver/layers/NCELayer.cpp
deleted file mode 100644
index ae4d6408168..00000000000
--- a/paddle/legacy/gserver/layers/NCELayer.cpp
+++ /dev/null
@@ -1,323 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-
-#include "Layer.h"
-#include "MultinomialSampler.h"
-#include "paddle/legacy/math/MathFunctions.h"
-
-namespace paddle {
-
-/**
- * Noise-contrastive estimation.
- * Implements the method in the following paper:
- * A fast and simple algorithm for training neural probabilistic language
- * models.
- *
- * The config file api is nce_layer.
- */
-class NCELayer : public Layer {
-  int numClasses_;
-  /// number of input layer besides labelLayer and weightLayer
-  int numInputs_;
-  LayerPtr labelLayer_;
-  /// weight layer, can be None
-  LayerPtr weightLayer_;
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-  std::unique_ptr<MultinomialSampler> sampler_;
-
-  std::uniform_int_distribution<int> rand_;
-
-  struct Sample {
-    int sampleId;
-    int labelId;
-    bool target;
-    real weight;
-  };
-  std::vector<Sample> samples_;
-  /// whether samples_ is prepared
-  bool prepared_;
-  Argument sampleOut_;
-
-  IVectorPtr labelIds_;
-
- public:
-  explicit NCELayer(const LayerConfig& config)
-      : Layer(config),
-        numClasses_(config.num_classes()),
-        rand_(0, config.num_classes() - 1),
-        prepared_(false) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    /* Initialize the basic parent class */
-    Layer::init(layerMap, parameterMap);
-
-    /* initialize the weightList */
-    size_t i;
-    for (i = 0; i < inputLayers_.size(); i++) {
-      if (!parameters_[i]) break;
-      size_t width = inputLayers_[i]->getSize();
-      // create a new weight
-      CHECK_EQ(parameters_[i]->getSize(), width * numClasses_);
-      Weight* w = new Weight(numClasses_, width, parameters_[i]);
-
-      // append the new weight to the list
-      weights_.emplace_back(w);
-    }
-
-    CHECK_EQ(1U, getSize());
-
-    numInputs_ = i;
-    CHECK_GE(numInputs_, 1)
-        << "Must have at least one input besides label and weight";
-    CHECK_LT(i, inputLayers_.size()) << "Missing label layer";
-    labelLayer_ = inputLayers_[i];
-    if (++i < inputLayers_.size()) {
-      weightLayer_ = inputLayers_[i];
-      ++i;
-    }
-    CHECK_EQ(i, inputLayers_.size());
-
-    /* initialize biases_ */
-    if (biasParameter_.get() != NULL) {
-      CHECK_EQ(biasParameter_->getSize(), (size_t)numClasses_);
-      biases_.reset(new Weight(1, numClasses_, biasParameter_));
-    }
-
-    if (config_.neg_sampling_dist_size()) {
-      CHECK_EQ(numClasses_, config_.neg_sampling_dist_size());
-      sampler_.reset(MultinomialSampler::create(
-          config_.neg_sampling_dist().data(), numClasses_));
-    }
-
-    return true;
-  }
-
-  void prepareSamples() {
-    CHECK(!useGpu_) << "GPU is not supported";
-
-    int batchSize = getInput(*labelLayer_).getBatchSize();
-    IVectorPtr label = getInput(*labelLayer_).ids;
-
-    CpuSparseMatrixPtr multiLabel = std::dynamic_pointer_cast<CpuSparseMatrix>(
-        getInput(*labelLayer_).value);
-
-    CHECK(label || multiLabel)
-        << "The label layer must have ids or NonValueSparseMatrix value";
-
-    auto& randEngine = ThreadLocalRandomEngine::get();
-
-    samples_.clear();
-    samples_.reserve(batchSize * (1 + config_.num_neg_samples()));
-
-    real* weight =
-        weightLayer_ ? getInputValue(*weightLayer_)->getData() : nullptr;
-
-    for (int i = 0; i < batchSize; ++i) {
-      real w = weight ? weight[i] : 1;
-      if (label) {
-        int* ids = label->getData();
-        samples_.push_back({i, ids[i], true, w});
-      } else {
-        const int* cols = multiLabel->getRowCols(i);
-        int n = multiLabel->getColNum(i);
-        for (int j = 0; j < n; ++j) {
-          samples_.push_back({i, cols[j], true, w});
-        }
-      }
-      for (int j = 0; j < config_.num_neg_samples(); ++j) {
-        int id = sampler_ ? sampler_->gen(randEngine) : rand_(randEngine);
-        samples_.push_back({i, id, false, w});
-      }
-    }
-    prepared_ = true;
-  }
-
-  void prefetch() override {
-    prepareSamples();
-    IVector::resizeOrCreate(labelIds_, samples_.size(), useGpu_);
-    int* ids = labelIds_->getData();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      ids[i] = samples_[i].labelId;
-    }
-
-    for (int i = 0; i < numInputs_; ++i) {
-      auto sparseParam =
-          dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
-      if (sparseParam) {
-        sparseParam->addRows(labelIds_);
-      }
-    }
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-
-    CHECK(!useGpu_) << "GPU is not supported";
-
-    if (!prepared_) {
-      if (passType == PASS_GC) {
-        ThreadLocalRandomEngine::get().seed(ThreadLocalRand::getDefaultSeed());
-      }
-      prepareSamples();
-    }
-    prepared_ = false;
-
-    /* malloc memory for the output_ if necessary */
-    int batchSize = getInputValue(0)->getHeight();
-    int size = getSize();
-    resetOutput(batchSize, size);
-
-    Matrix::resizeOrCreate(sampleOut_.value,
-                           1,
-                           samples_.size(),
-                           /* trans= */ false,
-                           useGpu_);
-
-    forwardBias();
-
-    for (int l = 0; l < numInputs_; ++l) {
-      forwardOneInput(l);
-    }
-
-    auto status = activation_->forward(sampleOut_);
-    status.check();
-
-    forwardCost();
-  }
-
-  void backward(const UpdateCallback& callback) override {
-    Matrix::resizeOrCreate(sampleOut_.grad,
-                           1,
-                           samples_.size(),
-                           /* trans= */ false,
-                           useGpu_);
-
-    backwardCost();
-
-    auto status = activation_->backward(sampleOut_);
-    status.check();
-
-    if (biases_->getWGrad()) {
-      backwardBias(callback);
-    }
-
-    for (int l = 0; l < numInputs_; ++l) {
-      backwardOneInput(l, callback);
-    }
-  }
-
-  void forwardBias() {
-    if (!biases_) {
-      sampleOut_.value->zeroMem();
-    } else {
-      real* bias = biases_->getW()->getData();
-      real* sampleOut = sampleOut_.value->getData();
-      for (size_t i = 0; i < samples_.size(); ++i) {
-        sampleOut[i] = bias[samples_[i].labelId];
-      }
-    }
-  }
-
-  void backwardBias(const UpdateCallback& callback) {
-    if (!biases_) return;
-    real* bias = biases_->getWGrad()->getData();
-    real* sampleOut = sampleOut_.grad->getData();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      bias[samples_[i].labelId] += sampleOut[i];
-    }
-    biases_->incUpdate(callback);
-  }
-
-  void forwardOneInput(int layerId) {
-    const MatrixPtr& inputMat = getInputValue(layerId);
-    const MatrixPtr& weightMat = weights_[layerId]->getW();
-
-    int dim = inputMat->getWidth();
-    real* sampleOut = sampleOut_.value->getData();
-
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      sampleOut[i] += dotProduct(dim,
-                                 inputMat->getRowBuf(samples_[i].sampleId),
-                                 weightMat->getRowBuf(samples_[i].labelId));
-    }
-  }
-
-  void backwardOneInput(int layerId, const UpdateCallback& callback) {
-    const MatrixPtr& inputMat = getInputValue(layerId);
-    const MatrixPtr& inputGradMat = getInputGrad(layerId);
-    const MatrixPtr& weightMat = weights_[layerId]->getW();
-    const MatrixPtr& weightGradMat = weights_[layerId]->getWGrad();
-
-    int dim = inputMat->getWidth();
-    real* sampleGrad = sampleOut_.grad->getData();
-
-    if (weightGradMat) {
-      for (size_t i = 0; i < samples_.size(); ++i) {
-        axpy(dim,
-             sampleGrad[i],
-             inputMat->getRowBuf(samples_[i].sampleId),
-             weightGradMat->getRowBuf(samples_[i].labelId));
-      }
-      weights_[layerId]->incUpdate(callback);
-    }
-
-    if (inputGradMat) {
-      for (size_t i = 0; i < samples_.size(); ++i) {
-        axpy(dim,
-             sampleGrad[i],
-             weightMat->getRowBuf(samples_[i].labelId),
-             inputGradMat->getRowBuf(samples_[i].sampleId));
-      }
-    }
-  }
-
-  void forwardCost() {
-    real* out = output_.value->getData();
-    real* sampleOut = sampleOut_.value->getData();
-    real b = 1. / numClasses_ * config_.num_neg_samples();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      real o = sampleOut[i];
-      if (sampler_) {
-        b = config_.num_neg_samples() *
-            config_.neg_sampling_dist(samples_[i].labelId);
-      }
-      real cost = samples_[i].target ? -log(o / (o + b)) : -log(b / (o + b));
-      out[samples_[i].sampleId] += samples_[i].weight * cost;
-    }
-  }
-
-  void backwardCost() {
-    real* sampleOut = sampleOut_.value->getData();
-    real* sampleGrad = sampleOut_.grad->getData();
-
-    real b = 1. / numClasses_ * config_.num_neg_samples();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      real o = sampleOut[i];
-      if (sampler_) {
-        b = config_.num_neg_samples() *
-            config_.neg_sampling_dist(samples_[i].labelId);
-      }
-      real w = samples_[i].weight;
-      sampleGrad[i] = samples_[i].target ? -w * b / (o * (o + b)) : w / (o + b);
-    }
-  }
-};
-
-REGISTER_LAYER(nce, NCELayer);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NormLayer.cpp b/paddle/legacy/gserver/layers/NormLayer.cpp
deleted file mode 100644
index 443e26dbc85..00000000000
--- a/paddle/legacy/gserver/layers/NormLayer.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NormLayer.h"
-#include "NormProjectionLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-namespace paddle {
-
-REGISTER_LAYER_CREATE_FUNC(norm, &NormLayer::create);
-
-Layer* NormLayer::create(const LayerConfig& config) {
-  CHECK_EQ(config.inputs_size(), 1);
-  const std::string& norm = config.inputs(0).norm_conf().norm_type();
-  if (norm == "rnorm") {
-    return new ResponseNormLayer(config);
-  } else if (norm == "cmrnorm-projection") {
-    return new CMRProjectionNormLayer(config);
-  } else if (norm == "cross-channel-norm") {
-    return new CrossChannelNormLayer(config);
-  } else {
-    LOG(FATAL) << "Unknown norm type: " << norm;
-    return nullptr;
-  }
-}
-
-bool ResponseNormLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  NormLayer::init(layerMap, parameterMap);
-
-  /* the size of inputs for norm-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  const NormConfig& conf = config_.inputs(0).norm_conf();
-  channels_ = conf.channels();
-  size_ = conf.size();
-  scale_ = conf.scale();
-  pow_ = conf.pow();
-  outputX_ = conf.output_x();
-  imgSize_ = conf.img_size();
-  denoms_ = NULL;
-
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  return true;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NormLayer.h b/paddle/legacy/gserver/layers/NormLayer.h
deleted file mode 100644
index 5ac00034d08..00000000000
--- a/paddle/legacy/gserver/layers/NormLayer.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "NormLayer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Basic parent layer of normalization
- *
- * @note Normalize the input in local region
- */
-class NormLayer : public Layer {
- public:
-  explicit NormLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    Layer::init(layerMap, parameterMap);
-    return true;
-  }
-
-  /**
-   * @brief create norm layer by norm_type
-   */
-  static Layer* create(const LayerConfig& config);
-};
-
-/**
- * @brief response normalization within feature maps
- * namely normalize in independent channel
- * When code refactoring, we delete the original implementation.
- * Need to implement in the futrue.
- */
-class ResponseNormLayer : public NormLayer {
- protected:
-  size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_;
-  real scale_, pow_;
-  MatrixPtr denoms_;
-
- public:
-  explicit ResponseNormLayer(const LayerConfig& config) : NormLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override { LOG(FATAL) << "Not implemented"; }
-  void backward(const UpdateCallback& callback = nullptr) override {
-    LOG(FATAL) << "Not implemented";
-  }
-};
-
-/**
- * This layer applys normalization across the channels of each sample to a
- * conv layer's output, and scales the output by a group of trainable factors
- * whose dimensions equal to the number of channels.
- * - Input: One and only one input layer are accepted.
- * - Output: The normalized data of the input data.
- * Reference:
- *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
- *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
- */
-class CrossChannelNormLayer : public NormLayer {
- public:
-  explicit CrossChannelNormLayer(const LayerConfig& config)
-      : NormLayer(config) {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
-  MatrixPtr createSampleMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
-  MatrixPtr createSpatialMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
-
- protected:
-  size_t channels_;
-  std::unique_ptr<Weight> scale_;
-  MatrixPtr scaleDiff_;
-  MatrixPtr normBuffer_;
-  MatrixPtr dataBuffer_;
-  MatrixPtr channelBuffer_;
-  MatrixPtr spatialBuffer_;
-  MatrixPtr sampleBuffer_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NormProjectionLayer.cpp b/paddle/legacy/gserver/layers/NormProjectionLayer.cpp
deleted file mode 100644
index 72affaa1ce6..00000000000
--- a/paddle/legacy/gserver/layers/NormProjectionLayer.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NormProjectionLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-size_t CMRProjectionNormLayer::getSize() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t layerSize = 0;
-  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imgSizeH_ == 0) {
-    imgSizeH_ = imgSizeY_;
-  }
-  if (imgSizeW_ == 0) {
-    imgSizeW_ = imgSize_;
-  }
-  outputH_ = imgSizeH_;
-  outputW_ = imgSizeW_;
-  layerSize = outputH_ * outputW_ * channels_;
-
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
-  return layerSize;
-}
-
-bool CMRProjectionNormLayer::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  ResponseNormLayer::init(layerMap, parameterMap);
-
-  /* the size of inputs for norm-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  createFunction(
-      forward_,
-      "CrossMapNormal",
-      FuncConfig().set("size", size_).set("scale", scale_).set("pow", pow_));
-  createFunction(
-      backward_,
-      "CrossMapNormalGrad",
-      FuncConfig().set("size", size_).set("scale", scale_).set("pow", pow_));
-
-  return true;
-}
-
-void CMRProjectionNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  /* malloc memory for the output_ if necessary */
-  /* note: one sample correspond to one row */
-  MatrixPtr input = inputLayers_[0]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  int size = getSize();
-  resetOutput(batchSize, size);
-
-  Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_);
-
-  shape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
-
-  // prepare forward arguments
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), shape_);
-  outputs.addArg(*getOutputValue(), shape_, ASSIGN_TO);
-  outputs.addArg(*denoms_, shape_, ASSIGN_TO);
-
-  forward_[0]->calc(inputs, outputs);
-}
-
-void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-
-  // prepare backward arguments
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), shape_);
-  inputs.addArg(*getOutputValue(), shape_);
-  inputs.addArg(*getOutputGrad(), shape_);
-  inputs.addArg(*denoms_, shape_);
-  outputs.addArg(*getInputGrad(0), shape_, ADD_TO);
-
-  backward_[0]->calc(inputs, outputs);
-}
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NormProjectionLayer.h b/paddle/legacy/gserver/layers/NormProjectionLayer.h
deleted file mode 100644
index 492d1fcb723..00000000000
--- a/paddle/legacy/gserver/layers/NormProjectionLayer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "NormLayer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief response normalization across feature maps
- * namely normalize in number of size_ channels
- */
-class CMRProjectionNormLayer : public ResponseNormLayer {
-  size_t imgSizeH_, imgSizeW_;
-  size_t outputH_, outputW_;
-
- public:
-  explicit CMRProjectionNormLayer(const LayerConfig& config)
-      : ResponseNormLayer(config) {}
-
-  ~CMRProjectionNormLayer() {}
-
-  size_t getSize();
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  TensorShape shape_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Operator.cpp b/paddle/legacy/gserver/layers/Operator.cpp
deleted file mode 100644
index 5b9cf8d15d6..00000000000
--- a/paddle/legacy/gserver/layers/Operator.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Operator.h"
-
-namespace paddle {
-
-ClassRegistrar<Operator, OperatorConfig, bool> Operator::registrar_;
-
-Operator* Operator::create(const OperatorConfig& config, bool useGpu) {
-  return registrar_.createByType(config.type(), config, useGpu);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Operator.h b/paddle/legacy/gserver/layers/Operator.h
deleted file mode 100644
index 20a248985eb..00000000000
--- a/paddle/legacy/gserver/layers/Operator.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/parameter/Parameter.h"
-
-#include "Layer.h"
-#include "paddle/legacy/parameter/Argument.h"
-
-namespace paddle {
-
-// Macro for registering a operator type
-// Example: REGISTER_OPERATOR(dot_mul, DotMulOperator);
-#define REGISTER_OPERATOR(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {               \
-    Operator::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-
-/**
- * Operator like Projection, but takes more than one Arguments as input.
- * @note: Operator can't have parameters.
- */
-class Operator {
- public:
-  static Operator* create(const OperatorConfig& config, bool useGpu);
-
-  Operator(const OperatorConfig& config, bool useGpu)
-      : config_(config), useGpu_(useGpu) {}
-
-  virtual ~Operator() {}
-
-  const OperatorConfig& getConfig() const { return config_; }
-
-  static ClassRegistrar<Operator, OperatorConfig, bool> registrar_;
-
-  /**
-   * Forward propagation. If backward() will be called, in and out must be kept
-   * valid until then.
-   * @param ins inputs of operator
-   * @param out output of operator
-   * @param passType PASS_TRAIN of PASS_TEST
-   */
-  void forward(std::vector<const Argument*> ins,
-               Argument* out,
-               PassType passType) {
-    ins_ = ins;
-    out_ = out;
-    passType_ = passType;
-    forward();
-  }
-
-  virtual void prefetch(const Argument* in) {}
-  virtual void forward() = 0;
-  virtual void backward() = 0;
-
-  /**
-   * See comment in Layer.h for the function with the same name.
-   */
-  virtual void resetState() {}
-
-  /**
-   * Set layer state.
-   */
-  virtual void setState(LayerStatePtr state) {}
-
-  /**
-   * Set layer state.
-   */
-  virtual LayerStatePtr getState() { return nullptr; }
-
- protected:
-  /// Config of operator
-  OperatorConfig config_;
-  bool useGpu_;
-
-  /// Store `ins` passed to forward()
-  std::vector<const Argument*> ins_;
-  /// Store `out` passed to forward()
-  Argument* out_;
-  /// Store `passType` passed to forward()
-  PassType passType_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/OuterProdLayer.cpp b/paddle/legacy/gserver/layers/OuterProdLayer.cpp
deleted file mode 100644
index d0928be9d4d..00000000000
--- a/paddle/legacy/gserver/layers/OuterProdLayer.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for computing the outer product of two vectors
- * @note used in NEURAL TURING MACHINE
- * Input1: vector (batchSize * dim1)
- * Input2: vector (batchSize * dim2)
- * Output: a matrix: (batchSize * (dim1*dim2))
- */
-
-class OuterProdLayer : public Layer {
- protected:
-  MatrixPtr tmpMtx0;
-  MatrixPtr tmpRow0;
-  MatrixPtr tmpRow1;
-
- public:
-  explicit OuterProdLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~OuterProdLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(out_prod, OuterProdLayer);
-
-bool OuterProdLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  size_t dim0 = inputLayers_[0]->getSize();
-  size_t dim1 = inputLayers_[1]->getSize();
-
-  CHECK_EQ(dim0 * dim1, getSize()) << "Dimension mismatch";
-
-  tmpRow0 = Matrix::create(
-      nullptr, /* height= */ 1, dim0, /* trans= */ false, useGpu_);
-  tmpRow1 = Matrix::create(
-      nullptr, /* height= */ 1, dim1, /* trans= */ false, useGpu_);
-  tmpMtx0 = Matrix::create(nullptr,
-                           /* height= */ dim0,
-                           dim1,
-                           /* trans= */ false,
-                           useGpu_);
-  return true;
-}
-
-void OuterProdLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t dim0 = inV0->getWidth();
-  size_t dim1 = inV1->getWidth();
-
-  CHECK_EQ(dim0 * dim1, getSize());
-  CHECK_EQ(inV1->getHeight(), batchSize);
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, dim0 * dim1);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  {
-    REGISTER_TIMER_INFO("FwOutProdTimer", getName().c_str());
-    for (size_t i = 0; i < batchSize; i++) {
-      tmpMtx0->setData(outV->getData() + i * dim0 * dim1);
-      tmpRow0->setData(inV0->getData() + i * dim0);
-      tmpRow1->setData(inV1->getData() + i * dim1);
-
-      tmpMtx0->mul(*tmpRow0->getTranspose(), *tmpRow1);
-    }
-  }
-}
-
-void OuterProdLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t dim0 = inV0->getWidth();
-  size_t dim1 = inV1->getWidth();
-
-  {
-    REGISTER_TIMER_INFO("BwOutProdTimer", getName().c_str());
-
-    if (inG0) {
-      for (size_t i = 0; i < batchSize; i++) {
-        tmpMtx0->setData(outG->getData() + i * dim0 * dim1);
-        tmpRow0->setData(inG0->getData() + i * dim0);
-        tmpRow1->setData(inV1->getData() + i * dim1);
-
-        tmpRow0->mul(*tmpRow1, *tmpMtx0->getTranspose(), 1, 1);
-      }
-    }
-
-    if (inG1) {
-      for (size_t i = 0; i < batchSize; i++) {
-        tmpMtx0->setData(outG->getData() + i * dim0 * dim1);
-        tmpRow0->setData(inV0->getData() + i * dim0);
-        tmpRow1->setData(inG1->getData() + i * dim1);
-
-        tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 1);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PadLayer.cpp b/paddle/legacy/gserver/layers/PadLayer.cpp
deleted file mode 100644
index 7b92b3de2d8..00000000000
--- a/paddle/legacy/gserver/layers/PadLayer.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PadLayer.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(pad, PadLayer);
-
-bool PadLayer::init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  auto& pad_conf = config_.inputs(0).pad_conf();
-  auto& img_conf = pad_conf.image_conf();
-  CHECK_EQ(config_.inputs_size(), 1);
-  inDims_ = TensorShape(
-      {0,
-       img_conf.channels(),
-       img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size(),
-       img_conf.img_size()});
-
-  CHECK_EQ(2, pad_conf.pad_c_size());
-  CHECK_EQ(2, pad_conf.pad_h_size());
-  CHECK_EQ(2, pad_conf.pad_w_size());
-  padc_ = {pad_conf.pad_c(0), pad_conf.pad_c(1)};
-  padh_ = {pad_conf.pad_h(0), pad_conf.pad_h(1)};
-  padw_ = {pad_conf.pad_w(0), pad_conf.pad_w(1)};
-
-  outDims_ = TensorShape(4);
-  setOutDims(0);
-
-  createFunction(forward_,
-                 "Pad",
-                 FuncConfig()
-                     .set("channel", padc_)
-                     .set("height", padh_)
-                     .set("width", padw_));
-  createFunction(backward_,
-                 "PadGrad",
-                 FuncConfig()
-                     .set("channel", padc_)
-                     .set("height", padh_)
-                     .set("width", padw_));
-
-  return true;
-}
-
-void PadLayer::setOutDims(const size_t batchSize) {
-  outDims_.reshape({batchSize,
-                    inDims_[1] + padc_[0] + padc_[1],
-                    inDims_[2] + padh_[0] + padh_[1],
-                    inDims_[3] + padw_[0] + padw_[1]});
-}
-
-void PadLayer::setTensorDim(const size_t batchSize) {
-  CHECK_EQ(static_cast<int>(inputLayers_.size()), 1);
-  inDims_.setDim(0, batchSize);
-  int h = inputLayers_[0]->getOutput().getFrameHeight();
-  if (h != 0) inDims_.setDim(2, h);
-  int w = inputLayers_[0]->getOutput().getFrameWidth();
-  if (w != 0) inDims_.setDim(3, w);
-  setOutDims(batchSize);
-}
-
-void PadLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  MatrixPtr input = inputLayers_[0]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  setTensorDim(batchSize);
-  int size = outDims_[1] * outDims_[2] * outDims_[3];
-  resetOutput(batchSize, size);
-  MatrixPtr outV = getOutputValue();
-  REGISTER_TIMER_INFO("PadForward", getName().c_str());
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), inDims_);
-  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
-  forward_[0]->calc(inputs, outputs);
-}
-
-void PadLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  REGISTER_TIMER_INFO("PadBackward", getName().c_str());
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getOutputGrad(), outDims_);
-  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
-  backward_[0]->calc(inputs, outputs);
-}
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PadLayer.h b/paddle/legacy/gserver/layers/PadLayer.h
deleted file mode 100644
index 46b8a595978..00000000000
--- a/paddle/legacy/gserver/layers/PadLayer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * \brief  This layer pads zeros to inputs according to the specify dimension.
- *         The input and output is a 4D tensor. Padding zeros from the 2nd to
- *         the 4th dimenstion according padc_, padh_ and padw_.
- */
-class PadLayer : public Layer {
- public:
-  explicit PadLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~PadLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  void setOutDims(const size_t batchSize);
-  void setTensorDim(const size_t batchSize);
-
-  std::vector<uint32_t> padc_;
-  std::vector<uint32_t> padh_;
-  std::vector<uint32_t> padw_;
-  TensorShape inDims_;
-  TensorShape outDims_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ParameterReluLayer.cpp b/paddle/legacy/gserver/layers/ParameterReluLayer.cpp
deleted file mode 100644
index 23715d1975d..00000000000
--- a/paddle/legacy/gserver/layers/ParameterReluLayer.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterReluLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(prelu, ParameterReluLayer);
-
-bool ParameterReluLayer::init(const LayerMap& layerMap,
-                              const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  partialSum_ = config_.partial_sum();
-  CHECK_GT(partialSum_, 0UL) << "partial_sum must be larger than zero.";
-  CHECK(!(inputLayers_[0]->getSize() % partialSum_))
-      << "Incorrect value for partialSum: " << partialSum_
-      << " must divide input size: " << inputLayers_[0]->getSize();
-  CHECK_EQ(getSize() / partialSum_, parameters_[0]->getSize());
-  weight_ = std::unique_ptr<Weight>(new Weight(
-      1UL, inputLayers_[0]->getSize() / partialSum_, parameters_[0]));
-  return true;
-}
-
-void ParameterReluLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  reserveOutput(batchSize, size);
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    outV->paramReluForward(*(getInput(0).value), *(weight_->getW()));
-  }
-}
-
-void ParameterReluLayer::backward(const UpdateCallback& callback) {
-  if (weight_->getWGrad()) {
-    weight_->getWGrad()->paramReluBackwardW(*getOutputGrad(),
-                                            *(getInputValue(0)));
-  }
-
-  MatrixPtr preGrad = getInputGrad(0);
-  preGrad->paramReluBackwardDiff(
-      *getOutputGrad(), *(getInputValue(0)), *(weight_->getW()));
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ParameterReluLayer.h b/paddle/legacy/gserver/layers/ParameterReluLayer.h
deleted file mode 100644
index 3aac4b42f60..00000000000
--- a/paddle/legacy/gserver/layers/ParameterReluLayer.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- *  @brief ParameterReluLayer active inputs with learnable parameter weight_.
- *  forward:
- *  \f[
- *      y = x > 0 ? x : w .* x
- *  \f]
- *  backward:
- *  \f[
- *      dx = x > 0 ? dy : w .* dy \\
- *      dw = x > 0 ? 0 : dy.*x
- *  \f]
- *  Here, x is the input, w is the weight, y is the output.
- *  dx, dw, dy is the gradient.
- */
-
-class ParameterReluLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> weight_;
-
-  /**
-   *  @brief partialSum_ makes a group of inputs share same weights,
-   *  - partialSum_ = 1:
-   *       element wise activation: each element has a weight_,
-   *  - partialSum_ = number of elements in one channel,
-   *       channels wise parameter activation, elements in a channel
-   *       share same weight_,
-   *  - partialSum_ = number of outputs
-   *       all elements share same weight_,
-   */
-  size_t partialSum_;
-
- public:
-  explicit ParameterReluLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ParameterReluLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Pool3DLayer.cpp b/paddle/legacy/gserver/layers/Pool3DLayer.cpp
deleted file mode 100644
index ae3f55c27f2..00000000000
--- a/paddle/legacy/gserver/layers/Pool3DLayer.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Pool3DLayer.h"
-#include "PoolProjectionLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-REGISTER_LAYER(pool3d, Pool3DLayer);
-
-bool Pool3DLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  /* the size of inputs for pool-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  const PoolConfig& conf = config_.inputs(0).pool_conf();
-  poolType_ = conf.pool_type();
-  channels_ = conf.channels();
-
-  sizeX_ = conf.size_x();
-  sizeY_ = conf.size_y();
-  sizeZ_ = conf.size_z();
-
-  strideW_ = conf.stride();
-  strideH_ = conf.stride_y();
-  strideD_ = conf.stride_z();
-
-  imgSizeW_ = conf.img_size();
-  imgSizeH_ = conf.img_size_y();
-  imgSizeD_ = conf.img_size_z();
-
-  paddingW_ = conf.padding();
-  paddingH_ = conf.padding_y();
-  paddingD_ = conf.padding_z();
-
-  outputW_ = conf.output_x();
-  outputH_ = conf.output_y();
-  outputD_ = conf.output_z();
-
-  return true;
-}
-
-size_t Pool3DLayer::getSize() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-
-  size_t layerSize = 0;
-  outputD_ = outputSize(imgSizeD_, sizeZ_, paddingD_, strideD_, false);
-  outputH_ = outputSize(imgSizeH_, sizeY_, paddingH_, strideH_, false);
-  outputW_ = outputSize(imgSizeW_, sizeX_, paddingW_, strideW_, false);
-
-  layerSize = outputD_ * outputH_ * outputW_ * channels_;
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
-  getOutput().setFrameDepth(outputD_);
-  return layerSize;
-}
-
-void Pool3DLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
-  size_t batchSize = inMat->getHeight();
-  size_t outWidth = getSize();
-  resetOutput(batchSize, outWidth);
-  Matrix::resizeOrCreate(maxPoolIdx_, batchSize, outWidth, false, useGpu_);
-  const MatrixPtr outMat = getOutputValue();
-
-  if (poolType_ == "avg") {
-    outMat->avgPool3DForward(*inMat,
-                             channels_,
-                             imgSizeD_,
-                             imgSizeH_,
-                             imgSizeW_,
-                             outputD_,
-                             outputH_,
-                             outputW_,
-                             sizeZ_,
-                             sizeY_,
-                             sizeX_,
-                             strideD_,
-                             strideH_,
-                             strideW_,
-                             paddingD_,
-                             paddingH_,
-                             paddingW_);
-  } else if (poolType_ == "max") {
-    outMat->maxPool3DForward(*inMat,
-                             *maxPoolIdx_,
-                             channels_,
-                             imgSizeD_,
-                             imgSizeH_,
-                             imgSizeW_,
-                             outputD_,
-                             outputH_,
-                             outputW_,
-                             sizeZ_,
-                             sizeY_,
-                             sizeX_,
-                             strideD_,
-                             strideH_,
-                             strideW_,
-                             paddingD_,
-                             paddingH_,
-                             paddingW_);
-  } else {
-    LOG(FATAL) << "Unknown pool type: " << poolType_;
-  }
-  forwardActivation();
-}
-
-void Pool3DLayer::backward(const UpdateCallback& callback) {
-  backwardActivation();
-
-  (void)callback;
-  if (NULL == getInputGrad(0)) return;
-  MatrixPtr inMat = inputLayers_[0]->getOutputValue();
-  MatrixPtr inGradMat = inputLayers_[0]->getOutputGrad();
-  MatrixPtr outMat = getOutputValue();
-  MatrixPtr outGradMat = getOutputGrad();
-
-  if (poolType_ == "avg") {
-    inGradMat->avgPool3DBackward(*outGradMat,
-                                 imgSizeD_,
-                                 imgSizeH_,
-                                 imgSizeW_,
-                                 outputD_,
-                                 outputH_,
-                                 outputW_,
-                                 sizeZ_,
-                                 sizeY_,
-                                 sizeZ_,
-                                 strideD_,
-                                 strideH_,
-                                 strideW_,
-                                 paddingD_,
-                                 paddingH_,
-                                 paddingW_,
-                                 1.0,
-                                 1.0);
-  } else if (poolType_ == "max") {
-    inGradMat->maxPool3DBackward(*outGradMat,
-                                 *maxPoolIdx_,
-                                 imgSizeD_,
-                                 imgSizeH_,
-                                 imgSizeW_,
-                                 outputD_,
-                                 outputH_,
-                                 outputW_,
-                                 sizeZ_,
-                                 sizeY_,
-                                 sizeZ_,
-                                 strideD_,
-                                 strideH_,
-                                 strideW_,
-                                 paddingD_,
-                                 paddingH_,
-                                 paddingW_,
-                                 1.0,
-                                 1.0);
-  } else {
-    LOG(FATAL) << "Unknown pool type: " << poolType_;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Pool3DLayer.h b/paddle/legacy/gserver/layers/Pool3DLayer.h
deleted file mode 100644
index 6851c44ab22..00000000000
--- a/paddle/legacy/gserver/layers/Pool3DLayer.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Basic parent layer of pooling
- * Pools the input within regions
- */
-class Pool3DLayer : public Layer {
- public:
-  explicit Pool3DLayer(const LayerConfig& config) : Layer(config) {}
-  ~Pool3DLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-  size_t getSize();
-
- protected:
-  int channels_;
-  int sizeX_, sizeY_, sizeZ_;
-  int strideW_, strideH_, strideD_;
-  int paddingW_, paddingH_, paddingD_;
-  int imgSizeW_, imgSizeH_, imgSizeD_;
-  int outputW_, outputH_, outputD_;
-  std::string poolType_;
-  MatrixPtr maxPoolIdx_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolLayer.cpp b/paddle/legacy/gserver/layers/PoolLayer.cpp
deleted file mode 100644
index df172d95757..00000000000
--- a/paddle/legacy/gserver/layers/PoolLayer.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PoolLayer.h"
-#include "MaxPoolWithMaskLayer.h"
-#include "PoolProjectionLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#ifdef PADDLE_WITH_CUDA
-#include "CudnnPoolLayer.h"
-#endif
-namespace paddle {
-
-REGISTER_LAYER_CREATE_FUNC(pool, &PoolLayer::create);
-
-bool PoolLayer::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* the size of inputs for pool-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  const PoolConfig& conf = config_.inputs(0).pool_conf();
-  poolType_ = conf.pool_type();
-  channels_ = conf.channels();
-  sizeX_ = conf.size_x();
-  stride_ = conf.stride();
-  outputX_ = conf.output_x();
-  imgSize_ = conf.img_size();
-  confPadding_ = conf.padding();
-
-  sizeY_ = conf.has_size_y() ? conf.size_y() : conf.size_x();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
-  confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-
-  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
-  return true;
-}
-
-Layer* PoolLayer::create(const LayerConfig& config) {
-  CHECK_EQ(config.inputs_size(), 1);
-  const std::string& pool = config.inputs(0).pool_conf().pool_type();
-  if (pool == "max-projection" || pool == "avg-projection") {
-    return new PoolProjectionLayer(config);
-#ifdef PADDLE_WITH_CUDA
-  } else if (CudnnPoolLayer::typeCheck(pool)) {
-    return new CudnnPoolLayer(config);
-#endif
-  } else if (pool == "max-pool-with-mask") {
-    return new MaxPoolWithMaskLayer(config);
-  } else {
-    LOG(FATAL) << "Unknown pool type: " << pool;
-    return nullptr;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolLayer.h b/paddle/legacy/gserver/layers/PoolLayer.h
deleted file mode 100644
index 0808dfae849..00000000000
--- a/paddle/legacy/gserver/layers/PoolLayer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Basic parent layer of pooling
- * Pools the input within regions
- */
-class PoolLayer : public Layer {
- protected:
-  size_t channels_, sizeX_, stride_, outputX_, imgSize_;
-  int confPadding_;
-
-  size_t sizeY_;
-  size_t imgSizeY_;
-  size_t strideY_;
-  size_t outputY_;
-  int confPaddingY_;
-
-  std::string poolType_;
-
-  bool excludeMode_;
-
- public:
-  explicit PoolLayer(const LayerConfig& config) : Layer(config) {}
-
-  /**
-   * @brief create pooling layer by pool_type
-   */
-  static Layer* create(const LayerConfig& config);
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolProjection.cpp b/paddle/legacy/gserver/layers/PoolProjection.cpp
deleted file mode 100644
index 73ce88adf25..00000000000
--- a/paddle/legacy/gserver/layers/PoolProjection.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PoolProjection.h"
-
-namespace paddle {
-
-REGISTER_PROJECTION_CREATE_FUNC(pool, &PoolProjection::create);
-
-PoolProjection::PoolProjection(const ProjectionConfig& config,
-                               ParameterPtr parameter,
-                               bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  const PoolConfig& conf = config_.pool_conf();
-  poolType_ = conf.pool_type();
-  channels_ = conf.channels();
-  sizeX_ = conf.size_x();
-  stride_ = conf.stride();
-  outputX_ = conf.output_x();
-  imgSize_ = conf.img_size();
-  confPadding_ = conf.padding();
-
-  sizeY_ = conf.has_size_y() ? conf.size_y() : conf.size_x();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
-  confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-
-  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
-}
-
-size_t PoolProjection::getSize() {
-  imgSizeY_ = in_->getFrameHeight();
-  imgSize_ = in_->getFrameWidth();
-  const PoolConfig& conf = config_.pool_conf();
-  if (imgSizeY_ == 0) {
-    imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  }
-  if (imgSize_ == 0) {
-    imgSize_ = conf.img_size();
-  }
-  outputY_ = outputSize(imgSizeY_,
-                        sizeY_,
-                        confPaddingY_,
-                        strideY_,
-                        /* caffeMode */ false);
-  outputX_ = outputSize(imgSize_,
-                        sizeX_,
-                        confPadding_,
-                        stride_,
-                        /* caffeMode */ false);
-
-  const_cast<Argument*>(out_)->setFrameHeight(outputY_);
-  const_cast<Argument*>(out_)->setFrameWidth(outputX_);
-
-  return outputY_ * outputX_ * channels_;
-}
-
-PoolProjection* PoolProjection::create(const ProjectionConfig& config,
-                                       ParameterPtr parameter,
-                                       bool useGpu) {
-  const std::string& pool = config.pool_conf().pool_type();
-  if (pool == "max-projection") {
-    return new MaxPoolProjection(config, parameter, useGpu);
-  } else if (pool == "avg-projection") {
-    return new AvgPoolProjection(config, parameter, useGpu);
-  } else {
-    LOG(FATAL) << "Unknown pool type: " << pool;
-    return nullptr;
-  }
-}
-
-void MaxPoolProjection::forward() {
-  size_t width = getSize();
-  CHECK_EQ(width, out_->value->getWidth());
-  MatrixPtr inputV = in_->value;
-  MatrixPtr outV = out_->value;
-  outV->maxPoolForward(*inputV,
-                       imgSizeY_,
-                       imgSize_,
-                       channels_,
-                       sizeX_,
-                       sizeY_,
-                       strideY_,
-                       stride_,
-                       outputY_,
-                       outputX_,
-                       confPaddingY_,
-                       confPadding_);
-}
-
-void MaxPoolProjection::backward(const UpdateCallback& callback) {
-  (void)callback;
-  MatrixPtr outGrad = out_->grad;
-  MatrixPtr inputV = in_->value;
-  MatrixPtr outV = out_->value;
-  MatrixPtr inputGrad = in_->grad;
-
-  if (NULL == inputGrad) {
-    return;
-  }
-  inputGrad->maxPoolBackward(*inputV,
-                             imgSizeY_,
-                             imgSize_,
-                             *outGrad,
-                             *outV,
-                             sizeX_,
-                             sizeY_,
-                             strideY_,
-                             stride_,
-                             outputY_,
-                             outputX_,
-                             1,
-                             1,
-                             confPaddingY_,
-                             confPadding_);
-}
-
-void AvgPoolProjection::forward() {
-  size_t width = getSize();
-  CHECK_EQ(width, out_->value->getWidth());
-  MatrixPtr inputV = in_->value;
-  MatrixPtr outV = out_->value;
-  outV->avgPoolForward(*inputV,
-                       imgSizeY_,
-                       imgSize_,
-                       channels_,
-                       sizeX_,
-                       sizeY_,
-                       strideY_,
-                       stride_,
-                       outputY_,
-                       outputX_,
-                       confPaddingY_,
-                       confPadding_,
-                       excludeMode_);
-}
-
-void AvgPoolProjection::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  MatrixPtr outputGrad = out_->grad;
-  MatrixPtr inputGrad = in_->grad;
-
-  if (NULL == inputGrad) {
-    return;
-  }
-
-  inputGrad->avgPoolBackward(*outputGrad,
-                             imgSizeY_,
-                             imgSize_,
-                             sizeX_,
-                             sizeY_,
-                             strideY_,
-                             stride_,
-                             outputY_,
-                             outputX_,
-                             1,
-                             1,
-                             confPaddingY_,
-                             confPadding_,
-                             excludeMode_);
-}
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolProjection.h b/paddle/legacy/gserver/layers/PoolProjection.h
deleted file mode 100644
index d01b6a13f0a..00000000000
--- a/paddle/legacy/gserver/layers/PoolProjection.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Projection.h"
-#include "paddle/legacy/math/MathUtils.h"
-
-namespace paddle {
-
-class PoolProjection : public Projection {
- protected:
-  size_t imgSizeY_, imgSize_;
-  size_t outputY_, outputX_;
-  size_t strideY_, stride_;
-  size_t sizeY_, sizeX_;
-  int confPaddingY_, confPadding_;
-  size_t channels_;
-  std::string poolType_;
-  bool excludeMode_;
-
- public:
-  PoolProjection(const ProjectionConfig& config,
-                 ParameterPtr parameter,
-                 bool useGpu);
-
-  static PoolProjection* create(const ProjectionConfig& config,
-                                ParameterPtr parameter,
-                                bool useGpu);
-
-  const std::string& getPoolType() const { return poolType_; }
-
-  size_t getSize();
-};
-
-class MaxPoolProjection : public PoolProjection {
- public:
-  MaxPoolProjection(const ProjectionConfig& config,
-                    ParameterPtr parameter,
-                    bool useGpu)
-      : PoolProjection(config, parameter, useGpu) {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback = nullptr);
-};
-
-class AvgPoolProjection : public PoolProjection {
- public:
-  AvgPoolProjection(const ProjectionConfig& config,
-                    ParameterPtr parameter,
-                    bool useGpu)
-      : PoolProjection(config, parameter, useGpu) {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback = nullptr);
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolProjectionLayer.cpp b/paddle/legacy/gserver/layers/PoolProjectionLayer.cpp
deleted file mode 100644
index e44b1d7ba14..00000000000
--- a/paddle/legacy/gserver/layers/PoolProjectionLayer.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PoolProjectionLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-size_t PoolProjectionLayer::getSize() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t layerSize = 0;
-  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imgSizeH_ == 0) {
-    imgSizeH_ = imgSizeY_;
-  }
-  if (imgSizeW_ == 0) {
-    imgSizeW_ = imgSize_;
-  }
-
-  outputH_ = outputSize(imgSizeH_,
-                        sizeY_,
-                        confPaddingY_,
-                        strideY_,
-                        /* caffeMode */ false);
-  outputW_ = outputSize(imgSizeW_,
-                        sizeX_,
-                        confPadding_,
-                        stride_,
-                        /* caffeMode */ false);
-
-  layerSize = outputH_ * outputW_ * channels_;
-
-  return layerSize;
-}
-
-void PoolProjectionLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  const Argument& in = getInput(0);
-  int batchSize = in.value->getHeight();
-  int size = getSize();
-  resetOutput(batchSize, size);
-  poolProjection_->forward(&in, &output_, passType);
-}
-
-void PoolProjectionLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-  poolProjection_->backward(callback);
-}
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolProjectionLayer.h b/paddle/legacy/gserver/layers/PoolProjectionLayer.h
deleted file mode 100644
index fcd35bbba4d..00000000000
--- a/paddle/legacy/gserver/layers/PoolProjectionLayer.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "PoolLayer.h"
-#include "PoolProjection.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief Basic parent layer of different kinds of pooling
- */
-class PoolProjectionLayer : public PoolLayer {
- protected:
-  size_t imgSizeH_, imgSizeW_;
-  size_t outputH_, outputW_;
-  std::unique_ptr<PoolProjection> poolProjection_;
-  ProjectionConfig projectionConfig_;
-
- public:
-  explicit PoolProjectionLayer(const LayerConfig& config) : PoolLayer(config) {
-    PoolConfig* conf = projectionConfig_.mutable_pool_conf();
-    *conf = config_.inputs(0).pool_conf();
-    poolProjection_.reset(
-        PoolProjection::create(projectionConfig_, nullptr, useGpu_));
-  }
-
-  size_t getSize();
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PowerLayer.cpp b/paddle/legacy/gserver/layers/PowerLayer.cpp
deleted file mode 100644
index 5e94c64db60..00000000000
--- a/paddle/legacy/gserver/layers/PowerLayer.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * This layer applys a power function to a vector element-wise,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   y = x^w
- * \f]
- * where \f$x\f$ is a input vector, \f$w\f$ is scalar weight,
- * and output \f$y\f$ is a vector.
- *
- * The config file api is power_layer.
- */
-
-class PowerLayer : public Layer {
- protected:
-  MatrixPtr tmpMtx;
-
- public:
-  explicit PowerLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~PowerLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(power, PowerLayer);
-
-bool PowerLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  return true;
-}
-
-void PowerLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  CHECK_EQ(getSize(), dataDim);
-  CHECK_EQ(1U, inV0->getWidth());
-  CHECK_EQ(batchSize, inV0->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  {
-    REGISTER_TIMER_INFO("FwPowerTimer", getName().c_str());
-    outV->rowPow(0, *inV1, *inV0);
-  }
-}
-
-void PowerLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  {
-    REGISTER_TIMER_INFO("BwPowerTimer", getName().c_str());
-    Matrix::resizeOrCreate(tmpMtx, batchSize, dataDim, false, useGpu_);
-
-    if (inG0) {
-      tmpMtx->log2(*inV1);
-      tmpMtx->dotMul(*tmpMtx, *outV);
-
-      // inG0 += outG .* (log(inV1) * outV)
-      inG0->rowDotMul(0, *outG, *tmpMtx);
-    }
-
-    if (inG1) {
-      // tmp = (outV / inV1) * inV0
-      tmpMtx->dotDiv(*outV, *inV1);
-      tmpMtx->rowScale(0, *tmpMtx, *inV0);
-
-      inG1->addDotMul(*outG, *tmpMtx, 1, 1);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PrintLayer.cpp b/paddle/legacy/gserver/layers/PrintLayer.cpp
deleted file mode 100644
index 6fbcc447f92..00000000000
--- a/paddle/legacy/gserver/layers/PrintLayer.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-class PrintLayer : public Layer {
- public:
-  explicit PrintLayer(const LayerConfig& config) : Layer(config) {}
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-    std::vector<std::string> vals;
-    for (size_t i = 0; i != inputLayers_.size(); ++i) {
-      std::ostringstream s;
-      getInput(i).printValueString(s, "");
-      vals.push_back(s.str());
-    }
-    size_t pos = 0;
-    size_t i = 0;
-    std::ostringstream s;
-    const std::string& format = config_.user_arg();
-    while (true) {
-      size_t pos1 = format.find("%s", pos);
-      if (pos1 == std::string::npos) break;
-      if (i >= vals.size()) {
-        break;
-      }
-      s << format.substr(pos, pos1 - pos) << vals[i];
-      pos = pos1 + 2;
-      ++i;
-    }
-    if (i != inputLayers_.size()) {
-      LOG(ERROR) << "Number of value in the format (" << format
-                 << ") is not same as the number of inputs ("
-                 << inputLayers_.size() << ") at " << getName();
-    }
-    s << format.substr(pos);
-
-    const std::string delimiter("\n");
-    std::string content = s.str();
-    std::string::size_type foundPos = 0;
-    std::string::size_type prevPos = 0;
-    while ((foundPos = content.find(delimiter, prevPos)) != std::string::npos) {
-      LOG(INFO) << content.substr(prevPos, foundPos - prevPos);
-      prevPos = foundPos + delimiter.size();
-    }
-    LOG(INFO) << content.substr(prevPos);
-  }
-
-  void backward(const UpdateCallback& callback) override {}
-};
-
-REGISTER_LAYER(print, PrintLayer);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PriorBox.cpp b/paddle/legacy/gserver/layers/PriorBox.cpp
deleted file mode 100644
index 83aab6e3666..00000000000
--- a/paddle/legacy/gserver/layers/PriorBox.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/BaseMatrix.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief A layer for generating priorbox locations and variances.
- * - Input: Two and only two input layer are accepted. The input layer must be
- *          be a data output layer and a convolution output layer.
- * - Output: The priorbox locations and variances of the input data.
- * Reference:
- *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
- *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
- */
-
-class PriorBoxLayer : public Layer {
- public:  // NOLINT
-  explicit PriorBoxLayer(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override {}
-
- protected:  // NOLINT
-  int numPriors_;
-  std::vector<int> minSize_;
-  std::vector<int> maxSize_;
-  std::vector<real> aspectRatio_;
-  std::vector<real> variance_;
-  MatrixPtr buffer_;
-};
-
-REGISTER_LAYER(priorbox, PriorBoxLayer);
-
-bool PriorBoxLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  auto pbConf = config_.inputs(0).priorbox_conf();
-  std::vector<real> tmp;
-  aspectRatio_.push_back(1.);
-  std::copy(pbConf.min_size().begin(),
-            pbConf.min_size().end(),
-            std::back_inserter(minSize_));
-  std::copy(pbConf.max_size().begin(),
-            pbConf.max_size().end(),
-            std::back_inserter(maxSize_));
-  std::copy(pbConf.variance().begin(),
-            pbConf.variance().end(),
-            std::back_inserter(variance_));
-  std::copy(pbConf.aspect_ratio().begin(),
-            pbConf.aspect_ratio().end(),
-            std::back_inserter(tmp));
-
-  if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size());
-
-  // flip aspect ratios
-  for (unsigned index = 0; index < tmp.size(); index++) {
-    real ar = tmp[index];
-    if (fabs(ar - 1.) < 1e-6) continue;
-    aspectRatio_.push_back(ar);
-    aspectRatio_.push_back(1. / ar);
-  }
-
-  numPriors_ = aspectRatio_.size() * minSize_.size() + maxSize_.size();
-
-  return true;
-}
-
-void PriorBoxLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  auto input = getInput(0);
-  int layerWidth = input.getFrameWidth();
-  int layerHeight = input.getFrameHeight();
-
-  auto image = getInput(1);
-  int imageWidth = image.getFrameWidth();
-  int imageHeight = image.getFrameHeight();
-
-  real stepW = static_cast<real>(imageWidth) / layerWidth;
-  real stepH = static_cast<real>(imageHeight) / layerHeight;
-  int dim = layerHeight * layerWidth * numPriors_ * 4;
-  reserveOutput(1, dim * 2);
-  // use a cpu buffer to compute
-  Matrix::resizeOrCreate(buffer_, 1, dim * 2, false, false);
-  auto* tmpPtr = buffer_->getData();
-
-  int idx = 0;
-  for (int h = 0; h < layerHeight; ++h) {
-    for (int w = 0; w < layerWidth; ++w) {
-      real centerX = (w + 0.5) * stepW;
-      real centerY = (h + 0.5) * stepH;
-      for (size_t s = 0; s < minSize_.size(); s++) {
-        real minSize = minSize_[s];
-        real boxWidth = minSize;
-        real boxHeight = minSize;
-
-        // first prior: aspect_ratio == 1.0, compatible to old logic
-        tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-        tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-        tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-        tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-        // set the variance.
-        for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-
-        if (maxSize_.size() > 0) {
-          // square prior with size sqrt(minSize * maxSize)
-          real maxSize = maxSize_[s];
-          boxWidth = boxHeight = sqrt(minSize * maxSize);
-          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-          // set the variance.
-          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-        }
-
-        // priors with different aspect ratios
-        for (size_t r = 0; r < aspectRatio_.size(); r++) {
-          real ar = aspectRatio_[r];
-          if (fabs(ar - 1.0) < 1e-6) {
-            continue;
-          }
-          boxWidth = minSize * sqrt(ar);
-          boxHeight = minSize / sqrt(ar);
-          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-          // set the variance.
-          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-        }
-      }
-    }
-  }
-
-  // clip the prior's coordidate such that it is within [0, 1]
-  for (int d = 0; d < dim * 2; ++d)
-    if ((d % 8) < 4)
-      tmpPtr[d] = std::min(std::max(tmpPtr[d], (real)0.), (real)1.);
-  MatrixPtr outV = getOutputValue();
-  outV->copyFrom(buffer_->data_, dim * 2);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Projection.cpp b/paddle/legacy/gserver/layers/Projection.cpp
deleted file mode 100644
index 96d61e7f67b..00000000000
--- a/paddle/legacy/gserver/layers/Projection.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-
-#include "ContextProjection.h"
-#include "FullMatrixProjection.h"
-#include "TableProjection.h"
-
-namespace paddle {
-
-ClassRegistrar<Projection, ProjectionConfig, ParameterPtr, bool>
-    Projection::registrar_;
-
-Projection* Projection::create(const ProjectionConfig& config,
-                               ParameterPtr parameter,
-                               bool useGpu) {
-  return registrar_.createByType(config.type(), config, parameter, useGpu);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Projection.h b/paddle/legacy/gserver/layers/Projection.h
deleted file mode 100644
index 974f5a2cacd..00000000000
--- a/paddle/legacy/gserver/layers/Projection.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/parameter/Parameter.h"
-
-namespace paddle {
-
-// Macro for registering a projection type
-// Example: REGISTER_LAYER(fc, FullMatrixProjection);
-#define REGISTER_PROJECTION(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {                 \
-    Projection::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-
-#define REGISTER_PROJECTION_CREATE_FUNC(__type_name, createFunction)    \
-  static InitFunction __reg_type_##__type_name([]() {                   \
-    Projection::registrar_.registerClass(#__type_name, createFunction); \
-  })
-
-/**
- * A projection takes one Argument as input, calculate the result and add it
- * to output Argument.
- */
-class Projection {
- public:
-  static Projection* create(const ProjectionConfig& config,
-                            ParameterPtr parameter,
-                            bool useGpu);
-
-  Projection(const ProjectionConfig& config,
-             ParameterPtr parameter,
-             bool useGpu)
-      : config_(config), parameter_(parameter), useGpu_(useGpu) {}
-
-  virtual ~Projection() {}
-
-  const std::string& getName() const { return config_.name(); }
-
-  /// Register a projection
-  static ClassRegistrar<Projection, ProjectionConfig, ParameterPtr, bool>
-      registrar_;
-
-  /**
-   * Forward propagation. If backward() will be called, in and out must be kept
-   * valid until then.
-   * @param in input of projection
-   * @param out output of projection
-   * @param passType PASS_TRAIN of PASS_TEST
-   */
-  void forward(const Argument* in, const Argument* out, PassType passType) {
-    in_ = in;
-    out_ = out;
-    passType_ = passType;
-    forward();
-  }
-
-  virtual void prefetch(const Argument* in) {}
-  virtual void forward() = 0;
-  virtual void backward(const UpdateCallback& callback) = 0;
-
-  /**
-   * See comment in Layer.h for the function with the same name.
-   */
-  virtual void resetState() {}
-
-  /**
-   * Set layer state.
-   */
-  virtual void setState(LayerStatePtr state) {}
-
-  /**
-   * Get layer state. A copy of internal state is returned.
-   */
-  virtual LayerStatePtr getState() { return nullptr; }
-
-  /**
-   * init forward_ and backward_ functions
-   */
-  virtual bool init() { return true; }
-
-  /**
-   * Get output size of projection.
-   */
-  size_t getOutputSize() const { return config_.output_size(); }
-
- protected:
-  /**
-   * Create layer function. Function is called in forward or backward.
-   * \param function, Layer::forward_ or Layer::backward_
-   * \param name, function name
-   * \param config, initialization configuration for the function
-   */
-  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
-                      const std::string& name,
-                      const FuncConfig& config) {
-    if (useGpu_) {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
-    } else {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
-    }
-    auto& func = function.back();
-    func->init(config);
-  }
-
- protected:
-  /// Config of projection
-  ProjectionConfig config_;
-  /// Parameter of projection
-  ParameterPtr parameter_;
-  bool useGpu_;
-
-  /// Store `in` passed to forward()
-  const Argument* in_;
-  /// Store `out` passed to forward()
-  const Argument* out_;
-  /// Store `passType` passed to forward()
-  PassType passType_;
-  /// Layer forward function
-  std::vector<std::shared_ptr<FunctionBase>> forward_;
-  /// Layer backward function
-  std::vector<std::shared_ptr<FunctionBase>> backward_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ROIPoolLayer.cpp b/paddle/legacy/gserver/layers/ROIPoolLayer.cpp
deleted file mode 100644
index b5cbc0c704a..00000000000
--- a/paddle/legacy/gserver/layers/ROIPoolLayer.cpp
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ROIPoolLayer.h"
-#include <cfloat>
-
-namespace paddle {
-
-REGISTER_LAYER(roi_pool, ROIPoolLayer);
-
-bool ROIPoolLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
-  pooledWidth_ = layerConf.pooled_width();
-  pooledHeight_ = layerConf.pooled_height();
-  spatialScale_ = layerConf.spatial_scale();
-
-  return true;
-}
-
-void ROIPoolLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
-  height_ = getInput(0).getFrameHeight();
-  if (!height_) height_ = layerConf.height();
-  width_ = getInput(0).getFrameWidth();
-  if (!width_) width_ = layerConf.width();
-  channels_ = getInputValue(0)->getWidth() / width_ / height_;
-
-  size_t batchSize = getInput(0).getBatchSize();
-  size_t numROIs = getInput(1).getBatchSize();
-
-  MatrixPtr dataValue = getInputValue(0);
-  MatrixPtr roiValue = getInputValue(1);
-  resetOutput(numROIs, channels_ * pooledHeight_ * pooledWidth_);
-  MatrixPtr outputValue = getOutputValue();
-
-  if (useGpu_) {  // TODO(guosheng): implement on GPU later
-    MatrixPtr dataCpuBuffer;
-    Matrix::resizeOrCreate(dataCpuBuffer,
-                           dataValue->getHeight(),
-                           dataValue->getWidth(),
-                           false,
-                           false);
-    MatrixPtr roiCpuBuffer;
-    Matrix::resizeOrCreate(roiCpuBuffer,
-                           roiValue->getHeight(),
-                           roiValue->getWidth(),
-                           false,
-                           false);
-    dataCpuBuffer->copyFrom(*dataValue);
-    roiCpuBuffer->copyFrom(*roiValue);
-    dataValue = dataCpuBuffer;
-    roiValue = roiCpuBuffer;
-    MatrixPtr outputCpuBuffer;
-    Matrix::resizeOrCreate(outputCpuBuffer,
-                           outputValue->getHeight(),
-                           outputValue->getWidth(),
-                           false,
-                           false);
-    outputCpuBuffer->copyFrom(*outputValue);
-    outputValue = outputCpuBuffer;
-  }
-
-  real* bottomData = dataValue->getData();
-  size_t batchOffset = dataValue->getWidth();
-  size_t channelOffset = height_ * width_;
-  real* bottomROIs = roiValue->getData();
-  size_t roiOffset = roiValue->getWidth();
-  size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
-
-  real* outputData = outputValue->getData();
-  real* argmaxData = nullptr;
-  if (passType != PASS_TEST) {
-    Matrix::resizeOrCreate(maxIdxs_,
-                           numROIs,
-                           channels_ * pooledHeight_ * pooledWidth_,
-                           false,
-                           false);
-    argmaxData = maxIdxs_->getData();
-  }
-
-  for (size_t n = 0; n < numROIs; ++n) {
-    // the first five elememts of each RoI should be:
-    // batch_idx, roi_x_start, roi_y_start, roi_x_end, roi_y_end
-    size_t roiBatchIdx = bottomROIs[0];
-    size_t roiStartW = round(bottomROIs[1] * spatialScale_);
-    size_t roiStartH = round(bottomROIs[2] * spatialScale_);
-    size_t roiEndW = round(bottomROIs[3] * spatialScale_);
-    size_t roiEndH = round(bottomROIs[4] * spatialScale_);
-    CHECK_GE(roiBatchIdx, 0UL);
-    CHECK_LT(roiBatchIdx, batchSize);
-    size_t roiHeight =
-        std::max(roiEndH - roiStartH + 1, static_cast<size_t>(1));
-    size_t roiWidth = std::max(roiEndW - roiStartW + 1, static_cast<size_t>(1));
-    real binSizeH =
-        static_cast<real>(roiHeight) / static_cast<real>(pooledHeight_);
-    real binSizeW =
-        static_cast<real>(roiWidth) / static_cast<real>(pooledWidth_);
-    real* batchData = bottomData + batchOffset * roiBatchIdx;
-    for (size_t c = 0; c < channels_; ++c) {
-      for (size_t ph = 0; ph < pooledHeight_; ++ph) {
-        for (size_t pw = 0; pw < pooledWidth_; ++pw) {
-          size_t hstart = static_cast<size_t>(std::floor(ph * binSizeH));
-          size_t wstart = static_cast<size_t>(std::floor(pw * binSizeW));
-          size_t hend = static_cast<size_t>(std::ceil((ph + 1) * binSizeH));
-          size_t wend = static_cast<size_t>(std::ceil((pw + 1) * binSizeW));
-          hstart = std::min(
-              std::max(hstart + roiStartH, static_cast<size_t>(0)), height_);
-          wstart = std::min(
-              std::max(wstart + roiStartW, static_cast<size_t>(0)), width_);
-          hend = std::min(std::max(hend + roiStartH, static_cast<size_t>(0)),
-                          height_);
-          wend = std::min(std::max(wend + roiStartW, static_cast<size_t>(0)),
-                          width_);
-
-          bool isEmpty = (hend <= hstart) || (wend <= wstart);
-          size_t poolIndex = ph * pooledWidth_ + pw;
-          outputData[poolIndex] = isEmpty ? 0 : -FLT_MAX;
-          if (argmaxData) {
-            argmaxData[poolIndex] = -1;
-          }
-
-          for (size_t h = hstart; h < hend; ++h) {
-            for (size_t w = wstart; w < wend; ++w) {
-              size_t index = h * width_ + w;
-              if (batchData[index] > outputData[poolIndex]) {
-                outputData[poolIndex] = batchData[index];
-                if (argmaxData) {
-                  argmaxData[poolIndex] = index;
-                }
-              }
-            }
-          }
-        }
-      }
-      batchData += channelOffset;
-      outputData += poolChannelOffset;
-      if (argmaxData) {
-        argmaxData += poolChannelOffset;
-      }
-    }
-    bottomROIs += roiOffset;
-  }
-  if (useGpu_) {
-    getOutputValue()->copyFrom(*outputValue);
-  }
-}
-
-void ROIPoolLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inGradValue = getInputGrad(0);
-  MatrixPtr outGradValue = getOutputGrad();
-  MatrixPtr roiValue = getInputValue(1);
-
-  if (useGpu_) {
-    MatrixPtr inGradCpuBuffer;
-    Matrix::resizeOrCreate(inGradCpuBuffer,
-                           inGradValue->getHeight(),
-                           inGradValue->getWidth(),
-                           false,
-                           false);
-    MatrixPtr outGradCpuBuffer;
-    Matrix::resizeOrCreate(outGradCpuBuffer,
-                           outGradValue->getHeight(),
-                           outGradValue->getWidth(),
-                           false,
-                           false);
-    MatrixPtr roiCpuBuffer;
-    Matrix::resizeOrCreate(roiCpuBuffer,
-                           roiValue->getHeight(),
-                           roiValue->getWidth(),
-                           false,
-                           false);
-    inGradCpuBuffer->copyFrom(*inGradValue);
-    outGradCpuBuffer->copyFrom(*outGradValue);
-    roiCpuBuffer->copyFrom(*roiValue);
-    inGradValue = inGradCpuBuffer;
-    outGradValue = outGradCpuBuffer;
-    roiValue = roiCpuBuffer;
-  }
-
-  real* bottomROIs = roiValue->getData();
-  size_t numROIs = getInput(1).getBatchSize();
-  size_t roiOffset = getInputValue(1)->getWidth();
-
-  real* inDiffData = inGradValue->getData();
-  size_t batchOffset = getInputValue(0)->getWidth();
-  size_t channelOffset = height_ * width_;
-
-  real* outDiffData = outGradValue->getData();
-  size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
-  real* argmaxData = maxIdxs_->getData();
-
-  for (size_t n = 0; n < numROIs; ++n) {
-    size_t roiBatchIdx = bottomROIs[0];
-    real* batchDiffData = inDiffData + batchOffset * roiBatchIdx;
-    for (size_t c = 0; c < channels_; ++c) {
-      for (size_t ph = 0; ph < pooledHeight_; ++ph) {
-        for (size_t pw = 0; pw < pooledWidth_; ++pw) {
-          size_t poolIndex = ph * pooledWidth_ + pw;
-          if (argmaxData[poolIndex] > 0) {
-            size_t index = static_cast<size_t>(argmaxData[poolIndex]);
-            batchDiffData[index] += outDiffData[poolIndex];
-          }
-        }
-      }
-      batchDiffData += channelOffset;
-      outDiffData += poolChannelOffset;
-      argmaxData += poolChannelOffset;
-    }
-    bottomROIs += roiOffset;
-  }
-
-  if (useGpu_) {
-    getInputGrad(0)->copyFrom(*inGradValue);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ROIPoolLayer.h b/paddle/legacy/gserver/layers/ROIPoolLayer.h
deleted file mode 100644
index 801a9b3aebe..00000000000
--- a/paddle/legacy/gserver/layers/ROIPoolLayer.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * A layer used by Fast R-CNN to extract feature maps of ROIs from the last
- * feature map.
- * - Input: This layer needs two input layers: The first input layer is a
- *          convolution layer; The second input layer contains the ROI data
- *          which is the output of ProposalLayer in Faster R-CNN. layers for
- *          generating bbox location offset and the classification confidence.
- * - Output: The ROIs' feature map.
- * Reference:
- *    Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun.
- *    Faster R-CNN: Towards Real-Time Object Detection with Region Proposal
- * Networks
- */
-
-class ROIPoolLayer : public Layer {
- protected:
-  size_t channels_;
-  size_t width_;
-  size_t height_;
-  size_t pooledWidth_;
-  size_t pooledHeight_;
-  real spatialScale_;
-
-  // Since there is no int matrix, use real maxtrix instead.
-  MatrixPtr maxIdxs_;
-
- public:
-  explicit ROIPoolLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RecurrentLayer.cpp b/paddle/legacy/gserver/layers/RecurrentLayer.cpp
deleted file mode 100644
index 3fc5bd15edd..00000000000
--- a/paddle/legacy/gserver/layers/RecurrentLayer.cpp
+++ /dev/null
@@ -1,301 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RecurrentLayer.h"
-
-DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
-
-namespace paddle {
-
-REGISTER_LAYER(recurrent, RecurrentLayer);
-
-bool RecurrentLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-  CHECK_EQ(1U, parameters_.size());
-  CHECK_EQ(getSize() * getSize(), parameters_[0]->getSize());
-  weight_.reset(new Weight(getSize(), getSize(), parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    bias_.reset(new Weight(1, getSize(), biasParameter_));
-  }
-  reversed_ = config_.reversed();
-  return true;
-}
-
-void RecurrentLayer::resetState() {
-  CHECK(!reversed_) << "state is not allowed for reversed recurrent layer";
-  Matrix::resizeOrCreate(
-      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
-  prevOutput_->zeroMem();
-}
-
-void RecurrentLayer::setState(LayerStatePtr state) {
-  CHECK(state->value.size() == 1) << "one matrix is expected for RNN state";
-  prevOutput_->copyFrom(*(state->value[0]));
-}
-
-LayerStatePtr RecurrentLayer::getState() {
-  LayerStatePtr res = std::make_shared<LayerState>();
-  res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
-  res->value[0]->copyFrom(*prevOutput_);
-  return res;
-}
-
-void RecurrentLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("RecurrentFwTimer", getName().c_str());
-  Layer::forward(passType);
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  size_t numSequences = input.getNumSequences();
-  resetOutput(batchSize, getSize());
-  CHECK_EQ(getSize(), input.value->getWidth());
-  const int* starts = input.sequenceStartPositions->getData(false);
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  output_.value->assign(*input.value);
-  if (bias_) {
-    output_.value->addBias(*bias_->getW(), 1);
-  }
-  if (!FLAGS_rnn_use_batch) {
-    forwardSequence(batchSize, numSequences, starts);
-  } else {
-    forwardBatch(batchSize, numSequences, starts);
-  }
-}
-
-void RecurrentLayer::forwardSequence(int batchSize,
-                                     size_t numSequences,
-                                     const int* starts) {
-  REGISTER_TIMER_INFO("RecurrentFwSequence", getName().c_str());
-  frameOutput_.reserve(batchSize);
-  for (int i = frameOutput_.size(); i < batchSize; ++i) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               getSize(),
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              getSize(),
-                              /* trans= */ false,
-                              useGpu_);
-    frameOutput_.push_back(arg);
-  }
-
-  for (int i = 0; i < batchSize; ++i) {
-    frameOutput_[i].value->setData(output_.value->getData() + i * getSize());
-  }
-
-  AsyncGpuBlock asyncGpuBlock;
-  for (size_t i = 0; i < numSequences; ++i) {
-    forwardOneSequence(starts[i], starts[i + 1] - starts[i]);
-  }
-}
-
-void RecurrentLayer::forwardOneSequence(int start, int length) {
-  if (!reversed_) {
-    if (prevOutput_) {
-      frameOutput_[start].value->mul(*prevOutput_, *weight_->getW(), 1, 1);
-    }
-    activation_->forward(frameOutput_[start]).check();
-
-    for (int i = 1; i < length; ++i) {
-      frameOutput_[start + i].value->mul(
-          *frameOutput_[start + i - 1].value, *weight_->getW(), 1, 1);
-      activation_->forward(frameOutput_[start + i]).check();
-    }
-    if (prevOutput_) {
-      prevOutput_->assign(*frameOutput_[start + length - 1].value);
-    }
-  } else {
-    activation_->forward(frameOutput_[start + length - 1]).check();
-    for (int i = length - 2; i >= 0; --i) {
-      frameOutput_[start + i].value->mul(
-          *frameOutput_[start + i + 1].value, *weight_->getW(), 1, 1);
-      activation_->forward(frameOutput_[start + i]).check();
-    }
-  }
-}
-
-void RecurrentLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("RecurrentBwTimer", getName().c_str());
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  const int* starts = input.sequenceStartPositions->getData(false);
-  size_t numSequences = input.getNumSequences();
-
-  if (!FLAGS_rnn_use_batch) {
-    backwardSequence(batchSize, numSequences, starts);
-  } else {
-    backwardBatch(batchSize, numSequences, starts);
-  }
-
-  if (input.grad) {
-    input.grad->add(*output_.grad);
-  }
-
-  if (bias_ && bias_->getWGrad()) {
-    bias_->getWGrad()->collectBias(*output_.grad, 1);
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-void RecurrentLayer::backwardSequence(int batchSize,
-                                      size_t numSequences,
-                                      const int* starts) {
-  REGISTER_TIMER_INFO("RecurrentBwSequence", getName().c_str());
-  for (int i = 0; i < batchSize; ++i) {
-    frameOutput_[i].grad->setData(output_.grad->getData() + i * getSize());
-  }
-
-  AsyncGpuBlock asyncGpuBlock;
-  for (size_t i = 0; i < numSequences; ++i) {
-    backwardOneSequence(starts[i], starts[i + 1] - starts[i]);
-  }
-}
-
-void RecurrentLayer::backwardOneSequence(int start, int length) {
-  MatrixPtr weightT = weight_->getW()->getTranspose();
-  if (!reversed_) {
-    for (int i = length - 1; i > 0; --i) {
-      activation_->backward(frameOutput_[start + i]).check();
-      frameOutput_[start + i - 1].grad->mul(
-          *frameOutput_[start + i].grad, *weightT, 1, 1);
-    }
-    activation_->backward(frameOutput_[start]).check();
-    if (weight_->getWGrad()) {
-      weight_->getWGrad()->mul(
-          *output_.value->subMatrix(start, length - 1)->getTranspose(),
-          *output_.grad->subMatrix(start + 1, length - 1),
-          1,
-          1);
-    }
-  } else {
-    for (int i = 0; i < length - 1; ++i) {
-      activation_->backward(frameOutput_[start + i]).check();
-      frameOutput_[start + i + 1].grad->mul(
-          *frameOutput_[start + i].grad, *weightT, 1, 1);
-    }
-    activation_->backward(frameOutput_[start + length - 1]).check();
-    if (weight_->getWGrad()) {
-      weight_->getWGrad()->mul(
-          *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
-          *output_.grad->subMatrix(start, length - 1),
-          1,
-          1);
-    }
-  }
-}
-
-void RecurrentLayer::forwardBatch(int batchSize,
-                                  size_t numSequences,
-                                  const int* starts) {
-  if (!batchValue_) {
-    batchValue_.reset(new SequenceToBatch(useGpu_));
-  }
-
-  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
-
-  batchValue_->copyFromSeq(*output_.value);
-  {
-    REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str());
-    AsyncGpuBlock asyncGpuBlock;
-    /* forward one batch */
-    for (size_t n = 0; n < batchValue_->getNumBatch(); n++) {
-      MatrixPtr batch2 = batchValue_->getBatchValue(n);
-
-      if (n != 0) {
-        MatrixPtr batch1 =
-            batchValue_->getBatchValue(n - 1, batch2->getHeight());
-        batch2->mul(*batch1, *weight_->getW(), 1, 1);
-      }
-      Argument arg;
-      arg.value = batch2;
-      activation_->forward(arg).check();
-    }
-  }
-  batchValue_->copyBackSeq(*output_.value);
-}
-
-void RecurrentLayer::backwardBatch(int batchSize,
-                                   size_t numSequences,
-                                   const int* starts) {
-  if (!batchGrad_) {
-    batchGrad_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchGrad_->shareIndexWith(*batchValue_);
-
-  size_t numBatch = batchGrad_->getNumBatch();
-  bool backwardByBatch = numBatch < numSequences;
-
-  batchGrad_->copyFromSeq(*output_.grad);
-  {
-    REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str());
-    MatrixPtr weightT = weight_->getW()->getTranspose();
-    AsyncGpuBlock asyncGpuBlock;
-    /* backward one batch */
-    for (int n = (int)numBatch - 1; n >= 0; n--) {
-      MatrixPtr batch2 = batchGrad_->getBatchValue(n);
-      MatrixPtr batch1 = batchValue_->getBatchValue(n, batch2->getHeight());
-
-      Argument arg;
-      arg.value = batch1;
-      arg.grad = batch2;
-      activation_->backward(arg).check();
-
-      if (n != 0) {
-        batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight());
-        batch1->mul(*batch2, *weightT, 1, 1);
-      }
-
-      if (backwardByBatch && weight_->getWGrad()) {
-        if (n != 0) {
-          /* backward weight */
-          batch1 = batchValue_->getBatchValue(n - 1, batch2->getHeight());
-          weight_->getWGrad()->mul(*batch1->getTranspose(), *batch2, 1, 1);
-        }
-      }
-    }
-  }
-
-  batchGrad_->copyBackSeq(*output_.grad);
-
-  if (!backwardByBatch && weight_->getWGrad()) {
-    REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str());
-    AsyncGpuBlock asyncGpuBlock;
-    for (size_t seq = 0; seq < numSequences; ++seq) {
-      int len = starts[seq + 1] - starts[seq];
-      if (!reversed_) {
-        weight_->getWGrad()->mul(
-            *output_.value->subMatrix(starts[seq], len - 1)->getTranspose(),
-            *output_.grad->subMatrix(starts[seq] + 1, len - 1),
-            1,
-            1);
-      } else {
-        weight_->getWGrad()->mul(
-            *output_.value->subMatrix(starts[seq] + 1, len - 1)->getTranspose(),
-            *output_.grad->subMatrix(starts[seq], len - 1),
-            1,
-            1);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RecurrentLayer.h b/paddle/legacy/gserver/layers/RecurrentLayer.h
deleted file mode 100644
index 287ea27a098..00000000000
--- a/paddle/legacy/gserver/layers/RecurrentLayer.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <gflags/gflags.h>
-#include "Layer.h"
-#include "SequenceToBatch.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief RecurrentLayer takes 1 input layer. The output size is the same with
- * input layer.
- * For each sequence [start, end] it performs the following computation:
- * \f[
- *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
- *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
- *
- * \f]
- * If reversed is true, the order is reversed:
- * \f[
- *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
- *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
- * \f]
- * There are two methods to calculate rnn. One way is to compute rnn one
- * sequence by one sequence. The other way is to reorganize the input
- * into batches, then compute rnn one batch by one batch. Users can select
- * them by rnn_use_batch flag.
- */
-
-class RecurrentLayer : public Layer {
- public:
-  explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
- protected:
-  /**
-   * @brief If user do not set --rnn_use_batch=true, it will
-   * compute rnn forward one sequence by one sequence in default.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn forward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void forwardOneSequence(int start, int length);
-  /**
-   * @brief Compute rnn backward one sequence by onesequence.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn backward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void backwardOneSequence(int start, int length);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch. It will convert batch shape to sequence after finishing forward.
-   * The batch info can refer to SequenceToBatch class.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  virtual void forwardBatch(int batchSize,
-                            size_t numSequences,
-                            const int* starts);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  virtual void backwardBatch(int batchSize,
-                             size_t numSequences,
-                             const int* starts);
-
- protected:
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> bias_;
-
-  /// frameOutput_[i] is used to hold the i-th sample of output_
-  std::vector<Argument> frameOutput_;
-  MatrixPtr prevOutput_;
-  /// Whether compute rnn by reverse.
-  bool reversed_;
-  /// If compute batch by batch, batchValue_ will be used to save the
-  /// reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  /// If compute batch by batch, batchGrad_ will be used to save the
-  /// gradient with respect to reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp b/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp
deleted file mode 100644
index 39321245995..00000000000
--- a/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <functional>
-#include "paddle/legacy/gserver/layers/Layer.h"
-
-#include "paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * Recurrent layer group is a group of layers, which forward/backward one frame
- * after previous frame forward/backward through all layers in layer group.
- * It's automatically added by config_parser if some layers are defined
- * between RecurrentLayerGroupBegin and RecurrentLayerGroupEnd.
- */
-class RecurrentLayerGroup : public Layer {
- public:
-  explicit RecurrentLayerGroup(const LayerConfig& config) : Layer(config) {}
-
-  void initSubNetwork(NeuralNetwork* rootNetwork,
-                      const ModelConfig& config,
-                      const std::vector<ParameterType>& parameterTypes,
-                      bool useGpu) override;
-
-  void forward(PassType passType) override {
-    REGISTER_TIMER_INFO("RecurrentGroupFwTime", getName().c_str());
-    const std::vector<Argument> inArgs;
-    std::vector<Argument> outArgs;
-    network_->forward(inArgs, &outArgs, passType);
-  }
-  void backward(const UpdateCallback& callback) override {
-    REGISTER_TIMER_INFO("RecurrentGroupBwTime", getName().c_str());
-    network_->backward(nullptr);
-
-    for (auto& para : parameters_) {
-      para->incUpdate(callback);
-    }
-  }
-
-  /**
-   * @see Layer.accessSubNetwork
-   */
-  void accessSubNetwork(
-      const std::function<void(NeuralNetwork&)>& callback) override {
-    callback(*network_);
-  }
-
- private:
-  std::unique_ptr<RecurrentGradientMachine> network_;
-};
-
-REGISTER_LAYER(recurrent_layer_group, RecurrentLayerGroup);
-
-void RecurrentLayerGroup::initSubNetwork(
-    NeuralNetwork* rootNetwork,
-    const ModelConfig& config,
-    const std::vector<ParameterType>& parameterTypes,
-    bool useGpu) {
-  setNeedGradient(true);
-
-  network_.reset(new RecurrentGradientMachine(config_.name(), rootNetwork));
-  ParamInitCallback cb = [rootNetwork](int paramId, Parameter* para) {
-    para->enableSharedType(
-        PARAMETER_VALUE,
-        rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_VALUE),
-        rootNetwork->getParameters()[paramId]->getMat(PARAMETER_VALUE));
-    para->enableSharedType(
-        PARAMETER_GRADIENT,
-        rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_GRADIENT),
-        rootNetwork->getParameters()[paramId]->getMat(PARAMETER_GRADIENT));
-  };
-  network_->init(config, cb, parameterTypes, useGpu);
-
-  for (auto paramId : network_->getParameterIds()) {
-    ParameterPtr parameter = rootNetwork->getParameters()[paramId];
-    parameter->incShared();
-    CHECK_EQ(parameter->getDeviceId(), getDeviceId());
-    parameters_.push_back(parameter);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ResizeLayer.cpp b/paddle/legacy/gserver/layers/ResizeLayer.cpp
deleted file mode 100644
index 8f8aad820f7..00000000000
--- a/paddle/legacy/gserver/layers/ResizeLayer.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/BaseMatrix.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief A layer for resizing a minibatch matrix h*w to h'*w'
- * @note
- * origin matrix height * width)
- * resize matrix: (height * width / size) * size
- */
-class ResizeLayer : public Layer {
- public:
-  explicit ResizeLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-};
-
-REGISTER_LAYER(resize, ResizeLayer);
-
-bool ResizeLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void ResizeLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  const Argument& input = getInput(0);
-  size_t height = input.value->getHeight();
-  size_t width = input.value->getWidth();
-  CHECK_EQ((height * width) % getSize(), 0UL);
-
-  reserveOutput(height * width / getSize(), getSize());
-  MatrixPtr tmp =
-      Matrix::create(output_.value->getData(), height, width, false, useGpu_);
-  tmp->assign(*input.value);
-}
-
-void ResizeLayer::backward(const UpdateCallback& callback) {
-  const Argument& input = getInput(0);
-  size_t height = input.value->getHeight();
-  size_t width = input.value->getWidth();
-
-  if (!input.grad) {
-    return;
-  }
-
-  MatrixPtr tmp = Matrix::create(input.grad->getData(),
-                                 height * width / getSize(),
-                                 getSize(),
-                                 false,
-                                 useGpu_);
-  tmp->add(*output_.grad);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RotateLayer.cpp b/paddle/legacy/gserver/layers/RotateLayer.cpp
deleted file mode 100644
index f205d1a9194..00000000000
--- a/paddle/legacy/gserver/layers/RotateLayer.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RotateLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(rotate, RotateLayer);
-
-bool RotateLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  height_ = config_.height();
-  width_ = config_.width();
-  CHECK_GT(height_, 0);
-  CHECK_GT(width_, 0);
-  return true;
-}
-
-void RotateLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr input = getInputValue(0);
-  batchSize_ = input->getHeight();
-  size_ = input->getWidth();
-  CHECK_GE(size_, height_ * width_);
-  CHECK_EQ(size_ % (height_ * width_), 0)
-      << "total size_ is not dividable by (height_ * width_), i.e., "
-      << "channel number should be an integer";
-  channels_ = size_ / (height_ * width_);
-
-  resizeOutput(batchSize_, size_);
-
-  MatrixPtr outV = getOutputValue();
-  for (int b = 0; b < batchSize_; b++) {   // for each input feat map
-    for (int c = 0; c < channels_; c++) {  // for each feat channel
-      MatrixPtr inputSample =
-          Matrix::create(input->getData() + b * size_ + c * height_ * width_,
-                         height_,
-                         width_,
-                         false,
-                         useGpu_);
-      MatrixPtr outputSample =
-          Matrix::create(outV->getData() + b * size_ + c * height_ * width_,
-                         width_,
-                         height_,
-                         false,
-                         useGpu_);
-      inputSample->rotate(outputSample, false, true /* clock-wise */);
-    }
-  }
-
-  if (getInputGrad(0)) {
-    zeroGrad();
-  }
-}
-
-void RotateLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  MatrixPtr outputGrad = getOutputGrad();
-  if (outputGrad == NULL) {
-    return;
-  }
-  // the grad should be rotated in the reverse direction
-  MatrixPtr preGrad = getInputGrad(0);
-
-  for (int b = 0; b < batchSize_; b++) {   // for each input feat map
-    for (int c = 0; c < channels_; c++) {  // for each feat channel
-      MatrixPtr inputSampleGrad =
-          Matrix::create(preGrad->getData() + b * size_ + c * height_ * width_,
-                         height_,
-                         width_,
-                         false,
-                         useGpu_);
-      MatrixPtr outputSampleGrad = Matrix::create(
-          outputGrad->getData() + b * size_ + c * height_ * width_,
-          width_,
-          height_,
-          false,
-          useGpu_);
-      MatrixPtr tmpGrad = nullptr;
-      outputSampleGrad->rotate(tmpGrad, true, false /* anti clock-wise */);
-      inputSampleGrad->add(*tmpGrad);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RotateLayer.h b/paddle/legacy/gserver/layers/RotateLayer.h
deleted file mode 100644
index 498e24372b8..00000000000
--- a/paddle/legacy/gserver/layers/RotateLayer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-/**
- * A layer for rotating a multi-channel feature map (M x N x C) in the spatial
- * domain
- * The rotation is 90 degrees in clock-wise for each channel
- * \f[
- *   y(j,i,:) = x(M-i-1,j,:)
- * \f]
- * where \f$x\f$ is (M x N x C) input, and \f$y\f$ is (N x M x C) output.
- *
- * The config file api is rotate_layer
- *
- */
-
-class RotateLayer : public Layer {
- public:
-  explicit RotateLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
-
- private:
-  int batchSize_;
-  int size_;
-  int height_;
-  int width_;
-  int channels_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RowConvLayer.cpp b/paddle/legacy/gserver/layers/RowConvLayer.cpp
deleted file mode 100644
index 1961557dc2d..00000000000
--- a/paddle/legacy/gserver/layers/RowConvLayer.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RowConvLayer.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(row_conv, RowConvLayer);
-
-bool RowConvLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  contexLength_ = config_.inputs(0).row_conv_conf().context_length();
-
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  weight_.reset(new Weight(contexLength_, getSize(), parameters_[0]));
-  createFunction(forward_, "RowConv", FuncConfig());
-  createFunction(backward_, "RowConvGrad", FuncConfig());
-
-  return true;
-}
-
-void RowConvLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  MatrixPtr input = getInputValue(0);
-  size_t height = input->getHeight();
-  size_t width = input->getWidth();
-  CHECK_EQ(width, getSize());
-  resetOutput(height, width);
-
-  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
-  MatrixPtr w = weight_->getW();
-  wDims_ = TensorShape({w->getHeight(), w->getWidth()});
-
-  MatrixPtr outV = getOutputValue();
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), *startPos);
-  inputs.addArg(*w, wDims_);
-  outputs.addArg(*getOutputValue(), *startPos, ADD_TO);
-
-  {
-    REGISTER_TIMER_INFO("RowConvForward", getName().c_str());
-    forward_[0]->calc(inputs, outputs);
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void RowConvLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getOutputGrad(), *startPos);
-  inputs.addArg(*getInputValue(0), *startPos);
-  inputs.addArg(*weight_->getW(), wDims_);
-
-  MatrixPtr inGrad = getInputGrad(0);
-  MatrixPtr wGrad = weight_->getWGrad();
-  size_t h = getInputValue(0)->getHeight();
-  size_t w = getInputValue(0)->getWidth();
-  outputs.addArg(
-      inGrad ? (*inGrad) : *(Matrix::create(nullptr, h, w, false, useGpu_)),
-      *startPos,
-      ADD_TO);
-  outputs.addArg(
-      wGrad ? (*wGrad)
-            : *(Matrix::create(nullptr, contexLength_, w, false, useGpu_)),
-      wDims_,
-      ADD_TO);
-
-  {
-    REGISTER_TIMER_INFO("RowConvBackward", getName().c_str());
-    backward_[0]->calc(inputs, outputs);
-  }
-
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RowConvLayer.h b/paddle/legacy/gserver/layers/RowConvLayer.h
deleted file mode 100644
index 3b74df0b1af..00000000000
--- a/paddle/legacy/gserver/layers/RowConvLayer.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * \brief Row Convolution Layer.
- */
-class RowConvLayer : public Layer {
- public:
-  explicit RowConvLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~RowConvLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  // Row convolution weight, context_lenght_ * fan_out.
-  // fan_out is the size of output feature.
-  std::unique_ptr<Weight> weight_;
-
-  // The step number to look ahead plus one equals contexLength_.
-  size_t contexLength_;
-  TensorShape wDims_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RowL2NormLayer.cpp b/paddle/legacy/gserver/layers/RowL2NormLayer.cpp
deleted file mode 100644
index d5e6e10a027..00000000000
--- a/paddle/legacy/gserver/layers/RowL2NormLayer.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * A layer for L2 normalization in each row,
- * \f[
- *   out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
- * \f]
- * where the size of \f$in\f$ is (batchSize x dataDim),
- * and the size of \f$out\f$ is (batchSize x dataDim).
- */
-
-class RowL2NormLayer : public Layer {
- protected:
-  MatrixPtr inSquare_;
-  MatrixPtr l2NormReciprocal_;
-  MatrixPtr dotSum_;
-
- public:
-  explicit RowL2NormLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(row_l2_norm, RowL2NormLayer);
-
-bool RowL2NormLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1U);
-
-  return true;
-}
-
-void RowL2NormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-
-  /* malloc memory for the output_ if necessary */
-  size_t batchSize = inV->getHeight();
-  size_t dataDim = getSize();
-  CHECK_EQ(dataDim, inV->getWidth());
-  resetOutput(batchSize, dataDim);
-  MatrixPtr outV = getOutputValue();
-
-  Matrix::resizeOrCreate(inSquare_, batchSize, dataDim, false, useGpu_);
-  inV->square2(*inSquare_);
-  Matrix::resizeOrCreate(l2NormReciprocal_, batchSize, 1, false, useGpu_);
-  inSquare_->rowSum(*l2NormReciprocal_);
-  l2NormReciprocal_->sqrt2(*l2NormReciprocal_);
-  l2NormReciprocal_->scalarDiv(*l2NormReciprocal_, 1.0);
-  outV->rowScale(0, *inV, *l2NormReciprocal_);
-}
-
-void RowL2NormLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-  size_t batchSize = inV->getHeight();
-
-  // inG[ij] += outG[ij] / l2NormReciprocal
-  // inG[ij] += -inV[ij] * l2NormReciprocal * l2NormReciprocal * DotMul(outG[i],
-  // inV[i])
-  if (inG) {
-    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
-    dotSum_->zeroMem();
-    dotSum_->rowDotMul(0, *outG, *outV);
-    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
-    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
-    inSquare_->rowScale(0, *inV, *dotSum_);
-    inG->sub(*inSquare_);
-    inG->addRowScale(0, *outG, *l2NormReciprocal_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SamplingIdLayer.cpp b/paddle/legacy/gserver/layers/SamplingIdLayer.cpp
deleted file mode 100644
index dbce6358812..00000000000
--- a/paddle/legacy/gserver/layers/SamplingIdLayer.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <random>
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for sampling id from multinomial distribution from the
- * input layer. Sampling one id for one sample. The result is stored in
- * output_.ids.
- *
- * The config file api is sampling_id_layer.
- */
-class SamplingIdLayer : public Layer {
-  /// Produces random floating-point values, uniformly distributed on [0, 1).
-  std::uniform_real_distribution<double> rand1_;
-  std::vector<Argument> tmpCpuInput_;
-
- public:
-  explicit SamplingIdLayer(const LayerConfig& config)
-      : Layer(config), rand1_(0, 1) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    bool ret = Layer::init(layerMap, parameterMap);
-    CHECK_EQ(1UL, inputLayers_.size());
-    if (useGpu_) {
-      tmpCpuInput_.reserve(inputLayers_.size());
-      for (size_t i = 0; i < inputLayers_.size(); i++) {
-        tmpCpuInput_.push_back(Argument());
-      }
-    }
-    return ret;
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-    if (useGpu_) {
-      for (size_t i = 0; i < inputLayers_.size(); i++) {
-        tmpCpuInput_[i].resizeAndCopyFrom(
-            getInput(i), false, HPPL_STREAM_DEFAULT);
-      }
-      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-      forwardImp(tmpCpuInput_[0]);
-    } else {
-      forwardImp(getInput(0));
-    }
-  }
-
-  void forwardImp(const Argument& input) {
-    size_t batchSize = input.getBatchSize();
-    IVector::resizeOrCreate(output_.ids, batchSize, useGpu_);
-    real* buf = input.value->getData();
-    int dim = input.value->getWidth();
-    std::vector<int> ids(batchSize);
-    auto& reng = ThreadLocalRandomEngine::get();
-    for (size_t i = 0; i < batchSize; ++i) {
-      double r = rand1_(reng);
-      int id = dim - 1;
-      for (int j = 0; j < dim; ++j) {
-        if ((r -= buf[i * dim + j]) < 0) {
-          id = j;
-          break;
-        }
-      }
-      ids[i] = id;
-    }
-    output_.ids->copyFrom(ids.data(), batchSize);
-  }
-
-  void backward(const UpdateCallback& callback) override {}
-};
-
-REGISTER_LAYER(sampling_id, SamplingIdLayer);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ScaleShiftLayer.cpp b/paddle/legacy/gserver/layers/ScaleShiftLayer.cpp
deleted file mode 100644
index 8af78a2e27d..00000000000
--- a/paddle/legacy/gserver/layers/ScaleShiftLayer.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * A layer applies a linear transformation to each element in each row of
- * the input matrix. For each element, the layer first re-scale it and then
- * adds a bias to it.
- *
- * \f[
- *    y = wx + b
- * \f]
- *
- * Here, w is the scale and b is the bias. Both w and b are trainable scalars.
- *
- */
-
-class ScaleShiftLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> scale_;
-  std::unique_ptr<Weight> offset_;
-
- public:
-  explicit ScaleShiftLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(scale_shift, ScaleShiftLayer);
-
-bool ScaleShiftLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(inputLayers_.size(), 1U);
-  scale_.reset(new Weight(1, 1, parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    offset_ = std::unique_ptr<Weight>(new Weight(1, 1, biasParameter_));
-  }
-  return true;
-}
-
-void ScaleShiftLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-  resetOutput(inV->getHeight(), inV->getWidth());
-  MatrixPtr outV = getOutputValue();
-  real scaleValue = scale_->getW()->getElement(0, 0);
-  outV->mulScalar(*inV, scaleValue);
-  if (offset_) {
-    real offsetValue = offset_->getW()->getElement(0, 0);
-    outV->add(offsetValue);
-  }
-}
-
-void ScaleShiftLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-
-  /* Calculate the parameter gradient for the current layer */
-  if (scale_->getWGrad()) {
-    MatrixPtr rowSumMtx;
-    Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
-    // this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
-    rowSumMtx->sumOfProducts(
-        /* b= */ *inV, /* c= */ *outG, /* scaleSum= */ 1, /* scaleDest= */ 0.);
-    // this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
-    scale_->getWGrad()->sumCols(
-        /* b= */ *rowSumMtx, /* scaleSum= */ 1., /* scaleDest= */ 1.);
-    scale_->getParameterPtr()->incUpdate(callback);
-  }
-  if (offset_ && offset_->getWGrad()) {
-    MatrixPtr rowSumMtx;
-    Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
-    rowSumMtx->sumRows(*outG, 1., 0.);
-    offset_->getWGrad()->sumCols(*rowSumMtx, 1., 1.);
-    offset_->getParameterPtr()->incUpdate(callback);
-  }
-
-  /* Calculate the input layers error */
-  if (inG) {
-    real scaleValue = scale_->getW()->getElement(0, 0);
-    inG->add(*outG, scaleValue);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp b/paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp
deleted file mode 100644
index 70d44d2a7ef..00000000000
--- a/paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ScaleSubRegionLayer.h"
-#include "paddle/legacy/utils/Stat.h"
-namespace paddle {
-
-REGISTER_LAYER(scale_sub_region, ScaleSubRegionLayer);
-
-bool ScaleSubRegionLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(static_cast<int>(inputLayers_.size()), 2);
-  auto& conf = config_.inputs(0).scale_sub_region_conf();
-  value_ = conf.value();
-
-  createFunction(forward_, "ScaleSubRegion", FuncConfig().set("value", value_));
-  createFunction(
-      backward_, "ScaleSubRegionGrad", FuncConfig().set("value", value_));
-
-  return true;
-}
-
-void ScaleSubRegionLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  auto in0 = getInput(0);
-  imgH_ = in0.getFrameHeight();
-  imgW_ = in0.getFrameWidth();
-  if (imgH_ == 0 || imgW_ == 0) {
-    auto& conf = config_.inputs(0).scale_sub_region_conf();
-    imgH_ = conf.image_conf().img_size_y();
-    imgW_ = conf.image_conf().img_size();
-  }
-  MatrixPtr imgV = in0.value;
-  size_t batchSize = imgV->getHeight();
-  size_t spatialSize = imgH_ * imgW_;
-  channelsNum_ = imgV->getWidth() / spatialSize;
-  shape_ = TensorShape({batchSize, channelsNum_, imgH_, imgW_});
-
-  resetOutput(batchSize, imgV->getWidth());
-  auto& out = getOutput();
-  out.setFrameHeight(imgH_);
-  out.setFrameWidth(imgW_);
-
-  MatrixPtr indicesV = getInputValue(1);
-  indicesShape_ = TensorShape({batchSize, 6});
-
-  REGISTER_TIMER_INFO("ScaleSubRegionForward", getName().c_str());
-  BufferArgs inArgs;
-  BufferArgs outArgs;
-  inArgs.addArg(*imgV, shape_);
-  inArgs.addArg(*indicesV, indicesShape_);
-  outArgs.addArg(*out.value, shape_, ASSIGN_TO);
-  forward_[0]->calc(inArgs, outArgs);
-}
-
-void ScaleSubRegionLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("ScaleSubRegionBackward", getName().c_str());
-  BufferArgs inArgs;
-  BufferArgs outArgs;
-  inArgs.addArg(*getOutputGrad(), shape_);
-  inArgs.addArg(*getInputValue(1), indicesShape_);
-  outArgs.addArg(*getInputGrad(0), shape_, ADD_TO);
-  backward_[0]->calc(inArgs, outArgs);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ScaleSubRegionLayer.h b/paddle/legacy/gserver/layers/ScaleSubRegionLayer.h
deleted file mode 100644
index fe431698bc6..00000000000
--- a/paddle/legacy/gserver/layers/ScaleSubRegionLayer.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * \brief  For each instance, this layer can be used to multiply a value to a
- *         specified sub continuous region. By providing start index and end
- *         index for C/H/W, you can specify the location and shape of the
- *         region.
- *
- *         input_0: Input value.
- *         input_1: Indices value to specify the location an shape of the
- *                  region.
- */
-class ScaleSubRegionLayer : public Layer {
- public:
-  explicit ScaleSubRegionLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ScaleSubRegionLayer() {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-
-  void backward(const UpdateCallback& callback = nullptr);
-
- protected:
-  TensorShape shape_;
-  TensorShape indicesShape_;
-  size_t imgH_;
-  size_t imgW_;
-  size_t channelsNum_;
-  real value_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ScalingLayer.cpp b/paddle/legacy/gserver/layers/ScalingLayer.cpp
deleted file mode 100644
index a8286b6614c..00000000000
--- a/paddle/legacy/gserver/layers/ScalingLayer.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for each row of a matrix, multiplying with a element of a vector,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   y.row[i] = w[i] * x.row[i]
- * \f]
- * where \f$x\f$ is (batchSize x dataDim) input, \f$w\f$ is
- * (batchSize x 1) weight vector, and \f$y\f$ is (batchSize x dataDim) output.
- *
- * The config file api is scaling_layer.
- */
-
-class ScalingLayer : public Layer {
- public:
-  explicit ScalingLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ScalingLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(scaling, ScalingLayer);
-
-bool ScalingLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  return true;
-}
-
-void ScalingLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  CHECK_EQ(dataDim, getSize());
-  CHECK_EQ(weightV->getWidth(), 1U);
-  CHECK_EQ(weightV->getHeight(), batchSize);
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwScalingTimer", getName().c_str());
-    // outV += inV1 * weight
-    outV->addRowScale(0, *inV1, *weightV);
-  }
-}
-
-void ScalingLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr outG = getOutputGrad();
-
-  {
-    REGISTER_TIMER_INFO("BwScalingTimer", getName().c_str());
-
-    if (inG0) {
-      // inG0 += outG .* inV1
-      inG0->rowDotMul(0, *outG, *inV1);
-    }
-
-    if (inG1) {
-      // inG1 += outG * weight;
-      inG1->addRowScale(0, *outG, *weightV);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ScalingProjection.cpp b/paddle/legacy/gserver/layers/ScalingProjection.cpp
deleted file mode 100644
index 4d871cafc4d..00000000000
--- a/paddle/legacy/gserver/layers/ScalingProjection.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-
-namespace paddle {
-
-class ScalingProjection : public Projection {
- public:
-  ScalingProjection(const ProjectionConfig& config,
-                    const ParameterPtr& parameter,
-                    bool useGpu)
-      : Projection(config, parameter, useGpu) {
-    CHECK_EQ(parameter->getSize(), 1UL);
-    weight_.reset(new Weight(1, 1, parameter));
-  }
-
-  void forward() {
-    CHECK(in_->value);
-    out_->value->add(*in_->value, weight_->getW()->getElement(0, 0));
-  }
-
-  void backward(const UpdateCallback& callback) {
-    if (weight_->getWGrad()) {
-      auto sum = Matrix::create(in_->value->getHeight(), 1, false, useGpu_);
-      sum->sumOfProducts(*in_->value,
-                         *out_->grad,
-                         /* scaleSum= */ 1,
-                         /* scaleDest= */ 0);
-      weight_->getWGrad()->sumCols(*sum,
-                                   /* scaleSum= */ 1,
-                                   /* scaleDest= */ 1);
-      parameter_->incUpdate(callback);
-    }
-    if (in_->grad) {
-      in_->grad->add(*out_->grad, weight_->getW()->getElement(0, 0));
-    }
-  }
-
- protected:
-  std::unique_ptr<Weight> weight_;
-};
-
-REGISTER_PROJECTION(scaling, ScalingProjection);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp
deleted file mode 100644
index 72fb0681488..00000000000
--- a/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp
+++ /dev/null
@@ -1,336 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SelectiveFullyConnectedLayer.h"
-#include <algorithm>
-#include <vector>
-#include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(selective_fc, SelectiveFullyConnectedLayer);
-
-bool SelectiveFullyConnectedLayer::init(const LayerMap& layerMap,
-                                        const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  inputNum_ = inputLayers_.size();
-  if (config_.has_selected_colums()) {
-    inputNum_ -= 1;
-  }
-  for (size_t i = 0; i < inputNum_; i++) {
-    size_t height = inputLayers_[i]->getSize();
-    size_t width = getSize();
-    // NOTE weight is transpoed
-    weights_.emplace_back(new Weight(width, height, parameters_[i]));
-  }
-
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  fullOutput_ = false;
-
-  return true;
-}
-
-void SelectiveFullyConnectedLayer::prefetch() {}
-
-void SelectiveFullyConnectedLayer::reserveOutput(size_t height,
-                                                 size_t width,
-                                                 size_t nnz) {
-  bool flag = (passType_ == PASS_TEST &&
-               config_.selective_fc_pass_generation() && !fullOutput_);
-  SetDevice device(output_.deviceId);
-  if (flag) {
-    // output_.value is sparse matrix
-    if (dynamic_cast<CpuMatrix*>(output_.value.get()) ||
-        dynamic_cast<GpuMatrix*>(output_.value.get())) {
-      output_.value = nullptr;
-    }
-    Matrix::resizeOrCreateSparseMatrix(output_.value,
-                                       height,
-                                       width,
-                                       nnz,
-                                       FLOAT_VALUE,
-                                       SPARSE_CSR,
-                                       /*trans=*/false,
-                                       /*useGpu=*/useGpu_);
-    output_.value->copyFrom(*selCols_);
-    interOutput_ = output_.value;
-  } else {
-    if (fullOutput_) {
-      // output_.value is dense matrix
-      if (dynamic_cast<CpuSparseMatrix*>(output_.value.get()) ||
-          dynamic_cast<GpuSparseMatrix*>(output_.value.get())) {
-        output_.value = nullptr;
-      }
-      Matrix::resizeOrCreate(output_.value,
-                             height,
-                             width,
-                             /*trans=*/false,
-                             /*useGpu=*/useGpu_);
-      interOutput_ = output_.value;
-    } else {
-      // output_.value is dense matrix, but width = nnz /height
-      CHECK_EQ(nnz % height, 0U);
-      CHECK(nnz / height);
-      Matrix::resizeOrCreate(output_.value,
-                             height,
-                             nnz / height,
-                             /*trans=*/false,
-                             /*useGpu=*/useGpu_);
-      interOutput_ = Matrix::createSparseMatrix(output_.value->getData(),
-                                                selCols_->getRows(),
-                                                selCols_->getCols(),
-                                                height,
-                                                width,
-                                                nnz,
-                                                FLOAT_VALUE,
-                                                SPARSE_CSR,
-                                                /*trans=*/false,
-                                                /*useGpu=*/useGpu_);
-    }
-  }
-  interOutput_->zeroMem();
-
-  if (passType_ != PASS_TEST && needGradient()) {
-    CHECK_EQ(nnz % height, 0U) << "during training, each sample must have a "
-                                  "same number of selected columns.";
-    CHECK(nnz / height)
-        << "during training, "
-           "each sample must have at least one column selected.";
-    Matrix::resizeOrCreate(output_.grad,
-                           height,
-                           nnz / height,
-                           /*trans=*/false,
-                           /*useGpu=*/useGpu_);
-    output_.grad->zeroMem();
-  }
-}
-
-void SelectiveFullyConnectedLayer::forward(PassType passType) {
-  REGISTER_TIMER("selective_fc.forward");
-  Layer::forward(passType);
-
-  getSelectiveCols();
-  size_t height = getInput(0).getBatchSize();
-  size_t width = getSize();
-  size_t nnz = height * width;
-  if (!fullOutput_) {
-    CHECK(selCols_);
-    CHECK(height == selCols_->getHeight());
-    CHECK(width == selCols_->getWidth());
-    nnz = selCols_->getElementCnt();
-  }
-
-  // Layer::ResetOutput(), here we set outV/outG as SparseMatrix manually
-  // this outV should be used as input of MaxIdLayer and softmax activation
-  reserveOutput(height, width, nnz);
-
-  bool flag = true;
-  for (size_t i = 0; i < inputNum_; i++) {
-    MatrixPtr input = getInputValue(i);
-    MatrixPtr weight = weights_[i]->getW();
-    size_t hsize = input->getHeight();
-    size_t wsize = weight->getHeight();
-    real scaleT = i == 0 ? real(0) : real(1);
-
-    flag = nnz < (hsize * wsize) * config_.selective_fc_full_mul_ratio() &&
-           !fullOutput_;
-    if (flag) {
-      // if the indecies are highly sparse,
-      // manully compute the multiplication of
-      // the input vector and the selected rows.
-      REGISTER_TIMER("selective.plain");
-      interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
-    } else {
-      // if the indecies is not sparse enough,
-      // use full mul instead
-      REGISTER_TIMER("selective.mul");
-      if (fullOutput_) {
-        interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
-      } else {
-        Matrix::resizeOrCreate(mmat_,
-                               hsize,
-                               wsize,
-                               /*trans=*/false,
-                               /*useGpu=*/useGpu_);
-        mmat_->mul(*input, *weight->getTranspose());
-        interOutput_->add3(mmat_);
-      }
-    }
-  }
-
-  if (biases_) {
-    interOutput_->addBias(*(biases_->getW()), 1);
-  }
-
-  flag = (passType_ == PASS_TEST && config_.selective_fc_pass_generation() &&
-          !fullOutput_);
-  if (flag) {
-    // during generation, output of this layer is a sparse csr matrix,
-    // which is probably the input of maxid layer
-    // if the model is trained with multi-class-cross-entroy-with-selfnorm,
-    // activiation of this layer should be exponential, not softmax.
-
-    Argument arg;
-    arg.value = Matrix::create(interOutput_->getData(),
-                               1,
-                               nnz,
-                               /*trans=*/false,
-                               /*useGpu=*/useGpu_);
-    //! TODO(yuyang18): Why we cannot invoke forwardActivation here?
-    activation_->forward(arg).check();
-  } else /* train and test in train, not generating */ {
-    // during training, this layer output value is *Matrix*, which is input of
-    // eg. multi-class-cross-entropy
-
-    // while training, every sample has a equal number of selected
-    // columns to be activated.
-    // note indices of multi-class-cross-entropy need to be remapped
-    // to this index.
-    // e.g. sample = [1,3,5] and 3 is gold, then label is 1
-
-    forwardActivation();
-  }
-}
-
-void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) {
-  backwardActivation();
-  MatrixPtr oGrad = getOutputGrad();
-  if (!fullOutput_) {
-    interOutGrad_ = Matrix::createSparseMatrix(oGrad->getData(),
-                                               interOutput_->getRows(),
-                                               interOutput_->getCols(),
-                                               interOutput_->getHeight(),
-                                               interOutput_->getWidth(),
-                                               interOutput_->getElementCnt(),
-                                               FLOAT_VALUE,
-                                               SPARSE_CSR,
-                                               /*trans=*/false,
-                                               /*useGpu=*/useGpu_);
-  } else {
-    interOutGrad_ = Matrix::create(oGrad->getData(),
-                                   oGrad->getHeight(),
-                                   oGrad->getWidth(),
-                                   /*trans=*/false,
-                                   /*useGpu=*/useGpu_);
-  }
-
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*interOutGrad_, 1);
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  // backward is different from FullyConnectedLayer
-  // because the weight is transposed
-  for (size_t i = 0; i < inputNum_; i++) {
-    AsyncGpuBlock block;
-    MatrixPtr preGrad = getInputGrad(i);
-    if (preGrad) {
-      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-      preGrad->mul(*interOutGrad_, *weights_[i]->getW(), 1, 1);
-    }
-
-    MatrixPtr wGrad = weights_[i]->getWGrad();
-    if (wGrad) {
-      REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-      MatrixPtr input = getInputValue(i);
-      wGrad->mul(*interOutGrad_->getTranspose(), *input, 1, 1);
-    }
-
-    {
-      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-
-void paddle::SelectiveFullyConnectedLayer::fillSelectiveData(
-    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates) {
-  if (candidates == nullptr) {
-    fillFullySelectiveData();
-    return;
-  }
-
-  size_t sampleNum = candidates->size();
-  size_t outputWidth = getSize();
-  size_t nnz =
-      std::accumulate(candidates->begin(),
-                      candidates->end(),
-                      0UL,
-                      [](size_t a, const std::pair<int*, size_t>& arr) {
-                        return a + arr.second;
-                      });
-
-  Matrix::resizeOrCreateSparseMatrix(this->cpuSelCols_,
-                                     sampleNum,
-                                     outputWidth,
-                                     nnz,
-                                     NO_VALUE,
-                                     SPARSE_CSR,
-                                     false,
-                                     false);
-  CHECK(this->cpuSelCols_ != nullptr);
-  CpuSparseMatrixPtr selCols =
-      std::dynamic_pointer_cast<CpuSparseMatrix>(cpuSelCols_);
-  int* rowOffsets = selCols->getRows();
-  int* colIndices = selCols->getCols();
-
-  rowOffsets[0] = 0;
-  int idx = 0;
-  for (size_t i = 0; i < sampleNum; ++i) {
-    if ((*candidates)[i].second > 0) {
-      rowOffsets[i + 1] = rowOffsets[i] + (*candidates)[i].second;
-      for (size_t j = 0; j < (*candidates)[i].second; ++j) {
-        colIndices[idx] = (*candidates)[i].first[j];
-        idx++;
-      }
-    } else {
-      rowOffsets[i + 1] = rowOffsets[i];
-    }
-  }
-
-  CHECK_EQ(static_cast<size_t>(rowOffsets[sampleNum]), nnz);
-  if (!useGpu_) {
-    this->selCols_ = this->cpuSelCols_;
-  } else {
-    Matrix::resizeOrCreateSparseMatrix(this->selCols_,
-                                       sampleNum,
-                                       outputWidth,
-                                       nnz,
-                                       NO_VALUE,
-                                       SPARSE_CSR,
-                                       false,
-                                       true);
-    this->selCols_->copyFrom(*cpuSelCols_, HPPL_STREAM_1);
-    hl_stream_synchronize(HPPL_STREAM_1);
-  }
-
-  fullOutput_ = false;
-}
-
-void paddle::SelectiveFullyConnectedLayer::getSelectiveCols() {
-  if (config_.has_selected_colums()) {
-    this->selCols_ = inputLayers_[inputNum_]->getOutputValue();
-    fullOutput_ = false;
-  } else if (!config_.selective_fc_pass_generation() || selCols_ == nullptr) {
-    this->fillFullySelectiveData();
-  }  // else selCols_ is initialized by fillSelectiveData
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h
deleted file mode 100644
index 3ba04d9b2ae..00000000000
--- a/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * @brief The SelectiveFullyConnectedLayer class
- *
- * SelectiveFullyConnectedLayer differs from FullyConnectedLayer by that it
- * requires an additional input to indicate several selected columns, and only
- * compute the multiplications between the input matrices and the selected
- * columns of the parameter matrices of this layer. If the selected columns is
- * not specified, SelectiveFullyConnected layer acts exactly like
- * FullyConnectedLayer.
- *
- * The config file api is selective_fc_layer.
- */
-class SelectiveFullyConnectedLayer : public Layer {
- protected:
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-
- private:
-  /**
-   * Get selected columns each forward.
-   */
-  void getSelectiveCols();
-
-  MatrixPtr mmat_;
-  /// cpuSelCols_ is a CpuSparseMatrix, used to save selected columns.
-  MatrixPtr cpuSelCols_;
-  /// CpuSparseMatrix or GpuSparseMatrix. In CPU mode, selCols_ points
-  /// to cpuSelCols_.
-  MatrixPtr selCols_;
-  size_t inputNum_;
-
-  /// interOutput_ shared same memory with output_.value.
-  MatrixPtr interOutput_;
-
-  /// if fullOutput_ is false, interOutGrad_ sparse matrix
-  MatrixPtr interOutGrad_;
-
-  /// if true, means output_.value is the same as Fc Layer
-  bool fullOutput_;
-
- public:
-  explicit SelectiveFullyConnectedLayer(const LayerConfig& config)
-      : Layer(config), selCols_(nullptr) {}
-
-  ~SelectiveFullyConnectedLayer() {}
-  void prefetch() override;
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-
-  /**
-   * @brief Resize the output matrix size.
-   * And reset value to zero
-   */
-  void reserveOutput(size_t height, size_t width, size_t nnz);
-
-  /**
-   * @brief Fill candidates to select several activations as output.
-   * @param candidates specifies several selected columns of the parameter
-   * matrices of this layer.
-   * Multiplications only between the input matrices and the selected columns
-   * are computed.
-   * If the candidates is a nullptr, selective fc layer acts exactly like the
-   * fully connected layer.
-   * @note CURRENTLY, THIS METHOD IS ONLY USED FOR BEAM SEARCH
-   */
-  void fillSelectiveData(
-      const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates);
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  /**
-   * @brief Make SelectiveFC act as FullyConnectedLayer
-   */
-  void fillFullySelectiveData() { fullOutput_ = true; }
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp b/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp
deleted file mode 100644
index 7b598e11acd..00000000000
--- a/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for concatenating the first sequence with the second sequence
- * Input: two sequences each containing the same number of instances
- *        seq1 = [a1, a2, ..., an]
- *        seq2 = [b1, b2, ..., bn]
- * Output: a concatenated sequence of the two input sequences
- *        out = [a1, b1, a2, b2, ..., an, bn]
- */
-
-class SequenceConcatLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit SequenceConcatLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~SequenceConcatLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(seqconcat, SequenceConcatLayer);
-
-bool SequenceConcatLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  // sequene concatenation layer should have exactly 2 inputs
-  CHECK_EQ(2U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequenceConcatLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t dim = getSize();
-
-  const Argument& input1 = getInput(0);
-  size_t numSequences1 = input1.getNumSequences();
-  auto startPositions1 = input1.sequenceStartPositions->getVector(false);
-
-  const Argument& input2 = getInput(1);
-  size_t numSequences2 = input2.getNumSequences();
-  auto startPositions2 = input2.sequenceStartPositions->getVector(false);
-
-  CHECK_EQ(dim, input1.value->getWidth());
-  CHECK_EQ(startPositions1->getData()[numSequences1], input1.getBatchSize());
-  CHECK_EQ(numSequences1, startPositions1->getSize() - 1);
-
-  CHECK_EQ(dim, input2.value->getWidth());
-  CHECK_EQ(startPositions2->getData()[numSequences2], input2.getBatchSize());
-  CHECK_EQ(numSequences2, startPositions2->getSize() - 1);
-
-  CHECK_EQ(numSequences1, numSequences2);
-
-  MatrixPtr inputValue1 = getInputValue(0);
-  MatrixPtr inputValue2 = getInputValue(1);
-
-  // reset output
-  reserveOutput(inputValue1->getHeight() + inputValue2->getHeight(), dim);
-
-  MatrixPtr outputValue = getOutputValue();
-
-  const int* starts1 = startPositions1->getData();
-  const int* starts2 = startPositions2->getData();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceConcatLayerForward", getName().c_str());
-
-    size_t offset = 0;
-    size_t leftNumIns = 0;
-    size_t rightNumIns = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      leftNumIns = starts1[seqId + 1] - starts1[seqId];
-      outputValue->subMatrix(offset, leftNumIns)
-          ->assign(*(inputValue1->subMatrix(starts1[seqId], leftNumIns)));
-      offset += leftNumIns;
-
-      rightNumIns = starts2[seqId + 1] - starts2[seqId];
-      outputValue->subMatrix(offset, rightNumIns)
-          ->assign(*(inputValue2->subMatrix(starts2[seqId], rightNumIns)));
-      offset += rightNumIns;
-    }
-
-    // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences1 + 1, false);
-
-    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
-
-    for (size_t seqId = 0; seqId < numSequences1 + 1; ++seqId) {
-      tgtBuf[seqId] = starts1[seqId] + starts2[seqId];
-    }
-  }
-
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void SequenceConcatLayer::backward(const UpdateCallback& callback) {
-  /* activation */
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  MatrixPtr inputGrad1 = getInputGrad(0);
-  MatrixPtr inputGrad2 = getInputGrad(1);
-  MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
-  auto startPositions2 = getInput(1).sequenceStartPositions->getVector(false);
-
-  size_t numSequences1 = startPositions1->getSize() - 1;
-  size_t numSequences2 = startPositions2->getSize() - 1;
-
-  CHECK_EQ(numSequences1, numSequences2);
-
-  const int* starts1 = startPositions1->getData();
-  const int* starts2 = startPositions2->getData();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceConcatLayerBackward", getName().c_str());
-
-    size_t offset = 0;
-    size_t leftNumIns = 0;
-    size_t rightNumIns = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      leftNumIns = starts1[seqId + 1] - starts1[seqId];
-      if (inputGrad1) {
-        inputGrad1->subMatrix(starts1[seqId], leftNumIns)
-            ->add(*(outputGrad->subMatrix(offset, leftNumIns)));
-      }
-      offset += leftNumIns;
-
-      rightNumIns = starts2[seqId + 1] - starts2[seqId];
-      if (inputGrad2) {
-        inputGrad2->subMatrix(starts2[seqId], rightNumIns)
-            ->add(*(outputGrad->subMatrix(offset, rightNumIns)));
-      }
-      offset += rightNumIns;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp
deleted file mode 100644
index 8735d71ba37..00000000000
--- a/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include "SequencePoolLayer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for extracting the last instance of the input sequence.
- * Input: a sequence
- * If SequenceLevel = kNonseq:
- *   Output: a sequence containing only the last instance of the input sequence
- *   If stride_ > 0:
- *      Output: a shorten sequence. Stride is the step size by which we slide a
- *              window upon the input sequence, and getting last instance
- *              operation is then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *   Check input sequence must has sub-sequence
- *   Output: a sequence containing only the last instance of each sub-sequence
- *           of the input sequence
- *
- * The config file api is last_seq and first_seq.
- */
-
-class SequenceLastInstanceLayer : public SequencePoolLayer {
- protected:
-  MatrixPtr tmpSrc_;
-  MatrixPtr tmpDest_;
-  std::vector<int> instanceIds_;
-
- public:
-  explicit SequenceLastInstanceLayer(const LayerConfig& config)
-      : SequencePoolLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
-
-bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
-                                     const ParameterMap& parameterMap) {
-  SequencePoolLayer::init(layerMap, parameterMap);
-  reversed_ = config_.select_first();
-
-  tmpSrc_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  tmpDest_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-
-  return true;
-}
-
-void SequenceLastInstanceLayer::forward(PassType passType) {
-  SequencePoolLayer::forward(passType);
-
-  auto starts = startPositions_->getData(false);
-  MatrixPtr inputValue = getInputValue(0);
-  MatrixPtr outputValue = getOutputValue();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
-
-    instanceIds_.clear();
-    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
-      int insId = reversed_ ? starts[seqId] : starts[seqId + 1] - 1;
-      instanceIds_.push_back(insId);
-
-      outputValue->subMatrix(seqId, 1, tmpDest_)
-          ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
-    }
-  }
-
-  if (biases_.get() != NULL) {
-    outputValue->addBias(*(biases_->getW()), 1);
-  }
-
-  /*  activation, should set to 'linear' in most cases */
-  forwardActivation();
-}
-
-void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
-  SequencePoolLayer::backward(callback);
-
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-
-  if (inputGrad) {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceLastInstanceLayerBackward", getName().c_str());
-
-    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
-      inputGrad->subMatrix(instanceIds_[seqId], 1, tmpDest_)
-          ->add(*(outputGrad->subMatrix(seqId, 1, tmpSrc_)));
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequencePoolLayer.cpp b/paddle/legacy/gserver/layers/SequencePoolLayer.cpp
deleted file mode 100644
index 243b795db42..00000000000
--- a/paddle/legacy/gserver/layers/SequencePoolLayer.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SequencePoolLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-bool SequencePoolLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  // seqlastins/max/average layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  stride_ = config_.seq_pool_stride();
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequencePoolLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  CHECK(input.hasSeq() || input.hasSubseq())
-      << "Input should be a sequence or subsequence for layer " << getName();
-
-  newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences();
-  size_t dim = getSize();
-  // check
-  CHECK_EQ(dim, input.value->getWidth());
-  startPositions_ =
-      type_ ? input.subSequenceStartPositions : input.sequenceStartPositions;
-  auto starts = startPositions_->getVector(false);
-  CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
-  CHECK_EQ(newBatchSize_, starts->getSize() - 1);
-
-  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-   * thus, in this case, output_ has no sequenceStartPositions.
-   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-   * case, we should compute the new sequenceStartPositions.
-   */
-  if (type_) {
-    CHECK(input.subSequenceStartPositions)
-        << "when trans_type = seq, input must hasSubseq";
-    output_.degradeSequence(input);
-  }
-  if (stride_ > 0) {
-    CHECK_EQ(input.hasSubseq(), 0UL)
-        << "sequence stride pooling is invalid for hasSubseq now";
-    output_.poolSequenceWithStride(input, stride_, &startPositions_, reversed_);
-    newBatchSize_ = startPositions_->getSize() - 1;
-  }
-
-  resetOutput(newBatchSize_, dim);
-}
-
-void SequencePoolLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ { backwardActivation(); }
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequencePoolLayer.h b/paddle/legacy/gserver/layers/SequencePoolLayer.h
deleted file mode 100644
index 1c019b31309..00000000000
--- a/paddle/legacy/gserver/layers/SequencePoolLayer.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-/**
- * A base layer for SequenceLastInstanceLayer/AverageLayer/MaxLayer.
- *
- * Input: one or more sequences. Each sequence contains some instances.
- * If SequenceLevel = kNonSeq:
- *    Output: output size is the number of input sequences (NOT input instances)
- *    output[i] = seqlastin/average/max_{for each instance in this
- * sequence}{input[i]}
- *    If stride_ > 0:
- *        Check input sequence must not have sub-sequence
- *        Output: a shorten sequence. Stride is the step size by which we slide
- *                a window upon the input sequence, and the pooling operation
- *                is then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *    Check input sequence must has sub-sequence
- *    Output: output size is the number of input sub-sequences
- *    output[i] = seqlastin/average/max_{for each instance in this
- * sub-sequence}{input[i]}
- *
- * The config file api is pooling_layer.
- */
-
-class SequencePoolLayer : public Layer {
- protected:
-  int type_;
-  std::unique_ptr<Weight> biases_;
-  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
-  size_t newBatchSize_;
-  ICpuGpuVectorPtr startPositions_;
-  int stride_;
-  // Whether the input sequence is reversed or not.
-  bool reversed_ = false;
-
- public:
-  explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp b/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp
deleted file mode 100644
index e3d40cab50a..00000000000
--- a/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- *  A layer for reshaping the sequence. Assume the input sequence has
- *  T instances, the dimension of each instance is M, and the input
- *  reshape_dim is N, then the output sequence has T*M/N instances,
- *  the dimension of each instance is N.
- *
- *  Note that T*M/N must be an integer.
- */
-
-class SequenceReshapeLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-
-  MatrixPtr reshapedOutputGrad;
-
- public:
-  explicit SequenceReshapeLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(seqreshape, SequenceReshapeLayer);
-
-bool SequenceReshapeLayer::init(const LayerMap& layerMap,
-                                const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(1U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequenceReshapeLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-
-  size_t inDim = input.value->getWidth();
-  size_t outDim = getSize();
-
-  size_t numSequences = input.getNumSequences();
-
-  // by default, we assume each instance as a sequence
-  IVectorPtr seqStarts;
-  IVector::resizeOrCreate(seqStarts, input.getBatchSize() + 1, false);
-  int* startsData = seqStarts->getData();
-  for (int i = 0; i < input.getBatchSize() + 1; i++) {
-    startsData[i] = i;
-  }
-  const int* starts = startsData;
-
-  // if there is sequence, then use start positions
-  if (input.sequenceStartPositions) {
-    auto startPositions = input.sequenceStartPositions->getVector(false);
-    starts = startPositions->getData();
-    CHECK_EQ(starts[numSequences], input.getBatchSize());
-    CHECK_EQ(numSequences, startPositions->getSize() - 1);
-  }
-
-  for (size_t seqID = 0; seqID < numSequences; seqID++) {
-    size_t inNumIns = starts[seqID + 1] - starts[seqID];
-    size_t outNumIns = inNumIns * inDim / outDim;
-    CHECK_EQ(outNumIns * outDim, inNumIns * inDim);
-  }
-
-  MatrixPtr inputValue = getInputValue(0);
-
-  // reset output
-  reserveOutput(inputValue->getHeight() * inDim / outDim, outDim);
-  MatrixPtr outputValue = getOutputValue();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceReshapeLayerForward", getName().c_str());
-
-    outputValue->copyFrom(*inputValue);
-
-    // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences + 1, false);
-
-    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
-
-    for (size_t seqId = 0; seqId < numSequences + 1; ++seqId) {
-      tgtBuf[seqId] = starts[seqId] * inDim / outDim;
-    }
-  }
-
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void SequenceReshapeLayer::backward(const UpdateCallback& callback) {
-  /* activation */
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-
-  AsyncGpuBlock asyncGpuBlock;
-  REGISTER_TIMER_INFO("SequenceReshapeLayerBackward", getName().c_str());
-
-  if (inputGrad) {
-    Matrix::resizeOrCreate(reshapedOutputGrad,
-                           inputGrad->getHeight(),
-                           inputGrad->getWidth(),
-                           false,
-                           useGpu_);
-    reshapedOutputGrad->copyFrom(*outputGrad);
-    inputGrad->add(*reshapedOutputGrad);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp b/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp
deleted file mode 100644
index 3ed51c4ef2f..00000000000
--- a/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-class SequenceSliceLayer : public Layer {
- public:
-  explicit SequenceSliceLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  /*
-   * TODO(caoying)
-   * In PaddePaddle, currently all matrices are real number types,
-   * but the second and the (optional) third input which are some
-   * selected indices of the give sequence to trim the sequence, are actually
-   * filled with int types so that storing int types information in real number
-   * matrices is very dangerous, since real numbers will be convered to int
-   * types. If a user fills this matrix himself, invalid data may occor.
-   */
-
-  MatrixPtr startIdsOnCpu_;
-  MatrixPtr endIdsOnCpu_;
-
-  std::vector<int> selectedRows_;
-  IVectorPtr rowIndice_;
-  std::vector<std::vector<int>> inputSeqInfoVec_;
-  std::vector<int> outSubSeqStartPos_;
-  std::vector<int> outSeqStartPos_;
-
-  void checkInputs();
-  void copySliceIdsToCpu();
-  void calSelectedRows(const MatrixPtr starts, const MatrixPtr ends);
-};
-
-REGISTER_LAYER(seq_slice, SequenceSliceLayer);
-
-bool SequenceSliceLayer::init(const LayerMap& layerMap,
-                              const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_GE(inputLayers_.size(), 2U);
-  CHECK_LE(inputLayers_.size(), 3U);
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequenceSliceLayer::checkInputs() {
-  const Argument& inputSeq = getInput(0);
-  CHECK(inputSeq.hasSeq()) << "The first input of sequence slice layer "
-                           << "must be a sequence.";
-  const MatrixPtr indices1 = getInputValue(1);
-  CHECK_EQ(
-      indices1->getHeight(),
-      static_cast<size_t>(inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
-                                               : inputSeq.getNumSequences()))
-      << "Height of the second input should be equal to number of sequence "
-      << "in the first input.";
-  if (inputLayers_.size() == 3) {
-    const MatrixPtr indices2 = getInputValue(2);
-    CHECK_EQ(indices2->getHeight(), indices1->getHeight())
-        << "start indices and end indices should have the same height.";
-    CHECK_EQ(indices2->getWidth(), indices1->getWidth())
-        << "start indices and end indices should have the same Width.";
-  }
-}
-
-void SequenceSliceLayer::copySliceIdsToCpu() {
-  const MatrixPtr indices1 = getInputValue(1);
-  if (inputLayers_.size() == 2U) {
-    if (config_.select_first()) {
-      Matrix::resizeOrCreate(startIdsOnCpu_,
-                             indices1->getHeight(),
-                             indices1->getWidth(),
-                             false /* trans */,
-                             false /* useGpu */);
-      startIdsOnCpu_->copyFrom(*indices1);
-      endIdsOnCpu_ = nullptr;
-    } else {
-      Matrix::resizeOrCreate(endIdsOnCpu_,
-                             indices1->getHeight(),
-                             indices1->getWidth(),
-                             false /* trans */,
-                             false /* useGpu */);
-      endIdsOnCpu_->copyFrom(*indices1);
-      startIdsOnCpu_ = nullptr;
-    }
-  } else if (inputLayers_.size() == 3U) {
-    Matrix::resizeOrCreate(startIdsOnCpu_,
-                           indices1->getHeight(),
-                           indices1->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    startIdsOnCpu_->copyFrom(*indices1);
-
-    const MatrixPtr indices2 = getInputValue(2);
-    Matrix::resizeOrCreate(endIdsOnCpu_,
-                           indices2->getHeight(),
-                           indices2->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    endIdsOnCpu_->copyFrom(*indices2);
-  }
-}
-
-void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
-                                         const MatrixPtr ends) {
-  CHECK(starts || ends) << "At least one of the start or end indices "
-                        << "should be given.";
-
-  bool hasSubseq = getInput(0).hasSubseq();
-
-  outSeqStartPos_.resize(1, 0);
-  outSubSeqStartPos_.resize(1, 0);
-  selectedRows_.clear();
-
-  size_t beamSize = starts ? starts->getWidth() : ends->getWidth();
-  size_t rowIdx = 0;
-  for (size_t i = 0; i < inputSeqInfoVec_.size(); ++i) {
-    for (size_t j = 0; j < inputSeqInfoVec_[i].size() - 1; ++j) {
-      for (size_t k = 0; k < beamSize; ++k) {
-        if (starts && starts->getElement(rowIdx, k) == -1.) break;
-        if (ends && ends->getElement(rowIdx, k) == -1.) break;
-
-        int begPos = inputSeqInfoVec_[i][j];
-        if (starts) begPos += starts->getElement(rowIdx, k);
-
-        int endPos = inputSeqInfoVec_[i][j + 1] - 1;
-        if (ends) endPos = inputSeqInfoVec_[i][j] + ends->getElement(rowIdx, k);
-
-        int seqLen = endPos - begPos + 1;
-        CHECK_GT(seqLen, 0);
-        for (int m = begPos; m <= endPos; ++m) selectedRows_.push_back(m);
-        hasSubseq
-            ? outSubSeqStartPos_.push_back(outSubSeqStartPos_.back() + seqLen)
-            : outSeqStartPos_.push_back(outSeqStartPos_.back() + seqLen);
-      }
-      rowIdx++;
-    }
-    if (hasSubseq) outSeqStartPos_.push_back(outSubSeqStartPos_.back());
-  }
-
-  if (useGpu_) {
-    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
-    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
-  } else {
-    rowIndice_ =
-        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
-  }
-
-  // create the sequence information for the output.
-  ICpuGpuVector::resizeOrCreate(
-      output_.sequenceStartPositions, outSeqStartPos_.size(), false);
-  output_.sequenceStartPositions->copyFrom(
-      outSeqStartPos_.data(), outSeqStartPos_.size(), false);
-
-  if (hasSubseq) {
-    ICpuGpuVector::resizeOrCreate(
-        output_.subSequenceStartPositions, outSubSeqStartPos_.size(), false);
-    output_.subSequenceStartPositions->copyFrom(
-        outSubSeqStartPos_.data(), outSubSeqStartPos_.size(), false);
-  }
-}
-
-void SequenceSliceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  checkInputs();
-
-  const Argument& inputSeq = getInput(0);
-  inputSeqInfoVec_.clear();
-  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
-                              inputSeq.subSequenceStartPositions,
-                              inputSeqInfoVec_);
-  if (!useGpu_) {
-    if (inputLayers_.size() == 2U) {
-      startIdsOnCpu_ = config_.select_first() ? getInputValue(1) : nullptr;
-      endIdsOnCpu_ = config_.select_first() ? nullptr : getInputValue(1);
-    } else if (inputLayers_.size() == 3U) {
-      startIdsOnCpu_ = getInputValue(1);
-      endIdsOnCpu_ = getInputValue(2);
-    }
-  } else {
-    copySliceIdsToCpu();
-  }
-
-  /*
-   * calculate the selected row indices in a batch, and build the output
-   * sequence information.
-   */
-  calSelectedRows(startIdsOnCpu_, endIdsOnCpu_);
-
-  resetOutput(selectedRows_.size(), getSize());
-
-  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
-}
-
-void SequenceSliceLayer::backward(const UpdateCallback& callback) {
-  getOutputGrad()->addToRows(*getInputGrad(0), *rowIndice_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceToBatch.cpp b/paddle/legacy/gserver/layers/SequenceToBatch.cpp
deleted file mode 100644
index 5d0d588e67a..00000000000
--- a/paddle/legacy/gserver/layers/SequenceToBatch.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SequenceToBatch.h"
-#include <string.h>
-#include <algorithm>
-#include <iostream>
-#include <vector>
-
-namespace paddle {
-
-void SequenceToBatch::resizeOrCreateBatch(int batchSize,
-                                          size_t numSequences,
-                                          const int *seqStarts,
-                                          bool reversed,
-                                          bool prevBatchState) {
-  CHECK_EQ(seqStarts[numSequences], batchSize);
-  IVector::resizeOrCreate(seq2BatchIdx_, batchSize, useGpu_);
-  if (!useGpu_) {
-    cpuSeq2BatchIdx_ = seq2BatchIdx_;
-  } else {
-    IVector::resizeOrCreate(cpuSeq2BatchIdx_, batchSize, false);
-  }
-
-  /*
-   * calculate the length of each sequence & sort sequence index by the length
-   * Exampel:  Sequences = {s0, s1, s2}
-   *           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-   *           seqStartAndLength[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
-   */
-  struct SeqStartAndLength {
-    int start_;
-    int length_;
-    int seqIdx_;
-    SeqStartAndLength(int start, int length, int seqIdx)
-        : start_(start), length_(length), seqIdx_(seqIdx) {}
-  };
-  std::vector<SeqStartAndLength> seqStartAndLength;
-  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-    int length = seqStarts[seqId + 1] - seqStarts[seqId];
-    seqStartAndLength.emplace_back(seqStarts[seqId], length, seqId);
-  }
-  std::sort(seqStartAndLength.begin(),
-            seqStartAndLength.end(),
-            [](SeqStartAndLength a, SeqStartAndLength b) {
-              return a.length_ > b.length_;
-            });
-
-  /*
-   * calculate the start position of each batch
-   * (numBatch equal the maxLength of sequences)
-   * Exampel:  Sequences = {s0, s1, s2}
-   *           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-   *           numBatch = 5,
-   *           batchIndex = {b0, b1, b2, b3, b4}
-   *           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
-   *           batchStartPositions[6] = {0, 3, 6, 9, 11, 12}
-   */
-  numBatch_ = (size_t)seqStartAndLength[0].length_;
-
-  IVector::resizeOrCreate(batchStartPositions_, numBatch_ + 1, false);
-  int *batchStartPositions = batchStartPositions_->getData();
-  batchStartPositions[0] = 0;
-  for (size_t n = 0; n < numBatch_; n++) {
-    int batchId = batchStartPositions[n];
-    for (size_t i = 0; i < seqStartAndLength.size(); ++i) {
-      size_t seqLength = seqStartAndLength[i].length_;
-      int start = seqStartAndLength[i].start_;
-      if (n < seqLength) {
-        if (!reversed) {
-          cpuSeq2BatchIdx_->getData()[batchId] = start + n;
-        } else {
-          cpuSeq2BatchIdx_->getData()[batchId] = start + seqLength - 1 - n;
-        }
-        batchId++;
-      } else {
-        break;
-      }
-    }
-    batchStartPositions[n + 1] = batchId;
-  }
-  if (useGpu_) {
-    seq2BatchIdx_->copyFrom(*cpuSeq2BatchIdx_);
-  }
-  if (prevBatchState) {
-    IVector::resizeOrCreate(seqIdx_, numSequences, useGpu_);
-    IVector::resizeOrCreate(seqEndIdxInBatch_, numSequences, useGpu_);
-    if (!useGpu_) {
-      cpuSeqIdx_ = seqIdx_;
-      cpuSeqEndIdxInBatch_ = seqEndIdxInBatch_;
-    } else {
-      IVector::resizeOrCreate(cpuSeqIdx_, numSequences, false);
-      IVector::resizeOrCreate(cpuSeqEndIdxInBatch_, numSequences, false);
-    }
-    int *seqIdx = cpuSeqIdx_->getData();
-    int *seqEndIdxInBatch = cpuSeqEndIdxInBatch_->getData();
-    for (size_t i = 0; i < seqStartAndLength.size(); ++i) {
-      seqIdx[i] = seqStartAndLength[i].seqIdx_;
-    }
-    for (size_t i = 0; i < seqStartAndLength.size(); ++i) {
-      if (seqStartAndLength[i].length_ > 0) {
-        seqEndIdxInBatch[seqStartAndLength[i].seqIdx_] =
-            batchStartPositions[seqStartAndLength[i].length_ - 1] + i;
-      } else {
-        seqEndIdxInBatch[seqStartAndLength[i].seqIdx_] = 0;
-      }
-    }
-    if (useGpu_) {
-      seqIdx_->copyFrom(*cpuSeqIdx_);
-      seqEndIdxInBatch_->copyFrom(*cpuSeqEndIdxInBatch_);
-    }
-  }
-}
-
-void SequenceToBatch::resizeOrCreate(Matrix &seqValue) {
-  Matrix::resizeOrCreate(batchValue_,
-                         seqValue.getHeight(),
-                         seqValue.getWidth(),
-                         /* trans= */ false,
-                         useGpu_);
-}
-
-MatrixPtr SequenceToBatch::getBatchValue(int batchId, int numRows) {
-  return getBatchValue(*batchValue_, batchId, numRows);
-}
-
-MatrixPtr SequenceToBatch::getBatchValue(Matrix &batchValue,
-                                         int batchId,
-                                         int numRows) {
-  int *batchStartPositions = batchStartPositions_->getData();
-  int start = batchStartPositions[batchId];
-  int maxRows = batchStartPositions[batchId + 1] - batchStartPositions[batchId];
-  if (numRows == 0) {
-    numRows = maxRows;
-  } else {
-    CHECK_LE(numRows, maxRows);
-  }
-  return batchValue.subMatrix(start, numRows);
-}
-
-void SequenceToBatch::prevOutput2Batch(Matrix &src, Matrix &dst) {
-  sequence2BatchCopy(dst, src, *seqIdx_, true);
-}
-
-void SequenceToBatch::getSeqOutputFromBatch(Matrix &sequence, Matrix &batch) {
-  sequence2BatchCopy(sequence, batch, *seqEndIdxInBatch_, true);
-}
-
-void SequenceToBatch::sequence2BatchCopy(Matrix &batch,
-                                         Matrix &sequence,
-                                         IVector &seq2BatchIdx,
-                                         bool seq2batch) {
-  int seqWidth = sequence.getWidth();
-  int batchCount = batch.getHeight();
-  real *batchData = batch.getData();
-  real *seqData = sequence.getData();
-  int *idxData = seq2BatchIdx.getData();
-
-  if (useGpu_) {
-    hl_sequence2batch_copy(
-        batchData, seqData, idxData, seqWidth, batchCount, seq2batch);
-  } else {
-    if (seq2batch) {
-#ifdef PADDLE_USE_MKLML
-      const int blockMemSize = 8 * 1024;
-      const int blockSize = blockMemSize / sizeof(real);
-#pragma omp parallel for collapse(2)
-      for (int i = 0; i < batchCount; ++i) {
-        for (int j = 0; j < seqWidth; j += blockSize) {
-          memcpy(batch.rowBuf(i) + j,
-                 sequence.rowBuf(idxData[i]) + j,
-                 (j + blockSize > seqWidth) ? (seqWidth - j) * sizeof(real)
-                                            : blockMemSize);
-        }
-      }
-#else
-      for (int i = 0; i < batchCount; ++i) {
-        memcpy(batch.rowBuf(i),
-               sequence.rowBuf(idxData[i]),
-               seqWidth * sizeof(real));
-      }
-#endif
-    } else {
-#ifdef PADDLE_USE_MKLML
-#pragma omp parallel for
-#endif
-      for (int i = 0; i < batchCount; ++i) {
-        memcpy(sequence.rowBuf(idxData[i]),
-               batch.rowBuf(i),
-               seqWidth * sizeof(real));
-      }
-    }
-  }
-}
-
-void SequenceToBatch::sequence2BatchAdd(Matrix &batch,
-                                        Matrix &sequence,
-                                        IVector &seq2BatchIdx,
-                                        bool seq2batch) {
-  int seqWidth = sequence.getWidth();
-  int batchCount = batch.getHeight();
-  real *batchData = batch.getData();
-  real *seqData = sequence.getData();
-  int *idxData = seq2BatchIdx.getData();
-
-  if (useGpu_) {
-    hl_sequence2batch_add(
-        batchData, seqData, idxData, seqWidth, batchCount, seq2batch);
-  } else {
-    for (int i = 0; i < batchCount; ++i) {
-      if (seq2batch) {
-        batch.subMatrix(i, 1)->add(*sequence.subMatrix(idxData[i], 1));
-      } else {
-        sequence.subMatrix(idxData[i], 1)->add(*batch.subMatrix(i, 1));
-      }
-    }
-  }
-}
-
-void SequenceToBatch::copyFromSeq(Matrix &seqValue) {
-  Matrix::resizeOrCreate(batchValue_,
-                         seqValue.getHeight(),
-                         seqValue.getWidth(),
-                         /* trans= */ false,
-                         useGpu_);
-  sequence2BatchCopy(*batchValue_, seqValue, *seq2BatchIdx_, true);
-}
-
-void SequenceToBatch::copyBackSeq(Matrix &seqValue) {
-  sequence2BatchCopy(*batchValue_, seqValue, *seq2BatchIdx_, false);
-}
-
-void SequenceToBatch::copy(Matrix &seqValue,
-                           Matrix &batchValue,
-                           bool seq2batch) {
-  sequence2BatchCopy(batchValue, seqValue, *seq2BatchIdx_, seq2batch);
-}
-
-void SequenceToBatch::add(Matrix &seqValue,
-                          Matrix &batchValue,
-                          bool seq2batch) {
-  sequence2BatchAdd(batchValue, seqValue, *seq2BatchIdx_, seq2batch);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceToBatch.h b/paddle/legacy/gserver/layers/SequenceToBatch.h
deleted file mode 100644
index 7ed517937d4..00000000000
--- a/paddle/legacy/gserver/layers/SequenceToBatch.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-
-namespace paddle {
-
-/*
- * This class can used to modify the matrix structure of sequence matrix into
- * batch structure.
- * sequence matrix: [C1_s ... Cn_s | ...... | C1_t ... Cn_t]
- * batch matrix:    [C1_s ... C1_t | ...... | Cn_s ... Cn_t]
- * Cn_s is the state for sequence s at time n.
- *
- * Exampel:  sequence matrix = {{0, 0, 0, 0}, {1, 1, 1, 1, 1}, {2, 2, 2}}
- *           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
- *           batch matrix = {{1, 0, 2}, {1, 0, 2}, {1, 0, 2}, {1, 0}, {1}}
- *           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
- *
- * Use:
- * Input: seqMatrix, seqStarts(Sequence Start Positions)
- * Output: batchMatrix
- * 1. SequenceToBatch seq2batch;
- * 2. seq2batch.resizeOrCreateBatch(seqStarts);     // calculate seq2BatchIdx
- * 3. seq2batch.copy(seqMatrix, batchMatrix, true); // copy seq to batch matrix
- *
- */
-class SequenceToBatch {
- public:
-  explicit SequenceToBatch(bool useGpu) : useGpu_(useGpu) {}
-
-  /* resize and calculate the batchIndex_ */
-  void resizeOrCreateBatch(int batchSize,
-                           size_t numSequences,
-                           const int *seqStarts,
-                           bool reversed,
-                           bool prevBatchState = false);
-
-  /* sequence matrix and batch matrix copy:
-   * seq2batch: copy(seqValue, batchValue, true);
-   * batch2seq: copy(seqValue, batchValue, false);
-   */
-  void copy(Matrix &seqValue, Matrix &batchValue, bool seq2batch);
-  /* sequence/batch matrix add to batch/sequence matrix */
-  void add(Matrix &seqValue, Matrix &batchValue, bool seq2batch);
-  MatrixPtr getBatchValue(Matrix &batchValue, int batchId, int numRows = 0);
-
-  size_t getNumBatch() const { return numBatch_; }
-
-  /* resize or create a batch matrix(batchValue_) */
-  void resizeOrCreate(Matrix &seqValue);
-  /* copy seqValue to batchValue_ */
-  void copyFromSeq(Matrix &seqValue);
-  /* copy batchValue_ to seqValue */
-  void copyBackSeq(Matrix &seqValue);
-  MatrixPtr getBatchValue(int batchId, int numRows = 0);
-  MatrixPtr getBatchValue() { return batchValue_; }
-  /*tranfer preBatchOutput to batch struct*/
-  void prevOutput2Batch(Matrix &src, Matrix &dst);
-  /*get sequence output from batch struct*/
-  void getSeqOutputFromBatch(Matrix &sequence, Matrix &batch);
-
-  /* Copy the index from another seq2batch. */
-  void shareIndexWith(const SequenceToBatch &seq2batch) {
-    CHECK(useGpu_ == seq2batch.useGpu_);
-    batchStartPositions_ = seq2batch.batchStartPositions_;
-    seq2BatchIdx_ = seq2batch.seq2BatchIdx_;
-    cpuSeq2BatchIdx_ = seq2batch.cpuSeq2BatchIdx_;
-    numBatch_ = seq2batch.numBatch_;
-  }
-
- protected:
-  void sequence2BatchCopy(Matrix &batch,
-                          Matrix &sequence,
-                          IVector &seq2BatchIdx,
-                          bool seq2batch);
-  void sequence2BatchAdd(Matrix &batch,
-                         Matrix &sequence,
-                         IVector &seq2BatchIdx,
-                         bool seq2batch);
-
-  IVectorPtr batchStartPositions_;
-  IVectorPtr seq2BatchIdx_;
-  IVectorPtr cpuSeq2BatchIdx_;
-  IVectorPtr cpuSeqIdx_;
-  IVectorPtr cpuSeqEndIdxInBatch_;
-  IVectorPtr seqIdx_;
-  IVectorPtr seqEndIdxInBatch_;
-  size_t numBatch_;
-  bool useGpu_;
-  MatrixPtr batchValue_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SliceProjection.cpp b/paddle/legacy/gserver/layers/SliceProjection.cpp
deleted file mode 100644
index b474f2db759..00000000000
--- a/paddle/legacy/gserver/layers/SliceProjection.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * SliceProjection can slice the input value into multiple parts,
- * and then select some of them to merge into a new output.
- *
- * First, calculate the slices that need to be merged into the output.
- * slices = input.slices().for_output()
- *
- * Second, merge each slice into the output.
- * for(auto slice: slices) {
- *   out.addAtOffset(slice, offset);
- * }
- *
- * Input slices as output: s0, s1, ...:
- *   -----------------------
- *   |///|   |//////|      |
- *   |/s0|   |//s1//|      |
- *   |///|   |//////|      |
- *   -----------------------
- * Output, merge s0, s1, ... into one output:
- *   ----------------
- *   |///|//////|   |
- *   |/s0|//s1//|...|
- *   |///|//////|   |
- *   ----------------
- *
- * The config file api is slice_projection.
- */
-class SliceProjection : public Projection {
- public:
-  SliceProjection(const ProjectionConfig& config,
-                  const ParameterPtr& parameter,
-                  bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
- protected:
-  std::vector<std::pair<size_t, size_t>> slices_;
-};
-
-REGISTER_PROJECTION(slice, SliceProjection);
-
-/**
- * Constructed function.
- * @note SliceProjection should not have any parameter.
- */
-SliceProjection::SliceProjection(const ProjectionConfig& config,
-                                 const ParameterPtr& parameter,
-                                 bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(!parameter) << "'slice' projection should not have any parameter";
-
-  slices_.reserve(config.slices_size());
-  for (const auto& slice : config.slices()) {
-    slices_.push_back(std::make_pair(slice.start(), slice.end()));
-  }
-}
-
-void SliceProjection::forward() {
-  size_t offset = 0;
-  for (auto& slice : slices_) {
-    auto slice_out = in_->value->subColMatrix(slice.first, slice.second);
-    out_->value->addAtOffset(*slice_out, offset);
-    offset += slice_out->getWidth();
-  }
-}
-
-void SliceProjection::backward(const UpdateCallback& callback) {
-  if (in_->grad) {
-    size_t offset = 0;
-    for (auto& slice : slices_) {
-      auto slice_out = in_->grad->subColMatrix(slice.first, slice.second);
-      slice_out->addAtOffset(*out_->grad, offset);
-      offset += slice_out->getWidth();
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp b/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp
deleted file mode 100644
index 9168fd7dda6..00000000000
--- a/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for applying a slope and an intercept to the input
- * element-wise.
- * This layer is used in NEURAL TURING MACHINE.
- * @note There is no activation and weight in this layer.
- *
- * \f[
- *    y = ax + b
- * \f]
- *
- * Here, a is scale and b is offset, which are provided as attributes of the
- * layer.
- *
- * The config file api is slope_intercept_layer.
- */
-
-class SlopeInterceptLayer : public Layer {
- public:
-  explicit SlopeInterceptLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(slope_intercept, SlopeInterceptLayer);
-
-bool SlopeInterceptLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1U);
-
-  return true;
-}
-
-void SlopeInterceptLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-
-  /* malloc memory for the output_ if necessary */
-  size_t batchSize = inV->getHeight();
-  size_t size = getSize();
-
-  CHECK_EQ(size, inV->getWidth());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, size);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwSlopeInterceptTimer", getName().c_str());
-    outV->mulScalar(*inV, config_.slope());
-    outV->add(config_.intercept());
-  }
-}
-
-void SlopeInterceptLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr outG = getOutputGrad();
-
-  if (inG) {
-    REGISTER_TIMER_INFO("BwSlopeInterceptTimer", getName().c_str());
-    inG->add(*outG, config_.slope());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.cpp b/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.cpp
deleted file mode 100644
index b445a399ef6..00000000000
--- a/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SpatialPyramidPoolLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(spp, SpatialPyramidPoolLayer);
-
-ProjectionConfig SpatialPyramidPoolLayer::getConfig(size_t imgSizeW,
-                                                    size_t imgSizeH,
-                                                    size_t channels,
-                                                    size_t pyramidLevel,
-                                                    std::string& poolType) {
-  ProjectionConfig config;
-  config.set_type("pool");
-  PoolConfig* conf = config.mutable_pool_conf();
-  conf->set_channels(channels);
-  conf->set_img_size(imgSizeW);
-  conf->set_img_size_y(imgSizeH);
-  conf->set_pool_type(poolType);
-
-  int numBins = std::pow(2, pyramidLevel);
-
-  int sizeH = std::ceil(imgSizeH / static_cast<double>(numBins));
-  int paddingH = (sizeH * numBins - imgSizeH + 1) / 2;
-  int outSizeH = outputSize(imgSizeH, sizeH, paddingH, sizeH, true);
-
-  int sizeW = std::ceil(imgSizeW / static_cast<double>(numBins));
-  int paddingW = (sizeW * numBins - imgSizeW + 1) / 2;
-  int outSizeW = outputSize(imgSizeW, sizeW, paddingW, sizeW, true);
-
-  conf->set_stride(sizeW);
-  conf->set_stride_y(sizeH);
-  conf->set_size_x(sizeW);
-  conf->set_size_y(sizeH);
-  conf->set_padding(paddingW);
-  conf->set_padding_y(paddingH);
-  conf->set_output_x(outSizeW);
-  conf->set_output_y(outSizeH);
-  config.set_output_size(outSizeH * outSizeW * channels);
-  return config;
-}
-
-size_t SpatialPyramidPoolLayer::getSize() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t layerSize = 0;
-  const ImageConfig& conf = config_.inputs(0).spp_conf().image_conf();
-  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imgSizeH_ == 0) {
-    imgSizeH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  }
-  if (imgSizeW_ == 0) {
-    imgSizeW_ = conf.img_size();
-  }
-
-  size_t outputH = 1;
-  size_t outputW = (std::pow(4, pyramidHeight_) - 1) / (4 - 1);
-
-  layerSize = outputH * outputW * channels_;
-  return layerSize;
-}
-
-bool SpatialPyramidPoolLayer::init(const LayerMap& layerMap,
-                                   const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  const SppConfig& sppConf = config_.inputs(0).spp_conf();
-  pyramidHeight_ = sppConf.pyramid_height();
-  poolType_ = sppConf.pool_type();
-
-  const ImageConfig& imageConf = sppConf.image_conf();
-  channels_ = imageConf.channels();
-  imgSizeW_ = imageConf.img_size();
-  imgSizeH_ = imageConf.has_img_size_y() ? imageConf.img_size_y() : imgSizeW_;
-  poolProjections_.reserve(pyramidHeight_);
-  projCol_.reserve(pyramidHeight_);
-  projOutput_.resize(pyramidHeight_);
-
-  size_t startCol = 0;
-  size_t endCol = 0;
-  for (size_t i = 0; i < pyramidHeight_; i++) {
-    poolProjections_.emplace_back(PoolProjection::create(
-        getConfig(imgSizeW_, imgSizeH_, channels_, i, poolType_),
-        nullptr,
-        useGpu_));
-    endCol += poolProjections_[i]->getOutputSize();
-    projCol_.push_back(std::make_pair(startCol, endCol));
-    startCol = endCol;
-  }
-  CHECK_EQ(endCol, getSize());
-  return true;
-}
-
-void SpatialPyramidPoolLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInput(0).getBatchSize();
-  resetOutput(batchSize, getSize());
-  for (size_t i = 0; i < pyramidHeight_; i++) {
-    size_t startCol = projCol_[i].first;
-    size_t endCol = projCol_[i].second;
-    projOutput_[i].value = output_.value->subColMatrix(startCol, endCol);
-    if (output_.grad) {
-      projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
-    }
-  }
-  for (size_t i = 0; i < pyramidHeight_; i++) {
-    poolProjections_[i]->forward(&getInput(0), &projOutput_[i], passType);
-  }
-}
-
-void SpatialPyramidPoolLayer::backward(const UpdateCallback& callback) {
-  for (size_t i = 0; i < pyramidHeight_; i++) {
-    if (poolProjections_[i]) {
-      poolProjections_[i]->backward(callback);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h
deleted file mode 100644
index 6d8ed9c8788..00000000000
--- a/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "PoolProjection.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-/**
- * @brief A layer for spatial pyramid pooling on the input image by taking
- * the max, average, etc. within regions, so that the result vector of
- * different sized images are of the same size.
- *
- * The config file api is spp_layer.
- */
-
-class SpatialPyramidPoolLayer : public Layer {
- protected:
-  size_t channels_;
-  size_t imgSizeW_;
-  size_t imgSizeH_;
-  size_t pyramidHeight_;
-  std::string poolType_;
-
-  std::vector<std::unique_ptr<PoolProjection>> poolProjections_;
-  std::vector<Argument> projOutput_;
-  std::vector<std::pair<size_t, size_t>> projCol_;
-
- public:
-  explicit SpatialPyramidPoolLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  ProjectionConfig getConfig(size_t sizeX_,
-                             size_t sizeY_,
-                             size_t channels,
-                             size_t pyamidLevel_,
-                             std::string& poolType_);
-  size_t getSize();
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp b/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp
deleted file mode 100644
index f363c2ac8dd..00000000000
--- a/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-class SubNestedSequenceLayer : public Layer {
- public:
-  explicit SubNestedSequenceLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  /*
-   * This functions generates the indices of rows in a batch according to the
-   * indices of selected sub-sequence in each sequence.
-   *
-   * Examples:
-   * selectedIndices:
-   *   [
-   *     [0, 1, -1],
-   *     [0, 1, 2],
-   *     [0, -1, -1],
-   *     [0, 2, 3],
-   *   ]
-   * inputSeqInfo:
-   *   [
-   *     [0,3,4],
-   *     [4,5,7,10,15],
-   *     [15,20],
-   *     [20,22,23,25,28]
-   *   ]
-   *
-   * ths output is saved to private member rowIndice_;
-   * [0,1,2,3,4,5,6,7,8,9,15,16,17,18,19,20,21,23,24,25,26,27]
-   */
-
-  void calSelectedRows(const MatrixPtr selectedIndices,
-                       const std::vector<std::vector<int>>& inputSeqInfo);
-
-  /*
-   * TODO(caoying)
-   * In PaddePaddle, currently all matrices are real number types,
-   * but the second is some selected indices of the give sequence to trim
-   * the nested sequence, are actually filled with int types so that storing
-   * int types information in real number matrices is very dangerous, since
-   * real numbers will be convered to int types. If a user fills this matrix
-   * himself, invalid data may occor.
-   *
-   * if the second input of this layer is on GPU memory, copy it to CPU memory.
-   */
-  MatrixPtr selIdsCpu_;
-
-  /*
-   * reorganize sequenceStartPositions and subSequenceStartPositions
-   * into a 2d vector to facilitate the sequence selection process.
-   */
-  std::vector<std::vector<int>> inputSeqInfoVec_;
-
-  /* store the final selected row indices in a batch */
-  IVectorPtr rowIndice_;
-  /* rowIndice_ and selectedRows_ actually share a same memory. */
-  std::vector<int> selectedRows_;
-};
-
-REGISTER_LAYER(sub_nested_seq, SubNestedSequenceLayer);
-
-bool SubNestedSequenceLayer::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(2U, inputLayers_.size());
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SubNestedSequenceLayer::calSelectedRows(
-    const MatrixPtr selectedIndices,
-    const std::vector<std::vector<int>>& inputSeqInfo) {
-  selectedRows_.clear();
-
-  std::vector<int> outSeqStartInfo(1, 0);
-  std::vector<int> outSubSeqStartInfo(1, 0);
-
-  size_t seqNum = selectedIndices->getHeight();
-  size_t beamSize = selectedIndices->getWidth();
-  for (size_t i = 0; i < seqNum; ++i) {
-    for (size_t j = 0; j < beamSize; ++j) {
-      if (selectedIndices->getElement(i, j) == -1.) break;
-      size_t selSubSeqIdx = selectedIndices->getElement(i, j);
-      CHECK_GT(inputSeqInfoVec_[i].size() - 1, selSubSeqIdx);
-
-      size_t subSeqLen = inputSeqInfoVec_[i][selSubSeqIdx + 1] -
-                         inputSeqInfoVec_[i][selSubSeqIdx];
-      for (size_t k = 0; k < subSeqLen; ++k)
-        selectedRows_.push_back(inputSeqInfoVec_[i][selSubSeqIdx] + k);
-      outSubSeqStartInfo.push_back(outSubSeqStartInfo.back() + subSeqLen);
-    }
-    outSeqStartInfo.push_back(outSubSeqStartInfo.back());
-  }
-
-  if (useGpu_) {
-    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
-    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
-  } else {
-    rowIndice_ =
-        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
-  }
-
-  // create the sequence information for the output.
-  ICpuGpuVector::resizeOrCreate(
-      output_.sequenceStartPositions, outSeqStartInfo.size(), false);
-  output_.sequenceStartPositions->copyFrom(
-      outSeqStartInfo.data(), outSeqStartInfo.size(), false);
-
-  ICpuGpuVector::resizeOrCreate(
-      output_.subSequenceStartPositions, outSubSeqStartInfo.size(), false);
-  output_.subSequenceStartPositions->copyFrom(
-      outSubSeqStartInfo.data(), outSubSeqStartInfo.size(), false);
-}
-
-void SubNestedSequenceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& inputSeq = getInput(0);
-  CHECK(inputSeq.hasSubseq()) << "The first input of SubNestSequence layer "
-                              << "must be a nested sequence.";
-  const MatrixPtr selectedIndices = getInputValue(1);
-  CHECK_EQ(size_t(inputSeq.getNumSequences()), selectedIndices->getHeight());
-
-  if (dynamic_cast<GpuMatrix*>(selectedIndices.get())) {
-    /*
-     * Currently, the second input for this layer is generated by
-     * kmax_sequence_score_layer whose output is always stored on CPU,
-     * or a data_layer which canbe on GPU.
-     *
-     * If the second input is on GPU, copy it to CPU memory, because this
-     * input always uses very few memory, and operations related to it are
-     * all logic control, not computations.
-     */
-    Matrix::resizeOrCreate(selIdsCpu_,
-                           selectedIndices->getHeight(),
-                           selectedIndices->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    selIdsCpu_->copyFrom(*selectedIndices);
-  } else {
-    selIdsCpu_ = selectedIndices;
-  }
-
-  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
-                              inputSeq.subSequenceStartPositions,
-                              inputSeqInfoVec_);
-  calSelectedRows(selIdsCpu_, inputSeqInfoVec_);
-
-  resetOutput(selectedRows_.size(), getSize());
-  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
-}
-
-void SubNestedSequenceLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inputSeqGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-
-  if (inputSeqGrad) outputGrad->addToRows(*inputSeqGrad, *rowIndice_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SubSequenceLayer.cpp b/paddle/legacy/gserver/layers/SubSequenceLayer.cpp
deleted file mode 100644
index 36796f04739..00000000000
--- a/paddle/legacy/gserver/layers/SubSequenceLayer.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for taking the subsequence according to given offset and size
- * Input: original sequence, offset, size
- * Output: subsequence
- */
-
-class SubSequenceLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-  MatrixPtr tmpSrc_;
-  MatrixPtr tmpDest_;
-
- public:
-  explicit SubSequenceLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(subseq, SubSequenceLayer);
-
-bool SubSequenceLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  // sequene concatenation layer should have exactly 2 inputs
-  CHECK_EQ(3U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  tmpSrc_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  tmpDest_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SubSequenceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t dim = getSize();
-
-  const Argument& input = getInput(0);
-  size_t numSequences1 = input.getNumSequences();
-  auto startPositions1 = input.sequenceStartPositions->getVector(false);
-
-  const Argument& offsetSeq = getInput(1);
-  size_t numSequences2 = offsetSeq.getNumSequences();
-  auto startPositions2 = offsetSeq.sequenceStartPositions->getVector(false);
-
-  const Argument& sizeSeq = getInput(2);
-  size_t numSequences3 = sizeSeq.getNumSequences();
-  auto startPositions3 = sizeSeq.sequenceStartPositions->getVector(false);
-
-  CHECK_EQ(dim, input.value->getWidth());
-
-  CHECK_EQ(startPositions1->getData()[numSequences1], input.getBatchSize());
-  CHECK_EQ(numSequences1, startPositions1->getSize() - 1);
-
-  CHECK_EQ(startPositions2->getData()[numSequences2], offsetSeq.getBatchSize());
-  CHECK_EQ(numSequences2, startPositions2->getSize() - 1);
-
-  CHECK_EQ(startPositions3->getData()[numSequences3], sizeSeq.getBatchSize());
-  CHECK_EQ(numSequences3, startPositions3->getSize() - 1);
-
-  CHECK_EQ(numSequences1, numSequences2);
-  CHECK_EQ(numSequences2, numSequences3);
-
-  MatrixPtr inputValue = input.value;
-  IVectorPtr offsetValue;
-  IVectorPtr sizeValue;
-
-  if (useGpu_) {
-    // copy to cpu
-    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
-    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
-    offsetValue->copyFrom(*offsetSeq.ids);
-    sizeValue->copyFrom(*sizeSeq.ids);
-  } else {
-    offsetValue = offsetSeq.ids;
-    sizeValue = sizeSeq.ids;
-  }
-
-  CHECK_EQ(offsetValue->getSize(), numSequences1);
-  CHECK_EQ(sizeValue->getSize(), numSequences1);
-
-  int* offsets = offsetValue->getData();
-  int* sizes = sizeValue->getData();
-
-  // get total height of output
-  size_t height = 0;
-  for (size_t seqId = 0; seqId < numSequences1; seqId++) {
-    height += sizes[seqId];
-  }
-
-  // reset output
-  resetOutput(height, dim);
-
-  MatrixPtr outputValue = getOutputValue();
-
-  const int* starts1 = startPositions1->getData();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SubSequenceLayerForward", getName().c_str());
-
-    size_t offsetIn = 0;
-    size_t offsetOut = 0;
-    size_t size = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      offsetIn = starts1[seqId] + offsets[seqId];
-      size = sizes[seqId];
-
-      outputValue->subMatrix(offsetOut, size, tmpDest_)
-          ->assign(*(inputValue->subMatrix(offsetIn, size, tmpSrc_)));
-
-      offsetOut += size;
-    }
-
-    // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences1 + 1, false);
-
-    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
-    int offset = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      tgtBuf[seqId] = offset;
-      offset += sizes[seqId];
-    }
-    tgtBuf[numSequences1] = offset;
-  }
-
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void SubSequenceLayer::backward(const UpdateCallback& callback) {
-  /* activation */
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  MatrixPtr inputGrad1 = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
-  size_t numSequences1 = startPositions1->getSize() - 1;
-  const int* starts1 = startPositions1->getData();
-
-  const Argument& offsetSeq = getInput(1);
-  const Argument& sizeSeq = getInput(2);
-  IVectorPtr offsetValue;
-  IVectorPtr sizeValue;
-
-  if (useGpu_) {
-    // copy to cpu
-    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
-    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
-    offsetValue->copyFrom(*offsetSeq.ids);
-    sizeValue->copyFrom(*sizeSeq.ids);
-  } else {
-    offsetValue = offsetSeq.ids;
-    sizeValue = sizeSeq.ids;
-  }
-
-  int* offsets = offsetValue->getData();
-  int* sizes = sizeValue->getData();
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SubSequenceLayerBackward", getName().c_str());
-
-    int offsetIn = 0;
-    int offsetOut = 0;
-    int size = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      offsetIn = starts1[seqId] + offsets[seqId];
-      size = sizes[seqId];
-
-      inputGrad1->subMatrix(offsetIn, size, tmpDest_)
-          ->add(*(outputGrad->subMatrix(offsetOut, size, tmpSrc_)));
-      offsetOut += size;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp b/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp
deleted file mode 100644
index 410f4dd7c90..00000000000
--- a/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for sum-to-one normalization,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   out[i] = \frac {in[i]} {\sum_{k=1}^N in[k]}
- * \f]
- * where \f$in\f$ is a (batchSize x dataDim) input vector,
- * and \f$out\f$ is a (batchSize x dataDim) output vector.
- *
- * The config file api is sum_to_one_norm_layer.
- */
-
-class SumToOneNormLayer : public Layer {
- protected:
-  /// reciprocalRowSum_ = \f$1 / \sum_{k=1}^N in[k]\f$
-  MatrixPtr reciprocalRowSum_;
-  /// dotSum = output_.grad \f$.*\f$ output_.value
-  MatrixPtr dotSum_;
-
- public:
-  explicit SumToOneNormLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(sum_to_one_norm, SumToOneNormLayer);
-
-bool SumToOneNormLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1U);
-
-  return true;
-}
-
-void SumToOneNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-
-  /* malloc memory for the output_ if necessary */
-  size_t batchSize = inV->getHeight();
-  size_t dataDim = getSize();
-
-  CHECK_EQ(dataDim, inV->getWidth());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwSumToOneNormTimer", getName().c_str());
-
-    Matrix::resizeOrCreate(reciprocalRowSum_, batchSize, 1, false, useGpu_);
-    inV->rowSum(*reciprocalRowSum_);
-
-    // todo: matrix checks
-    CHECK_GT(reciprocalRowSum_->getMin(), 0.0);
-
-    reciprocalRowSum_->scalarDiv(*reciprocalRowSum_, 1.0);
-
-    // outV = inV * reciprocalRowSum
-    outV->rowScale(0, *inV, *reciprocalRowSum_);
-  }
-}
-
-void SumToOneNormLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-
-  size_t batchSize = inV->getHeight();
-
-  if (inG) {
-    REGISTER_TIMER_INFO("BwSumToOneTimer", getName().c_str());
-
-    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
-
-    // dotSum = outG .* outV
-    dotSum_->zeroMem();
-    dotSum_->rowDotMul(0, *outG, *outV);
-
-    // inG += -1 * (dotSum / rowSum)
-    dotSum_->dotMul(*dotSum_, *reciprocalRowSum_);
-    inG->rowAdd(0, *inG, *dotSum_, -1.0);
-    // inG += outG * (1/rowSum)
-    inG->addRowScale(0, *outG, *reciprocalRowSum_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SwitchOrderLayer.cpp b/paddle/legacy/gserver/layers/SwitchOrderLayer.cpp
deleted file mode 100644
index 513f3df7bca..00000000000
--- a/paddle/legacy/gserver/layers/SwitchOrderLayer.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SwitchOrderLayer.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(switch_order, SwitchOrderLayer);
-
-bool SwitchOrderLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  auto& img_conf = config_.inputs(0).image_conf();
-  size_t inD = img_conf.img_size_z();
-  size_t inH =
-      img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size();
-  size_t inW = img_conf.img_size();
-  size_t inC = img_conf.channels();
-  inH = inH * inD;
-  inDims_ = TensorShape({0, inC, inH, inW});
-  outDims_ = TensorShape(4);
-
-  auto& reshape_conf = config_.reshape_conf();
-  for (int i = 0; i < reshape_conf.height_axis_size(); i++) {
-    heightAxis_.push_back(reshape_conf.height_axis(i));
-  }
-  for (int i = 0; i < reshape_conf.width_axis_size(); i++) {
-    widthAxis_.push_back(reshape_conf.width_axis(i));
-  }
-  createFunction(nchw2nhwc_, "NCHW2NHWC", FuncConfig());
-  createFunction(nhwc2nchw_, "NHWC2NCHW", FuncConfig());
-  return true;
-}
-
-void SwitchOrderLayer::setOutDims() {
-  outDims_.setDim(0, inDims_[0]);
-  outDims_.setDim(1, inDims_[2]);
-  outDims_.setDim(2, inDims_[3]);
-  outDims_.setDim(3, inDims_[1]);
-  reshapeHeight_ = 1;
-  for (size_t i = 0; i < heightAxis_.size(); i++) {
-    reshapeHeight_ *= outDims_[heightAxis_[i]];
-  }
-  output_.setFrameHeight(reshapeHeight_);
-  reshapeWidth_ = 1;
-  for (size_t i = 0; i < widthAxis_.size(); i++) {
-    reshapeWidth_ *= outDims_[widthAxis_[i]];
-  }
-  output_.setFrameWidth(reshapeWidth_);
-}
-
-void SwitchOrderLayer::setInDims() {
-  MatrixPtr input = inputLayers_[0]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  inDims_.setDim(0, batchSize);
-  int d = inputLayers_[0]->getOutput().getFrameDepth();
-  d = (d == 0 ? 1 : d);
-  int h = inputLayers_[0]->getOutput().getFrameHeight();
-  if (h != 0) inDims_.setDim(2, h * d);
-  int w = inputLayers_[0]->getOutput().getFrameWidth();
-  if (w != 0) inDims_.setDim(3, w);
-  int totalCount = input->getElementCnt();
-  int channels = totalCount / (inDims_[0] * inDims_[2] * inDims_[3]);
-  if (channels != 0) inDims_.setDim(1, channels);
-}
-
-void SwitchOrderLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  setInDims();
-  setOutDims();
-  resetOutput(outDims_[0], outDims_[1] * outDims_[2] * outDims_[3]);
-  if (heightAxis_.size() > 0) {
-    resetOutput(reshapeHeight_, reshapeWidth_);
-  }
-
-  // switch NCHW to NHWC
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), inDims_);
-  outputs.addArg(*getOutputValue(), outDims_);
-  nchw2nhwc_[0]->calc(inputs, outputs);
-  forwardActivation();
-}
-
-void SwitchOrderLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  backwardActivation();
-
-  // switch NHWC to NCHW
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getOutputGrad(), outDims_);
-  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
-  nhwc2nchw_[0]->calc(inputs, outputs);
-}
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SwitchOrderLayer.h b/paddle/legacy/gserver/layers/SwitchOrderLayer.h
deleted file mode 100644
index 8a551a2bba6..00000000000
--- a/paddle/legacy/gserver/layers/SwitchOrderLayer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * \brief  This layer calculate softmax in image channel dimension.
- */
-class SwitchOrderLayer : public Layer {
- public:
-  explicit SwitchOrderLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~SwitchOrderLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-  void setInDims();
-  void setOutDims();
-
- protected:
-  std::vector<std::shared_ptr<FunctionBase>> nchw2nhwc_;
-  std::vector<std::shared_ptr<FunctionBase>> nhwc2nchw_;
-  TensorShape inDims_;
-  TensorShape outDims_;
-  std::vector<int> heightAxis_;
-  std::vector<int> widthAxis_;
-  size_t reshapeHeight_;
-  size_t reshapeWidth_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TableProjection.cpp b/paddle/legacy/gserver/layers/TableProjection.cpp
deleted file mode 100644
index 326e241d075..00000000000
--- a/paddle/legacy/gserver/layers/TableProjection.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TableProjection.h"
-
-namespace paddle {
-
-REGISTER_PROJECTION(table, TableProjection);
-
-TableProjection::TableProjection(const ProjectionConfig& config,
-                                 const ParameterPtr& parameter,
-                                 bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  table_.reset(
-      new Weight(config.input_size(), config.output_size(), parameter));
-}
-
-void TableProjection::prefetch(const Argument* in) {
-  CHECK(in->ids);
-  auto* sparseParam =
-      dynamic_cast<SparsePrefetchRowCpuMatrix*>(table_->getW().get());
-  if (sparseParam) {
-    sparseParam->addRows(in->ids);
-  }
-}
-
-void TableProjection::forward() {
-  CHECK(in_->ids);
-  out_->value->selectRows(*table_->getW(), *in_->ids);
-}
-
-void TableProjection::backward(const UpdateCallback& callback) {
-  if (table_->getWGrad()) {
-    CHECK(in_->ids);
-    out_->grad->addToRows(*table_->getWGrad(), *in_->ids);
-    parameter_->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TableProjection.h b/paddle/legacy/gserver/layers/TableProjection.h
deleted file mode 100644
index 60286149f42..00000000000
--- a/paddle/legacy/gserver/layers/TableProjection.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * Table projection takes index data input. It select rows from parameter
- * where row_id is in input_ids:
- * \f[
- *   out.row[i] += table.row[ids[i]]
- * \f]
- * where \f$out\f$ is out, \f$table\f$ is parameter, \f$ids\f$ is input_ids,
- * and \f$i\f$ is row_id.
- *
- * The config file api is table_projection.
- *
- * @note If \f$ids[i] = -1\f$, it will be ignored.
- */
-class TableProjection : public Projection {
- public:
-  TableProjection(const ProjectionConfig& config,
-                  const ParameterPtr& parameter,
-                  bool useGpu);
-  /**
-   * If use sparse row matrix as parameter, prefetch feature ids in input label.
-   */
-  virtual void prefetch(const Argument* in);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
- protected:
-  std::unique_ptr<Weight> table_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TensorLayer.cpp b/paddle/legacy/gserver/layers/TensorLayer.cpp
deleted file mode 100644
index 7f874bce0f2..00000000000
--- a/paddle/legacy/gserver/layers/TensorLayer.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TensorLayer.h"
-
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(tensor, TensorLayer);
-
-bool TensorLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize the weightList */
-  CHECK_EQ(inputLayers_.size(), 2LU);
-  CHECK(parameters_[0]);
-  CHECK(!parameters_[1]);
-
-  // Option the parameters
-  size_t height = inputLayers_[0]->getSize();
-  size_t width = inputLayers_[1]->getSize();
-  CHECK_EQ(width * height * getSize(), parameters_[0]->getSize());
-
-  for (size_t i = 0; i < getSize(); ++i) {
-    // create a new weight
-    Weight* w = new Weight(height, width, parameters_[0], i * width * height);
-
-    // append the new weight to the list
-    weights_.emplace_back(w);
-  }
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  return true;
-}
-
-void TensorLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(0)->getHeight();
-  int size = getSize();
-
-  { resetOutput(batchSize, size); }
-
-  MatrixPtr outV = getOutputValue();
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* e1 * W * trans(e2) */ {
-    MatrixPtr input1 = getInputValue(0);
-    MatrixPtr input2 = getInputValue(1);
-    MatrixPtr tmpMat = Matrix::create(input2->getHeight(),
-                                      input2->getWidth(),
-                                      /* trans= */ false,
-                                      input2->useGpu());
-    REGISTER_TIMER_INFO("TensorFwMulTimer", getName().c_str());
-    for (size_t i = 0; i < getSize(); ++i) {
-      MatrixPtr weights = weights_[i]->getW();
-      tmpMat->mul(*input1, *weights, 1, 0);
-      outV->rowDotMul(i, *tmpMat, *input2);
-    }
-  }
-
-  /* activation */ { forwardActivation(); }
-}
-
-void TensorLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ { backwardActivation(); }
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  bool syncFlag = hl_get_sync_flag();
-
-  /* Calculate the W-gradient for the current layer */
-  MatrixPtr input1 = getInputValue(0);
-  MatrixPtr input2 = getInputValue(1);
-  MatrixPtr oGrad = getOutputGrad();
-  MatrixPtr tmpMat = Matrix::create(input1->getHeight(),
-                                    input1->getWidth(),
-                                    /* trans= */ false,
-                                    input1->useGpu());
-
-  /* trans(grad * e1) * e2 */ {
-    REGISTER_TIMER_INFO("TensorGradMulTimer", getName().c_str());
-    for (size_t i = 0; i < getSize(); ++i) {
-      if (weights_[i]->getWGrad()) {
-        tmpMat->rowScale(i, *input1, *oGrad);
-        MatrixPtr input1_T = tmpMat->getTranspose();
-        weights_[i]->getWGrad()->mul(*input1_T, *input2, 1, 1);
-      }
-    }
-  }
-
-  hl_set_sync_flag(false);
-
-  /* Calculate the input layers error */ {
-    MatrixPtr preGrad1 = getInputGrad(0);
-    MatrixPtr preGrad2 = getInputGrad(1);
-
-    REGISTER_TIMER_INFO("TensorBpMulTimer", getName().c_str());
-    for (size_t i = 0; i < getSize(); ++i) {
-      MatrixPtr weights = weights_[i]->getW();
-
-      if (NULL != preGrad1) { /* (grad * e2) * trans(W) */
-        tmpMat->rowScale(i, *input2, *oGrad);
-        MatrixPtr weights_T = weights->getTranspose();
-        preGrad1->mul(*tmpMat, *weights_T, 1, 1);
-      }
-      if (NULL != preGrad2) { /* (grad * e1) * W */
-        tmpMat->rowScale(i, *input1, *oGrad);
-        preGrad2->mul(*tmpMat, *weights, 1, 1);
-      }
-    }
-  }
-  hl_set_sync_flag(syncFlag);
-  parameters_[0]->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TensorLayer.h b/paddle/legacy/gserver/layers/TensorLayer.h
deleted file mode 100644
index fc491a7c9f2..00000000000
--- a/paddle/legacy/gserver/layers/TensorLayer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * @brief TensorLayer takes two input vectors.
- * \f[
- *     y_{i} = x_{1} * W_{i} * x_{2}^{\rm T}, i=0, 1, ...,K-1
- * \f]
- *
- * - \f$x_{1}\f$: the first input, size is M.
- * - \f$x_{2}\f$: the second input, size is N.
- * - y: output, size is K.
- * - \f$y_{i}\f$: i-th element of y.
- * - \f$W_{i}\f$: the i-th learned weight, dimensions: [M, N].
- * - \f$x_{2}^{\rm T}\f$: the transpose of \f$x_{2}\f$.
- *
- * The config file api is tensor_layer.
- */
-
-class TensorLayer : public Layer {
- protected:
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit TensorLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TransLayer.cpp b/paddle/legacy/gserver/layers/TransLayer.cpp
deleted file mode 100644
index fd1d435ea5f..00000000000
--- a/paddle/legacy/gserver/layers/TransLayer.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TransLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-namespace paddle {
-
-REGISTER_LAYER(trans, TransLayer);
-
-bool TransLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* the size of inputs for trans-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  return true;
-}
-
-void TransLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  MatrixPtr input = getInputValue(0);
-  int height = input->getHeight();
-  int width = input->getWidth();
-
-  resizeOutput(width, height);
-
-  MatrixPtr outV = getOutputValue();
-
-  /* outV's memory has been allocated, so memAlloc = false */
-  input->transpose(outV, false);
-  if (getInputGrad(0)) {
-    zeroGrad();
-  }
-}
-
-void TransLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  MatrixPtr outputGrad = getOutputGrad();
-  if (outputGrad == NULL) {
-    return;
-  }
-  MatrixPtr preGrad = getInputGrad(0);
-  if (preGrad) {
-    MatrixPtr transGrad = Matrix::create(preGrad->getHeight(),
-                                         preGrad->getWidth(),
-                                         /* trans= */ false,
-                                         preGrad->useGpu());
-    outputGrad->transpose(transGrad, false);
-    preGrad->add(*transGrad);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TransLayer.h b/paddle/legacy/gserver/layers/TransLayer.h
deleted file mode 100644
index 0a6b13933f8..00000000000
--- a/paddle/legacy/gserver/layers/TransLayer.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-/**
- * A layer for transposing a minibatch matrix.
- * \f[
-     y = x^\mathrm{T}
- * \f]
- * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
- *
- * The config file api is trans_layer.
- */
-class TransLayer : public Layer {
- public:
-  explicit TransLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp
deleted file mode 100644
index c8533dc7d78..00000000000
--- a/paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief TransposedFullMatrixProjection performs full matrix multiplication:
- * out.row[i] += in.row[i] * weight.transpose
- *
- * The config file api is trans_full_matrix_projection.
- */
-class TransposedFullMatrixProjection : public Projection {
- public:
-  TransposedFullMatrixProjection(const ProjectionConfig& config,
-                                 ParameterPtr parameter,
-                                 bool useGPu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
- protected:
-  std::unique_ptr<Weight> weight_;
-};
-
-REGISTER_PROJECTION(trans_fc, TransposedFullMatrixProjection);
-
-TransposedFullMatrixProjection::TransposedFullMatrixProjection(
-    const ProjectionConfig& config, ParameterPtr parameter, bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  weight_.reset(
-      new Weight(config.output_size(), config.input_size(), parameter));
-}
-
-void TransposedFullMatrixProjection::forward() {
-  REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
-  out_->value->mul(*(in_->value), *(weight_->getW()->getTranspose()), 1, 1);
-}
-
-void TransposedFullMatrixProjection::backward(const UpdateCallback& callback) {
-  bool syncFlag = hl_get_sync_flag();
-
-  /* Calculate the W-gradient for the current layer */
-  if (weight_->getWGrad()) {
-    REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-    weight_->getWGrad()->mul(
-        *(out_->grad->getTranspose()), *(in_->value), 1, 1);
-  }
-
-  // If callback does not change value, backprop error asynchronously so that
-  // we can do the callback concurrently.
-  // This is still a little bit dangerous since theoretically for
-  // SyncMultiGpuMachine it is possible that the value copyback can still
-  // happen at the same time as the error backprop where the value is being
-  // used.
-  hl_set_sync_flag(false);
-
-  /* Calculate the input layers error */
-  if (in_->grad) {
-    REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-    in_->grad->mul(*(out_->grad), *(weight_->getW()), 1, 1);
-  }
-
-  hl_set_sync_flag(syncFlag);
-  parameter_->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/UpsampleLayer.cpp b/paddle/legacy/gserver/layers/UpsampleLayer.cpp
deleted file mode 100644
index 3ff5332e640..00000000000
--- a/paddle/legacy/gserver/layers/UpsampleLayer.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-    limitations under the License. */
-
-#include "UpsampleLayer.h"
-#include "iostream"
-
-namespace paddle {
-
-REGISTER_LAYER(upsample, UpsampleLayer);
-
-size_t UpsampleLayer::getOutputSize() {
-  if (upsampleSize_ == 0) {
-    upsampleSize_ = imgSize_ * scale_ - static_cast<int>(padOutX_);
-    upsampleSizeY_ = imgSizeY_ * scaleY_ - static_cast<int>(padOutY_);
-  }
-  return upsampleSize_ * upsampleSizeY_ * channels_;
-}
-
-bool UpsampleLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-  CHECK_EQ(config_.inputs_size(), 2);
-  const auto& conf = config_.inputs(0).upsample_conf();
-  const auto& img_conf = conf.image_conf();
-
-  imgSizeY_ =
-      img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size();
-  imgSize_ = img_conf.img_size();
-  channels_ = img_conf.channels();
-
-  CHECK((conf.has_upsample_size()) || (conf.has_scale()))
-      << "scale or upsample_size is required.";
-
-  if (conf.has_upsample_size()) {
-    upsampleSize_ = conf.upsample_size();
-    upsampleSizeY_ = upsampleSize_;
-    if (conf.has_upsample_size_y()) {
-      upsampleSizeY_ = conf.upsample_size_y();
-    }
-  } else {
-    if (!conf.has_scale_y()) {
-      scale_ = scaleY_ = conf.scale_y();
-      CHECK_GT(static_cast<int>(scale_), 1);
-    } else {
-      scale_ = conf.scale();
-      scaleY_ = conf.scale_y();
-    }
-    padOutX_ = conf.pad_out_x();
-    padOutY_ = conf.pad_out_y();
-    CHECK(!padOutX_ || scale_ == 2)
-        << "Output height padding compensation requires scale_ == 2";
-    CHECK(!padOutY_ || scaleY_ == 2)
-        << "Output width padding compensation requires scaleY_ == 2";
-    upsampleSize_ = upsampleSizeY_ = 0;
-  }
-  return true;
-}
-
-void UpsampleLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr input = getInputValue(0);
-  MatrixPtr mask = inputLayers_[1]->getOutput("mask").value;
-
-  size_t batchSize = input->getHeight();
-  size_t outSize = getOutputSize();
-
-  CHECK_EQ(input->getWidth(), mask->getWidth());
-  CHECK_EQ(mask->getHeight(), batchSize);
-  resetOutput(batchSize, outSize);
-
-  MatrixPtr output = getOutputValue();
-  output->upsampleForward(*input,
-                          *mask,
-                          imgSize_,
-                          imgSizeY_,
-                          channels_,
-                          upsampleSize_,
-                          upsampleSizeY_);
-}
-
-void UpsampleLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr mask = inputLayers_[1]->getOutput("mask").value;
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-  inputGrad->upsampleBackward(*outputGrad,
-                              *mask,
-                              imgSize_,
-                              imgSizeY_,
-                              channels_,
-                              upsampleSize_,
-                              upsampleSizeY_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/UpsampleLayer.h b/paddle/legacy/gserver/layers/UpsampleLayer.h
deleted file mode 100644
index 2fe5938244c..00000000000
--- a/paddle/legacy/gserver/layers/UpsampleLayer.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * This layer transpose the pooling process.
- * It takes two input, the first input is the input data, and
- * the second is the mask data from the max-pool-with-mask layer.
- *
- */
-
-class UpsampleLayer : public Layer {
- public:
-  explicit UpsampleLayer(const LayerConfig& config) : Layer(config) {}
-  ~UpsampleLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  size_t getOutputSize();
-
- protected:
-  size_t scale_, scaleY_;
-  size_t upsampleSize_, upsampleSizeY_;
-  size_t padOutX_, padOutY_;
-  size_t imgSize_, imgSizeY_;
-  size_t channels_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ValidationLayer.cpp b/paddle/legacy/gserver/layers/ValidationLayer.cpp
deleted file mode 100644
index 9956fd2ed41..00000000000
--- a/paddle/legacy/gserver/layers/ValidationLayer.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <fstream>
-#include <memory>
-
-#include "ValidationLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-bool ValidationLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  return Layer::init(layerMap, parameterMap);
-}
-
-void ValidationLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr output = getInputValue(*getOutputLayer());
-  CHECK(output);
-  IVectorPtr label = getInputLabel(*getLabelLayer());
-  CHECK(label);
-  validationImp(output, label);
-}
-
-void ValidationLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-}
-
-bool AucValidation::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  bool ret = ValidationLayer::init(layerMap, parameterMap);
-  EvaluatorConfig config;
-  config.set_name(getName());
-  config.set_type("last-column-auc");
-  config.add_input_layers(inputLayers_[0]->getName());
-  config.add_input_layers(inputLayers_[1]->getName());
-  if (3 == inputLayers_.size()) {
-    config.add_input_layers(inputLayers_[2]->getName());
-  }
-  evaluator_.reset(Evaluator::create(config));
-  passBegin_ = false;
-  return ret;
-}
-
-void AucValidation::validationImp(MatrixPtr output, IVectorPtr label) {
-  if (!passBegin_) {
-    passBegin_ = true;
-    evaluator_->start();
-  }
-
-  bool supportWeight = (3 == inputLayers_.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? getInputValue(*inputLayers_[2]) : nullptr;
-  if (dynamic_cast<GpuMatrix*>(output.get())) {
-    size_t height = output->getHeight();
-    size_t width = output->getWidth();
-    Matrix::resizeOrCreate(cpuOutput_,
-                           height,
-                           width,
-                           /* trans=*/false,
-                           /* useGpu=*/false);
-    cpuOutput_->copyFrom(*output);
-    IVector::resizeOrCreate(cpuLabel_, height, false);
-    cpuLabel_->copyFrom(*label);
-
-    if (supportWeight) {
-      Matrix::resizeOrCreate(cpuWeight_, height, (size_t)1, false, false);
-      cpuWeight_->copyFrom(*weight);
-    }
-
-    output = cpuOutput_;
-    label = cpuLabel_;
-    weight = cpuWeight_;
-  }
-
-  for (size_t i = 0; i < output->getHeight(); i++) {
-    float y1 = output->getData()[i * output->getWidth() + 1];
-    int* labels = label->getData();
-    predictArray_.push_back(PredictionResult(y1, labels[i]));
-  }
-  std::vector<Argument> arguments;
-  if (3 == inputLayers_.size()) {
-    arguments.resize(3);
-    arguments[2].value = weight;
-  } else {
-    arguments.resize(2);
-  }
-  arguments[0].value = output;
-  arguments[1].ids = label;
-  evaluator_->evalImp(arguments);
-}
-
-void AucValidation::onPassEnd() {
-  if (!FLAGS_predict_file.empty()) {
-    std::ofstream fs(FLAGS_predict_file);
-    CHECK(fs) << "Fail to open " << FLAGS_predict_file;
-    for (auto& res : predictArray_) {
-      fs << res.out << " " << res.label << std::endl;
-    }
-  }
-
-  evaluator_->finish();
-  LOG(INFO) << *evaluator_;
-  passBegin_ = false;
-  predictArray_.clear();
-}
-
-bool PnpairValidation::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  bool ret = ValidationLayer::init(layerMap, parameterMap);
-  if (!ret) return ret;
-  CHECK_GE(inputLayers_.size(), 3UL);
-  CHECK_LE(inputLayers_.size(), 4UL);
-  EvaluatorConfig config;
-  config.set_name(getName());
-  config.set_type("pnpair");
-  config.add_input_layers(inputLayers_[0]->getName());
-  config.add_input_layers(inputLayers_[1]->getName());
-  config.add_input_layers(inputLayers_[2]->getName());
-  if (4 == inputLayers_.size()) {
-    config.add_input_layers(inputLayers_[3]->getName());
-  }
-  evaluator_.reset(Evaluator::create(config));
-  passBegin_ = false;
-  return true;
-}
-
-void PnpairValidation::validationImp(MatrixPtr output, IVectorPtr label) {
-  if (!passBegin_) {
-    passBegin_ = true;
-    evaluator_->start();
-  }
-  MatrixPtr weight =
-      (4 == inputLayers_.size()) ? getInputValue(*inputLayers_[3]) : nullptr;
-  IVectorPtr info = getInputLabel(*getInfoLayer());
-  std::vector<Argument> arguments;
-  if (4 == inputLayers_.size()) {
-    arguments.resize(4);
-    arguments[3].value = weight;
-  } else {
-    arguments.resize(3);
-  }
-  arguments[0].value = output;
-  arguments[1].ids = label;
-  arguments[2].ids = info;
-  evaluator_->evalImp(arguments);
-}
-
-void PnpairValidation::onPassEnd() {
-  if (!FLAGS_predict_file.empty()) {
-    (dynamic_cast<PnpairEvaluator*>(evaluator_.get()))->printPredictResults();
-  }
-  evaluator_->finish();
-  LOG(INFO) << *evaluator_;
-  passBegin_ = false;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ValidationLayer.h b/paddle/legacy/gserver/layers/ValidationLayer.h
deleted file mode 100644
index fbc94e8ef57..00000000000
--- a/paddle/legacy/gserver/layers/ValidationLayer.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-
-#include "Layer.h"
-#include "paddle/legacy/gserver/evaluators/Evaluator.h"
-
-DECLARE_int32(trainer_id);
-
-namespace paddle {
-
-class ValidationLayer : public Layer {
- public:
-  explicit ValidationLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  LayerPtr getOutputLayer() { return inputLayers_[0]; }
-
-  LayerPtr getLabelLayer() { return inputLayers_[1]; }
-
-  LayerPtr getInfoLayer() {
-    assert(inputLayers_.size() > 2);
-    return inputLayers_[2];
-  }
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-  virtual void validationImp(MatrixPtr outputValue, IVectorPtr label) = 0;
-
-  void onPassEnd() override = 0;
-};
-
-/*
- * AucValidation
- */
-class AucValidation : public ValidationLayer {
- public:
-  explicit AucValidation(const LayerConfig& config)
-      : ValidationLayer(config),
-        cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
-
-  void onPassEnd() override;
-
-  struct PredictionResult {
-    PredictionResult(real __out, int __label) : out(__out), label(__label) {}
-    real out;
-    int label;
-  };
-  std::vector<PredictionResult> predictArray_;
-
- private:
-  bool passBegin_;
-  std::unique_ptr<Evaluator> evaluator_;
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-};
-
-/*
- * positive-negative pair rate Validation
- */
-class PnpairValidation : public ValidationLayer {
- public:
-  explicit PnpairValidation(const LayerConfig& config)
-      : ValidationLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
-
-  void onPassEnd() override;
-
- private:
-  bool passBegin_;
-  std::unique_ptr<Evaluator> evaluator_;
-};
-
-typedef std::shared_ptr<ValidationLayer> ValidationLayerPtr;
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/WarpCTCLayer.cpp b/paddle/legacy/gserver/layers/WarpCTCLayer.cpp
deleted file mode 100644
index 6b1656a523d..00000000000
--- a/paddle/legacy/gserver/layers/WarpCTCLayer.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "WarpCTCLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(warp_ctc, WarpCTCLayer);
-
-bool WarpCTCLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  /* Initialize the basic parament class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2UL);
-
-  /* The inputLayers_[0] must be sequence output without softmax */
-  numClasses_ = config_.size();
-  CHECK_GE(numClasses_, 2UL);
-  CHECK_EQ(numClasses_, inputLayers_[0]->getSize());
-
-  blank_ = config_.blank();
-  CHECK_LT(blank_, numClasses_);
-
-  normByTimes_ = config_.norm_by_times();
-
-  // We don't need sequenceStartPositions because each sample of output_ is
-  // for the cost of one sequence.
-  setNeedSequenceInfo(false);
-
-  return true;
-}
-
-void WarpCTCLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& output = getInput(0);
-  const Argument& labels = getInput(1);
-
-  CHECK(output.sequenceStartPositions);
-  CHECK(labels.sequenceStartPositions);
-  CHECK(labels.ids);
-
-  size_t numSequences = labels.sequenceStartPositions->getSize() - 1;
-  CHECK_EQ(numSequences, output.sequenceStartPositions->getSize() - 1);
-
-  resizeOutput(numSequences, 1);
-
-  const int* cpuLabelStartPositions =
-      labels.sequenceStartPositions->getData(false);
-  const int* cpuOutputStartPositions =
-      output.sequenceStartPositions->getData(false);
-
-  std::vector<int> cpuLabelLengths(numSequences);
-  std::vector<int> cpuOutputLengths(numSequences);
-  for (size_t i = 0; i < numSequences; i++) {
-    cpuLabelLengths[i] =
-        cpuLabelStartPositions[i + 1] - cpuLabelStartPositions[i];
-    cpuOutputLengths[i] =
-        cpuOutputStartPositions[i + 1] - cpuOutputStartPositions[i];
-  }
-
-  /* Get the maximum sequence length */
-  maxSequenceLength_ = 0;
-  maxSequenceLength_ = *std::max_element(
-      cpuOutputLengths.data(), cpuOutputLengths.data() + numSequences);
-
-  Matrix::resizeOrCreate(batchValue_,
-                         /* height */ numSequences * maxSequenceLength_,
-                         /* width */ numClasses_,
-                         /* trans */ false,
-                         /* useGpu */ useGpu_);
-
-  Matrix::resizeOrCreate(batchGrad_,
-                         /* height */ numSequences * maxSequenceLength_,
-                         /* width */ numClasses_,
-                         /* trans */ false,
-                         /* useGpu */ useGpu_);
-  batchGrad_->zeroMem();
-
-  seq2batchPadding(output.value, batchValue_, output.sequenceStartPositions);
-
-  /* labels always in CPU memory */
-  IVector::resizeOrCreate(cpuLabels_,
-                          /* size */ (labels.ids)->getSize(),
-                          /* useGpu */ false);
-  cpuLabels_->copyFrom(*(labels.ids));
-
-  /* labels always in CPU memory */
-  Matrix::resizeOrCreate(cpuCosts_,
-                         /* height */ numSequences,
-                         /* width */ 1,
-                         /* trans */ false,
-                         /* useGpu */ false);
-
-  /* Init warp-ctc options */
-  hl_warpctc_options_t options;
-  hl_warpctc_init(blank_, useGpu_, &options);
-
-  /* Get the needed workspace size */
-  size_t workspaceBytes = 0;
-  hl_warpctc_get_workspace_size(cpuLabelLengths.data(),
-                                cpuOutputLengths.data(),
-                                numClasses_,
-                                numSequences,
-                                &options,
-                                &workspaceBytes);
-  CHECK_GT(workspaceBytes, 0UL);
-
-  size_t workspaceLength = workspaceBytes / sizeof(real) + 1;
-  Vector::resizeOrCreate(workspace_,
-                         /* size */ workspaceLength,
-                         /* useGpu */ useGpu_);
-
-  hl_warpctc_compute_loss(batchValue_->getData(),
-                          batchGrad_->getData(),
-                          cpuLabels_->getData(),
-                          cpuLabelLengths.data(),
-                          cpuOutputLengths.data(),
-                          numClasses_,
-                          numSequences,
-                          cpuCosts_->getData(),
-                          workspace_->getData(),
-                          &options);
-
-  /* Copy the costs */
-  output_.value->copyFrom(*cpuCosts_);
-}
-
-void WarpCTCLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  const Argument& output = getInput(0);
-  CHECK(batchGrad_);
-
-  batch2seqPadding(
-      output.grad, batchGrad_, output.sequenceStartPositions, normByTimes_);
-}
-
-void WarpCTCLayer::seq2batchPadding(const MatrixPtr& seqValue,
-                                    MatrixPtr& batchValue,
-                                    const ICpuGpuVectorPtr& seqStartPositions) {
-  size_t numSequences = seqStartPositions->getSize() - 1;
-  const int* seqStartPositionsData = seqStartPositions->getData(useGpu_);
-
-  real* seqData = seqValue->getData();
-  real* batchData = batchValue->getData();
-  if (useGpu_) {
-    hl_sequence2batch_copy_padding(batchData,
-                                   seqData,
-                                   seqStartPositionsData,
-                                   numClasses_,
-                                   maxSequenceLength_,
-                                   numSequences,
-                                   false,
-                                   true);
-  } else {
-    for (size_t i = 0; i < maxSequenceLength_; i++) {
-      for (size_t j = 0; j < numSequences; j++) {
-        size_t sequenceStart = seqStartPositionsData[j];
-        size_t sequenceLength =
-            seqStartPositionsData[j + 1] - seqStartPositionsData[j];
-        if (i < sequenceLength) {
-          memcpy(batchData + (i * numSequences + j) * numClasses_,
-                 seqData + (sequenceStart + i) * numClasses_,
-                 numClasses_ * sizeof(real));
-        } else {
-          memset(batchData + (i * numSequences + j) * numClasses_,
-                 0,
-                 numClasses_ * sizeof(real));
-        }
-      }
-    }
-  }
-}
-
-void WarpCTCLayer::batch2seqPadding(const MatrixPtr& seqValue,
-                                    MatrixPtr& batchValue,
-                                    const ICpuGpuVectorPtr& seqStartPositions,
-                                    bool normByTimes) {
-  size_t numSequences = seqStartPositions->getSize() - 1;
-  const int* seqStartPositionsData = seqStartPositions->getData(useGpu_);
-
-  real* seqData = seqValue->getData();
-  real* batchData = batchValue->getData();
-  if (useGpu_) {
-    hl_sequence2batch_copy_padding(batchData,
-                                   seqData,
-                                   seqStartPositionsData,
-                                   numClasses_,
-                                   maxSequenceLength_,
-                                   numSequences,
-                                   normByTimes,
-                                   false);
-  } else {
-    for (size_t i = 0; i < numSequences; i++) {
-      int sequenceStart = seqStartPositionsData[i];
-      int sequenceLength =
-          seqStartPositionsData[i + 1] - seqStartPositionsData[i];
-      real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
-      for (int j = 0; j < sequenceLength; j++) {
-        for (size_t k = 0; k < numClasses_; k++) {
-          seqData[(sequenceStart + j) * numClasses_ + k] =
-              batchData[(j * numSequences + i) * numClasses_ + k] * scale;
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/WarpCTCLayer.h b/paddle/legacy/gserver/layers/WarpCTCLayer.h
deleted file mode 100644
index 3017ca794ec..00000000000
--- a/paddle/legacy/gserver/layers/WarpCTCLayer.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * @brief A layer integrating the open-source warp-ctc library
- *        <https://github.com/baidu-research/warp-ctc> to compute connectionist
- *        temporal classification cost.
- *
- * The config file api is warp_ctc_layer.
- */
-class WarpCTCLayer : public Layer {
- public:
-  explicit WarpCTCLayer(const LayerConfig& config) : Layer(config) {}
-  ~WarpCTCLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
- protected:
-  /**
-   * sequence matrix and batch matrix copy:
-   * sequence (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
-   * batch    (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0)
-   */
-  void seq2batchPadding(const MatrixPtr& seqValue,
-                        MatrixPtr& batchValue,
-                        const ICpuGpuVectorPtr& seqStartPositions);
-  void batch2seqPadding(const MatrixPtr& seqValue,
-                        MatrixPtr& batchValue,
-                        const ICpuGpuVectorPtr& seqStartPositions,
-                        bool normByTimes);
-
- protected:
-  size_t numClasses_;
-  size_t blank_;
-  size_t maxSequenceLength_;
-  bool normByTimes_;
-
-  MatrixPtr batchValue_;
-  MatrixPtr batchGrad_;
-  VectorPtr workspace_;
-
-  IVectorPtr cpuLabels_;
-  MatrixPtr cpuCosts_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/tests/.gitignore b/paddle/legacy/gserver/tests/.gitignore
deleted file mode 100644
index 7f1845d7ec4..00000000000
--- a/paddle/legacy/gserver/tests/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-pyDataProviderBase.py
diff --git a/paddle/legacy/gserver/tests/CMakeLists.txt b/paddle/legacy/gserver/tests/CMakeLists.txt
deleted file mode 100644
index 93ddf5aa233..00000000000
--- a/paddle/legacy/gserver/tests/CMakeLists.txt
+++ /dev/null
@@ -1,103 +0,0 @@
-# gserver pacakge unittests
-add_simple_unittest(test_LinearChainCRF)
-add_simple_unittest(test_RecurrentLayer)
-
-if(NOT MOBILE_INFERENCE)
-  add_simple_unittest(test_MultinomialSampler)
-endif()
-
-function(gserver_test TARGET)
-  add_unittest_without_exec(${TARGET}
-      ${TARGET}.cpp
-      LayerGradUtil.cpp)
-  add_test(NAME ${TARGET}
-      COMMAND ${TARGET})
-endfunction()
-
-add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/concat_dotmul_a.conf
-    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
-)
-add_custom_target(copy_gserver_conf ALL DEPENDS concat_dotmul_a.conf)
-
-gserver_test(test_LayerGrad)
-gserver_test(test_CRFLayerGrad)
-gserver_test(test_CrossEntropyOverBeamGrad)
-gserver_test(test_SeqSliceLayerGrad)
-gserver_test(test_ActivationGrad)
-gserver_test(test_ConvTrans)
-gserver_test(test_PriorBox)
-gserver_test(test_DetectionOutput)
-gserver_test(test_ConvUnify)
-gserver_test(test_BatchNorm)
-gserver_test(test_KmaxSeqScore)
-gserver_test(test_Expand)
-gserver_test(test_MaxPoolingWithMaskOutput)
-gserver_test(test_Upsample)
-
-set(PYTHON_PATH 
-   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
-   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/legacy/gserver/tests)
-function(gserver_test_with_python TARGET)
-  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
-  add_test(NAME ${TARGET}
-    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
-      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
-endfunction()
-
-gserver_test_with_python(test_PyDataProvider2)
-if(WITH_PYTHON)
-    gserver_test_with_python(test_PyDataProvider)
-endif()
-if(NOT MOBILE_INFERENCE)
-    gserver_test_with_python(test_CompareTwoNets)
-    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it.
-    gserver_test_with_python(test_RecurrentGradientMachine)
-endif()
-
-########## test_MKLDNN layers and activations ##########
-if(WITH_MKLDNN)
-    add_unittest_without_exec(test_MKLDNN
-        test_MKLDNN.cpp
-        MKLDNNTester.cpp
-        LayerGradUtil.cpp)
-    add_test(NAME test_MKLDNN
-        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
-            WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
-endif()
-
-############### test_WarpCTCLayer #######################
-if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
-    add_unittest_without_exec(test_WarpCTCLayer
-        test_WarpCTCLayer.cpp)
-    add_test(NAME test_WarpCTCLayer
-        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
-        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
-endif()
-
-if(NOT MOBILE_INFERENCE)
-    ################## test_Evaluator #############
-    add_unittest(test_Evaluator
-        test_Evaluator.cpp)
-      
-    ########### test_NetworkCompare ###############
-    add_unittest_without_exec(test_NetworkCompare
-        test_NetworkCompare.cpp)
-    if(WITH_GPU)
-        set(use_gpu true)
-    else()
-        set(use_gpu false)
-    endif()
-    add_test(NAME test_NetworkCompare
-        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
-        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
-
-    ############ test_CompareSparse ################
-    add_unittest_without_exec(test_CompareSparse
-        test_CompareSparse.cpp)
-    if(NOT ON_TRAVIS)
-      add_test(NAME test_CompareSparse
-        COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port -n 6
-                ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
-    endif()
-endif()
diff --git a/paddle/legacy/gserver/tests/LayerGradUtil.cpp b/paddle/legacy/gserver/tests/LayerGradUtil.cpp
deleted file mode 100644
index f08c1cd1d50..00000000000
--- a/paddle/legacy/gserver/tests/LayerGradUtil.cpp
+++ /dev/null
@@ -1,854 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LayerGradUtil.h"
-
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-namespace paddle {
-real getCostSum(LayerPtr& testLayer, MatrixPtr weights) {
-  testLayer->forward(PASS_GC);
-  std::vector<Argument> outArgs;
-  outArgs.push_back(testLayer->getOutput());
-  if (weights) {
-    outArgs[0].value->dotMul(*outArgs[0].value, *weights);
-  }
-  return Argument::sum(outArgs);
-}
-
-real getDiffAndPrint(real newCost1,
-                     real newCost2,
-                     real callbackCount,
-                     char fill,
-                     string testLayerName,
-                     string name,
-                     real step,
-                     real delta) {
-  EXPECT_FALSE(std::isnan(newCost1));
-  EXPECT_FALSE(std::isnan(newCost2));
-
-  real trueDelta = (newCost1 - newCost2) * (callbackCount / 2.);
-  real diff = (1e-20 + trueDelta) / (1e-20 + delta) - 1;
-  LOG(INFO) << setiosflags(ios::left) << setfill(fill) << setw(20)
-            << testLayerName << " " << setw(20) << name << "step=" << setw(15)
-            << step << "cost1=" << setw(10) << newCost1 << "cost2=" << setw(10)
-            << newCost2 << "true_delta=" << setw(15) << trueDelta
-            << "analytic_delta=" << setw(15) << delta << "diff=" << diff
-            << (abs(diff) > 0.01 ? " ***" : "");
-  if (fabs(diff - 1) < 0.02) {
-    LOG(INFO) << "The previous diff might be caused by not accumulating"
-              << " parameter gradients in backward()";
-  }
-  return diff;
-}
-
-void testState(LayerPtr testLayer,
-               vector<DataLayerPtr>& dataLayers,
-               vector<Argument>& datas) {
-  auto batchSize = datas[0].getBatchSize();
-  Argument data;
-  ICpuGpuVectorPtr sequenceStartPositions =
-      ICpuGpuVector::create(2, /* useGpu= */ false);
-  sequenceStartPositions->getMutableData(false)[0] = 0;
-  sequenceStartPositions->getMutableData(false)[1] = batchSize;
-  data.sequenceStartPositions = sequenceStartPositions;
-  testLayer->resetState();
-  for (size_t j = 0; j < datas.size(); ++j) {
-    if (datas[j].value) {
-      data.value = datas[j].value;
-    }
-    if (datas[j].ids) {
-      data.ids = datas[j].ids;
-    }
-    dataLayers[j]->setData(data);
-    dataLayers[j]->forward(PASS_TEST);
-  }
-  testLayer->forward(PASS_TEST);
-  Argument batchOut;
-  batchOut.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
-
-  sequenceStartPositions->getMutableData(false)[1] = 1;
-  testLayer->resetState();
-
-  auto testLayerState = [&](int batchId) {
-    for (size_t j = 0; j < datas.size(); ++j) {
-      if (datas[j].value) {
-        data.value = datas[j].value->subMatrix(batchId, 1);
-      }
-      if (datas[j].ids) {
-        data.ids = IVector::create(
-            datas[j].ids->getData() + batchId, 1, FLAGS_use_gpu);
-      }
-      dataLayers[j]->setData(data);
-      dataLayers[j]->forward(PASS_TEST);
-    }
-
-    testLayer->forward(PASS_TEST);
-    Argument out;
-    out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
-    if (batchOut.value) {
-      size_t dim = batchOut.value->getWidth();
-      ASSERT_TRUE((bool)out.value);
-      EXPECT_EQ(dim, out.value->getWidth());
-      EXPECT_EQ(1UL, out.value->getHeight());
-      auto ret = std::mismatch(batchOut.value->getData() + batchId * dim,
-                               batchOut.value->getData() + (batchId + 1) * dim,
-                               out.value->getData());
-      if (ret.second != out.value->getData() + dim) {
-        // If reaches here, the test will fail
-        EXPECT_EQ(*ret.first, *ret.second);
-      }
-    } else if (batchOut.ids) {
-      ASSERT_TRUE((bool)out.ids);
-      EXPECT_EQ(1UL, out.ids->getSize());
-      EXPECT_EQ(batchOut.ids->getElement(batchId), out.ids->getElement(0));
-    }
-  };
-
-  CHECK_GT(batchSize, 0);
-  std::vector<LayerStatePtr> statePtrs;
-  statePtrs.reserve(batchSize);
-
-  // Test layer setState() and getState()
-  for (int i = 0; i < batchSize; ++i) {
-    statePtrs.push_back(testLayer->getState());
-    testLayerState(i);
-  }
-  for (int k = 0; k < batchSize - 1; ++k) {
-    testLayer->setState(statePtrs[k]);
-    for (int i = k; i < batchSize; ++i) {
-      testLayerState(i);
-    }
-  }
-}
-
-void testBatchState(LayerPtr testLayer,
-                    vector<DataLayerPtr>& dataLayers,
-                    vector<Argument>& datas) {
-  auto batchSize = datas[0].getBatchSize();
-  Argument data;
-  /*two sequences*/
-  size_t numSequences = 2;
-  ICpuGpuVectorPtr sequenceStartPositions =
-      ICpuGpuVector::create(numSequences + 1, /* useGpu= */ false);
-  int* cpuStarts = sequenceStartPositions->getMutableData(false);
-  int len = ::rand() % (batchSize - 1);
-  cpuStarts[0] = 0;
-  cpuStarts[1] = len > 0 ? len : 1;
-  cpuStarts[2] = batchSize;
-
-  data.sequenceStartPositions = sequenceStartPositions;
-  for (size_t j = 0; j < datas.size(); ++j) {
-    if (datas[j].value) {
-      data.value = datas[j].value;
-    }
-    if (datas[j].ids) {
-      data.ids = datas[j].ids;
-    }
-    dataLayers[j]->setData(data);
-    dataLayers[j]->forward(PASS_TEST);
-  }
-  testLayer->resetState();
-  testLayer->forward(PASS_TEST);
-  Argument batchOut;
-  batchOut.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
-
-  /*split one miniBatch into two miniBatchs*/
-  std::vector<int> seqSplitPos;
-  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-    int len = ::rand() % (cpuStarts[seqId + 1] - cpuStarts[seqId]);
-    len = len > 0 ? len : 1;
-    seqSplitPos.push_back(cpuStarts[seqId] + len);
-  }
-
-  std::vector<int> start; /*seq start pos in source data*/
-  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-    start.push_back(cpuStarts[seqId]);
-  }
-  testLayer->resetState();
-  Argument splitData;
-  for (size_t batchId = 0; batchId < 2; ++batchId) {
-    size_t splitBatchSize = 0;
-    std::vector<int> seqLens;
-    for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-      int seqLen = (batchId == 0) ? seqSplitPos[seqId] - cpuStarts[seqId]
-                                  : cpuStarts[seqId + 1] - seqSplitPos[seqId];
-      seqLens.push_back(seqLen);
-      splitBatchSize += seqLen;
-    }
-    ICpuGpuVectorPtr cpuSeqStartPos =
-        ICpuGpuVector::create(3, /* useGpu= */ false);
-    int* seqStartPosData = cpuSeqStartPos->getMutableData(false);
-    seqStartPosData[0] = 0;
-    seqStartPosData[1] = seqLens[0];
-    seqStartPosData[2] = splitBatchSize;
-
-    CHECK_GT(splitBatchSize, size_t(0));
-    splitData.sequenceStartPositions = cpuSeqStartPos;
-    for (size_t j = 0; j < datas.size(); ++j) {
-      if (datas[j].value) {
-        Matrix::resizeOrCreate(splitData.value,
-                               splitBatchSize,
-                               datas[j].value->getWidth(),
-                               false,
-                               FLAGS_use_gpu);
-        for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-          if (seqLens[seqId]) {
-            splitData.value->subMatrix(seqStartPosData[seqId], seqLens[seqId])
-                ->copyFrom(
-                    *datas[j].value->subMatrix(start[seqId], seqLens[seqId]));
-          }
-        }
-      }
-      if (datas[j].ids) {
-        IVector::resizeOrCreate(splitData.ids, splitBatchSize, FLAGS_use_gpu);
-        for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-          if (seqLens[seqId]) {
-            splitData.ids->subVec(seqStartPosData[seqId], seqLens[seqId])
-                ->copyFrom(*datas[j].ids->subVec(start[seqId], seqLens[seqId]));
-          }
-        }
-      }
-      dataLayers[j]->setData(splitData);
-      dataLayers[j]->forward(PASS_TEST);
-    }
-
-    testLayer->forward(PASS_TEST);
-    Argument out;
-    out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
-    if (batchOut.value) {
-      size_t dim = batchOut.value->getWidth();
-      ASSERT_TRUE((bool)out.value);
-      EXPECT_EQ(dim, out.value->getWidth());
-      for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-        if (seqLens[seqId]) {
-          out.value->subMatrix(seqStartPosData[seqId], seqLens[seqId])
-              ->sub(*batchOut.value->subMatrix(start[seqId], seqLens[seqId]));
-        }
-      }
-    }
-
-    std::vector<Argument> args;
-    args.push_back(out);
-    ASSERT_NEAR(0, Argument::sum(args), 1e-5) << "testBatchState failed";
-    for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-      start[seqId] += seqLens[seqId];
-    }
-  }
-}
-
-double genPerturbation(const real* oldGrad, real* newGrad, size_t dim) {
-  double gradNorm = 0, dNorm = 0;
-  for (size_t i = 0; i < dim; ++i) {
-    newGrad[i] = 2. * rand() / RAND_MAX - 1;  // NOLINT
-    dNorm += newGrad[i] * newGrad[i];
-    gradNorm += oldGrad[i] * oldGrad[i];
-  }
-  if (gradNorm > 0) {
-    real s = 0.5 * sqrt(gradNorm / dNorm);
-    for (size_t i = 0; i < dim; ++i) {
-      newGrad[i] = s * newGrad[i] + oldGrad[i];
-    }
-  }
-  double delta = 0;
-  for (size_t i = 0; i < dim; ++i) {
-    delta += oldGrad[i] * newGrad[i];
-  }
-  return delta;
-}
-
-void initWeight(MatrixPtr& weights) {
-  MatrixPtr tmpMat = weights->clone();
-  for (int i = 0; i < int(tmpMat->getElementCnt()); i++) {
-    tmpMat->getData()[i] = (11 - 2 * (i % 11));
-  }
-  weights->copyFrom(*tmpMat);
-}
-
-void initBatchState(LayerPtr dataLayer,
-                    LayerPtr testLayer,
-                    LayerStatePtr state,
-                    bool useGpu) {
-  int sequenceNum = dataLayer->getOutput().getNumSequences();
-  MatrixPtr prevBatchOutput =
-      Matrix::create(sequenceNum, testLayer->getSize(), false, useGpu);
-  MatrixPtr prevBatchState =
-      Matrix::create(sequenceNum, testLayer->getSize(), false, useGpu);
-  prevBatchOutput->randomizeUniform();
-  prevBatchState->randomizeUniform();
-  state->value.clear();
-  state->value.push_back(prevBatchOutput);
-  state->value.push_back(prevBatchState);
-}
-
-void initDataLayer(TestConfig testConf,
-                   std::vector<DataLayerPtr>* dataLayers,
-                   vector<Argument>* datas,
-                   LayerMap* layerMap,
-                   string testLayerName,
-                   size_t batchSize,
-                   bool trans,
-                   bool useGpu) {
-  ICpuGpuVectorPtr sequenceStartPositions;
-  ICpuGpuVectorPtr subSequenceStartPositions;
-  IVectorPtr cpuSequenceDims;
-  for (size_t i = 0; i < testConf.inputDefs.size(); ++i) {
-    if (testConf.inputDefs[i].inputType != INPUT_SEQUENCE_LABEL) continue;
-
-    const std::vector<int>& labelSeqStartPositions =
-        testConf.inputDefs[i].labelSeqStartPositions;
-    if (labelSeqStartPositions.size() != 0) {
-      CHECK(!sequenceStartPositions);
-      CHECK_GE(static_cast<int>(labelSeqStartPositions.size()), 2);
-
-      sequenceStartPositions =
-          ICpuGpuVector::create(labelSeqStartPositions.size(), useGpu);
-      sequenceStartPositions->copyFrom(
-          labelSeqStartPositions.data(), labelSeqStartPositions.size(), useGpu);
-    }
-  }
-
-  for (size_t i = 0; i < testConf.inputDefs.size(); ++i) {
-    LayerConfig config;
-    config.set_name(testConf.inputDefs[i].name);
-    config.set_type("data");
-    config.set_size(testConf.inputDefs[i].dim);
-    LayerPtr layer = LayerPtr(new DataLayer(config));
-    size_t numSequence = sequenceStartPositions
-                             ? sequenceStartPositions->getSize() - 1
-                             : batchSize / 10 + 1;
-
-    Argument data;
-    auto fillData = [&](bool trans, int height, int width) {
-      int newHeight = trans ? height : width;
-      int newWidth = trans ? width : height;
-      data.value = Matrix::create(newHeight, newWidth, false, useGpu);
-      data.grad = Matrix::create(newHeight, newWidth, false, useGpu);
-    };
-    switch (testConf.inputDefs[i].inputType) {
-      case INPUT_DATA:
-      case INPUT_SEQUENCE_DATA:
-      case INPUT_HASSUB_SEQUENCE_DATA:
-      case INPUT_DATA_TARGET:
-      case INPUT_SEQUENCE_MDIM_DATA:
-        fillData(trans, layer->getSize(), batchSize);
-        data.value->randomizeUniform();
-        // make sure that multi-class-cross-entry won't encounter negatives
-        // make sure that multi_binary_label satisfies 0~1
-        data.value->add(-0.5);
-        if (testLayerName != "prelu") {
-          data.value->sigmoid(*data.value);
-        }
-        data.grad->zeroMem();
-        break;
-      case INPUT_LABEL:
-      case INPUT_SEQUENCE_LABEL:
-        if (testConf.inputDefs[i].labelInitValue.size() != 0) {
-          const std::vector<int>& labelInitValue =
-              testConf.inputDefs[i].labelInitValue;
-          CHECK_EQ(labelInitValue.size(), batchSize);
-          data.ids = VectorT<int>::create(batchSize, useGpu);
-          data.ids->copyFrom(labelInitValue.data(), batchSize);
-        } else {
-          data.ids = VectorT<int>::create(batchSize, useGpu);
-          // now rand number can be 0 to inputDefs[i].dim
-          data.ids->rand(testConf.inputDefs[i].dim);
-        }
-        break;
-      case INPUT_SPARSE_NON_VALUE_DATA:
-        data.value = makeRandomSparseMatrix(
-            batchSize,
-            layer->getSize(),
-            /* withValue= */ false,
-            useGpu,
-            testConf.inputDefs[i].sparse.equalNnzPerSample);
-        break;
-      case INPUT_SPARSE_FLOAT_VALUE_DATA:
-        data.value = makeRandomSparseMatrix(batchSize,
-                                            layer->getSize(),
-                                            /* withValue= */ true,
-                                            useGpu);
-        break;
-      case INPUT_DENSE_DIM_DATA:
-        fillData(trans, layer->getSize(), numSequence);
-        data.value->randomizeUniform();
-        data.value->add(-0.5);
-        data.value->sigmoid(*data.value);
-        data.grad->zeroMem();
-        break;
-      case INPUT_SELF_DEFINE_DATA: {
-        if (testConf.inputDefs[i].ids.size()) {
-          data.ids = IVector::create(testConf.inputDefs[i].ids.size(), useGpu);
-          data.ids->copyFrom(testConf.inputDefs[i].ids.data(),
-                             testConf.inputDefs[i].ids.size());
-        } else if (testConf.inputDefs[i].selfDefinedData) {
-          size_t height = testConf.inputDefs[i].selfDefinedData->getHeight();
-          size_t width = testConf.inputDefs[i].selfDefinedData->getWidth();
-          CHECK_GT(static_cast<int>(height), 0);
-          CHECK_GT(static_cast<int>(width), 0);
-          data.value = Matrix::create(height, width, false, useGpu);
-          data.grad = Matrix::create(height, width, false, useGpu);
-          data.value->copyFrom(*testConf.inputDefs[i].selfDefinedData);
-          data.grad->zeroMem();
-        } else {
-          LOG(FATAL) << "No self-defined data are given.";
-          return;
-        }
-
-        const std::vector<int>& labelSeqStartPositions =
-            testConf.inputDefs[i].labelSeqStartPositions;
-        if (labelSeqStartPositions.size() != 0) {
-          CHECK_GE(static_cast<int>(labelSeqStartPositions.size()), 2);
-
-          sequenceStartPositions =
-              ICpuGpuVector::create(labelSeqStartPositions.size(), useGpu);
-          sequenceStartPositions->copyFrom(labelSeqStartPositions.data(),
-                                           labelSeqStartPositions.size(),
-                                           useGpu);
-          data.sequenceStartPositions = sequenceStartPositions;
-        }
-
-        const std::vector<int>& labelSubSeqStartPositions =
-            testConf.inputDefs[i].labelSubSeqStartPositions;
-        if (labelSubSeqStartPositions.size() != 0) {
-          CHECK_GE(static_cast<int>(labelSubSeqStartPositions.size()), 2);
-
-          subSequenceStartPositions =
-              ICpuGpuVector::create(labelSubSeqStartPositions.size(), useGpu);
-          subSequenceStartPositions->copyFrom(labelSubSeqStartPositions.data(),
-                                              labelSubSeqStartPositions.size(),
-                                              useGpu);
-          data.subSequenceStartPositions = subSequenceStartPositions;
-        }
-        break;
-      }
-      default:
-        LOG(FATAL) << " unknown inputType ";
-        return;
-    }
-    if (testConf.inputDefs[i].inputType == INPUT_SEQUENCE_DATA ||
-        testConf.inputDefs[i].inputType == INPUT_HASSUB_SEQUENCE_DATA ||
-        testConf.inputDefs[i].inputType == INPUT_SEQUENCE_LABEL ||
-        testConf.inputDefs[i].inputType == INPUT_SEQUENCE_MDIM_DATA) {
-      if (!sequenceStartPositions) {
-        generateSequenceStartPositions(batchSize, sequenceStartPositions);
-      }
-      data.sequenceStartPositions = sequenceStartPositions;
-    }
-    if (testConf.inputDefs[i].inputType == INPUT_HASSUB_SEQUENCE_DATA) {
-      if (!subSequenceStartPositions) {
-        generateSubSequenceStartPositions(sequenceStartPositions,
-                                          subSequenceStartPositions);
-      }
-      data.subSequenceStartPositions = subSequenceStartPositions;
-    }
-    if (testConf.inputDefs[i].inputType == INPUT_SEQUENCE_MDIM_DATA) {
-      if (!cpuSequenceDims) {
-        generateMDimSequenceData(sequenceStartPositions, cpuSequenceDims);
-      }
-      data.cpuSequenceDims = cpuSequenceDims;
-    }
-
-    DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-    dataLayer->setData(data);
-    dataLayer->forward(PASS_GC);
-    dataLayers->push_back(dataLayer);
-    (*layerMap)[config.name()] = layer;
-    datas->push_back(data);
-  }
-}
-
-void initTestLayer(TestConfig testConf,
-                   LayerMap* layerMap,
-                   std::vector<ParameterPtr>* parameters,
-                   LayerPtr* testLayer) {
-  ParameterMap parameterMap;
-  size_t index = 0;
-  LayerConfig testConfig = testConf.layerConfig;
-  CHECK_EQ(testConf.inputDefs.size(),
-           size_t(testConf.layerConfig.inputs_size()));
-
-  auto initParameter = [&](string paraName,
-                           size_t paraSize,
-                           bool isStatic,
-                           bool initialize,
-                           ParameterConfig paraConfig) {
-    paraConfig.set_name(paraName);
-    paraConfig.set_size(paraSize);
-    paraConfig.set_is_static(isStatic);
-    auto para =
-        std::make_shared<Parameter>(paraConfig, FLAGS_use_gpu, initialize);
-    para->enableType(PARAMETER_VALUE);
-    if (!para->isStatic()) {
-      para->enableType(PARAMETER_GRADIENT);
-      para->enableType(PARAMETER_MOMENTUM);
-    }
-    para->randomize();
-    para->setID(index++);
-    parameters->push_back(para);
-    parameterMap[paraConfig.name()] = para;
-  };
-
-  for (size_t i = 0; i < testConf.inputDefs.size(); i++) {
-    InputDef inputDef = testConf.inputDefs[i];
-    size_t paraSize = inputDef.paraSize;
-    bool sparse = inputDef.sparse.sparse;
-    LayerInputConfig& input = *(testConfig.mutable_inputs(i));
-    input.set_input_layer_name(inputDef.name);
-
-    if (paraSize) {
-      constexpr int kParaNameLen = 20;
-      char paraName[kParaNameLen];
-      snprintf(paraName, kParaNameLen, "para_%d", (int)i);
-      input.set_input_parameter_name(paraName);
-      ParameterConfig paraConfig;
-      paraConfig.set_is_sparse(sparse);
-      paraConfig.set_format(inputDef.sparse.format);
-      if (sparse) {
-        paraConfig.add_dims((*layerMap)[input.input_layer_name()]->getSize());
-        paraConfig.add_dims(testConf.layerConfig.size());
-      }
-      CHECK_GE(testConf.paramInitialStd, 0);
-      paraConfig.set_initial_mean(testConf.paramInitialMean);
-      paraConfig.set_initial_std(testConf.paramInitialStd);
-      initParameter(paraName, paraSize, inputDef.isStatic, false, paraConfig);
-    }
-  }
-  if (testConf.biasSize) {
-    testConfig.set_bias_parameter_name("bias");
-    ParameterConfig paraConfig;
-    initParameter(testConfig.bias_parameter_name(),
-                  testConf.biasSize,
-                  testConf.staticBias,
-                  true,
-                  paraConfig);
-  }
-
-  *testLayer = Layer::create(testConfig);
-  (*layerMap)[testConfig.name()] = *testLayer;
-  (*testLayer)->init((*layerMap), parameterMap);
-  (*testLayer)->setNeedGradient(true);
-}
-
-void testPerturbParameter(TestConfig testConf,
-                          const MatrixPtr weights,
-                          const LayerStatePtr state,
-                          real cost,
-                          real callbackCount,
-                          real* maxDiff,
-                          LayerPtr testLayer,
-                          std::vector<ParameterPtr>* parameters) {
-  char fill = ' ';
-  for (auto& parameter : *parameters) {
-    if (parameter->isStatic()) {
-      continue;
-    }
-
-    size_t dim = parameter->getSize();
-    CpuVector oldPara(dim);
-    CpuVector newPara(dim);
-    VectorPtr v = parameter->getBuf(PARAMETER_VALUE);
-    oldPara.copyFrom(*parameter->getBuf(PARAMETER_VALUE));
-    real* newp = newPara.getData();
-    real* oldp = oldPara.getData();
-    CpuVector cpuGrad(*parameter->getBuf(PARAMETER_GRADIENT));
-    vector<real> d(dim);
-
-    double delta = genPerturbation(cpuGrad.getData(), &d[0], dim);
-    // use a step such that delta / cost is FLAGS_checkgrad_eps
-    real step =
-        (delta != 0) ? cost / delta * FLAGS_checkgrad_eps : FLAGS_checkgrad_eps;
-    if (fabs(step) < 1e-6) step = 1e-6;
-    delta *= step;
-
-    // compute newCost
-    real newCost[2];
-    for (int k = 0; k < 2; k++) {
-      for (size_t i = 0; i < dim; ++i) {
-        newp[i] = (k == 0) ? oldp[i] + step * d[i] : oldp[i] - step * d[i];
-      }
-      if (testConf.testBatchState) {
-        testLayer->setState(state);
-      }
-      parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
-      parameter->setValueUpdated();
-      newCost[k] = getCostSum(testLayer, weights);
-    }
-    real diff = getDiffAndPrint(newCost[0],
-                                newCost[1],
-                                callbackCount,
-                                fill,
-                                testLayer->getName(),
-                                parameter->getName(),
-                                step,
-                                delta);
-    *maxDiff = std::max(*maxDiff, abs(diff));
-    // restore parameter
-    parameter->getBuf(PARAMETER_VALUE)->copyFrom(oldPara);
-    parameter->setValueUpdated();
-    fill = (fill == ' ') ? '.' : ' ';
-  }
-}
-
-void testPerturbInput(TestConfig testConf,
-                      const MatrixPtr weights,
-                      const LayerStatePtr state,
-                      real cost,
-                      real callbackCount,
-                      real* maxDiff,
-                      LayerPtr testLayer,
-                      std::vector<DataLayerPtr> dataLayers) {
-  char fill = ' ';
-  for (size_t index = 0; index < testConf.inputDefs.size(); index++) {
-    InputType inputType = testConf.inputDefs[index].inputType;
-    if (inputType != INPUT_DATA && inputType != INPUT_SEQUENCE_DATA &&
-        inputType != INPUT_HASSUB_SEQUENCE_DATA) {
-      continue;
-    }
-
-    MatrixPtr outV = dataLayers[index]->getOutputValue();
-    int height = outV->getHeight();
-    int width = outV->getWidth();
-    size_t dim = height * width;
-
-    CpuMatrix oldPara(height, width);
-    CpuMatrix newPara(height, width);
-    oldPara.copyFrom(*outV);
-    real* newp = newPara.getData();
-    real* oldp = oldPara.getData();
-    CpuMatrix cpuGrad(height, width);
-    cpuGrad.copyFrom(*(dataLayers[index]->getOutputGrad()));
-    CpuMatrix d(height, width);
-    real* data = d.getData();
-
-    double delta = genPerturbation(cpuGrad.getData(), data, dim);
-    // use a step such that delta / cost is FLAGS_checkgrad_eps
-    real step =
-        (delta != 0) ? cost / delta * FLAGS_checkgrad_eps : FLAGS_checkgrad_eps;
-    if (fabs(step) < 1e-6) step = 1e-6;
-    delta *= step;
-
-    real newCost[2];
-    for (int k = 0; k < 2; k++) {
-      for (size_t i = 0; i < dim; ++i) {
-        newp[i] =
-            (k == 0) ? oldp[i] + step * data[i] : oldp[i] - step * data[i];
-      }
-      if (testConf.testBatchState) {
-        testLayer->setState(state);
-      }
-      outV->copyFrom(newPara);
-      newCost[k] = getCostSum(testLayer, weights);
-    }
-
-    real diff = getDiffAndPrint(newCost[0],
-                                newCost[1],
-                                callbackCount,
-                                fill,
-                                testLayer->getName(),
-                                dataLayers[index]->getName(),
-                                step,
-                                delta);
-    *maxDiff = std::max(*maxDiff, abs(diff));
-    // restore parameter
-    outV->copyFrom(oldPara);
-    fill = (fill == ' ') ? '.' : ' ';
-  }
-}
-
-void testLayerGradKernel(TestConfig testConf,
-                         string testLayerName,
-                         size_t batchSize,
-                         bool trans,
-                         bool useGpu,
-                         bool useWeight,
-                         float epsilon) {
-#ifndef PADDLE_WITH_CUDA
-  if (useGpu) return;
-#endif
-  FLAGS_use_gpu = useGpu;
-  FLAGS_prev_batch_state = testConf.testBatchState;
-  MatrixPtr weights = nullptr;
-  testConf.layerConfig.set_name(testLayerName);
-  LOG(INFO) << " layer_type=" << testConf.layerConfig.type()
-            << " useGpu=" << useGpu;
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(testConf,
-                &dataLayers,
-                &datas,
-                &layerMap,
-                testLayerName,
-                batchSize,
-                trans,
-                useGpu);
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr testLayer;
-  initTestLayer(testConf, &layerMap, &parameters, &testLayer);
-
-  LayerStatePtr state = std::make_shared<LayerState>();
-  if (testConf.testBatchState) {
-    initBatchState(dataLayers[0], testLayer, state, useGpu);
-    testLayer->resetState();
-    testLayer->setState(state);
-  }
-
-  testLayer->forward(PASS_GC);
-  if (useWeight && weights == nullptr) {
-    weights = testLayer->getOutput().value->clone(0, 0, useGpu);
-    initWeight(weights);
-  }
-  std::vector<Argument> outArgs;
-  outArgs.push_back(testLayer->getOutput());
-  if (useWeight) {
-    outArgs[0].value = outArgs[0].value->clone(0, 0, useGpu);
-    outArgs[0].value->dotMul(*testLayer->getOutput().value, *weights);
-  }
-
-  real cost = Argument::sum(outArgs);
-  LOG(INFO) << " cost " << cost;
-  EXPECT_FALSE(std::isnan(cost));
-
-  // Test whether the callback is called for a parameter
-  if (testLayer->getOutputGrad()) {
-    useWeight ? testLayer->getOutput().grad->copyFrom(*weights)
-              : testLayer->getOutputGrad()->resetOne();
-  }
-  vector<int> callbackFlags(parameters.size(), 0);
-  auto callback = [&](Parameter* para) { ++callbackFlags[para->getID()]; };
-  testLayer->backward(callback);
-
-  // do forward and backward for another time to test that gradient is doubled
-  int callbackCount = 1;
-  if (testConf.testAccumulate) {
-    if (testConf.testBatchState) {
-      testLayer->setState(state);
-    }
-    testLayer->forward(PASS_GC);
-    if (testLayer->getOutputGrad()) {
-      useWeight ? testLayer->getOutput().grad->copyFrom(*weights)
-                : testLayer->getOutputGrad()->resetOne();
-    }
-    testLayer->backward(callback);
-    ++callbackCount;
-  }
-  for (size_t i = 0; i < parameters.size(); ++i) {
-    EXPECT_EQ(parameters[i]->isStatic() ? 0 : callbackCount, callbackFlags[i]);
-  }
-
-  // Test whether the layer's forward calculation is stable
-  // by adding perturbation to its parameters or its input layers
-  real maxDiff = 0;
-  testPerturbParameter(testConf,
-                       weights,
-                       state,
-                       cost,
-                       callbackCount,
-                       &maxDiff,
-                       testLayer,
-                       &parameters);
-  testPerturbInput(testConf,
-                   weights,
-                   state,
-                   cost,
-                   callbackCount,
-                   &maxDiff,
-                   testLayer,
-                   dataLayers);
-  EXPECT_LE(fabs(maxDiff), epsilon);
-
-  if (testConf.testState) {
-    testState(testLayer, dataLayers, datas);
-  }
-  if (testConf.testBatchState) {
-    testBatchState(testLayer, dataLayers, datas);
-  }
-}
-
-void testLayerGrad(TestConfig testConf,
-                   string testLayerName,
-                   size_t batchSize,
-                   bool trans,
-                   bool useGpu,
-                   bool useWeight,
-                   float epsilon) {
-  testLayerGradKernel(
-      testConf, testLayerName, batchSize, trans, useGpu, useWeight, epsilon);
-  bool isStaticTest = false;
-  LayerConfig testConfig = testConf.layerConfig;
-  for (size_t i = 0; i < testConf.inputDefs.size(); i++) {
-    InputDef inputDef = testConf.inputDefs[i];
-    // Some layer must set isStatic true, like DataNormLayer
-    // so use !isStatic in if
-    if (inputDef.paraSize && (!inputDef.isStatic)) {
-      testConf.inputDefs[i].isStatic = true;
-      isStaticTest = true;
-    }
-  }
-
-  if (testConf.biasSize) {
-    testConf.staticBias = true;
-    isStaticTest = true;
-  }
-  if (isStaticTest) {
-    testLayerGradKernel(
-        testConf, testLayerName, batchSize, trans, useGpu, useWeight, epsilon);
-  }
-}
-
-void testProjectionGrad(ProjectionConfig conf,
-                        InputType inputType,
-                        size_t parameterSize,
-                        size_t batchSize,
-                        bool useGpu,
-                        bool testState,
-                        int biasSize,
-                        bool sharedBias) {
-  TestConfig config;
-  conf.set_name(conf.type());
-  config.layerConfig.set_type("mixed");
-  config.layerConfig.set_size(conf.output_size());
-  config.biasSize = biasSize == 0 ? config.layerConfig.size() : biasSize;
-  config.layerConfig.set_bias_size(config.biasSize);
-  config.layerConfig.set_shared_biases(sharedBias);
-  config.inputDefs.push_back({inputType,
-                              "layer_0",
-                              static_cast<size_t>(conf.input_size()),
-                              parameterSize});
-  *config.layerConfig.add_inputs()->mutable_proj_conf() = conf;
-  config.testState = testState;
-  testLayerGrad(config, "mixed", batchSize, false, useGpu);
-}
-
-void testOperatorGrad(TestConfig& config,
-                      OperatorConfig& operatorConf,
-                      size_t batchSize,
-                      bool useGpu,
-                      bool testState) {
-  config.layerConfig.set_type("mixed");
-
-  operatorConf.set_output_size(config.layerConfig.size());
-  for (size_t i = 0; i < config.inputDefs.size(); ++i) {
-    operatorConf.add_input_indices(i);
-    operatorConf.add_input_sizes(config.inputDefs[i].dim);
-  }
-
-  config.testState = testState;
-  testLayerGrad(config, "mixed", batchSize, false, useGpu);
-}
-}  //  namespace paddle
diff --git a/paddle/legacy/gserver/tests/LayerGradUtil.h b/paddle/legacy/gserver/tests/LayerGradUtil.h
deleted file mode 100644
index 941989a1da4..00000000000
--- a/paddle/legacy/gserver/tests/LayerGradUtil.h
+++ /dev/null
@@ -1,329 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-
-#include "paddle/testing/TestUtil.h"
-using namespace std;  // NOLINT
-
-namespace paddle {
-enum InputType {
-  INPUT_DATA,         // dense vector
-  INPUT_LABEL,        // id
-  INPUT_DATA_TARGET,  // dense vector, but no gradient
-  INPUT_SEQUENCE_DATA,
-  INPUT_HASSUB_SEQUENCE_DATA,  // sequence has sub-sequence
-  INPUT_SEQUENCE_MDIM_DATA,
-  INPUT_SEQUENCE_LABEL,
-  INPUT_SPARSE_NON_VALUE_DATA,
-  INPUT_SPARSE_FLOAT_VALUE_DATA,
-  INPUT_DENSE_DIM_DATA,    // using sequence length to init dense data
-  INPUT_SELF_DEFINE_DATA,  // support customizing for input value
-};
-
-struct ParaSparse {
-  bool sparse;
-  string format;
-  // if equalNnzPerSample is set true,
-  // every row of the sparse matrix in a format of CSR has a same
-  // number of nnz values. Currently, this flag is only used for
-  // selective_fc layer
-  bool equalNnzPerSample;
-  ParaSparse(const string& formatIn = "") {  // NOLINT
-    if (formatIn == "") {
-      sparse = false;
-    } else {
-      sparse = true;
-    }
-    equalNnzPerSample = false;
-  }
-  ParaSparse(const string& formatIn, bool equalNnz) {
-    format = formatIn;
-    sparse = true;
-    equalNnzPerSample = equalNnz;
-  }
-};
-
-struct InputDef {
-  InputType inputType;
-  string name;
-  size_t dim;
-  size_t paraSize;
-  ParaSparse sparse;
-  bool isStatic;
-  std::vector<int> labelInitValue;
-  std::vector<int> labelSeqStartPositions;
-  std::vector<int> labelSubSeqStartPositions;
-  std::vector<int> ids;
-  MatrixPtr selfDefinedData;
-
-  InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) {
-    inputType = type;
-    name = nameIn;
-    dim = dimIn;
-    paraSize = sizeIn;
-    sparse = {""};
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           MatrixPtr selfDefinedData,
-           std::vector<int> selfDefinedSeqStartPos = {},
-           std::vector<int> selfDefinedSubSeqStartPos = {})
-      : labelSeqStartPositions(selfDefinedSeqStartPos),
-        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
-        selfDefinedData(selfDefinedData) {
-    inputType = type;
-    name = nameIn;
-    dim = 0;
-    sparse = {""};
-    paraSize = 0;
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           const std::vector<int>& ids,
-           const std::vector<int>& selfDefinedSeqStartPos = {},
-           const std::vector<int>& selfDefinedSubSeqStartPos = {})
-      : labelSeqStartPositions(selfDefinedSeqStartPos),
-        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
-        ids(ids) {
-    selfDefinedData = nullptr;
-    inputType = type;
-    name = nameIn;
-    dim = 0;
-    sparse = {""};
-    paraSize = 0;
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           size_t dimIn,
-           size_t sizeIn,
-           const std::vector<int>& labelInitValue,
-           const std::vector<int>& labelSeqStartPositions)
-      : labelInitValue(labelInitValue),
-        labelSeqStartPositions(labelSeqStartPositions) {
-    inputType = type;
-    name = nameIn;
-    dim = dimIn;
-    paraSize = sizeIn;
-    sparse = {""};
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           size_t dimIn,
-           size_t sizeIn,
-           ParaSparse sparseIn) {
-    inputType = type;
-    name = nameIn;
-    dim = dimIn;
-    paraSize = sizeIn;
-    sparse = sparseIn;
-  }
-};
-
-struct TestConfig {
-  LayerConfig layerConfig;
-  std::vector<InputDef> inputDefs;
-  size_t biasSize;
-  real paramInitialMean;
-  real paramInitialStd;
-  bool testAccumulate;
-  bool testState;
-  bool staticBias;
-  bool testBatchState;
-  TestConfig()
-      : biasSize(0),
-        paramInitialMean(0.0),
-        paramInitialStd(1.0),
-        testAccumulate(true),
-        testState(false),
-        staticBias(false),
-        testBatchState(false) {}
-};
-
-real getCostSum(ParameterPtr& parameter,
-                CpuVector& cpuPara,
-                LayerPtr& testLayer,
-                MatrixPtr weights = nullptr);
-
-real getDiffAndPrint(real newCost1,
-                     real newCost2,
-                     real callbackCount,
-                     char fill,
-                     string testLayerName,
-                     string name,
-                     real step,
-                     real delta);
-
-/**
- * @brief verify that sequentially running forward() one timestamp at one time
- *        has same result as running forward() with one whole sequence
- *
- * @param testLayer[in/out]    testLayer
- * @param dataLayers[in/out]   dataLayers
- * @param datas[in/out]        data of dataLayers
- */
-void testState(LayerPtr testLayer,
-               vector<DataLayerPtr>& dataLayers,
-               vector<Argument>& datas);
-
-/**
- * @brief verify that sequentially running forward() with short sequences one
- *        time has same result as running forward() with long sequences.
- *
- * @param testLayer[in/out]    testLayer
- * @param dataLayers[in/out]   dataLayers
- * @param datas[in/out]        data of dataLayers
- */
-void testBatchState(LayerPtr testLayer,
-                    vector<DataLayerPtr>& dataLayers,
-                    vector<Argument>& datas);
-
-/**
- * @brief Generate a perturbation so that it is roughly aligned with the
- *        gradient direction. This is to make sure that change along this
- *        direction will make cost increase (or decrease) in a meaningful
- *        way so that the finite difference can be used to approximate the
- *        directional dirivative well.
- *
- * @param oldGrad[in]  input gradient
- *        newGrad[out] output gradient
- *        dim          dimension of oldGrad/newGrad
- *
- * @return sum_i(oldGrad[i] * newGrad[i])
- */
-double genPerturbation(const real* oldGrad, real* newGrad, size_t dim);
-
-void initWeight(MatrixPtr& weights);
-
-void initBatchState(LayerPtr dataLayer,
-                    LayerPtr testLayer,
-                    LayerStatePtr state,
-                    bool useGpu);
-
-/**
- * @brief initialize the dataLayer by its inputType
- *
- * @param testConf[in]        test config
- *        dataLayers[out]     dataLayers
- *        datas[out]          initialized data of dataLayers
- *        layerMap[out]       layerMap
- */
-void initDataLayer(TestConfig testConf,
-                   std::vector<DataLayerPtr>* dataLayers,
-                   vector<Argument>* datas,
-                   LayerMap* layerMap,
-                   string testLayerName,
-                   size_t batchSize,
-                   bool trans,
-                   bool useGpu);
-
-/**
- * @brief initialize the parameter of testLayer
- *
- * @param testConf[in/out]    test config
- *        layerMap[out]       layerMap
- *        parameters[out]     parameters of testLayer
- *        testLayer[out]      testLayer
- */
-void initTestLayer(TestConfig testConf,
-                   LayerMap* layerMap,
-                   std::vector<ParameterPtr>* parameters,
-                   LayerPtr* testLayer);
-
-/**
- * @brief Test whether the layer's forward calculation is stable by adding
- *        perturbation to its parameters
- *
- * @param testConf[in]         test config
- *        weights[in]          weights of testLayer
- *        state[in]            state of testLayer
- *        cost[in]             input cost
- *        callbackCount[in]    number of done callback
- *        maxDiff[in/out]      max of all previous diff
- *        testLayer[in/out]    testLayer
- *        parameters[in/out]   parameters of testLayer
- */
-void testPerturbParameter(TestConfig testConf,
-                          const MatrixPtr weights,
-                          const LayerStatePtr state,
-                          real cost,
-                          real callbackCount,
-                          real* maxDiff,
-                          LayerPtr testLayer,
-                          std::vector<ParameterPtr>* parameters);
-
-/**
- * @brief Test whether the layer's forward calculation is stable by adding
- *        perturbation to its input layers
- *
- * @param testConf[in]         test config
- *        weights[in]          weights of testLayer
- *        state[in]            state of testLayer
- *        cost[in]             input cost
- *        callbackCount[in]    number of done callback
- *        maxDiff[in/out]      max of all previous diff
- *        testLayer[in/out]    testLayer
- *        dataLayers[in/out]   dataLayers
- */
-void testPerturbInput(TestConfig testConf,
-                      const MatrixPtr weights,
-                      const LayerStatePtr state,
-                      real cost,
-                      real callbackCount,
-                      real* maxDiff,
-                      LayerPtr testLayer,
-                      std::vector<DataLayerPtr> dataLayers);
-
-void testLayerGradKernel(TestConfig testConf,
-                         string testLayerName,
-                         size_t batchSize,
-                         bool trans,
-                         bool useGpu,
-                         bool useWeight = false,
-                         float epsilon = 0.02);
-
-void testLayerGrad(TestConfig testConf,
-                   string testLayerName,
-                   size_t batchSize,
-                   bool trans,
-                   bool useGpu,
-                   bool useWeight = false,
-                   float epsilon = 0.02);
-
-void testProjectionGrad(ProjectionConfig conf,
-                        InputType inputType,
-                        size_t parameterSize,
-                        size_t batchSize,
-                        bool useGpu,
-                        bool testState = false,
-                        int biasSize = 0,
-                        bool sharedBias = false);
-
-void testOperatorGrad(TestConfig& config,
-                      OperatorConfig& operatorConf,
-                      size_t batchSize,
-                      bool useGpu,
-                      bool testState = false);
-
-}  //  namespace paddle
diff --git a/paddle/legacy/gserver/tests/MKLDNNTester.cpp b/paddle/legacy/gserver/tests/MKLDNNTester.cpp
deleted file mode 100644
index b550ba9c72d..00000000000
--- a/paddle/legacy/gserver/tests/MKLDNNTester.cpp
+++ /dev/null
@@ -1,580 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNTester.h"
-#include "paddle/legacy/gserver/layers/MKLDNNBase.h"
-#include "paddle/legacy/gserver/layers/MKLDNNLayer.h"
-#include "paddle/legacy/trainer/Trainer.h"
-
-namespace paddle {
-
-// init data layer and test layer of both dnn and reference
-void MKLDNNTester::reset(const TestConfig& dnn,
-                         const TestConfig& ref,
-                         size_t batchSize) {
-  const bool trans = false;
-  const bool useGpu = false;
-
-  // clear
-  configs_.clear();
-  layerNames_.clear();
-  dataLayers_.clear();
-  datas_.clear();
-  layerMaps_.clear();
-  parameters_.clear();
-  testLayers_.clear();
-
-  // resize
-  configs_.resize(NUM);
-  layerNames_.resize(NUM);
-  dataLayers_.resize(NUM);
-  datas_.resize(NUM);
-  layerMaps_.resize(NUM);
-  parameters_.resize(NUM);
-  testLayers_.resize(NUM);
-
-  // reset configs and layer names
-  configs_[DNN] = dnn;
-  configs_[REF] = ref;
-  layerNames_[DNN] = "mkldnn";     // the first is mkldnn layer
-  layerNames_[REF] = "reference";  // second is reference layer
-
-  // reset others
-  for (size_t i = 0; i < NUM; ++i) {
-    configs_[i].layerConfig.set_name(layerNames_[i]);
-    initDataLayer(configs_[i],
-                  &(dataLayers_[i]),
-                  &(datas_[i]),
-                  &(layerMaps_[i]),
-                  layerNames_[i],
-                  batchSize,
-                  trans,
-                  useGpu);
-    initTestLayer(
-        configs_[i], &(layerMaps_[i]), &(parameters_[i]), &(testLayers_[i]));
-  }
-  refLayer_ = testLayers_[REF];
-  dnnLayer_ = testLayers_[DNN];
-  EXPECT_EQ(dataLayers_[DNN].size(), dataLayers_[REF].size());
-  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  setInputImgSize();
-
-  // for comparison with Paddle reference results,
-  // need manually add cpu device output for test
-  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
-  if (dnnLayer) {
-    dnnLayer->addOutputArgument(CPU_DEVICE);
-  }
-}
-
-void MKLDNNTester::setInputImgSize() {
-  for (size_t n = 0; n < dataLayers_.size(); ++n) {
-    for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
-      // TODO(TJ): fix me when concat and elewise ready
-      dataLayers_[n][i]->getOutput().setFrameHeight(ih_);
-      dataLayers_[n][i]->getOutput().setFrameWidth(iw_);
-    }
-  }
-}
-
-// init randome parameters of ref, and copy to mkldnn
-void MKLDNNTester::randomWgtDatas() {
-  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  const bool isBN = refLayer_->getType() == "batch_norm";
-  for (size_t i = 0; i < parameters_[REF].size(); ++i) {
-    const VectorPtr& dnnValue = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
-    const VectorPtr& refValue = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
-    parameters_[REF][i]->randomize();
-    if (isBN && i == 2) {
-      // this param is moving average in batch norm, which must larger than 0
-      real offset = fabs(refValue->getMin()) + 1.0;
-      refValue->add(offset);
-    }
-    dnnValue->copyFrom(*refValue);
-
-    VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName();
-    printVector(dnnValue);
-  }
-}
-
-// random botdata of ref layer and copy same to mkldnn
-void MKLDNNTester::randomBotDatas() {
-  CHECK_EQ(dataLayers_.size(), NUM);
-  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
-    dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
-    dataLayers_[DNN][i]->getOutputValue()->copyFrom(
-        *(dataLayers_[REF][i]->getOutputValue()));
-    VLOG(MKLDNN_TESTS) << "Random Foward, InputValue " << i;
-    printMatrix(dataLayers_[REF][i]->getOutputValue());
-  }
-}
-
-void MKLDNNTester::randomTopDiffs() {
-  refLayer_->getOutputGrad()->randomizeUniform();
-  dnnLayer_->getOutput(CPU_DEVICE)
-      .grad->copyFrom(*(refLayer_->getOutputGrad()));
-  VLOG(MKLDNN_TESTS) << "Random Backward, OutputGrad";
-  printMatrix(refLayer_->getOutputGrad());
-}
-
-void MKLDNNTester::checkForward() {
-  VLOG(MKLDNN_TESTS) << "Check Forward";
-  printTopDatas();
-  double delta =
-      compareMatrix(refLayer_->getOutputValue(), dnnLayer_->getOutputValue());
-  EXPECT_LE(fabs(delta), eps_);
-}
-
-void MKLDNNTester::checkBackwardData() {
-  VLOG(MKLDNN_TESTS) << "Check Backward Data";
-  const bool isBN = refLayer_->getType() == "batch_norm";
-  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
-    const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
-    const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
-    VLOG(MKLDNN_ALL) << "MKLDNN Backward Result: InputGrad " << i;
-    printMatrix(dnnDiff);
-    VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
-    printMatrix(refDiff);
-
-    double delta = compareMatrix(refDiff, dnnDiff);
-    EXPECT_LE(fabs(delta), eps_);
-    if (isBN) {
-      // the other two inputs in batch norm are for moving mean and var
-      // do not have grad to compare
-      break;
-    }
-  }
-}
-
-void MKLDNNTester::checkBackwardWgts() {
-  VLOG(MKLDNN_TESTS) << "Check Backward Weight";
-  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
-  saveWgt(parameters_[DNN], dnnWgts);
-
-  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
-  if (dnnLayer) {
-    dnnLayer->convertWeightsToPaddle();
-  }
-  for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
-    const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
-    const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
-    VLOG(MKLDNN_ALL) << "MKLDNN Result: weight value"
-                     << parameters_[DNN][i]->getName();
-    printVector(dnn);
-    VLOG(MKLDNN_ALL) << "Reference Result: weight value "
-                     << parameters_[REF][i]->getName();
-    printVector(ref);
-
-    double delta = compareVector(ref, dnn);
-    EXPECT_LE(fabs(delta), eps_);
-  }
-
-  VLOG(MKLDNN_ALL) << "Restore dnn weights before comapre";
-  restoreWgt(dnnWgts, parameters_[DNN]);
-}
-
-void MKLDNNTester::saveWgt(const vector<ParameterPtr>& from,
-                           vector<VectorPtr>& to) {
-  const bool useGpu = false;
-  to.resize(from.size());
-  for (size_t i = 0; i < to.size(); ++i) {
-    const VectorPtr& wgt = from[i]->getBuf(PARAMETER_VALUE);
-    to[i] = Vector::create(wgt->getSize(), useGpu);
-    to[i]->copyFrom(*wgt);
-  }
-}
-
-void MKLDNNTester::restoreWgt(const vector<VectorPtr>& from,
-                              vector<ParameterPtr>& to) {
-  CHECK_EQ(from.size(), to.size());
-  for (size_t i = 0; i < from.size(); ++i) {
-    const VectorPtr& wgt = to[i]->getBuf(PARAMETER_VALUE);
-    wgt->copyFrom(*from[i]);
-  }
-}
-
-// clear parameters grad
-void MKLDNNTester::clearWgtDiffs(size_t id) {
-  CHECK_LE(id, parameters_.size());
-  for (size_t n = 0; n < parameters_.size(); ++n) {
-    if (id == n || id == parameters_.size()) {
-      for (size_t i = 0; i < parameters_[n].size(); ++i) {
-        const VectorPtr& grad = parameters_[n][i]->getBuf(PARAMETER_GRADIENT);
-        if (grad) {
-          grad->zeroMem();
-        }
-      }
-    }
-  }
-}
-
-void MKLDNNTester::clearBotDiffs(size_t id) {
-  CHECK_LE(id, dataLayers_.size());
-  for (size_t n = 0; n < dataLayers_.size(); ++n) {
-    if (id == n || id == dataLayers_.size()) {
-      // clear inputs layers of this specific layer
-      for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
-        dataLayers_[n][i]->getOutputGrad()->zeroMem();
-      }
-    }
-  }
-}
-
-void MKLDNNTester::clearTopDatas(size_t id) {
-  CHECK_LE(id, testLayers_.size());
-  for (size_t i = 0; i < testLayers_.size(); ++i) {
-    if (id == i || id == testLayers_.size()) {
-      testLayers_[i]->getOutputValue()->zeroMem();
-    }
-  }
-}
-
-void MKLDNNTester::printTopDatas() {
-  if (!log_) {
-    return;
-  }
-
-  for (int n = 0; n < NUM; ++n) {
-    VLOG(MKLDNN_ALL) << testLayers_[n]->getType()
-                     << " Forward Result: OutputValue";
-    printMatrix(testLayers_[n]->getOutputValue());
-  }
-}
-
-void MKLDNNTester::printMatrix(const MatrixPtr& m) {
-  if (!log_) {
-    return;
-  }
-
-  std::ostringstream ostr;
-  m->print(ostr);
-  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
-}
-
-void MKLDNNTester::printVector(const VectorPtr& v) {
-  if (!log_) {
-    return;
-  }
-
-  std::ostringstream ostr;
-  v->print(ostr, v->getSize());
-  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
-}
-
-double MKLDNNTester::getDelta(const real* refer,
-                              const real* value,
-                              size_t len,
-                              const float failRate,
-                              const float thres) {
-  double delta = 0, sum = 0;
-  int failCnt = 0;
-  const double eps = 1e-5;
-  double maxRatio = 0;
-  for (size_t i = 0; i < len; ++i) {
-    double ref = fabs(refer[i]);
-    double val = fabs(value[i]);
-    double diff = fabs(refer[i] - value[i]);
-    delta += diff;
-    sum += ref;
-    if (ref < eps && val < eps) {  // both values are very small
-      continue;
-    }
-    double ratio = diff / ref;
-    if (ratio > thres) {
-      maxRatio = std::max(maxRatio, ratio);
-      failCnt++;
-    }
-  }
-  EXPECT_FALSE(std::isinf(sum));
-  EXPECT_FALSE(std::isnan(sum));
-  EXPECT_FALSE(std::isnan(delta));
-  VLOG(MKLDNN_ALL) << "reference avg data: " << sum / len
-                   << ", delta: " << delta / sum << ", failCnt:" << failCnt;
-  double res = sum > eps ? delta / sum : eps;
-  return (failCnt / (float)len) > failRate ? maxRatio : res;
-}
-
-double MKLDNNTester::compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2) {
-  CHECK_EQ(m1->getElementCnt(), m2->getElementCnt());
-  return getDelta(m1->getData(), m2->getData(), m1->getElementCnt());
-}
-
-double MKLDNNTester::compareVector(const VectorPtr& v1, const VectorPtr& v2) {
-  CHECK_EQ(v1->getSize(), v2->getSize());
-  return getDelta(v1->getData(), v2->getData(), v1->getSize());
-}
-
-void MKLDNNTester::runOnce() {
-  // test forward
-  randomBotDatas();
-  dnnLayer_->forward(passType_);
-  refLayer_->forward(passType_);
-  checkForward();
-
-  if (passType_ == PASS_TEST) {
-    return;
-  }
-
-  // test backward
-  // simple updater
-  UpdateCallback updateCallback = [](Parameter* para) {
-    auto& grad = para->getBuf(PARAMETER_GRADIENT);
-    auto& value = para->getBuf(PARAMETER_VALUE);
-    real lr = 1e-2;
-    value->add(*grad, lr);
-    grad->zeroMem();
-  };
-  randomTopDiffs();
-  dnnLayer_->backward(updateCallback);
-  refLayer_->backward(updateCallback);
-  checkBackwardData();
-  checkBackwardWgts();
-
-  // clear buffers
-  // ref code will addto the diff, dnn code will writeto it
-  // and clearTopDatas(REF) should be coverd by ref layers
-  clearBotDiffs(REF);
-  clearWgtDiffs(REF);
-  // it is necessary to clear bottom diffs when only activation is dnn type
-  if (configs_[DNN].layerConfig.active_type().compare(0, 7, "mkldnn_") == 0) {
-    clearBotDiffs(DNN);
-  }
-}
-
-void MKLDNNTester::run(const TestConfig& dnn,
-                       const TestConfig& ref,
-                       size_t batchSize,
-                       size_t inputImgH,
-                       size_t inputImgW,
-                       PassType passType,
-                       bool printDetails,
-                       size_t iter,
-                       float epsilon) {
-  CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
-        dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
-      << "should be MKLDNN layer or MKLDNN activation";
-  if (dnn.layerConfig.type() == ref.layerConfig.type()) {
-    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
-                       << dnn.layerConfig.active_type() << " vs "
-                       << ref.layerConfig.active_type();
-  } else {
-    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
-                       << dnn.layerConfig.type() << " vs "
-                       << ref.layerConfig.type();
-  }
-
-  ih_ = inputImgH;
-  iw_ = inputImgW;
-  passType_ = passType;
-  log_ = printDetails;
-  iter_ = iter;
-  eps_ = epsilon;
-
-  // Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight
-  reset(dnn, ref, batchSize);
-  randomWgtDatas();
-  clearWgtDiffs();
-  clearBotDiffs();
-  for (size_t i = 0; i < iter_; ++i) {
-    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
-    runOnce();
-  }
-
-  if (parameters_[DNN].empty()) {
-    // has no paramters
-    return;
-  }
-
-  // After run some iterations, the mkldnn weight has been stored in dnnLayer
-  // and we can also get the mkldnn weight parameter header format.
-  // Weight parameter should always be index 0 (and bias index 1).
-  // TODO(TJ): should also consider mean and var format when batchnorm ready
-  int dnnWgtFmt = parameters_[DNN][0]->getHeaderFormat();
-  int refWgtFmt = parameters_[REF][0]->getHeaderFormat();
-  if (dnnWgtFmt == refWgtFmt) {
-    // weight format are equal, so no need check more
-    return;
-  }
-
-  // then save the weights and restart again
-  vector<VectorPtr> dnnWgts, refWgts;
-  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  saveWgt(parameters_[DNN], dnnWgts);
-  saveWgt(parameters_[REF], refWgts);
-
-  // restart again with dnn weight format
-  reset(dnn, ref, batchSize);
-  // TODO(TJ): should also considerate mean and var format when batchnorm ready
-  parameters_[DNN][0]->setHeaderFormat(dnnWgtFmt);
-
-  // restore wgt
-  restoreWgt(dnnWgts, parameters_[DNN]);
-  restoreWgt(refWgts, parameters_[REF]);
-  clearWgtDiffs();
-  clearBotDiffs();
-
-  for (size_t i = 0; i < iter_; ++i) {
-    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
-    runOnce();
-  }
-}
-
-void MKLDNNTester::initArgument(DataIn& data,
-                                const std::string& configPath,
-                                const size_t iter) {
-  TrainerConfigHelper config(configPath);
-  size_t batchSize = config.getOptConfig().batch_size();
-  data.inArgs.resize(iter);
-  data.outGrads.resize(iter);
-  data.paraValues.clear();
-  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    for (size_t i = 0; i < iter; ++i) {
-      Argument arg;
-      arg.value = Matrix::create(batchSize, layerSize, false, false);
-      arg.grad = Matrix::create(batchSize, layerSize, false, false);
-      arg.value->randomizeUniform();
-      arg.value->add(-0.5);
-      arg.value->sigmoid(*arg.value);
-      arg.grad->zeroMem();
-      arg.ids = VectorT<int>::create(batchSize, false);
-      arg.ids->rand(layerSize);
-      generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
-      data.inArgs[i].push_back(arg);
-    }
-  }
-
-  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    for (size_t i = 0; i < iter; ++i) {
-      MatrixPtr grad = Matrix::create(batchSize, layerSize, false, false);
-      grad->randomizeUniform();
-      data.outGrads[i].push_back(grad);
-    }
-  }
-
-  for (const auto& para_config : config.getModelConfig().parameters()) {
-    VectorPtr value = Vector::create(para_config.size(), false);
-    value->randnorm(0, 2);
-    data.paraValues.push_back(value);
-  }
-}
-
-void MKLDNNTester::getOutResult(const std::string& configPath,
-                                DataIn& in,
-                                DataOut& out,
-                                bool use_mkldnn,
-                                size_t iter) {
-  FLAGS_use_gpu = false;
-  FLAGS_use_mkldnn = use_mkldnn;
-  *ThreadLocalRand::getSeed() = 1;
-  srand(1);
-
-  Trainer trainer;
-  auto config = std::make_shared<TrainerConfigHelper>(configPath);
-  trainer.init(config, false);
-  auto gradientMachine = trainer.getGradientMachine();
-  std::vector<ParameterPtr> parameters = gradientMachine->getParameters();
-  for (size_t i = 0; i < in.paraValues.size(); i++) {
-    parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
-  }
-  UpdateCallback simpleUpdate = [](Parameter* para) {
-    auto& grad = para->getBuf(PARAMETER_GRADIENT);
-    auto& value = para->getBuf(PARAMETER_VALUE);
-    real lr = 1e-2;
-    value->add(*grad, lr);
-    grad->zeroMem();
-  };
-
-  vector<Argument> outArgs;
-  gradientMachine->start();
-  out.outValues.clear();
-  out.paraValues.clear();
-  for (size_t i = 0; i < iter; ++i) {
-    VLOG(MKLDNN_TESTS) << "runing iteration " << i;
-    gradientMachine->forward(in.inArgs[i], &outArgs, PASS_TRAIN);
-    // save forward result
-    for (size_t k = 0; k < outArgs.size(); k++) {
-      const MatrixPtr& src = outArgs[k].value;
-      MatrixPtr dst =
-          Matrix::create(src->getHeight(), src->getWidth(), false, false);
-      if (typeid(*src) == typeid(MKLDNNMatrix)) {
-        MKLDNNMatrixPtr dnnSrc = std::dynamic_pointer_cast<MKLDNNMatrix>(src);
-        dnnSrc->copyTo(*dst);
-      } else {
-        dst->copyFrom(*src);
-      }
-      out.outValues.push_back(dst);
-    }
-
-    // random backward input
-    for (size_t k = 0; k < outArgs.size(); k++) {
-      outArgs[k].grad->copyFrom(*in.outGrads[i][k]);
-    }
-    gradientMachine->backward(simpleUpdate);
-  }
-  gradientMachine->finish();
-
-  // save param value
-  for (size_t i = 0; i < in.paraValues.size(); i++) {
-    VectorPtr val = Vector::create(
-        parameters[i]->getBuf(PARAMETER_VALUE)->getSize(), false);
-    val->copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
-    out.paraValues.push_back(val);
-  }
-}
-
-void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
-  CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
-  CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
-  for (size_t i = 0; i < ref.outValues.size(); i++) {
-    VLOG(MKLDNN_TESTS) << "compare value index: " << i;
-    EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
-  }
-  for (size_t i = 0; i < ref.paraValues.size(); i++) {
-    VLOG(MKLDNN_TESTS) << "compare param index: " << i;
-    EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
-  }
-}
-
-void MKLDNNTester::runNetTest(const std::string& configPath,
-                              size_t iter,
-                              float eps) {
-  DataIn in;
-  initArgument(in, configPath, iter);
-  DataOut outCpu, outDnn;
-  VLOG(MKLDNN_TESTS) << "runing cpu network";
-  getOutResult(configPath, in, outCpu, false, iter);
-  VLOG(MKLDNN_TESTS) << "runing mkldnn network";
-  getOutResult(configPath, in, outDnn, true, iter);
-
-  compareResult(outCpu, outDnn, eps);
-}
-
-}  //  namespace paddle
diff --git a/paddle/legacy/gserver/tests/MKLDNNTester.h b/paddle/legacy/gserver/tests/MKLDNNTester.h
deleted file mode 100644
index 086846ce537..00000000000
--- a/paddle/legacy/gserver/tests/MKLDNNTester.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "LayerGradUtil.h"
-#include "paddle/legacy/gserver/layers/MKLDNNBase.h"
-#include "paddle/legacy/gserver/layers/MKLDNNLayer.h"
-
-namespace paddle {
-
-/**
- * @brief test the functionality of MKLDNNlayers and MKLDNNActivations
- * refer to paddle original function
- */
-class MKLDNNTester {
-  enum {
-    DNN = 0,  // MKLDNN layer
-    REF = 1,  // Reference layer
-    NUM = 2,  // Number of total
-  };
-
-  struct DataIn {
-    std::vector<std::vector<Argument>> inArgs;
-    std::vector<std::vector<MatrixPtr>> outGrads;
-    std::vector<VectorPtr> paraValues;
-  };
-
-  struct DataOut {
-    std::vector<MatrixPtr> outValues;
-    std::vector<VectorPtr> paraValues;
-  };
-
- protected:
-  std::vector<TestConfig> configs_;
-  vector<string> layerNames_;
-  vector<vector<DataLayerPtr>> dataLayers_;
-  vector<vector<Argument>> datas_;
-  vector<LayerMap> layerMaps_;
-  vector<vector<ParameterPtr>> parameters_;
-  vector<LayerPtr> testLayers_;
-  LayerPtr refLayer_, dnnLayer_;
-
-  /// run some iterations, all the result should pass
-  size_t iter_;
-  /// whether to print out the details
-  bool log_;
-  /// epsilon
-  float eps_;
-  /// input image size, default 1
-  size_t ih_, iw_;
-  /// passType, PASS_TRAIN, PASS_TEST or PASS_GC (Gradient Check pass)
-  PassType passType_;
-
- public:
-  explicit MKLDNNTester(size_t iter = 3, float epsilon = 1e-4) {
-    iter_ = iter;
-    eps_ = epsilon;
-    log_ = false;
-    passType_ = PASS_TRAIN;
-  }
-
-  ~MKLDNNTester() {}
-
- public:
-  void run(const TestConfig& dnn,
-           const TestConfig& ref,
-           size_t batchSize,
-           size_t inputImgH = 1,
-           size_t inputImgW = 1,
-           PassType passType = PASS_TRAIN,
-           bool printDetails = false,
-           size_t iter = 3,
-           float epsilon = 1e-4);
-  static void runNetTest(const std::string& configPath,
-                         size_t iter = 2,
-                         float eps = 1e-4);
-  static void initArgument(DataIn& data,
-                           const std::string& configPath,
-                           size_t iter = 2);
-  static void getOutResult(const std::string& configPath,
-                           DataIn& in,
-                           DataOut& out,
-                           bool use_mkldnn,
-                           size_t iter = 2);
-
- private:
-  void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
-  void setInputImgSize();
-  void runOnce();
-
-  void randomWgtDatas();
-  void randomBotDatas();
-  void randomTopDiffs();
-
-  void checkForward();
-  void checkBackwardData();
-  void checkBackwardWgts();
-
-  // clear specific layer, clear all when id equals NUM
-  void clearWgtDiffs(size_t id = NUM);
-  void clearBotDiffs(size_t id = NUM);
-  void clearTopDatas(size_t id = NUM);
-
-  void printTopDatas();
-  void printMatrix(const MatrixPtr& m);
-  void printVector(const VectorPtr& v);
-
-  void saveWgt(const vector<ParameterPtr>& from, vector<VectorPtr>& to);
-  void restoreWgt(const vector<VectorPtr>& from, vector<ParameterPtr>& to);
-
-  static double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2);
-  static double compareVector(const VectorPtr& v1, const VectorPtr& v2);
-  static void compareResult(DataOut& ref, DataOut& dnn, float eps = 1e-4);
-
-  /**
-   * Get delta percent
-   * if many(>failRate) wrong(abs(val-ref)/abs(ref) > thres) points
-   * return the max(diff/ref)
-   * else return sum(abs(diff)) / sum(abs(ref))
-   * The return value should be smaller than eps when passing.
-   */
-  static double getDelta(const real* refer,
-                         const real* value,
-                         size_t len,
-                         const float failRate = 1e-3,
-                         const float thres = 0.1);
-};
-
-}  //  namespace paddle
diff --git a/paddle/legacy/gserver/tests/Sequence/dummy.list b/paddle/legacy/gserver/tests/Sequence/dummy.list
deleted file mode 100644
index 0e52665e112..00000000000
--- a/paddle/legacy/gserver/tests/Sequence/dummy.list
+++ /dev/null
@@ -1 +0,0 @@
-dummy_file_no_use
diff --git a/paddle/legacy/gserver/tests/Sequence/tour_dict_phrase.dict b/paddle/legacy/gserver/tests/Sequence/tour_dict_phrase.dict
deleted file mode 100644
index 41f68e7f5c0..00000000000
--- a/paddle/legacy/gserver/tests/Sequence/tour_dict_phrase.dict
+++ /dev/null
@@ -1,158 +0,0 @@
-，
-的
-。
-酒店
-房间
-了
-很
-也
-不错
-是
-！
-有
-服务
-就是
-都
-住
-一
-在
-好
-月湖
-不
-可以
-.
-且
-就
-离
-方便
-早餐
-还是
-近
-位置
-干净
-床上用品
-、
-价格
-挺
-强烈推荐
-感觉
-卫生
-本来
-挺好
-性价比
-房
-前台
-下次
-交通
-不过
-很方便
-给
-没
-这个
-不少
-还有
-十一
-来
-还会
-停电
-推荐
-流
-服务员
-新
-舒适
-选择
-热情
-简直
-吃饭
-安静
-吃
-很干净
-地理位置
-便利
-得
-这
-子
-杯子
-很多
-周围
-適
-第
-天一广场
-整体
-好吃
-*
-尚可
-品质
-2
-时候
-家
-出差
-又
-较
-便宜
-整洁
-啊
-汉庭
-交通便利
-旁边
-对
-去过
-次
-利落
-合
-换
-窗户
-温馨
-最
-两
-应该
-只有
-适中
-出去玩
-很安静
-商务
-对面
-道歉
-乾
-地铁站
-居然
-不远
-总体来说
-泳池
-地段
-全家
-相对
-晚
-天一阁
-电脑
-來
-呀
-一人
-口头
-上网
-刷牙
-相当
-天
-合理
-准备
-通知
-第一天
-水温
-出来
-五星级
-快
-无
-楼层
-各方面
-华润万家
-宁波
-选
-放心
-浄
-主要原因
-安排
-客户
-一次性杯子
-起
-床垫
-一早
diff --git a/paddle/legacy/gserver/tests/Sequence/tour_train_wdseg b/paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
deleted file mode 100644
index 2cdf7f7e14e..00000000000
--- a/paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
+++ /dev/null
@@ -1,10 +0,0 @@
-2  	酒店 有 很 舒适 的 床垫 子 ， 床上用品 也 应该 是 一人 一 换 ， 感觉 很 利落 对 卫生 很 放心 呀 。
-2  	很 温馨 ， 也 挺 干净 的 * 地段 不错 ， 出来 就 有 全家 ， 离 地铁站 也 近 ， 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 ， 就 第一天 给 了 一次性杯子 *
-2  	位置 方便 ， 强烈推荐 ， 十一 出去玩 的 时候 选 的 ， 对面 就是 华润万家 ， 周围 吃饭 的 也 不少 。
-2  	交通便利 ， 吃 很 便利 ， 乾 浄 、 安静 ， 商务 房 有 电脑 、 上网 快 ， 价格 可以 ， 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
-2  	本来 准备 住 两 晚 ， 第 2 天 一早 居然 停电 ， 且 无 通知 ， 只有 口头 道歉 。 总体来说 性价比 尚可 ， 房间 较 新 ， 还是 推荐 .
-2  	这个 酒店 去过 很多 次 了 ， 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
-2  	挺好 的 汉庭 ， 前台 服务 很 热情 ， 卫生 很 整洁 ， 房间 安静 ， 水温 适中 ， 挺好 ！
-2  	HowardJohnson 的 品质 ， 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 ， 简直 一 流 。 就 在 天一阁 、 月湖 旁边 ， 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
-2  	酒店 很干净 ， 很安静 ， 很 温馨 ， 服务员 服务 好 ， 各方面 都 不错 *
-2  	挺好 的 ， 就是 没 窗户 ， 不过 对 得 起 这 价格
diff --git a/paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest b/paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
deleted file mode 100644
index 3aa890d8aa1..00000000000
--- a/paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
+++ /dev/null
@@ -1,14 +0,0 @@
-2  	酒店 有 很 舒适 的 床垫 子 ， 床上用品 也 应该 是 一人 一 换 ， 感觉 很 利落 对 卫生 很 放心 呀 。
-2  	很 温馨 ， 也 挺 干净 的 * 地段 不错 ， 出来 就 有 全家 ， 离 地铁站 也 近 ， 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 ， 就 第一天 给 了 一次性杯子 *
-
-2  	位置 方便 ， 强烈推荐 ， 十一 出去玩 的 时候 选 的 ， 对面 就是 华润万家 ， 周围 吃饭 的 也 不少 。
-2  	交通便利 ， 吃 很 便利 ， 乾 浄 、 安静 ， 商务 房 有 电脑 、 上网 快 ， 价格 可以 ， 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
-2  	本来 准备 住 两 晚 ， 第 2 天 一早 居然 停电 ， 且 无 通知 ， 只有 口头 道歉 。 总体来说 性价比 尚可 ， 房间 较 新 ， 还是 推荐 .
-
-2  	这个 酒店 去过 很多 次 了 ， 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
-2  	挺好 的 汉庭 ， 前台 服务 很 热情 ， 卫生 很 整洁 ， 房间 安静 ， 水温 适中 ， 挺好 ！
-
-2  	HowardJohnson 的 品质 ， 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 ， 简直 一 流 。 就 在 天一阁 、 月湖 旁边 ， 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
-2  	酒店 很干净 ， 很安静 ， 很 温馨 ， 服务员 服务 好 ， 各方面 都 不错 *
-2  	挺好 的 ， 就是 没 窗户 ， 不过 对 得 起 这 价格
-
diff --git a/paddle/legacy/gserver/tests/Sequence/train.list b/paddle/legacy/gserver/tests/Sequence/train.list
deleted file mode 100644
index 1109a244925..00000000000
--- a/paddle/legacy/gserver/tests/Sequence/train.list
+++ /dev/null
@@ -1 +0,0 @@
-legacy/gserver/tests/Sequence/tour_train_wdseg
diff --git a/paddle/legacy/gserver/tests/Sequence/train.list.nest b/paddle/legacy/gserver/tests/Sequence/train.list.nest
deleted file mode 100644
index a67df35024f..00000000000
--- a/paddle/legacy/gserver/tests/Sequence/train.list.nest
+++ /dev/null
@@ -1 +0,0 @@
-legacy/gserver/tests/Sequence/tour_train_wdseg.nest
diff --git a/paddle/legacy/gserver/tests/__init__.py b/paddle/legacy/gserver/tests/__init__.py
deleted file mode 100644
index f662d682632..00000000000
--- a/paddle/legacy/gserver/tests/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/paddle/legacy/gserver/tests/concat_dotmul_a.conf b/paddle/legacy/gserver/tests/concat_dotmul_a.conf
deleted file mode 100644
index db02ca7e80d..00000000000
--- a/paddle/legacy/gserver/tests/concat_dotmul_a.conf
+++ /dev/null
@@ -1,31 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000)
-
-data = data_layer(name ="input", size=1000)
-
-with mixed_layer(size=1000) as layer1:
-    layer1 += dotmul_projection(input=data)
-
-with mixed_layer(size=1000) as layer2:
-    layer2 += dotmul_projection(input=data)
-
-concat = concat_layer(input=[layer1, layer2])
-
-outputs(concat)
diff --git a/paddle/legacy/gserver/tests/concat_dotmul_b.conf b/paddle/legacy/gserver/tests/concat_dotmul_b.conf
deleted file mode 100644
index 5e64970e444..00000000000
--- a/paddle/legacy/gserver/tests/concat_dotmul_b.conf
+++ /dev/null
@@ -1,29 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000)
-
-data = data_layer(name ="input", size=1000)
-
-proj1 = dotmul_projection(input=data)
-
-proj2 = dotmul_projection(input=data)
-
-concat = concat_layer(input=[proj1, proj2])
-
-outputs(concat)
diff --git a/paddle/legacy/gserver/tests/concat_fullmatrix_a.conf b/paddle/legacy/gserver/tests/concat_fullmatrix_a.conf
deleted file mode 100644
index 940d1efc58f..00000000000
--- a/paddle/legacy/gserver/tests/concat_fullmatrix_a.conf
+++ /dev/null
@@ -1,35 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-
-data = data_layer(name ="input", size=100)
-
-# fc1 is equal to fc2
-# note that in mixed_layer, default bias_attr=False,
-# and default act=LinearActivation().
-fc1 = fc_layer(input=data, size=1000, 
-               bias_attr=False, 
-               act=LinearActivation())
-
-with mixed_layer(size=1000) as fc2:
-    fc2 += full_matrix_projection(input=data)
-
-concat = concat_layer(input=[fc1, fc2])
-
-outputs(concat)
diff --git a/paddle/legacy/gserver/tests/concat_fullmatrix_b.conf b/paddle/legacy/gserver/tests/concat_fullmatrix_b.conf
deleted file mode 100644
index 931e5b38efa..00000000000
--- a/paddle/legacy/gserver/tests/concat_fullmatrix_b.conf
+++ /dev/null
@@ -1,29 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-
-data = data_layer(name ="input", size=100)
-
-proj1 = full_matrix_projection(input=data, size=1000)
-
-proj2 = full_matrix_projection(input=data, size=1000)
-
-concat = concat_layer(input=[proj1, proj2])
-
-outputs(concat)
diff --git a/paddle/legacy/gserver/tests/concat_slice_a.conf b/paddle/legacy/gserver/tests/concat_slice_a.conf
deleted file mode 100644
index dccf911089e..00000000000
--- a/paddle/legacy/gserver/tests/concat_slice_a.conf
+++ /dev/null
@@ -1,41 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-
-data = data_layer(name ="input", size=8*16*16)
-
-conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                       num_channels=8,
-                       num_filters=16, stride=1,
-                       bias_attr=False,
-                       act=ReluActivation())
-conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                       num_channels=8,
-                       num_filters=16, stride=1,
-                       bias_attr=False,
-                       act=ReluActivation())
-
-proj1 = slice_projection(input=conv1, slices=[(0, 4), (4, 12)])
-
-proj2 = slice_projection(input=conv2, slices=[(1, 5), (5, 15)])
-
-concat = concat_layer(input=[proj1, proj2])
-
-outputs(concat)
-
diff --git a/paddle/legacy/gserver/tests/concat_slice_b.conf b/paddle/legacy/gserver/tests/concat_slice_b.conf
deleted file mode 100644
index 29686ef2810..00000000000
--- a/paddle/legacy/gserver/tests/concat_slice_b.conf
+++ /dev/null
@@ -1,41 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-
-data = data_layer(name ="input", size=8*16*16)
-
-conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                       num_channels=8,
-                       num_filters=16, stride=1,
-                       bias_attr=False,
-                       act=ReluActivation())
-conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                       num_channels=8,
-                       num_filters=16, stride=1,
-                       bias_attr=False,
-                       act=ReluActivation())
-
-proj1 = slice_projection(input=conv1, slices=[(0, 12)])
-
-proj2 = slice_projection(input=conv2, slices=[(1, 15)])
-
-concat = concat_layer(input=[proj1, proj2])
-
-outputs(concat)
-
diff --git a/paddle/legacy/gserver/tests/concat_table_a.conf b/paddle/legacy/gserver/tests/concat_table_a.conf
deleted file mode 100644
index 047cb44d156..00000000000
--- a/paddle/legacy/gserver/tests/concat_table_a.conf
+++ /dev/null
@@ -1,32 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=300)
-
-data = data_layer(name ="input", size=10000)
-
-# emb1 is equal to emb2, note that bias_attr=false 
-# and act=LinearActivation() in default.
-emb1 = embedding_layer(input=data, size=128)
-
-with mixed_layer(size=128) as emb2:
-    emb2 += table_projection(input=data)
-
-concat = concat_layer(input=[emb1, emb2])
-
-outputs(concat)
diff --git a/paddle/legacy/gserver/tests/concat_table_b.conf b/paddle/legacy/gserver/tests/concat_table_b.conf
deleted file mode 100644
index c666ab99427..00000000000
--- a/paddle/legacy/gserver/tests/concat_table_b.conf
+++ /dev/null
@@ -1,29 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=300)
-
-data = data_layer(name ="input", size=10000)
-
-proj1 = table_projection(input=data, size=128)
-
-proj2 = table_projection(input=data, size=128)
-
-concat = concat_layer(input=[proj1, proj2])
-
-outputs(concat)
diff --git a/paddle/legacy/gserver/tests/img_conv_a.conf b/paddle/legacy/gserver/tests/img_conv_a.conf
deleted file mode 100644
index 3ad15c64fe5..00000000000
--- a/paddle/legacy/gserver/tests/img_conv_a.conf
+++ /dev/null
@@ -1,40 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-data = data_layer(name ="input", size=8*16*16)
-conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                        num_channels=8,
-                        num_filters=16, stride=1,
-                        bias_attr=False,
-                        act=ReluActivation())
-conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                       num_channels=8,
-                       num_filters=16, stride=1,
-                       bias_attr=False,
-                       act=ReluActivation())
-
-concat = concat_layer(input=[conv1, conv2])
-
-conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                      num_channels=8,
-                      num_filters=16, stride=1,
-                      bias_attr=True,
-                      act=LinearActivation(),
-                      groups=2)
-
-outputs(concat, conv)
diff --git a/paddle/legacy/gserver/tests/img_conv_b.conf b/paddle/legacy/gserver/tests/img_conv_b.conf
deleted file mode 100644
index e68008155e9..00000000000
--- a/paddle/legacy/gserver/tests/img_conv_b.conf
+++ /dev/null
@@ -1,32 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-data = data_layer(name ="input", size=8*16*16)
-proj1 = conv_projection(input=data, filter_size=1, filter_size_y=1,
-                        num_channels=8, num_filters=16, stride=1)
-proj2 = conv_projection(input=data, filter_size=1, filter_size_y=1,
-                        num_channels=8, num_filters=16, stride=1)
-concat = concat_layer(input=[proj1, proj2], bias_attr=False, act=ReluActivation())
-
-proj = conv_projection(input=data, filter_size=1, filter_size_y=1,
-                       num_channels=8, num_filters=16, stride=1, groups=2)
-
-with mixed_layer(bias_attr=True, act=LinearActivation()) as conv:
-    conv += proj
-
-outputs(concat, conv)
diff --git a/paddle/legacy/gserver/tests/img_conv_c.conf b/paddle/legacy/gserver/tests/img_conv_c.conf
deleted file mode 100644
index 4598ffbdb2f..00000000000
--- a/paddle/legacy/gserver/tests/img_conv_c.conf
+++ /dev/null
@@ -1,43 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-data = data_layer(name ="input", size=8*16*16)
-conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                        num_channels=8,
-                        num_filters=16, stride=1,
-                        bias_attr=False,
-                        act=ReluActivation(),
-                        layer_type="exconv")
-conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                       num_channels=8,
-                       num_filters=16, stride=1,
-                       bias_attr=False,
-                       act=ReluActivation(),
-                       layer_type="exconv")
-
-concat = concat_layer(input=[conv1, conv2])
-
-conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                      num_channels=8,
-                      num_filters=16, stride=1,
-                      bias_attr=True,
-                      act=LinearActivation(),
-                      groups=2,
-                      layer_type="exconv")
-
-outputs(concat, conv)
diff --git a/paddle/legacy/gserver/tests/img_conv_cudnn.py b/paddle/legacy/gserver/tests/img_conv_cudnn.py
deleted file mode 100644
index fd889ee1ce8..00000000000
--- a/paddle/legacy/gserver/tests/img_conv_cudnn.py
+++ /dev/null
@@ -1,31 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-data = data_layer(name="input", size=8 * 16 * 16)
-conv = img_conv_layer(
-    input=data,
-    filter_size=1,
-    filter_size_y=1,
-    num_channels=8,
-    num_filters=16,
-    stride=1,
-    bias_attr=True,
-    act=LinearActivation(),
-    groups=2,
-    layer_type="cudnn_conv")
-
-outputs(conv)
diff --git a/paddle/legacy/gserver/tests/img_conv_exconv.py b/paddle/legacy/gserver/tests/img_conv_exconv.py
deleted file mode 100644
index 5aca6da5acf..00000000000
--- a/paddle/legacy/gserver/tests/img_conv_exconv.py
+++ /dev/null
@@ -1,31 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-data = data_layer(name="input", size=8 * 16 * 16)
-conv = img_conv_layer(
-    input=data,
-    filter_size=1,
-    filter_size_y=1,
-    num_channels=8,
-    num_filters=16,
-    stride=1,
-    bias_attr=True,
-    act=LinearActivation(),
-    groups=2,
-    layer_type="exconv")
-
-outputs(conv)
diff --git a/paddle/legacy/gserver/tests/img_pool_a.conf b/paddle/legacy/gserver/tests/img_pool_a.conf
deleted file mode 100644
index afd271055d9..00000000000
--- a/paddle/legacy/gserver/tests/img_pool_a.conf
+++ /dev/null
@@ -1,44 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-data = data_layer(name ="input", size=8*16*16)
-conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                      num_channels=8,
-                      num_filters=8,stride=1)
-maxpool = img_pool_layer(input=conv,
-                         pool_size=3,
-                         pool_size_y=5,
-                         num_channels=8,
-                         stride=1,
-                         stride_y=2,
-                         padding=1,
-                         padding_y=2,
-                         pool_type=MaxPooling(),
-)
-avgpool = img_pool_layer(input=conv,
-                         pool_size=3,
-                         pool_size_y=5,
-                         num_channels=8,
-                         stride=1,
-                         stride_y=2,
-                         padding=1,
-                         padding_y=2,
-                         pool_type=AvgPooling(),
-)
-
-outputs([maxpool, avgpool])
diff --git a/paddle/legacy/gserver/tests/img_pool_b.conf b/paddle/legacy/gserver/tests/img_pool_b.conf
deleted file mode 100644
index e8deb9edbe7..00000000000
--- a/paddle/legacy/gserver/tests/img_pool_b.conf
+++ /dev/null
@@ -1,44 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-data = data_layer(name ="input", size=8*16*16)
-conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                      num_channels=8, num_filters=8, stride=1)
-maxpool = img_pool_layer(input=conv,
-                         pool_size=3,
-                         pool_size_y=5,
-                         num_channels=8,
-                         stride=1,
-                         stride_y=2,
-                         padding=1,
-                         padding_y=2,
-                         pool_type=CudnnMaxPooling(),
-)
-
-avgpool = img_pool_layer(input=conv,
-                         pool_size=3,
-                         pool_size_y=5,
-                         num_channels=8,
-                         stride=1,
-                         stride_y=2,
-                         padding=1,
-                         padding_y=2,
-                         pool_type=CudnnAvgPooling(),
-)
-
-outputs([maxpool, avgpool])
diff --git a/paddle/legacy/gserver/tests/mkldnn_branch_net.conf b/paddle/legacy/gserver/tests/mkldnn_branch_net.conf
deleted file mode 100644
index 8d5146abb0e..00000000000
--- a/paddle/legacy/gserver/tests/mkldnn_branch_net.conf
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=16)
-channels = get_config_arg("channels", int, 2)
-
-def two_conv(input, group_name):
-  out1 = img_conv_layer(input=input,
-              name=group_name+'_conv1_',
-              filter_size=1,
-              num_filters=channels,
-              padding=0,
-              shared_biases=True,
-              act=ReluActivation())
-
-  out2 = img_conv_layer(input=input,
-              name=group_name+'_conv2_',
-              filter_size=3,
-              num_filters=channels,
-              padding=1,
-              shared_biases=True,
-              act=ReluActivation())
-  return out1, out2
-
-def two_conv_bn(input, group_name):
-  out1, out2 = two_conv(input, group_name)
-  out1 = batch_norm_layer(input=out1,
-              name=group_name+'_bn1_',
-              use_global_stats=False,
-              act=ReluActivation())
-
-  out2 = batch_norm_layer(input=out2,
-              name=group_name+'_bn2_',
-              use_global_stats=False,
-              act=ReluActivation())
-  return out1, out2
-
-def two_conv_pool(input, group_name):
-  out1, out2 = two_conv(input, group_name)
-  out1 = img_pool_layer(input=out1,
-              name=group_name+'_pool1_',
-              pool_size=3,
-              stride=2,
-              padding=0,
-              pool_type=MaxPooling())
-
-  out2 = img_pool_layer(input=out2,
-              name=group_name+'_pool2_',
-              pool_size=5,
-              stride=2,
-              padding=1,
-              pool_type=MaxPooling())
-  return out1, out2
-
-def two_fc(input, group_name):
-  out1 = fc_layer(input=input,
-            name=group_name+'_fc1_',
-            size=channels,
-            bias_attr=False,
-            act=LinearActivation())
-
-  out2 = fc_layer(input=input,
-            name=group_name+'_fc2_',
-            size=channels,
-            bias_attr=False,
-            act=LinearActivation())
-  return out1, out2
-
-data = data_layer(name ="input", size=channels*16*16)
-
-tmp = img_conv_layer(input=data,
-            num_channels=channels,
-            filter_size=3,
-            num_filters=channels,
-            padding=1,
-            shared_biases=True,
-            act=ReluActivation())
-
-a1, a2 = two_conv(tmp, 'conv_branch')
-tmp = addto_layer(input=[a1, a2],
-            act=ReluActivation(),
-            bias_attr=False)
-
-tmp = img_pool_layer(input=tmp,
-            pool_size=3,
-            stride=2,
-            padding=1,
-            pool_type=AvgPooling())
-
-b1, b2 = two_conv_pool(tmp, 'pool_branch')
-tmp = concat_layer(input=[b1, b2])
-
-tmp = img_pool_layer(input=tmp,
-            num_channels=channels*2,
-            pool_size=3,
-            stride=2,
-            padding=1,
-            pool_type=MaxPooling())
-
-tmp = img_conv_layer(input=tmp,
-            filter_size=3,
-            num_filters=channels,
-            padding=1,
-            stride=2,
-            shared_biases=True,
-            act=LinearActivation(),
-            bias_attr=False)
-
-tmp = batch_norm_layer(input=tmp,
-            use_global_stats=False,
-            act=ReluActivation())
-
-c1, c2 = two_conv_bn(tmp, 'bn_branch')
-tmp = addto_layer(input=[c1, c2],
-            act=ReluActivation(),
-            bias_attr=False)
-
-tmp = fc_layer(input=tmp, size=channels,
-            bias_attr=True,
-            act=ReluActivation())
-
-d1, d2 = two_fc(tmp, 'fc_branch')
-tmp = addto_layer(input=[d1, d2])
-
-out = fc_layer(input=tmp, size=10,
-            bias_attr=True,
-            act=SoftmaxActivation())
-
-outputs(out)
diff --git a/paddle/legacy/gserver/tests/mkldnn_simple_net.conf b/paddle/legacy/gserver/tests/mkldnn_simple_net.conf
deleted file mode 100644
index 0e9d6b31fa8..00000000000
--- a/paddle/legacy/gserver/tests/mkldnn_simple_net.conf
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=16)
-channels = get_config_arg("channels", int, 2)
-
-data = data_layer(name ="input", size=channels*16*16)
-
-tmp = img_conv_layer(input=data,
-            num_channels=channels,
-            filter_size=3,
-            num_filters=channels,
-            padding=1,
-            shared_biases=True,
-            act=ReluActivation())
-
-tmp = img_pool_layer(input=tmp,
-            pool_size=3,
-            stride=1,
-            padding=0,
-            pool_type=AvgPooling())
-
-tmp = img_conv_layer(input=tmp,
-            filter_size=3,
-            num_filters=channels,
-            padding=1,
-            shared_biases=True,
-            act=LinearActivation(),
-            bias_attr=False)
-
-tmp = batch_norm_layer(input=tmp,
-            use_global_stats=False,
-            act=ReluActivation())
-
-tmp = img_pool_layer(input=tmp,
-            pool_size=3,
-            stride=2,
-            padding=1,
-            pool_type=MaxPooling())
-
-tmp = img_cmrnorm_layer(input=tmp, size=5, scale=0.0001, power=0.75)
-
-tmp = fc_layer(input=tmp,
-            size=channels,
-            bias_attr=False,
-            act=ReluActivation())
-
-out = fc_layer(input=tmp,
-            size=10,
-            bias_attr=True,
-            act=SoftmaxActivation())
-
-outputs(out)
diff --git a/paddle/legacy/gserver/tests/pyDataProvider.py b/paddle/legacy/gserver/tests/pyDataProvider.py
deleted file mode 100644
index 85ea90d6eec..00000000000
--- a/paddle/legacy/gserver/tests/pyDataProvider.py
+++ /dev/null
@@ -1,146 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-import numpy
-import struct
-import traceback
-
-
-def header_creator():
-    ret = ""
-    ret += struct.pack('i', 3)  # slot num
-    ret += struct.pack('i', 1)  # sequence flag
-    ret += struct.pack('i', 0)  # slot0 dense type
-    ret += struct.pack('i', 3)  # slot0 dim
-    ret += struct.pack('i', 1)  # slot1 sparse non value type
-    ret += struct.pack('i', 7)  # slot1 dim
-    ret += struct.pack('i', 3)  # slot2 index type
-    ret += struct.pack('i', 2)  # slot2 dim
-    return ret
-
-
-def dense_value_creator(sample_num):
-    ret = ""
-    ret += struct.pack('i', sample_num)  # slot0 sample num
-    for i in range(sample_num):  # slot0 value
-        ret += struct.pack('f', 1.0)
-        ret += struct.pack('f', 2.0)
-        ret += struct.pack('f', 3.0)
-    return ret
-
-
-def sparse_value_creator(sample_num):
-    ret = ""
-    ret += struct.pack('i', sample_num)  # slot1 sample num
-    for i in range(sample_num):  # slot1 index
-        ret += struct.pack('i', i * 2)
-    ret += struct.pack('i', sample_num * 2)  #slot1 length
-    for i in range(sample_num):  # slot1 value
-        ret += struct.pack('i', 1)
-        ret += struct.pack('i', 2)
-    return ret
-
-
-def index_value_creator(sample_num):
-    ret = ""
-    ret += struct.pack('i', sample_num)  # slot2 sample num
-    for i in range(sample_num):  # slot2 value
-        ret += struct.pack('i', 0)
-    return ret
-
-
-def sequenceStartPositions_creator():
-    ret = ""
-    ret += struct.pack('i', 2)  # slot0 sequence num
-    ret += struct.pack('i', 0)  # slot0 sequence value1
-    ret += struct.pack('i', 1)  # slot0 sequence value2
-    ret += struct.pack('i', 1)  # slot1 sequence num
-    ret += struct.pack('i', 0)  # slot1 sequence value1
-    ret += struct.pack('i', 2)  # slot2 sequence num
-    ret += struct.pack('i', 0)  # slot2 sequence value1
-    ret += struct.pack('i', 1)  # slot2 sequence value2
-    return ret
-
-
-def subSequenceStartPositions_creator():
-    ret = ""
-    ret += struct.pack('i', 3)  # slot0 subsequence num
-    ret += struct.pack('i', 0)  # slot0 subsequence value1
-    ret += struct.pack('i', 1)  # slot0 subsequence value2
-    ret += struct.pack('i', 2)  # slot0 subsequence value3
-    ret += struct.pack('i', 2)  # slot1 subsequence num
-    ret += struct.pack('i', 0)  # slot1 subsequence value1
-    ret += struct.pack('i', 1)  # slot1 subsequence value2
-    ret += struct.pack('i', 3)  # slot2 subsequence num
-    ret += struct.pack('i', 0)  # slot2 subsequence value1
-    ret += struct.pack('i', 1)  # slot2 subsequence value2
-    ret += struct.pack('i', 2)  # slot2 subsequence value3
-    return ret
-
-
-class SimpleDataProvider:
-    def __init__(self, *file_list):
-        self.file_list = file_list
-
-    def shuffle(self):
-        pass
-
-    def reset(self):
-        pass
-
-    def getHeader(self):
-        return header_creator()
-
-    def getNextBatch(self, batch_size):
-        ret = ""
-        ret += struct.pack('i', 2)  # batch size
-        ret += dense_value_creator(2)  # slot0
-        ret += sparse_value_creator(2)  # slot1
-        ret += index_value_creator(2)  # slot2
-        ret += sequenceStartPositions_creator()
-        return ret
-
-
-class SimpleNestDataProvider:
-    def __init__(self, *file_list):
-        self.file_list = file_list
-
-    def shuffle(self):
-        pass
-
-    def reset(self):
-        pass
-
-    def getHeader(self):
-        return header_creator()
-
-    def getNextBatch(self, batch_size):
-        ret = ""
-        ret += struct.pack('i', 2)  # batch size
-        ret += dense_value_creator(4)  # slot0
-        ret += sparse_value_creator(4)  # slot1
-        ret += index_value_creator(4)  # slot2
-        ret += sequenceStartPositions_creator()
-        ret += subSequenceStartPositions_creator()
-        return ret
-
-
-if __name__ == "__main__":
-    # test code
-    data_provider = SimpleDataProvider('./test_batch')
-    print len(data_provider.getHeader())
-    print len(data_provider.getNextBatch(2))
-
-    data_provider = SimpleNestDataProvider('./test_batch')
-    print len(data_provider.getHeader())
-    print len(data_provider.getNextBatch(2))
diff --git a/paddle/legacy/gserver/tests/pyDataProvider/pyDataProviderList b/paddle/legacy/gserver/tests/pyDataProvider/pyDataProviderList
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/paddle/legacy/gserver/tests/pyDataProvider/trainer.conf b/paddle/legacy/gserver/tests/pyDataProvider/trainer.conf
deleted file mode 100644
index 7d910df20d4..00000000000
--- a/paddle/legacy/gserver/tests/pyDataProvider/trainer.conf
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-################################### Data Configuration ###################
-TrainData(PyData(type="py", 
-            files = "./gserver/tests/pyDataProvider/pyDataProviderList", 
-            load_data_module="pyDataProvider", 
-            load_data_object="SimpleDataProvider"))
-
-################################### Algorithm Configuration #############
-Settings(
-    learning_rate_decay_a = 1e-05,
-    learning_rate_decay_b = 1e-06,
-    learning_rate = 0.001,
-    batch_size = 1,
-    algorithm = 'sgd',
-    num_batches_per_send_parameter = 1,
-    num_batches_per_get_parameter = 1,
-)
-
-################################### Network Configuration ###############
-Layer(type = "data", name = "input1", size = 3)
-Layer(type = "data", name = "input2", size = 7)
-
-Layer(inputs = [Input("input1", 
-                      decay_rate = 0.12, 
-                      initial_std = 0.02, 
-                      parameter_name = "_layer1_1.w"), 
-                Input("input2", 
-                      decay_rate = 0.12, 
-                      initial_std = 0.02, 
-                      parameter_name = "_layer1_2.w"),
-               ], 
-      name = "layer1", 
-      bias = Bias(parameter_name = "_layer1.bias"), 
-      active_type = "sigmoid", 
-      type = "fc", 
-      size = 100)
-Layer(inputs = [Input("layer1", 
-                      decay_rate = 0.06, 
-                      initial_std = 0.02, 
-                      parameter_name = "_layer2.w")], 
-      name = "layer2", 
-      bias = Bias(parameter_name = "_layer2.bias"), 
-      active_type = "sigmoid", 
-      type = "fc", 
-      size = 100)
-Layer(inputs = [Input("layer2", 
-                      decay_rate = 0.02, 
-                      initial_std = 0.02, 
-                      parameter_name = "_layer_output.w")], 
-      name = "output", 
-      bias = Bias(parameter_name = "_layer_output.bias"), 
-      active_type = "softmax", 
-      type = "fc", 
-      size = 10)
-
-Layer(type = "data", name = "label", size = 1)
-Layer(inputs = [Input("output"), Input("label")], 
-      type = "multi-class-cross-entropy", 
-      name = "cost")
-Inputs("input1", "input2", "label")
-Outputs("cost")
diff --git a/paddle/legacy/gserver/tests/rnn_data_provider.py b/paddle/legacy/gserver/tests/rnn_data_provider.py
deleted file mode 100644
index 18b2191f44e..00000000000
--- a/paddle/legacy/gserver/tests/rnn_data_provider.py
+++ /dev/null
@@ -1,115 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-from paddle.trainer.PyDataProvider2 import *
-
-# Note that each config should has an independent provider
-# in current design of PyDataProvider2.
-#######################################################
-data = [
-    [[[1, 3, 2], [4, 5, 2]], 0],
-    [[[0, 2], [2, 5], [0, 1, 2]], 1],
-]
-
-
-# Used for sequence_nest_rnn.conf
-@provider(
-    input_types=[integer_value_sub_sequence(10), integer_value(3)],
-    should_shuffle=False)
-def process_subseq(settings, file_name):
-    for d in data:
-        yield d
-
-
-# Used for sequence_rnn.conf
-@provider(
-    input_types=[integer_value_sequence(10), integer_value(3)],
-    should_shuffle=False)
-def process_seq(settings, file_name):
-    for d in data:
-        seq = []
-        for subseq in d[0]:
-            seq += subseq
-        yield seq, d[1]
-
-
-# Used for sequence_nest_rnn_multi_input.conf
-@provider(
-    input_types=[integer_value_sub_sequence(10), integer_value(3)],
-    should_shuffle=False)
-def process_subseq2(settings, file_name):
-    for d in data:
-        yield d
-
-
-# Used for sequence_rnn_multi_input.conf
-@provider(
-    input_types=[integer_value_sequence(10), integer_value(3)],
-    should_shuffle=False)
-def process_seq2(settings, file_name):
-    for d in data:
-        seq = []
-        for subseq in d[0]:
-            seq += subseq
-        yield seq, d[1]
-
-
-###########################################################
-data2 = [
-    [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]], 0],
-    [[[0, 2], [2, 5], [0, 1, 2]], [[1, 5], [4], [2, 3, 6, 1]], 1],
-]
-
-
-# Used for sequence_nest_rnn_multi_unequalength_inputs.conf
-@provider(
-    input_types=[
-        integer_value_sub_sequence(10), integer_value_sub_sequence(10),
-        integer_value(2)
-    ],
-    should_shuffle=False)
-def process_unequalength_subseq(settings, file_name):
-    for d in data2:
-        yield d
-
-
-# Used for sequence_rnn_multi_unequalength_inputs.conf
-@provider(
-    input_types=[
-        integer_value_sequence(10), integer_value_sequence(10), integer_value(2)
-    ],
-    should_shuffle=False)
-def process_unequalength_seq(settings, file_name):
-    for d in data2:
-        words1 = reduce(lambda x, y: x + y, d[0])
-        words2 = reduce(lambda x, y: x + y, d[1])
-        yield words1, words2, d[2]
-
-
-###########################################################
-data3 = [
-    [[[1, 2], [4, 5, 2]], [1, 2], 0],
-    [[[0, 2], [2, 5], [0, 1, 2]], [2, 3, 0], 1],
-]
-
-
-# Used for sequence_nest_mixed_inputs.conf
-@provider(
-    input_types=[
-        integer_value_sub_sequence(10), integer_value_sequence(10),
-        integer_value(2)
-    ],
-    should_shuffle=False)
-def process_mixed(settings, file_name):
-    for d in data3:
-        yield d
diff --git a/paddle/legacy/gserver/tests/sequenceGen.py b/paddle/legacy/gserver/tests/sequenceGen.py
deleted file mode 100644
index d5ec8ac23f1..00000000000
--- a/paddle/legacy/gserver/tests/sequenceGen.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-import os
-import sys
-
-from paddle.trainer.PyDataProvider2 import *
-
-
-def hook(settings, dict_file, **kwargs):
-    settings.word_dict = dict_file
-    settings.input_types = [
-        integer_value_sequence(len(settings.word_dict)), integer_value(3)
-    ]
-    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
-
-
-@provider(init_hook=hook, should_shuffle=False)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line in fdata:
-            label, comment = line.strip().split('\t')
-            label = int(''.join(label.split()))
-            words = comment.split()
-            words = [
-                settings.word_dict[w] for w in words if w in settings.word_dict
-            ]
-            yield words, label
-
-
-## for hierarchical sequence network
-def hook2(settings, dict_file, **kwargs):
-    settings.word_dict = dict_file
-    settings.input_types = [
-        integer_value_sub_sequence(len(settings.word_dict)),
-        integer_value_sequence(3)
-    ]
-    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
-
-
-@provider(init_hook=hook2, should_shuffle=False)
-def process2(settings, file_name):
-    with open(file_name) as fdata:
-        labels = []
-        sentences = []
-        for line in fdata:
-            if (len(line)) > 1:
-                label, comment = line.strip().split('\t')
-                label = int(''.join(label.split()))
-                words = comment.split()
-                words = [
-                    settings.word_dict[w] for w in words
-                    if w in settings.word_dict
-                ]
-                labels.append(label)
-                sentences.append(words)
-            else:
-                yield sentences, labels
-                labels = []
-                sentences = []
diff --git a/paddle/legacy/gserver/tests/sequence_layer_group.conf b/paddle/legacy/gserver/tests/sequence_layer_group.conf
deleted file mode 100644
index ad1b61d5821..00000000000
--- a/paddle/legacy/gserver/tests/sequence_layer_group.conf
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='legacy/gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 256
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-# (lstm_input + lstm) is equal to lstmemory 
-with mixed_layer(size=hidden_dim * 4) as lstm_input:
-    lstm_input += full_matrix_projection(input=emb)
-
-lstm = lstmemory_group(
-    input=lstm_input,
-    size=hidden_dim,
-    act=TanhActivation(),
-    gate_act=SigmoidActivation(),
-    state_act=TanhActivation())
-
-lstm_last = last_seq(input=lstm)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_lstm.conf b/paddle/legacy/gserver/tests/sequence_lstm.conf
deleted file mode 100644
index 6ab70e70713..00000000000
--- a/paddle/legacy/gserver/tests/sequence_lstm.conf
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='legacy/gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 256
-label_dim = 3
-sparse_update = get_config_arg("sparse_update", bool, False)
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(
-    input=data,
-    size=word_dim,
-    param_attr=ParamAttr(sparse_update=sparse_update))
-
-with mixed_layer(size=hidden_dim * 4) as lstm_input:
-    lstm_input += full_matrix_projection(input=emb)
-
-lstm = lstmemory(
-    input=lstm_input,
-    act=TanhActivation(),
-    gate_act=SigmoidActivation(),
-    state_act=TanhActivation())
-
-lstm_last = last_seq(input=lstm)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf b/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
deleted file mode 100644
index 75c36b11897..00000000000
--- a/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='legacy/gserver/tests/Sequence/train.list.nest',
-    test_list=None,
-    module='sequenceGen',
-    obj='process2',
-    args={"dict_file": dict_file})
-
-settings(batch_size=2)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 256
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb_group = embedding_layer(input=data, size=word_dim)
-
-
-# (lstm_input + lstm) is equal to lstmemory 
-def lstm_group(lstm_group_input):
-    with mixed_layer(size=hidden_dim * 4) as group_input:
-        group_input += full_matrix_projection(input=lstm_group_input)
-
-    lstm_output = lstmemory_group(
-        input=group_input,
-        name="lstm_group",
-        size=hidden_dim,
-        act=TanhActivation(),
-        gate_act=SigmoidActivation(),
-        state_act=TanhActivation())
-    return lstm_output
-
-
-lstm_nest_group = recurrent_group(
-    input=SubsequenceInput(emb_group), step=lstm_group, name="lstm_nest_group")
-# hasSubseq ->(seqlastins) seq
-lstm_last = last_seq(
-    input=lstm_nest_group, agg_level=AggregateLevel.TO_SEQUENCE)
-
-# seq ->(expand) hasSubseq
-lstm_expand = expand_layer(
-    input=lstm_last,
-    expand_as=emb_group,
-    expand_level=ExpandLevel.FROM_SEQUENCE)
-
-# hasSubseq ->(average) seq
-lstm_average = pooling_layer(
-    input=lstm_expand,
-    pooling_type=AvgPooling(),
-    agg_level=AggregateLevel.TO_SEQUENCE)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_average)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_rnn.conf b/paddle/legacy/gserver/tests/sequence_nest_rnn.conf
deleted file mode 100644
index bc3b22c2a94..00000000000
--- a/paddle/legacy/gserver/tests/sequence_nest_rnn.conf
+++ /dev/null
@@ -1,74 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_subseq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-# This hierachical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn.conf
-
-def outer_step(x):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-    def inner_step(y):
-        inner_mem = memory(name="inner_rnn_state",
-                           size=hidden_dim,
-                           boot_layer=outer_mem)
-        out = fc_layer(input=[y, inner_mem],
-                        size=hidden_dim,
-                        act=TanhActivation(),
-                        bias_attr=True,
-                        name="inner_rnn_state")
-        return out
-
-    inner_rnn_output = recurrent_group(
-        step=inner_step,
-        name="inner",
-        input=x)
-    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
-
-    # "return last" won't work, because recurrent_group only support the input 
-    # sequence type is same as return sequence type.
-    return inner_rnn_output
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=SubsequenceInput(emb))
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_input.conf b/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_input.conf
deleted file mode 100644
index 165ab229897..00000000000
--- a/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_input.conf
+++ /dev/null
@@ -1,76 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_subseq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-# This hierachical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn.conf
-
-def outer_step(wid, x):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-    def inner_step(y, wid):
-        z = embedding_layer(input=wid, size=word_dim)
-        inner_mem = memory(name="inner_rnn_state",
-                           size=hidden_dim,
-                           boot_layer=outer_mem)
-        out = fc_layer(input=[y, z, inner_mem],
-                        size=hidden_dim,
-                        act=TanhActivation(),
-                        bias_attr=True,
-                        name="inner_rnn_state")
-        return out
-
-    inner_rnn_output = recurrent_group(
-        step=inner_step,
-        name="inner",
-        input=[x, wid])
-    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
-
-    # "return last" should also work. But currently RecurrentGradientMachine
-    # does not handle it, and will report error: In hierachical RNN, all out
-    # links should be from sequences now.
-    return inner_rnn_output
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[SubsequenceInput(data), SubsequenceInput(emb)])
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py b/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
deleted file mode 100644
index 9a48b7f25c4..00000000000
--- a/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='legacy/gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_unequalength_subseq')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 2
-
-speaker1 = data_layer(name="word1", size=dict_dim)
-speaker2 = data_layer(name="word2", size=dict_dim)
-
-emb1 = embedding_layer(input=speaker1, size=word_dim)
-emb2 = embedding_layer(input=speaker2, size=word_dim)
-
-
-# This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_multi_unequalength_inputs.conf
-def outer_step(x1, x2):
-    index = [0]
-
-    def inner_step(ipt):
-        index[0] += 1
-        i = index[0]
-        outer_mem = memory(name="outer_rnn_state_%d" % i, size=hidden_dim)
-
-        def inner_step_impl(y):
-            inner_mem = memory(
-                name="inner_rnn_state_" + y.name,
-                size=hidden_dim,
-                boot_layer=outer_mem)
-            out = fc_layer(
-                input=[y, inner_mem],
-                size=hidden_dim,
-                act=TanhActivation(),
-                bias_attr=True,
-                name='inner_rnn_state_' + y.name)
-            return out
-
-        encoder = recurrent_group(
-            step=inner_step_impl, name='inner_%d' % i, input=ipt)
-        last = last_seq(name="outer_rnn_state_%d" % i, input=encoder)
-        return encoder, last
-
-    encoder1, sentence_last_state1 = inner_step(ipt=x1)
-    encoder2, sentence_last_state2 = inner_step(ipt=x2)
-
-    encoder1_expand = expand_layer(
-        input=sentence_last_state1, expand_as=encoder2)
-
-    return [encoder1_expand, encoder2]
-
-
-encoder1_rep, encoder2_rep = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[SubsequenceInput(emb1), SubsequenceInput(emb2)],
-    targetInlink=emb2)
-
-encoder1_last = last_seq(input=encoder1_rep)
-encoder1_expandlast = expand_layer(input=encoder1_last, expand_as=encoder2_rep)
-context = mixed_layer(
-    input=[
-        identity_projection(encoder1_expandlast),
-        identity_projection(encoder2_rep)
-    ],
-    size=hidden_dim)
-
-rep = last_seq(input=context)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(
-    classification_cost(
-        input=prob, label=data_layer(
-            name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_recurrent.py b/paddle/legacy/gserver/tests/sequence_recurrent.py
deleted file mode 100644
index e2c6a7935c2..00000000000
--- a/paddle/legacy/gserver/tests/sequence_recurrent.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='legacy/gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 128
-label_dim = 3
-
-# This config is designed to be equivalent with sequence_recurrent_group.py
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(
-    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
-
-recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
-
-recurrent_last = last_seq(input=recurrent)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=recurrent_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_recurrent_group.py b/paddle/legacy/gserver/tests/sequence_recurrent_group.py
deleted file mode 100644
index b4638bd9075..00000000000
--- a/paddle/legacy/gserver/tests/sequence_recurrent_group.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='legacy/gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 128
-label_dim = 3
-
-# This config is designed to be equivalent with sequence_recurrent.py
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(
-    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
-
-
-def step(y):
-    mem = memory(name="rnn_state", size=hidden_dim)
-    with mixed_layer(
-            name="rnn_state",
-            size=hidden_dim,
-            bias_attr=False,
-            act=SoftmaxActivation()) as out:
-        out += identity_projection(input=y)
-        out += full_matrix_projection(
-            input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
-    return out
-
-
-recurrent = recurrent_group(name="rnn", step=step, input=emb)
-
-recurrent_last = last_seq(input=recurrent)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=recurrent_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn.conf b/paddle/legacy/gserver/tests/sequence_rnn.conf
deleted file mode 100644
index 3133595c9ce..00000000000
--- a/paddle/legacy/gserver/tests/sequence_rnn.conf
+++ /dev/null
@@ -1,57 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_seq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-def step(y):
-    mem = memory(name="rnn_state", size=hidden_dim)
-    out = fc_layer(input=[y, mem],
-                    size=hidden_dim,
-                    act=TanhActivation(),
-                    bias_attr=True,
-                    name="rnn_state")
-    return out
-
-out = recurrent_group(
-    name="rnn",
-    step=step,
-    input=emb)
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_matched_inputs.py b/paddle/legacy/gserver/tests/sequence_rnn_matched_inputs.py
deleted file mode 100644
index 921cef04dda..00000000000
--- a/paddle/legacy/gserver/tests/sequence_rnn_matched_inputs.py
+++ /dev/null
@@ -1,84 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='legacy/gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_mixed')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 2
-hidden_dim = 2
-label_dim = 2
-
-data1 = data_layer(name="word1", size=dict_dim)
-data2 = data_layer(name="word2", size=dict_dim)
-label = data_layer(name="label", size=label_dim)
-
-encoding = embedding_layer(input=data2, size=word_dim)
-
-subseq = embedding_layer(input=data1, size=word_dim)
-seq = embedding_layer(input=data2, size=word_dim)
-nonseq = embedding_layer(input=label, size=word_dim)
-
-
-# This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_mixed_inputs.conf
-def outer_step(subseq, seq, nonseq, encoding):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-
-    def inner_step(subseq, seq, nonseq):
-        inner_mem = memory(
-            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
-
-        out = fc_layer(
-            input=[subseq, seq, nonseq, inner_mem],
-            size=hidden_dim,
-            act=TanhActivation(),
-            bias_attr=True,
-            name='inner_rnn_state')
-        return out
-
-    decoder = recurrent_group(
-        step=inner_step, name='inner', input=[subseq, seq, nonseq])
-    last = last_seq(name="outer_rnn_state", input=decoder)
-    context = simple_attention(
-        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
-    return context
-
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[
-        subseq, expand_layer(
-            seq, expand_as=subseq,
-            expand_level=ExpandLevel.FROM_SEQUENCE), expand_layer(
-                nonseq,
-                expand_as=subseq,
-                expand_level=ExpandLevel.FROM_NO_SEQUENCE),
-        StaticInput(encoding)
-    ])
-
-rep = last_seq(input=out)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_mixed_inputs.py b/paddle/legacy/gserver/tests/sequence_rnn_mixed_inputs.py
deleted file mode 100644
index c7bcaf6c4b2..00000000000
--- a/paddle/legacy/gserver/tests/sequence_rnn_mixed_inputs.py
+++ /dev/null
@@ -1,78 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='legacy/gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_mixed')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 2
-hidden_dim = 2
-label_dim = 2
-
-data1 = data_layer(name="word1", size=dict_dim)
-data2 = data_layer(name="word2", size=dict_dim)
-label = data_layer(name="label", size=label_dim)
-
-encoding = embedding_layer(input=data2, size=word_dim)
-
-
-# This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_matched_inputs.conf
-def outer_step(subseq, seq, nonseq, encoding):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-
-    def inner_step(data1, data2, label):
-        inner_mem = memory(
-            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
-
-        subseq = embedding_layer(input=data1, size=word_dim)
-        seq = embedding_layer(input=data2, size=word_dim)
-        nonseq = embedding_layer(input=label, size=word_dim)
-
-        print_layer(input=[data1, seq, label, inner_mem])
-        out = fc_layer(
-            input=[subseq, seq, nonseq, inner_mem],
-            size=hidden_dim,
-            act=TanhActivation(),
-            bias_attr=True,
-            name='inner_rnn_state')
-        return out
-
-    decoder = recurrent_group(
-        step=inner_step, name='inner',
-        input=[subseq, StaticInput(seq), nonseq])
-    last = last_seq(name="outer_rnn_state", input=decoder)
-    context = simple_attention(
-        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
-    return context
-
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[data1, data2, StaticInput(label), StaticInput(encoding)])
-
-rep = last_seq(input=out)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_multi_input.conf b/paddle/legacy/gserver/tests/sequence_rnn_multi_input.conf
deleted file mode 100644
index bf4be779a23..00000000000
--- a/paddle/legacy/gserver/tests/sequence_rnn_multi_input.conf
+++ /dev/null
@@ -1,58 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_seq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-def step(y, wid):
-    z = embedding_layer(input=wid, size=word_dim)
-    mem = memory(name="rnn_state", size=hidden_dim)
-    out = fc_layer(input=[y, z, mem],
-                    size=hidden_dim,
-                    act=TanhActivation(),
-                    bias_attr=True,
-                    name="rnn_state")
-    return out
-
-out = recurrent_group(
-    name="rnn",
-    step=step,
-    input=[emb, data])
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py b/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
deleted file mode 100644
index 3612b49c227..00000000000
--- a/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='legacy/gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_unequalength_seq')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 2
-
-speaker1 = data_layer(name="word1", size=dict_dim)
-speaker2 = data_layer(name="word2", size=dict_dim)
-
-emb1 = embedding_layer(input=speaker1, size=word_dim)
-emb2 = embedding_layer(input=speaker2, size=word_dim)
-
-# This hierachical RNN is designed to be equivalent to the RNN in
-# sequence_nest_rnn_multi_unequalength_inputs.conf
-
-
-def step(x1, x2):
-    def calrnn(y):
-        mem = memory(name='rnn_state_' + y.name, size=hidden_dim)
-        out = fc_layer(
-            input=[y, mem],
-            size=hidden_dim,
-            act=TanhActivation(),
-            bias_attr=True,
-            name='rnn_state_' + y.name)
-        return out
-
-    encoder1 = calrnn(x1)
-    encoder2 = calrnn(x2)
-    return [encoder1, encoder2]
-
-
-encoder1_rep, encoder2_rep = recurrent_group(
-    name="stepout", step=step, input=[emb1, emb2])
-
-encoder1_last = last_seq(input=encoder1_rep)
-encoder1_expandlast = expand_layer(input=encoder1_last, expand_as=encoder2_rep)
-context = mixed_layer(
-    input=[
-        identity_projection(encoder1_expandlast),
-        identity_projection(encoder2_rep)
-    ],
-    size=hidden_dim)
-
-rep = last_seq(input=context)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(
-    classification_cost(
-        input=prob, label=data_layer(
-            name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/test_ActivationGrad.cpp b/paddle/legacy/gserver/tests/test_ActivationGrad.cpp
deleted file mode 100644
index f468d229a88..00000000000
--- a/paddle/legacy/gserver/tests/test_ActivationGrad.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-void testActivation(const string& act) {
-  LOG(INFO) << "test activation: " << act;
-  size_t size = 10;
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("addto");
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type(act);
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
-  config.layerConfig.add_inputs();
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  act + "_activation",
-                  100,
-                  /* trans= */ false,
-                  useGpu,
-                  /* useWeight */ true);
-  }
-}
-
-TEST(Activation, activation) {
-  auto types = ActivationFunction::getAllRegisteredTypes();
-  std::set<string> excluded{"sequence_softmax"};
-  for (auto type : types) {
-    if (excluded.count(type)) continue;
-    testActivation(type);
-  }
-}
-
-void testSequenceSoftmaxAct(bool hasSubseq) {
-  LOG(INFO) << "test activation: sequence softmax";
-
-  const size_t size = 1;
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("addto");
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sequence_softmax");
-  config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0",
-       1,
-       0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "sequence_softmax",
-                  100,
-                  /* trans= */ false,
-                  useGpu,
-                  /* useWeight */ true);
-  }
-}
-
-TEST(SequenceSoftmaxActivation, activation) {
-  for (auto hasSubseq : {false, true}) {
-    LOG(INFO) << "hasSubseq = " << hasSubseq;
-    testSequenceSoftmaxAct(hasSubseq);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_BatchNorm.cpp b/paddle/legacy/gserver/tests/test_BatchNorm.cpp
deleted file mode 100644
index e21fa160744..00000000000
--- a/paddle/legacy/gserver/tests/test_BatchNorm.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/legacy/cuda/include/hl_batch_norm.h"
-#include "paddle/legacy/math/tests/TensorCheck.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-// Test that the batchNormLayer can be followed by a ConvLayer
-TEST(Layer, batchNorm) {
-  FLAGS_use_gpu = false;
-  TestConfig configBN;
-  const int CHANNELS = 6272;
-  const int IMG_SIZE = 1;
-  configBN.layerConfig.set_type("batch_norm");
-  configBN.layerConfig.set_name("bn");
-  configBN.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
-  configBN.layerConfig.set_active_type("relu");
-  configBN.biasSize = CHANNELS;
-  configBN.inputDefs.push_back({INPUT_DATA,
-                                "layer_0",
-                                /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
-                                /* paraSize= */ CHANNELS});
-
-  configBN.inputDefs.push_back(
-      {INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
-  configBN.inputDefs.back().isStatic = true;
-  configBN.inputDefs.push_back(
-      {INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
-  configBN.inputDefs.back().isStatic = true;
-
-  LayerInputConfig* input = configBN.layerConfig.add_inputs();
-  configBN.layerConfig.add_inputs();
-  configBN.layerConfig.add_inputs();
-
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(CHANNELS);
-  img_conf->set_img_size(IMG_SIZE);
-
-  // Setting up conv-layer config
-  TestConfig config;
-  config.biasSize = 64;
-  config.layerConfig.set_type("exconv");
-  config.layerConfig.set_num_filters(64);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "bn", 6272, 204800});
-  input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(5);
-  conv->set_filter_size_y(5);
-  conv->set_channels(128);
-  conv->set_padding(1);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(7);
-  conv->set_output_x(3);
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
-                              config.layerConfig.num_filters());
-  config.layerConfig.set_name("conv");
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(configBN,
-                &dataLayers,
-                &datas,
-                &layerMap,
-                "batch_norm",
-                100,
-                false,
-                false);
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr bnLayer;
-  initTestLayer(configBN, &layerMap, &parameters, &bnLayer);
-
-  std::vector<ParameterPtr> parameters2;
-  LayerPtr convLayer;
-  initTestLayer(config, &layerMap, &parameters2, &convLayer);
-
-  bnLayer->forward(PASS_GC);
-  convLayer->forward(PASS_GC);
-
-  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getHeight()), 100);
-  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
-}
-
-#ifdef PADDLE_WITH_CUDA
-void batchNormInference(int n, int c, int h, int w) {
-  MatrixPtr input = std::make_shared<GpuMatrix>(n, c * h * w);
-  MatrixPtr cudnnOut = std::make_shared<GpuMatrix>(n, c * h * w);
-  MatrixPtr cudaOut = std::make_shared<GpuMatrix>(n, c * h * w);
-  MatrixPtr cudnnCheck = std::make_shared<CpuMatrix>(n, c * h * w);
-  MatrixPtr cudaCheck = std::make_shared<CpuMatrix>(n, c * h * w);
-  input->randomizeUniform();
-  cudnnOut->zeroMem();
-  cudaOut->zeroMem();
-
-  MatrixPtr scale = std::make_shared<GpuMatrix>(1, c);
-  scale->randomizeUniform();
-  MatrixPtr bias = std::make_shared<GpuMatrix>(1, c);
-  bias->randomizeUniform();
-
-  MatrixPtr movingMean = std::make_shared<GpuMatrix>(1, c);
-  movingMean->randomizeUniform();
-
-  MatrixPtr movingVar = std::make_shared<GpuMatrix>(1, c);
-  movingVar->randomizeUniform();
-  movingVar->clip(0.01, 50);
-
-  hl_tensor_descriptor ioDesc;
-  hl_tensor_descriptor bnDesc;
-  hl_create_tensor_descriptor(&ioDesc);
-  hl_create_tensor_descriptor(&bnDesc);
-  hl_tensor_reshape(ioDesc, n, c, h, w);
-  hl_tensor_reshape(bnDesc, 1, c, 1, 1);
-
-  double EPS = 1E-5;
-  hl_batch_norm_forward_inference(ioDesc,
-                                  input->getData(),
-                                  ioDesc,
-                                  cudnnOut->getData(),
-                                  bnDesc,
-                                  scale->getData(),
-                                  bias->getData(),
-                                  movingMean->getData(),
-                                  movingVar->getData(),
-                                  EPS);
-
-  hl_batch_norm_cuda_inference(input->getData(),
-                               cudaOut->getData(),
-                               scale->getData(),
-                               bias->getData(),
-                               movingMean->getData(),
-                               movingVar->getData(),
-                               EPS,
-                               n,
-                               c,
-                               h,
-                               w);
-
-  cudnnCheck->copyFrom(*cudnnOut);
-  cudaCheck->copyFrom(*cudaOut);
-  autotest::TensorCheckErr(*cudnnCheck, *cudaCheck);
-
-  hl_destroy_tensor_descriptor(ioDesc);
-  hl_destroy_tensor_descriptor(bnDesc);
-}
-
-TEST(BatchNorm, Inference) {
-  batchNormInference(33, 267, 1, 1);
-  batchNormInference(19, 105, 4, 4);
-}
-#endif
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_CRFLayerGrad.cpp b/paddle/legacy/gserver/tests/test_CRFLayerGrad.cpp
deleted file mode 100644
index 1dafd1de4d8..00000000000
--- a/paddle/legacy/gserver/tests/test_CRFLayerGrad.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/gserver/layers/LinearChainCRF.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-static inline bool getNextSequence(std::vector<int>& seq, int numClasses) {
-  for (auto& v : seq) {
-    if (++v < numClasses) {
-      return true;
-    }
-    v = 0;
-  }
-  return false;
-}
-
-// log(exp(x) + exp(y))
-static inline real logSum(real x, real y) {
-  real maxValue = std::max(x, y);
-  if (std::isinf(maxValue)) {
-    return -std::numeric_limits<real>::infinity();
-  } else {
-    return maxValue + log(exp(x - maxValue) + exp(y - maxValue));
-  }
-}
-
-static inline std::vector<int> genRandLabels(int numClasses, int length) {
-  std::vector<int> labels(length);
-  for (int i = 0; i < length; ++i) {
-    labels[i] = rand() % numClasses;  // NOLINT
-  }
-  return labels;
-}
-
-TEST(CRFLayer, cost) {
-  const int numClasses = 4;
-  CpuVector para(numClasses * (numClasses + 2));
-  real* a = para.getData();
-  real* b = para.getData() + numClasses;
-  real* w = para.getData() + 2 * numClasses;
-  LinearChainCRF crf(4, para.getData());
-  for (int length : {1, 2, 3, 10}) {
-    for (int tries = 0; tries < 10; ++tries) {
-      CpuMatrix x(length, numClasses);
-      x.randomizeUniform();
-      para.randnorm(0, 2);
-
-      std::vector<int> goldenLabels = genRandLabels(numClasses, length);
-
-      real cost = crf.forward(x.getData(), goldenLabels.data(), length);
-
-      real logZ = -std::numeric_limits<real>::infinity();
-      real logNominator = -std::numeric_limits<real>::infinity();
-      std::vector<int> testResult(length, 0);
-      do {
-        real score = a[testResult.front()];
-        score += x.getElement(0, testResult.front());
-        for (int k = 1; k < length; ++k) {
-          score += x.getElement(k, testResult[k]) +
-                   w[numClasses * testResult[k - 1] + testResult[k]];
-        }
-        score += b[testResult.back()];
-        logZ = logSum(logZ, score);
-
-        if (goldenLabels == testResult) {
-          logNominator = score;
-        }
-      } while (getNextSequence(testResult, numClasses));
-
-      real trueCost = -logNominator + logZ;
-
-      real diff = fabs(trueCost - cost);
-      diff /= fabs(cost) < fabs(trueCost) ? fabs(cost) : fabs(trueCost);
-      VLOG(1) << "cost=" << cost << " trueCost=" << trueCost << " diff=" << diff
-              << std::endl;
-      if (typeid(real) == typeid(double)) {  // NOLINT
-        EXPECT_LE(diff, 1e-10);
-      } else {
-        EXPECT_LE(diff, 5e-3);
-      }
-    }
-  }
-}
-
-inline real epsilon() { return typeid(real) == typeid(double) ? 1e-10 : 0.06; }
-
-TestConfig initTestConfig(size_t numClasses, bool withWeight) {
-  TestConfig config;
-  config.layerConfig.set_type("crf");
-  config.layerConfig.set_size(numClasses);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
-                              "layer_0",
-                              numClasses,
-                              numClasses * (numClasses + 2)});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_LABEL, "layer_label", numClasses, 0});
-  config.layerConfig.add_inputs();
-
-  if (withWeight) {
-    config.inputDefs.push_back({INPUT_DENSE_DIM_DATA, "layer_weight", 1, 0});
-    config.layerConfig.add_inputs();
-  }
-
-  return config;
-}
-
-TEST(Layer, CRFLayer) {
-  size_t numClasses = 10;
-  for (int tries = 0; tries < 5; ++tries) {
-    TestConfig config = initTestConfig(numClasses, /* withWeight= */ false);
-    for (int length : {1, 3, 100}) {
-      // Not support GPU now
-      testLayerGrad(config,
-                    "crf",
-                    length,
-                    /* trans= */ false,
-                    /* useGpu= */ false,
-                    /* useWeight= */ false,
-                    epsilon());
-    }
-  }
-}
-
-TEST(Layer, CRFLayerUseWeight) {
-  size_t numClasses = 10;
-  for (int tries = 0; tries < 5; ++tries) {
-    TestConfig config = initTestConfig(numClasses, /* withWeight= */ true);
-    for (int length : {1, 3, 100}) {
-      // Not support GPU now
-      testLayerGrad(config,
-                    "crf",
-                    length,
-                    /* trans= */ false,
-                    /* useGpu= */ false,
-                    /* useWeight= */ false,
-                    epsilon());
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_CompareSparse.cpp b/paddle/legacy/gserver/tests/test_CompareSparse.cpp
deleted file mode 100644
index 11b633a5885..00000000000
--- a/paddle/legacy/gserver/tests/test_CompareSparse.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/legacy/utils/PythonUtil.h>
-
-#include "paddle/legacy/trainer/Trainer.h"
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/pserver/ParameterServer2.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile1 = "legacy/gserver/tests/sequence_lstm.conf";
-
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_int32(gpu_id);
-DECLARE_int32(seed);
-DECLARE_int32(num_passes);
-DECLARE_int32(saving_period);
-
-DECLARE_int32(num_gradient_servers);
-DECLARE_int32(port);
-DECLARE_bool(local);
-DECLARE_bool(use_old_updater);
-DECLARE_bool(parallel_nn);
-DECLARE_string(config_args);
-DEFINE_double(max_diff_ratio,
-              0.0f,
-              "max diff ratio allowed for parameters value");
-
-int gNumDevices = 0;
-
-std::vector<ParameterPtr> trainerOnePassTest(const string& configFile,
-                                             bool sparseUpdate,
-                                             int trainerCount = 1,
-                                             bool useGpu = false) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_config = configFile;
-  FLAGS_trainer_count = trainerCount;
-  FLAGS_config_args = sparseUpdate ? "sparse_update=1" : "sparse_update=0";
-
-  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile << " sparseUpdate=" << sparseUpdate;
-  srand(FLAGS_seed);
-  *ThreadLocalRand::getSeed() = FLAGS_seed;
-  ThreadLocalRandomEngine::get().seed(FLAGS_seed);
-  if (useGpu) {
-    CHECK_LE(trainerCount, gNumDevices);
-  }
-
-  std::vector<std::shared_ptr<ParameterServer2>> pservers;
-  if (!FLAGS_local) {
-    int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
-    pservers.resize(numPorts);
-
-    for (int i = 0; i < numPorts; ++i) {
-      pservers[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
-      pservers[i]->init();
-      pservers[i]->start();
-    }
-  }
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig());
-  trainer.train();
-  return trainer.getGradientMachine()->getParameters();
-}
-
-std::vector<ParameterPtr>& getDenseParameters() {
-  static std::vector<ParameterPtr> denseParameters;
-  if (denseParameters.empty()) {
-    // use dense training as base
-    FLAGS_local = true;
-    denseParameters = trainerOnePassTest(configFile1, false);
-  }
-
-  return denseParameters;
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 double maxDiffRatio) {
-  double maxDiff = 0;
-  double maxValue = 0;
-  for (size_t i = 0; i < len; ++i) {
-    double diff = fabs(A[i] - B[i]);
-    maxValue = std::max<double>(maxValue, std::max(fabs(A[i]), fabs(B[i])));
-    maxDiff = std::max(maxDiff, diff);
-  }
-  EXPECT_LE(maxDiff / maxValue, maxDiffRatio);
-  LOG(INFO) << " maxDiff=" << maxDiff << " maxValue=" << maxValue
-            << " maxDiff/maxValue=" << maxDiff / maxValue << "\n\n";
-}
-
-void compareValue(const vector<ParameterPtr>& parametersA,
-                  const vector<ParameterPtr>& parametersB,
-                  double maxDiffRatio = 0.0) {
-  LOG(INFO) << "\n\n--------------------------------"
-            << " Check Gradient Machine Parameters:"
-            << " -------------------------------------\n";
-  for (size_t i = 0; i < parametersA.size(); ++i) {
-    ParameterPtr parameterA, parameterB;
-    parameterA = parametersA[i];
-    parameterB = parametersB[i];
-
-    CpuVector paraA(parameterA->getSize());
-    CpuVector paraB(parameterB->getSize());
-    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
-    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
-              << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(),
-                "para_A",
-                paraB.getData(),
-                "para_B",
-                paraA.getSize(),
-                maxDiffRatio);
-  }
-}
-
-TEST(compareSparse, cpu) {
-  FLAGS_local = 1;  // disable remote sparse update in parameter config
-  std::vector<ParameterPtr> parameters = trainerOnePassTest(configFile1, true);
-  compareValue(getDenseParameters(), parameters);
-}
-
-TEST(compareSparse, remote_cpu) {
-  FLAGS_local = 0;  // will enable remote sparse update
-  FLAGS_ports_num_for_sparse = 5;
-  std::vector<ParameterPtr> parameters = trainerOnePassTest(configFile1, true);
-  compareValue(getDenseParameters(), parameters);
-}
-
-TEST(compareSparse, cpu10_local_vs_remote) {
-  FLAGS_local = 1;  // disable remote sparse update in parameter config
-  std::vector<ParameterPtr> localParameters =
-      trainerOnePassTest(configFile1, true, 2);
-
-  FLAGS_local = 0;  // will enable remote sparse update
-  FLAGS_ports_num_for_sparse = 5;
-  std::vector<ParameterPtr> remoteParameters =
-      trainerOnePassTest(configFile1, true, 2);
-
-  compareValue(localParameters, remoteParameters);
-}
-
-TEST(compareSparse, multiGradientMachine) {
-  int numGpu;
-#ifdef PADDLE_TYPE_DOUBLE
-  double eps = 1e-8;
-#else
-  double eps = 1e-4;
-#endif
-  numGpu = hl_get_device_count();
-  for (bool local : {false, true}) {
-    FLAGS_local = local;
-    FLAGS_ports_num_for_sparse = 5;
-    for (bool useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-      if (useGpu) continue;
-#endif
-      FLAGS_parallel_nn = useGpu;
-      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
-      int trainerCount = useGpu ? numGpu : 2;
-      std::vector<ParameterPtr> parameters =
-          trainerOnePassTest(configFile1, true, trainerCount, useGpu);
-      compareValue(getDenseParameters(), parameters, eps);
-    }
-  }
-  FLAGS_parallel_nn = false;
-}
-
-TEST(compareSparse, NeuralNetwork) {
-#ifdef PADDLE_TYPE_DOUBLE
-  double eps = 1e-8;
-#else
-  double eps = 1e-4;
-#endif
-  for (bool local : {false, true}) {
-    FLAGS_local = local;
-    FLAGS_ports_num_for_sparse = 5;
-    for (bool useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-      if (useGpu) continue;
-#endif
-      FLAGS_parallel_nn = useGpu;
-      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
-      int trainerCount = 1;
-      std::vector<ParameterPtr> parameters =
-          trainerOnePassTest(configFile1, true, trainerCount, useGpu);
-      compareValue(getDenseParameters(), parameters, useGpu ? eps : 0);
-    }
-  }
-  FLAGS_parallel_nn = false;
-}
-
-int main(int argc, char** argv) {
-  // FIXME(tonyyang-svail):
-  //   Turn off this test due CI failure:
-  //   https://paddleci.ngrok.io/viewLog.html?buildId=27608&buildTypeId=Paddle_PrCi&tab=buildLog&_focus=10430
-  return 0;
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  initPython(argc, argv);
-
-  gNumDevices = hl_get_device_count();
-  FLAGS_num_passes = 1;          // train one pass
-  FLAGS_saving_period = 100000;  // do not save parameter
-
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp b/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp
deleted file mode 100644
index e19c34abbd8..00000000000
--- a/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-
-#include "paddle/legacy/trainer/Trainer.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-
-DECLARE_bool(local);
-DECLARE_bool(use_gpu);
-
-DECLARE_string(config);
-DECLARE_string(nics);
-
-DEFINE_bool(need_high_accuracy,
-            false,
-            "whether need to run in double accuracy");
-DEFINE_double(
-    max_diff_ratio,
-    0.0f,
-    "max diff ratio allowed for outputs and parameters (value/gradient)");
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_int32(seed);
-
-static const string& config_file_a =
-    "legacy/gserver/tests/sequence_recurrent.py";
-static const string& config_file_b =
-    "legacy/gserver/tests/sequence_recurrent_group.py";
-
-struct ComData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-
-void calcGradient(ComData& data, const string configFile) {
-  FLAGS_config = configFile;
-
-  FLAGS_local = true;
-  FLAGS_use_gpu = false;
-
-  FLAGS_nics = "";
-
-  *ThreadLocalRand::getSeed() = FLAGS_seed;
-  srand(FLAGS_seed);
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig(), false);
-
-  data.parameters = trainer.getGradientMachine()->getParameters();
-
-  DataBatch dataBatch;
-  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
-
-  trainer.getDataProvider()->reset();
-  trainer.getDataProvider()->setSkipShuffle();
-  trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
-
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-  vector<Argument>& inArgs = dataBatch.getStreams();
-
-  trainer.getGradientMachine()->start();
-  trainer.getGradientMachine()->forwardBackward(
-      inArgs, &data.outArgs, PASS_TRAIN);
-
-  trainer.getGradientMachine()->finish();
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 size_t width = 1) {
-  int nNum = 0;
-  real maxVal = 0;
-  for (size_t i = 0; i < len; ++i) {
-    maxVal = std::max(maxVal, std::max(A[i], B[i]));
-  }
-  real maxDiff = 0;
-  for (size_t i = 0; i < len; ++i) {
-    real diff = fabs(A[i] - B[i]);
-    maxDiff = std::max(maxDiff, diff);
-    if (diff > maxVal * FLAGS_max_diff_ratio) {
-      nNum++;
-      VLOG(1) << "Row: " << i / width << ", " << desA << " : " << A[i] << "    "
-              << desB << " : " << B[i] << " diff=" << diff;
-    }
-  }
-  EXPECT_EQ(0, nNum);
-  LOG(INFO) << "maxValue=" << maxVal << " maxDiff=" << maxDiff << "\n\n";
-}
-
-void compareGradient(ComData& comDataA, ComData& comDataB) {
-  vector<Argument> outArgsA = comDataA.outArgs;
-  vector<Argument> outArgsB = comDataB.outArgs;
-
-  for (size_t i = 0; i < outArgsA.size(); ++i) {
-    CpuMatrix matA(outArgsA[i].value->getHeight(),
-                   outArgsA[i].value->getWidth());
-    CpuMatrix matB(outArgsB[i].value->getHeight(),
-                   outArgsB[i].value->getWidth());
-
-    matA.copyFrom(*outArgsA[i].value);
-    matB.copyFrom(*outArgsB[i].value);
-
-    LOG(INFO) << "\n--------------------------------"
-              << " Check Network Output_" << i << ":"
-              << " -------------------------------------\n";
-    checkBuffer(matA.getData(),
-                "network A output",
-                matB.getData(),
-                "network B output",
-                matA.getElementCnt(),
-                matA.getWidth());
-  }
-
-  vector<ParameterPtr>& parametersA = comDataA.parameters;
-  vector<ParameterPtr>& parametersB = comDataB.parameters;
-
-  LOG(INFO) << "\n\n--------------------------------"
-            << " Check Gradient Machine Parameters:"
-            << " -------------------------------------\n";
-  for (size_t i = 0; i < parametersA.size(); ++i) {
-    ParameterPtr parameterA, parameterB;
-    parameterA = parametersA[i];
-    parameterB = parametersB[i];
-
-    CpuVector paraA(parameterA->getSize());
-    CpuVector paraB(parameterB->getSize());
-    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
-    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
-              << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(),
-                "Network A",
-                paraB.getData(),
-                "Network B",
-                paraA.getSize());
-
-    CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
-    CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
-              << " ; size : " << gradA.getSize() << " -----------";
-    checkBuffer(gradA.getData(),
-                "Network A",
-                gradB.getData(),
-                "Network B",
-                gradA.getSize());
-  }
-}
-
-TEST(Trainer, create) {
-  ComData dataA;
-  calcGradient(dataA, config_file_a);
-  LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
-
-  ComData dataB;
-  calcGradient(dataB, config_file_b);
-  LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
-
-  compareGradient(dataA, dataB);
-}
-
-int main(int argc, char** argv) {
-  FLAGS_thread_local_rand_use_global_seed = true;
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  if (FLAGS_need_high_accuracy) {
-    LOG(INFO) << "skip test due to it's need high accuracy";
-    return 0;
-  }
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 1e-5;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in low accuracy mode";
-  }
-#else
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 1e-10;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in high accuracy mode";
-  }
-#endif
-
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/legacy/gserver/tests/test_ConvTrans.cpp b/paddle/legacy/gserver/tests/test_ConvTrans.cpp
deleted file mode 100644
index 4ea0a3d379b..00000000000
--- a/paddle/legacy/gserver/tests/test_ConvTrans.cpp
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-// Test that the convTrans forward is the same as conv backward
-TEST(Layer, convTransLayerFwd) {
-  // Setting up conv-trans layer
-  TestConfig configt;
-  configt.biasSize = 3;
-  configt.layerConfig.set_type("exconvt");
-  configt.layerConfig.set_num_filters(3);
-  configt.layerConfig.set_partial_sum(1);
-  configt.layerConfig.set_shared_biases(true);
-
-  configt.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
-  LayerInputConfig* input = configt.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(4);
-  conv->set_channels(16);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(3 / conv->groups());
-  conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                               configt.layerConfig.num_filters());
-  configt.layerConfig.set_name("convTrans");
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      configt, &dataLayers, &datas, &layerMap, "convTrans", 100, false, false);
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr convtLayer;
-  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
-  convtLayer->getBiasParameter()->zeroMem();
-  convtLayer->forward(PASS_GC);
-
-  // Setting up conv-layer config
-  TestConfig config;
-  config.biasSize = 16;
-  config.layerConfig.set_type("exconv");
-  config.layerConfig.set_num_filters(16);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 768, 384});
-  input = config.layerConfig.add_inputs();
-  conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(4);
-  conv->set_channels(3);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
-                              config.layerConfig.num_filters());
-  config.layerConfig.set_name("conv");
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers2;
-  LayerMap layerMap2;
-  vector<Argument> datas2;
-  initDataLayer(
-      config, &dataLayers2, &datas2, &layerMap2, "conv", 100, false, false);
-  // test layer initialize
-  std::vector<ParameterPtr> parameters2;
-  LayerPtr convLayer;
-  initTestLayer(config, &layerMap2, &parameters2, &convLayer);
-
-  // Sync convLayer and convtLayer parameter
-  convLayer->getBiasParameter()->zeroMem();
-  convLayer->getParameters()[0]
-      ->getBuf(PARAMETER_VALUE)
-      ->copyFrom(*(convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)));
-
-  // Set convLayer outputGrad as convTransLayer input value
-  convLayer->forward(PASS_GC);
-  convLayer->getOutput().grad->copyFrom(*(dataLayers[0]->getOutputValue()));
-
-  vector<int> callbackFlags(parameters2.size(), 0);
-  auto callback = [&](Parameter* para) { ++callbackFlags[para->getID()]; };
-  convLayer->backward(callback);
-
-  // Check that the convLayer backward is the same as convTransLayer forward
-  checkMatrixEqual(convtLayer->getOutputValue(),
-                   dataLayers2[0]->getOutputGrad());
-}
-
-// Do one forward pass of convTrans layer and check to see if its output
-// matches the given result
-void doOneConvtTest(size_t imgSize,
-                    size_t output_x,
-                    size_t stride,
-                    size_t padding,
-                    size_t filter_size,
-                    MatrixPtr& result) {
-  TestConfig configt;
-  configt.biasSize = 1;
-  configt.layerConfig.set_type("exconvt");
-  configt.layerConfig.set_num_filters(1);
-  configt.layerConfig.set_partial_sum(1);
-  configt.layerConfig.set_shared_biases(true);
-
-  configt.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", output_x * output_x, filter_size * filter_size});
-  LayerInputConfig* input = configt.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(filter_size);
-  conv->set_filter_size_y(filter_size);
-  conv->set_channels(1);
-  conv->set_padding(padding);
-  conv->set_padding_y(padding);
-  conv->set_stride(stride);
-  conv->set_stride_y(stride);
-  conv->set_groups(1);
-  conv->set_filter_channels(1);
-  conv->set_img_size(imgSize);
-  conv->set_output_x(output_x);
-
-  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                               configt.layerConfig.num_filters());
-  configt.layerConfig.set_name("convTrans");
-
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      configt, &dataLayers, &datas, &layerMap, "convTrans", 1, false, false);
-  dataLayers[0]->getOutputValue()->zeroMem();
-  dataLayers[0]->getOutputValue()->add(1.0);
-
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr convtLayer;
-  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
-  convtLayer->getBiasParameter()->zeroMem();
-  convtLayer->getParameters()[0]->zeroMem();
-  convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->add(1.0);
-  convtLayer->forward(PASS_GC);
-
-  checkMatrixEqual(convtLayer->getOutputValue(), result);
-}
-
-TEST(Layer, convTransLayerFwd2) {
-  MatrixPtr result;
-  result = Matrix::create(1, 5 * 5, false, false);
-  result->zeroMem();
-  result->add(1.0);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 1,
-                 /* stride */ 1,
-                 /* padding */ 0,
-                 /* filter_size */ 5,
-                 result);
-
-  real resultData[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
-                       4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
-  result->setData(resultData);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 2,
-                 /* stride */ 1,
-                 /* padding */ 0,
-                 /* filter_size */ 4,
-                 result);
-
-  real resultData2[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
-                        4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
-  result->setData(resultData2);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 2,
-                 /* stride */ 2,
-                 /* padding */ 1,
-                 /* filter_size */ 5,
-                 result);
-
-  real resultData3[] = {1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 4,
-                        2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1};
-  result->setData(resultData3);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 2,
-                 /* stride */ 2,
-                 /* padding */ 0,
-                 /* filter_size */ 3,
-                 result);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_ConvUnify.cpp b/paddle/legacy/gserver/tests/test_ConvUnify.cpp
deleted file mode 100644
index d4ca158352d..00000000000
--- a/paddle/legacy/gserver/tests/test_ConvUnify.cpp
+++ /dev/null
@@ -1,315 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-// Do one forward pass of ConvLayer using either exconv or cudnn_conv
-MatrixPtr doOneConvTest(size_t imgSize,
-                        size_t output_x,
-                        size_t stride,
-                        size_t padding,
-                        size_t filter_size,
-                        size_t channel,
-                        size_t numfilters,
-                        size_t groups,
-                        MatrixPtr& inputData,
-                        real* param,
-                        bool useGpu,
-                        bool isDeconv = false) {
-  TestConfig config;
-  config.biasSize = numfilters;
-  string layerType;
-  if (useGpu) {
-    layerType = (isDeconv) ? "cudnn_convt" : "cudnn_conv";
-  } else {
-    layerType = (isDeconv) ? "exconvt" : "exconv";
-  }
-  config.layerConfig.set_type(layerType);
-  config.layerConfig.set_num_filters(numfilters);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  size_t weightSize = channel * filter_size * filter_size *
-                      config.layerConfig.num_filters() / groups;
-  if (isDeconv) {
-    config.inputDefs.push_back(
-        {INPUT_DATA, "layer_0", output_x * output_x * channel, weightSize});
-    config.layerConfig.set_size(imgSize * imgSize *
-                                config.layerConfig.num_filters());
-  } else {
-    config.inputDefs.push_back(
-        {INPUT_DATA, "layer_0", imgSize * imgSize * channel, weightSize});
-    config.layerConfig.set_size(output_x * output_x *
-                                config.layerConfig.num_filters());
-  }
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(filter_size);
-  conv->set_filter_size_y(filter_size);
-  conv->set_channels(channel);
-  conv->set_padding(padding);
-  conv->set_padding_y(padding);
-  conv->set_stride(stride);
-  conv->set_stride_y(stride);
-  conv->set_groups(groups);
-  conv->set_img_size(imgSize);
-  conv->set_output_x(output_x);
-
-  if (isDeconv) {
-    conv->set_filter_channels(numfilters / groups);
-  } else {
-    conv->set_filter_channels(channel / groups);
-  }
-
-  config.layerConfig.set_name("conv");
-
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      config, &dataLayers, &datas, &layerMap, "conv", 1, false, useGpu);
-  dataLayers[0]->getOutputValue()->zeroMem();
-  dataLayers[0]->getOutputValue()->copyFrom(*inputData);
-
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr convLayer;
-  initTestLayer(config, &layerMap, &parameters, &convLayer);
-  convLayer->getBiasParameter()->zeroMem();
-  convLayer->getParameters()[0]->zeroMem();
-  convLayer->getParameters()[0]
-      ->getBuf(PARAMETER_VALUE)
-      ->copyFrom(param, weightSize);
-  convLayer->forward(PASS_GC);
-
-  return convLayer->getOutputValue();
-}
-
-TEST(Layer, convParaUnified) {
-#ifdef PADDLE_WITH_CUDA
-  MatrixPtr input, resultCpu, resultGpu;
-
-  /// TEST1 for conv ///
-  input = Matrix::create(1, 4 * 4, false, false);
-  real inputData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
-  real param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1};
-
-  input->setData(inputData);
-
-  resultCpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ false);
-
-  resultGpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST1 for deconv ///
-  input = Matrix::create(1, 2 * 2, false, false);
-  real inputDataT[] = {1, 2, 3, 4};
-  input->setData(inputDataT);
-
-  resultCpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ false,
-                            /*isDeconv*/ true);
-
-  resultGpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ true,
-                            /*isDeconv*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST2 for conv ///
-  input = Matrix::create(1, 3 * 3 * 2, false, false);
-  real inputData2[] = {
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
-  real param2[] = {1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1};
-
-  input->setData(inputData2);
-
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ false);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST3 for conv ///
-  real param3[] = {1, 2, 3, 4, 4, 3, 2, 1};
-
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ false);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST2 for deconv ///
-  input = Matrix::create(1, 2 * 2 * 2, false, false);
-  real inputData2T[] = {1, 2, 3, 4, 5, 6, 7, 8};
-  input->setData(inputData2T);
-
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ false,
-                            /*isDeconv*/ true);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ true,
-                            /*isDeconv*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST3 for deconv ///
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ false,
-                            /*isDeconv*/ true);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ true,
-                            /*isDeconv*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-#endif
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/legacy/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
deleted file mode 100644
index 34eb0dedffe..00000000000
--- a/paddle/legacy/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
+++ /dev/null
@@ -1,352 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-#include <sstream>
-
-#include <gtest/gtest.h>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-const size_t MAX_SEQ_NUM = 23;
-const size_t MAX_SEQ_LEN = 50;
-const size_t MAX_BEAM_SIZE = 27;
-
-const size_t SEED = (size_t)(time(NULL));
-
-struct SingleBeamExpansion {
-  vector<int> seqStartPos;
-  vector<int> subSeqStartPos;
-  vector<real> candidateScores;
-
-  // TODO(caoying): store this into Argument.ids
-  vector<real> selectedIndices;
-
-  vector<int> groundTruth;
-  vector<size_t> inBeam;
-  vector<int> rowIdxInBeam;
-  vector<int> colIdxInBeam;
-
-  void resetGroundTruth(size_t n) {
-    groundTruth.clear();
-    groundTruth.resize(n, -1);
-
-    inBeam.clear();
-    inBeam.resize(n, 0);
-
-    rowIdxInBeam.clear();
-    rowIdxInBeam.resize(n, -1);
-
-    colIdxInBeam.clear();
-    colIdxInBeam.resize(n, -1);
-  }
-};
-
-inline float randFloat() {
-  return static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
-}
-
-void genRand(real* numbers, size_t n) {
-  default_random_engine generator;
-  uniform_real_distribution<real> distribution(0.0, 1.0);
-  for (size_t i = 0; i < n; ++i) numbers[i] = distribution(generator);
-}
-
-vector<real> randSampling(real range, int n) {
-  CHECK_GE(range, n);
-  vector<real> num(range);
-  iota(begin(num), end(num), 0.);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  sort(begin(num), end(num));
-  return num;
-}
-
-void genCandidateScores(bool hasSubseq,
-                        size_t beamSize,
-                        SingleBeamExpansion& prevBeam,
-                        SingleBeamExpansion& curBeam) {
-  vector<int>& seqStartPos = curBeam.seqStartPos;
-  seqStartPos.resize(1, 0);
-  vector<int>& subSeqStartPos = curBeam.subSeqStartPos;
-  subSeqStartPos.resize(1, 0);
-
-  srand(SEED);
-  if (prevBeam.selectedIndices.size()) {
-    if (prevBeam.subSeqStartPos.size() > 1) {
-      int seqIdx = 1;
-      // samples in previous beam are nested sequences.
-      for (size_t i = 1; i < prevBeam.subSeqStartPos.size(); ++i) {
-        for (size_t j = 0; j < beamSize; ++j) {
-          if (prevBeam.selectedIndices[(i - 1) * beamSize + j] == -1.) break;
-          subSeqStartPos.push_back(1 + (rand() % MAX_SEQ_LEN) +
-                                   subSeqStartPos.back());
-        }
-        if (prevBeam.seqStartPos[seqIdx] == prevBeam.subSeqStartPos[i]) {
-          seqStartPos.push_back(subSeqStartPos.back());
-          seqIdx++;
-        }
-      }
-    } else {
-      for (size_t i = 0; i <= prevBeam.selectedIndices.size(); ++i) {
-        if (i && i % beamSize == 0) {
-          seqStartPos.push_back(subSeqStartPos.back());
-          if (i == prevBeam.selectedIndices.size()) break;
-        }
-        if (prevBeam.selectedIndices[i] == -1.) continue;
-        subSeqStartPos.push_back(subSeqStartPos.back() +
-                                 (1 + (rand() % MAX_SEQ_LEN)));
-      }
-    }
-  } else {
-    // the first beam expansion
-    int seqNum = 1 + (rand() % MAX_SEQ_NUM);
-    for (int i = 0; i < seqNum; ++i) {
-      if (hasSubseq) {
-        for (size_t j = 0; j < 1 + (rand() % MAX_SEQ_NUM); ++j)
-          subSeqStartPos.push_back(subSeqStartPos.back() +
-                                   (1 + (rand() % MAX_SEQ_LEN)));
-        seqStartPos.push_back(subSeqStartPos.back());
-      } else {
-        seqStartPos.push_back(seqStartPos.back() +
-                              (1 + (rand() % MAX_SEQ_LEN)));
-      }
-    }
-  }
-
-  size_t totalSeqNum = hasSubseq ? subSeqStartPos.back() : seqStartPos.back();
-  curBeam.candidateScores.resize(totalSeqNum, 0.);
-  genRand(curBeam.candidateScores.data(), totalSeqNum);
-}
-
-void genSelectedIndices(size_t beamSize,
-                        vector<int>& seqStartPos,
-                        vector<real>& selectedIndices) {
-  size_t selectedIdsCount = beamSize * (seqStartPos.size() - 1);
-  selectedIndices.resize(selectedIdsCount, -1.);
-
-  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
-    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
-    int n = min(seqLen, static_cast<int>(beamSize));
-    vector<real> ids = randSampling(seqLen, n);
-    memcpy(selectedIndices.data() + i * beamSize,
-           ids.data(),
-           sizeof(real) * ids.size());
-  }
-}
-
-void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
-                    size_t beamSize) {
-  SingleBeamExpansion& beam = beamExpansions[1];
-  size_t seqNum = beam.seqStartPos.size() - 1;
-  for (size_t i = 2; i < beamExpansions.size(); ++i)
-    CHECK_EQ(seqNum, beamExpansions[i].seqStartPos.size() - 1);
-
-  srand(SEED);
-
-  // initialize the first beam.
-  beam.resetGroundTruth(seqNum);
-  for (size_t i = 0; i < seqNum; ++i) {
-    if (randFloat() > 0.5) {
-      /*
-       * force the randomly generated label falls in the beam by chance 0.5.
-       * otherwise, when sequence length is relatively long and beam size is
-       * relatively small, the gold sequences falls off the beam at in the
-       * first search.
-       */
-      real* begPos = beam.selectedIndices.data() + i * beamSize;
-      beam.colIdxInBeam[i] =
-          rand() % count_if(begPos, begPos + beamSize, [](const real& val) {
-            return val != -1.;
-          });
-      beam.groundTruth[i] =
-          beam.selectedIndices[i * beamSize + beam.colIdxInBeam[i]];
-      beam.inBeam[i] = 1;
-    } else {
-      int label = rand() % (beam.seqStartPos[i + 1] - beam.seqStartPos[i]);
-      beam.groundTruth[i] = label;
-
-      real* begPos = beam.selectedIndices.data() + i * beamSize;
-      real* endPos = begPos + beamSize;
-      real* lblPos = find(begPos, endPos, real(label));
-      if (lblPos != endPos) {
-        beam.inBeam[i] = 1;
-        beam.colIdxInBeam[i] = lblPos - begPos;
-      }
-    }
-    beam.rowIdxInBeam[i] = i;
-  }
-
-  // iterate over each beam expansions
-  for (size_t i = 2; i < beamExpansions.size(); ++i) {
-    SingleBeamExpansion& curBeam = beamExpansions[i];
-    SingleBeamExpansion& prevBeam = beamExpansions[i - 1];
-    curBeam.resetGroundTruth(seqNum);
-
-    // iterate over each sequence
-    for (size_t j = 0; j < seqNum; ++j) {
-      if (!prevBeam.inBeam[j]) continue;
-
-      // gold sequence falls in the beam in previous search.
-      real* begPos = prevBeam.selectedIndices.data();
-      int offset =
-          prevBeam.rowIdxInBeam[j] * beamSize + prevBeam.colIdxInBeam[j];
-      curBeam.rowIdxInBeam[j] = count_if(
-          begPos, begPos + offset, [](const real& val) { return val != -1.; });
-
-      if (randFloat() > 0.5) {
-        // force the randomly generated label falls in the beam by chance 0.5.
-
-        real* start =
-            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
-        int n = rand() % count_if(start, start + beamSize, [](const real& val) {
-                  return val != -1.;
-                });
-        curBeam.colIdxInBeam[j] = n;
-        curBeam.groundTruth[j] = *(start + n);
-        curBeam.inBeam[j] = 1;
-      } else {
-        CHECK_LE((size_t)curBeam.rowIdxInBeam[j] + 1,
-                 curBeam.subSeqStartPos.size() - 1);
-        int start = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j]];
-        int end = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j] + 1];
-        CHECK_GT(size_t(end), size_t(start));
-        int label = rand() % (end - start);
-
-        curBeam.groundTruth[j] = label;
-        real* findBeg =
-            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
-        real* lblPos =
-            find(findBeg, findBeg + beamSize, static_cast<real>(label));
-        if (lblPos != (findBeg + beamSize)) {
-          curBeam.inBeam[j] = 1;
-          curBeam.colIdxInBeam[j] = lblPos - findBeg;
-        }
-      }
-    }
-  }
-}
-
-void genOneBeam(size_t beamSize,
-                bool hasSubseq,
-                SingleBeamExpansion& prevBeam,
-                SingleBeamExpansion& curBeam) {
-  genCandidateScores(hasSubseq, beamSize, prevBeam, curBeam);
-  genSelectedIndices(beamSize,
-                     hasSubseq ? curBeam.subSeqStartPos : curBeam.seqStartPos,
-                     curBeam.selectedIndices);
-}
-
-void genRandomBeamExpansion(size_t expansionCount,
-                            size_t beamSize,
-                            vector<SingleBeamExpansion>& beamExpansions) {
-  beamExpansions.clear();
-  beamExpansions.resize(expansionCount + 1);
-
-  // beamExpansions[0] is reserved.
-  for (size_t i = 1; i <= expansionCount; ++i)
-    genOneBeam(beamSize, bool(i - 1), beamExpansions[i - 1], beamExpansions[i]);
-  genGroundTruth(beamExpansions, beamSize);
-}
-
-void testCrossEntropyOverBeam(bool useGpu,
-                              size_t beamSize,
-                              vector<SingleBeamExpansion>& beams) {
-  TestConfig config;
-  config.layerConfig.set_type("cross_entropy_over_beam");
-
-  size_t seqNum = 0;
-  for (size_t i = 1; i < beams.size(); ++i) {
-    const SingleBeamExpansion& beam = beams[i];
-    // create scores for all the candidates
-    MatrixPtr candidateScorePtr =
-        Matrix::create(beam.candidateScores.size(), 1, false, false);
-    candidateScorePtr->copyFrom(beam.candidateScores.data(),
-                                beam.candidateScores.size());
-
-    ostringstream paramName;
-    paramName << "candidate_scores_" << i;
-
-    if (beam.subSeqStartPos.size() > 1) {
-      seqNum = beam.subSeqStartPos.size() - 1;
-      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                  paramName.str(),
-                                  candidateScorePtr,
-                                  beam.seqStartPos,
-                                  beam.subSeqStartPos});
-    } else {
-      seqNum = beam.seqStartPos.size() - 1;
-      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                  paramName.str(),
-                                  candidateScorePtr,
-                                  beam.seqStartPos});
-    }
-    config.layerConfig.add_inputs();
-
-    // create indices for the selected candidates
-    MatrixPtr selectedCandidates =
-        Matrix::create(seqNum, beamSize, false, false);
-    selectedCandidates->copyFrom(beam.selectedIndices.data(),
-                                 beam.selectedIndices.size());
-    paramName.clear();
-    paramName << "selected_candidates_" << i;
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, paramName.str(), selectedCandidates});
-    config.layerConfig.add_inputs();
-
-    // create the ground truth
-    paramName.clear();
-    paramName << "label_" << i;
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, paramName.str(), beam.groundTruth});
-    config.layerConfig.add_inputs();
-  }
-
-  testLayerGrad(
-      config, "cross_entropy_over_beam", seqNum, false, useGpu, false);
-}
-
-TEST(Layer, CrossEntropyOverBeam) {
-  LOG(INFO) << "SEED = " << SEED;
-  const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE;
-  LOG(INFO) << "beamSize = " << beamSize;
-
-  // TODO(caoying): test with random beam expansions.
-  const size_t expansionCount = 3;
-  vector<SingleBeamExpansion> beams;
-  genRandomBeamExpansion(expansionCount, beamSize, beams);
-
-  for (bool useGpu : {false, true})
-    testCrossEntropyOverBeam(useGpu, beamSize, beams);
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(SEED);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_DetectionOutput.cpp b/paddle/legacy/gserver/tests/test_DetectionOutput.cpp
deleted file mode 100644
index 48652142655..00000000000
--- a/paddle/legacy/gserver/tests/test_DetectionOutput.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-// Do one forward pass of priorBox layer and check to see if its output
-// matches the given result
-void doOneDetectionOutputTest(MatrixPtr& inputLoc,
-                              MatrixPtr& inputConf,
-                              MatrixPtr& inputPriorBox,
-                              size_t feature_map_width,
-                              size_t feature_map_height,
-                              real nms_threshold,
-                              bool use_gpu,
-                              MatrixPtr& result) {
-  // Setting up the detection output layer
-  TestConfig configt;
-  configt.layerConfig.set_type("detection_output");
-  LayerInputConfig* input = configt.layerConfig.add_inputs();
-  configt.layerConfig.add_inputs();
-  configt.layerConfig.add_inputs();
-
-  DetectionOutputConfig* detOutput = input->mutable_detection_output_conf();
-  detOutput->set_width(feature_map_width);
-  detOutput->set_height(feature_map_height);
-  detOutput->set_nms_threshold(nms_threshold);
-  detOutput->set_num_classes(2);
-  detOutput->set_nms_top_k(20);
-  detOutput->set_keep_top_k(10);
-  detOutput->set_background_id(0);
-  detOutput->set_confidence_threshold(0.01);
-  detOutput->set_input_num(1);
-  configt.inputDefs.push_back({INPUT_DATA_TARGET, "priorbox", 32, 0});
-  configt.inputDefs.push_back({INPUT_DATA, "input_loc", 16, 0});
-  configt.inputDefs.push_back({INPUT_DATA, "input_conf", 8, 0});
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      configt, &dataLayers, &datas, &layerMap, "priorbox", 1, false, use_gpu);
-
-  dataLayers[0]->getOutputValue()->copyFrom(*inputPriorBox);
-  dataLayers[1]->getOutputValue()->copyFrom(*inputLoc);
-  dataLayers[2]->getOutputValue()->copyFrom(*inputConf);
-
-  // test layer initialize
-  bool store_FLAGS_use_gpu = FLAGS_use_gpu;
-  FLAGS_use_gpu = use_gpu;
-  std::vector<ParameterPtr> parameters;
-  LayerPtr detectionOutputLayer;
-  initTestLayer(configt, &layerMap, &parameters, &detectionOutputLayer);
-  FLAGS_use_gpu = store_FLAGS_use_gpu;
-  detectionOutputLayer->forward(PASS_GC);
-  checkMatrixEqual(detectionOutputLayer->getOutputValue(), result);
-}
-
-TEST(Layer, detectionOutputLayerFwd) {
-  bool useGpu = false;
-  // CPU case 1.
-  MatrixPtr inputLoc;
-  MatrixPtr inputConf;
-  MatrixPtr inputPriorBox;
-  MatrixPtr result, result2, result3, result4;
-  real nmsTreshold = 0.01;
-  real inputLocData[] = {0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1};
-  real inputConfData[] = {0.1, 0.9, 0.2, 0.8, 0.3, 0.7, 0.4, 0.6};
-  real inputPriorBoxData[] = {0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2,
-                              0.2, 0.2, 0.6, 0.6, 0.1, 0.1, 0.2, 0.2,
-                              0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2,
-                              0.4, 0.4, 0.8, 0.8, 0.1, 0.1, 0.2, 0.2};
-  real resultData[] = {
-      0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031};
-  inputLoc = Matrix::create(1, 16, false, useGpu);
-  inputConf = Matrix::create(1, 8, false, useGpu);
-  inputPriorBox = Matrix::create(1, 32, false, useGpu);
-  result = Matrix::create(1, 7, false, useGpu);
-  inputLoc->setData(inputLocData);
-  inputConf->setData(inputConfData);
-  inputPriorBox->setData(inputPriorBoxData);
-  result->setData(resultData);
-  doOneDetectionOutputTest(inputLoc,
-                           inputConf,
-                           inputPriorBox,
-                           /* feature_map_width */ 1,
-                           /* feature_map_height */ 1,
-                           nmsTreshold,
-                           useGpu,
-                           result);
-
-  // CPU case 2.
-  nmsTreshold = 0.2;
-  result2 = Matrix::create(2, 7, false, useGpu);
-  real resultData2[] = {0,
-                        1,
-                        0.68997443,
-                        0.099959746,
-                        0.099959746,
-                        0.50804031,
-                        0.50804031,
-                        0,
-                        1,
-                        0.59868765,
-                        0.29995975,
-                        0.29995975,
-                        0.70804024,
-                        0.70804024};
-  result2->setData(resultData2);
-  doOneDetectionOutputTest(inputLoc,
-                           inputConf,
-                           inputPriorBox,
-                           /* feature_map_width */ 1,
-                           /* feature_map_height */ 1,
-                           nmsTreshold,
-                           useGpu,
-                           result2);
-
-#ifdef PADDLE_WITH_CUDA
-  // GPU case 1.
-  useGpu = true;
-  inputLoc = Matrix::create(1, 16, false, useGpu);
-  inputConf = Matrix::create(1, 8, false, useGpu);
-  inputPriorBox = Matrix::create(1, 32, false, useGpu);
-  inputLoc->copyFrom(inputLocData, 16);
-  inputConf->copyFrom(inputConfData, 8);
-  inputPriorBox->copyFrom(inputPriorBoxData, 32);
-
-  nmsTreshold = 0.01;
-  result3 = Matrix::create(1, 7, false, useGpu);
-  result3->copyFrom(resultData, 7);
-  doOneDetectionOutputTest(inputLoc,
-                           inputConf,
-                           inputPriorBox,
-                           /* feature_map_width */ 1,
-                           /* feature_map_height */ 1,
-                           nmsTreshold,
-                           useGpu,
-                           result3);
-
-  // GPU case 2.
-  nmsTreshold = 0.2;
-  result4 = Matrix::create(2, 7, false, useGpu);
-  result4->copyFrom(resultData2, 14);
-  doOneDetectionOutputTest(inputLoc,
-                           inputConf,
-                           inputPriorBox,
-                           /* feature_map_width */ 1,
-                           /* feature_map_height */ 1,
-                           nmsTreshold,
-                           useGpu,
-                           result4);
-#endif
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_Evaluator.cpp b/paddle/legacy/gserver/tests/test_Evaluator.cpp
deleted file mode 100644
index 8aab50d23e5..00000000000
--- a/paddle/legacy/gserver/tests/test_Evaluator.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/trainer/Trainer.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-enum InputType {
-  INPUT_DATA,         // dense vector
-  INPUT_LABEL,        // id
-  INPUT_DATA_TARGET,  // dense vector, but no gradient
-  INPUT_SEQUENCE_DATA,
-  INPUT_SEQUENCE_LABEL,
-  INPUT_SPARSE_NON_VALUE_DATA
-};
-
-struct InputDef {
-  InputType inputType;
-  string name;
-  size_t dim;
-};
-
-struct TestConfig {
-  EvaluatorConfig evaluatorConfig;
-  std::vector<InputDef> inputDefs;
-  bool testAccumulate;
-  TestConfig() : testAccumulate(true) {}
-};
-
-void testEvaluator(TestConfig testConf,
-                   string testEvaluatorName,
-                   size_t batchSize,
-                   bool useGpu) {
-#ifndef PADDLE_WITH_CUDA
-  if (useGpu) return;
-#endif
-  FLAGS_use_gpu = useGpu;
-  testConf.evaluatorConfig.set_name(testEvaluatorName);
-  LOG(INFO) << " evaluator_type=" << testConf.evaluatorConfig.type()
-            << " useGpu=" << useGpu;
-
-  std::vector<Argument> arguments;
-  for (size_t i = 0; i < testConf.inputDefs.size(); ++i) {
-    Argument data;
-    size_t dim = testConf.inputDefs[i].dim;
-    switch (testConf.inputDefs[i].inputType) {
-      case INPUT_DATA:
-      case INPUT_SEQUENCE_DATA:
-      case INPUT_DATA_TARGET:
-        data.value = Matrix::create(batchSize, dim, false, useGpu);
-        data.value->randomizeUniform();
-
-        // make sure output > 0 && output < 1
-        data.value->add(-0.5);
-        data.value->sigmoid(*data.value);
-        break;
-      case INPUT_LABEL:
-      case INPUT_SEQUENCE_LABEL:
-        data.ids = VectorT<int>::create(batchSize, useGpu);
-        data.ids->rand(dim);  // now rand number can be 0 to inputDefs[i].dim.
-        break;
-      case INPUT_SPARSE_NON_VALUE_DATA:
-        data.value = makeRandomSparseMatrix(batchSize,
-                                            dim,
-                                            /* withValue= */ false,
-                                            useGpu);
-        break;
-      default:
-        LOG(FATAL) << " unknown inputType ";
-        return;
-    }
-
-    ICpuGpuVectorPtr sequenceStartPositions;
-    if (testConf.inputDefs[i].inputType == INPUT_SEQUENCE_DATA ||
-        testConf.inputDefs[i].inputType == INPUT_SEQUENCE_LABEL) {
-      if (!sequenceStartPositions) {
-        generateSequenceStartPositions(batchSize, sequenceStartPositions);
-      }
-      data.sequenceStartPositions = sequenceStartPositions;
-    }
-
-    arguments.push_back(data);
-  }
-
-  Evaluator* testEvaluator = Evaluator::create(testConf.evaluatorConfig);
-  double totalScore = 0.0;
-  testEvaluator->start();
-  totalScore += testEvaluator->evalImp(arguments);
-  testEvaluator->updateSamplesNum(arguments);
-  testEvaluator->finish();
-  LOG(INFO) << *testEvaluator;
-
-  std::vector<std::string> names;
-  testEvaluator->getNames(&names);
-  paddle::Error err;
-  for (auto& name : names) {
-    auto value = testEvaluator->getValue(name, &err);
-    ASSERT_TRUE(err.isOK());
-    LOG(INFO) << name << " " << value;
-    auto tp = testEvaluator->getType(name, &err);
-    ASSERT_TRUE(err.isOK());
-    ASSERT_EQ(testConf.evaluatorConfig.type(), tp);
-  }
-
-  double totalScore2 = 0.0;
-  if (testConf.testAccumulate) {
-    testEvaluator->start();
-    totalScore2 += testEvaluator->evalImp(arguments);
-    testEvaluator->finish();
-    EXPECT_LE(fabs(totalScore - totalScore2), 1.0e-5);
-  }
-}
-
-void testEvaluatorAll(TestConfig testConf,
-                      string testEvaluatorName,
-                      size_t batchSize) {
-  testEvaluator(testConf, testEvaluatorName, batchSize, true);
-  testEvaluator(testConf, testEvaluatorName, batchSize, false);
-}
-
-TEST(Evaluator, detection_map) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("detection_map");
-  config.evaluatorConfig.set_overlap_threshold(0.5);
-  config.evaluatorConfig.set_background_id(0);
-  config.evaluatorConfig.set_ap_type("Integral");
-  config.evaluatorConfig.set_evaluate_difficult(0);
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 7});
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "label", 6});
-  config.evaluatorConfig.set_evaluate_difficult(false);
-  testEvaluatorAll(config, "detection_map", 100);
-
-  config.evaluatorConfig.set_evaluate_difficult(true);
-  testEvaluatorAll(config, "detection_map", 100);
-}
-
-TEST(Evaluator, classification_error) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("classification_error");
-  config.evaluatorConfig.set_top_k(5);
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 50});
-  config.inputDefs.push_back({INPUT_LABEL, "label", 50});
-  testEvaluatorAll(config, "classification_error", 100);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "classification_error_weight", 100);
-
-  // multi binary labels
-  config.inputDefs.clear();
-  config.inputDefs.push_back({INPUT_DATA, "output", 100});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "label", 100});
-  // Not support GPU
-  testEvaluator(config, "classification_error_multi_binary_label", 50, false);
-
-  config.evaluatorConfig.set_classification_threshold(0.4);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  // Not support GPU
-  testEvaluator(
-      config, "classification_error_weight_multi_binary_label", 50, false);
-}
-
-TEST(Evaluator, sum) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("sum");
-
-  // sum of output
-  config.inputDefs.push_back({INPUT_DATA, "output", 10});
-  testEvaluatorAll(config, "sum_output", 200);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "sum_output_weight", 200);
-
-  // sum of label
-  config.inputDefs.clear();
-  config.inputDefs.push_back({INPUT_LABEL, "label", 10});
-  testEvaluatorAll(config, "sum_label", 200);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "sum_label_weight", 200);
-}
-
-TEST(Evaluator, last_column_sum) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("last-column-sum");
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 50});
-  testEvaluatorAll(config, "last-column-sum", 200);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "last-column-sum_weight", 200);
-}
-
-TEST(Evaluator, last_column_auc) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("last-column-auc");
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 2});
-  config.inputDefs.push_back({INPUT_LABEL, "label", 2});
-  testEvaluatorAll(config, "last-column-auc", 500);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "last-column-auc_weight", 200);
-}
-
-TEST(Evaluator, precision_recall) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("precision_recall");
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 10});
-  config.inputDefs.push_back({INPUT_LABEL, "label", 10});
-  testEvaluatorAll(config, "precision_recall", 200);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "precision_recall_weight", 200);
-
-  LOG(INFO) << "positive_label = 5";
-  config.evaluatorConfig.set_positive_label(5);
-  testEvaluatorAll(config, "precision_recall_weight", 200);
-
-  // multi binary labels
-  config.inputDefs.clear();
-  config.evaluatorConfig.set_positive_label(-1);
-  config.inputDefs.push_back({INPUT_DATA, "output", 10});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "label", 10});
-  // Not support GPU
-  testEvaluator(config, "precision_recall_multi_binary_label", 100, false);
-
-  LOG(INFO) << "classification_threshold = 0.4";
-  config.evaluatorConfig.set_classification_threshold(0.4);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  // Not support GPU
-  testEvaluator(
-      config, "precision_recall_weight_multi_binary_label", 100, false);
-}
-
-TEST(Evaluator, ctc_error_evaluator) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("ctc_edit_distance");
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "output", 32});
-  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "label", 1});
-  testEvaluatorAll(config, "ctc_error_evaluator", 100);
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_Expand.cpp b/paddle/legacy/gserver/tests/test_Expand.cpp
deleted file mode 100644
index fa1c86d13f4..00000000000
--- a/paddle/legacy/gserver/tests/test_Expand.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-// Do one forward pass of expand layer and check to see if its output
-// matches the given result.(Test onlyCPU currently.)
-void doOneExpandTest(string trans_type,
-                     bool hasSubseq,
-                     bool useGpu,
-                     Argument& input1,
-                     Argument& input2,
-                     Argument& result) {
-  FLAGS_use_gpu = false;
-  // Setting up the expand layer
-  TestConfig config;
-  config.layerConfig.set_type("expand");
-
-  auto inputType1 =
-      trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA;
-  config.inputDefs.push_back({inputType1, "layer0", 1, 0});
-  auto inputType2 =
-      hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA;
-
-  config.inputDefs.push_back({inputType2, "layer1", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.set_trans_type(trans_type);
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      config, &dataLayers, &datas, &layerMap, "expand", 1, false, useGpu);
-  dataLayers[0]->getOutput() = input1;
-  dataLayers[1]->getOutput() = input2;
-
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr expandLayer;
-  initTestLayer(config, &layerMap, &parameters, &expandLayer);
-  expandLayer->forward(PASS_GC);
-  checkMatrixEqual(expandLayer->getOutputValue(), result.value);
-}
-
-TEST(Layer, ExpandLayerFwd) {
-  bool useGpu = false;
-
-  // Assume batch_size =3 in all cases.
-
-  // CPU case 1. non-seq expand to seq
-  // input1 = 1,2,3
-  // input2 = [4,5],[6],[7,8,9]
-  // result = [1,1],[2],[3,3,3]
-  Argument input1, input2, result;
-  input1.value = Matrix::create(3, 1, false, useGpu);
-  real input1Data[] = {1, 2, 3};
-  input1.value->setData(input1Data);
-
-  input2.value = Matrix::create(6, 1, false, useGpu);
-  real input2Data[] = {4, 5, 6, 7, 8, 9};
-  input2.value->setData(input2Data);
-  input2.sequenceStartPositions = ICpuGpuVector::create(4, useGpu);
-  int input2Seq[] = {0, 2, 3, 6};
-  input2.sequenceStartPositions->copyFrom(input2Seq, 4, useGpu);
-
-  result.value = Matrix::create(6, 1, false, useGpu);
-  real resultData[] = {1, 1, 2, 3, 3, 3};
-  result.value->setData(resultData);
-
-  doOneExpandTest("non-seq", false, useGpu, input1, input2, result);
-
-  // CPU case 2. non-seq expand to sub-seq
-  // NOTE: input1.batch_size == input2.sequencelength in this case.
-  // i.e, input1 expands by input2.sequence
-  // input1 = 1,2,3
-  // input2 = [[4,5]],[[6]],[[7],[8,9]]
-  // result = [[1,1]],[[2]],[[3],[3,3]]
-  input2.subSequenceStartPositions = ICpuGpuVector::create(5, useGpu);
-  int input2SubSeq[] = {0, 2, 3, 4, 6};
-  input2.subSequenceStartPositions->copyFrom(input2SubSeq, 5, useGpu);
-
-  doOneExpandTest("non-seq", true, useGpu, input1, input2, result);
-
-  // CPU case 3. seq expand to sub-seq
-  // input1 = [1,2],[3],[4]
-  // input2 = [[4,5]],[[6]],[[7],[8,9]]
-  // result = [[1,1]],[[2]],[[3],[4,4]]
-  Matrix::resizeOrCreate(input1.value, 4, 1, false, useGpu);
-  real input1Data_case3[] = {1, 2, 3, 4};
-  input1.value->setData(input1Data_case3);
-
-  input1.sequenceStartPositions = ICpuGpuVector::create(4, useGpu);
-  int input1Seq[] = {0, 2, 3, 4};
-  input1.sequenceStartPositions->copyFrom(input1Seq, 4, useGpu);
-
-  real resultData_case3[] = {1, 1, 2, 3, 4, 4};
-  result.value->setData(resultData_case3);
-
-  doOneExpandTest("seq", true, useGpu, input1, input2, result);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp b/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp
deleted file mode 100644
index e15b4e5038c..00000000000
--- a/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-vector<int> randSampling(int range, int n) {
-  CHECK_GE(range, n);
-  vector<int> num(range);
-  iota(begin(num), end(num), 0);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  return num;
-}
-
-void genRandomSeqInfo(vector<int>& seqStartPosition,
-                      vector<int>& subSeqStartPosition) {
-  const int maxSeqNum = 100;
-  // generate random start position information
-  int seqNum = 1 + (rand() % maxSeqNum);
-  seqStartPosition.resize(seqNum + 1, 0);
-  subSeqStartPosition.resize(1, 0);
-
-  for (int i = 0; i < seqNum; ++i) {
-    int subSeqLen = 1 + (rand() % maxSeqNum);
-    for (int j = 0; j < subSeqLen; ++j)
-      subSeqStartPosition.push_back(subSeqStartPosition.back() + subSeqLen);
-    seqStartPosition[i + 1] = subSeqStartPosition.back();
-  }
-}
-
-void genRandomGroundTruth(real* values,
-                          vector<vector<int>>& groundTruth,
-                          vector<int>& startPos,
-                          size_t beamSize) {
-  groundTruth.resize(startPos.size() - 1, vector<int>(beamSize, -1));
-  for (size_t i = 0; i < startPos.size() - 1; ++i) {
-    int seqLen = startPos[i + 1] - startPos[i];
-    vector<int> pos =
-        randSampling(seqLen, min(static_cast<int>(beamSize), seqLen));
-    for (size_t j = 0; j < pos.size(); ++j) {
-      groundTruth[i][j] = pos[j];
-      values[startPos[i] + pos[j]] = 1.;
-    }
-  }
-}
-
-void checkLayerOut(vector<vector<int>> groundTruth,
-                   real* layerOut,
-                   size_t beamSize) {
-  for (size_t i = 0; i < groundTruth.size(); ++i) {
-    int begPos = i * beamSize;
-    vector<real> tmp(layerOut + begPos, layerOut + begPos + beamSize);
-    sort(begin(tmp), end(tmp));
-    sort(begin(groundTruth[i]), end(groundTruth[i]));
-    for (size_t j = 0; j < beamSize; ++j) CHECK_EQ(tmp[j], groundTruth[i][j]);
-  }
-}
-
-TEST(Layer, kmaxSeqScoreLayer) {
-  const size_t maxBeamSize = 100;
-  size_t beamSize = 1 + (rand() % maxBeamSize);
-
-  vector<int> seqStartPosition;
-  vector<int> subSeqStartPosition;
-  genRandomSeqInfo(seqStartPosition, subSeqStartPosition);
-  MatrixPtr inValue =
-      Matrix::create(subSeqStartPosition.back(), 1, false, false);
-
-  std::vector<bool> mode = {false};
-#ifdef PADDLE_WITH_CUDA
-  mode.push_back(true);
-#endif
-
-  for (auto hasSubseq : {false, true}) {
-    vector<vector<int>> groundTruth;
-    inValue->randomizeUniform();
-    genRandomGroundTruth(inValue->getData(),
-                         groundTruth,
-                         hasSubseq ? subSeqStartPosition : seqStartPosition,
-                         beamSize);
-
-    for (auto useGpu : mode) {
-      TestConfig config;
-      config.layerConfig.set_type("kmax_seq_score");
-      config.layerConfig.set_beam_size(beamSize);
-
-      if (hasSubseq) {
-        config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                    "scores",
-                                    inValue,
-                                    seqStartPosition,
-                                    subSeqStartPosition});
-      } else {
-        config.inputDefs.push_back(
-            {INPUT_SELF_DEFINE_DATA, "scores", inValue, seqStartPosition});
-      }
-      config.layerConfig.add_inputs();
-
-      // data layer initialize
-      std::vector<DataLayerPtr> dataLayers;
-      LayerMap layerMap;
-      vector<Argument> datas;
-      initDataLayer(
-          config,
-          &dataLayers,
-          &datas,
-          &layerMap,
-          "kmax_seq_score",
-          100 /* actually this parameter is unused in self-defined input*/,
-          false,
-          useGpu);
-      // test layer initialize
-      std::vector<ParameterPtr> parameters;
-      LayerPtr kmaxSeqScoreLayer;
-      FLAGS_use_gpu = useGpu;
-      initTestLayer(config, &layerMap, &parameters, &kmaxSeqScoreLayer);
-      kmaxSeqScoreLayer->forward(PASS_TRAIN);
-
-      const MatrixPtr outValue = kmaxSeqScoreLayer->getOutputValue();
-      CHECK_EQ(outValue->getHeight(),
-               hasSubseq ? subSeqStartPosition.size() - 1
-                         : seqStartPosition.size() - 1);
-      CHECK_EQ(outValue->getWidth(), beamSize);
-      checkLayerOut(groundTruth, outValue->getData(), beamSize);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand((size_t)(time(NULL)));
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_LayerGrad.cpp b/paddle/legacy/gserver/tests/test_LayerGrad.cpp
deleted file mode 100644
index 979cf8ee673..00000000000
--- a/paddle/legacy/gserver/tests/test_LayerGrad.cpp
+++ /dev/null
@@ -1,2532 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-#include <cudnn.h>
-#endif
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/math/MathUtils.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-TEST(Operator, dot_mul) {
-  TestConfig config;
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
-  operatorConf.set_type("dot_mul");
-  operatorConf.set_dotmul_scale(-1);
-
-  testOperatorGrad(config, operatorConf, 100, false, false);
-}
-
-TEST(Projection, context) {
-  for (auto contextStart : {-5, -3, -1, 0, 3}) {
-    for (auto contextLength : {1, 2, 5, 7}) {
-      for (auto batchSize : {1, 2, 5, 20}) {
-        for (auto trainablePadding : {false, true}) {
-          LOG(INFO) << " contextStart=" << contextStart
-                    << " contextLength=" << contextLength
-                    << " batchSize=" << batchSize
-                    << " trainablePadding=" << trainablePadding;
-          ProjectionConfig conf;
-          conf.set_type("context");
-          conf.set_input_size(10);
-          conf.set_context_start(contextStart);
-          conf.set_context_length(contextLength);
-          conf.set_trainable_padding(trainablePadding);
-          conf.set_output_size(conf.context_length() * conf.input_size());
-          int pad =
-              std::max(0, -conf.context_start()) +
-              std::max(0, conf.context_start() + conf.context_length() - 1);
-          for (auto useGpu : {false, true}) {
-            testProjectionGrad(
-                conf,
-                INPUT_SEQUENCE_DATA,
-                trainablePadding ? conf.input_size() * pad : 0,
-                batchSize,
-                useGpu,
-                contextStart + contextLength <= 1);  // = testState
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Projection, trans_fc) {
-  ProjectionConfig conf;
-  conf.set_type("trans_fc");
-  conf.set_input_size(50);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 1000,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, fc) {
-  ProjectionConfig conf;
-  conf.set_type("fc");
-  conf.set_input_size(10);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 200,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, dot_mul) {
-  ProjectionConfig conf;
-  conf.set_type("dot_mul");
-  conf.set_input_size(20);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 20,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, table) {
-  ProjectionConfig conf;
-  conf.set_type("table");
-  conf.set_input_size(10);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_LABEL,
-                       /* parameterSize */ 200,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, identity) {
-  ProjectionConfig conf;
-  conf.set_type("identity");
-  conf.set_input_size(10);
-  conf.set_output_size(10);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 0,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, slice) {
-  ProjectionConfig conf;
-  conf.set_type("slice");
-  conf.set_input_size(100);
-  SliceConfig& slice1 = *conf.add_slices();
-  slice1.set_start(10);
-  slice1.set_end(20);
-  SliceConfig& slice2 = *conf.add_slices();
-  slice2.set_start(50);
-  slice2.set_end(70);
-  conf.set_output_size(30);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 0,
-                       /* batchSize */ 10,
-                       useGpu);
-  }
-}
-
-TEST(Projection, scaling) {
-  ProjectionConfig conf;
-  conf.set_type("scaling");
-  conf.set_input_size(10);
-  conf.set_output_size(10);
-  for (auto useGpu : {false}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 1,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-void testProjectionConv(size_t groups, bool isDeconv) {
-  const int NUM_FILTERS = 18;
-  const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 2;
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 16;
-
-#if CUDNN_VERSION >= 6000
-  const int DILATION = 2;
-#else
-  const int DILATION = 1;
-#endif
-
-  ProjectionConfig conf;
-  if (isDeconv) {
-    conf.set_type("convt");
-  } else {
-    conf.set_type("conv");
-  }
-  conf.set_num_filters(NUM_FILTERS);
-
-  ConvConfig* conv = conf.mutable_conv_conf();
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_channels(CHANNELS);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_dilation(DILATION);
-  conv->set_dilation_y(DILATION);
-  conv->set_groups(groups);
-  if (isDeconv) {
-    conv->set_filter_channels(NUM_FILTERS / conv->groups());
-  } else {
-    conv->set_filter_channels(conv->channels() / conv->groups());
-  }
-  conv->set_img_size(IMAGE_SIZE);
-  int output_x = outputSize(conv->img_size(),
-                            (conv->filter_size() - 1) * DILATION + 1,
-                            conv->padding(),
-                            conv->stride(),
-                            /* caffeMode */ true);
-  int output_y = outputSize(conv->img_size(),
-                            (conv->filter_size_y() - 1) * DILATION + 1,
-                            conv->padding_y(),
-                            conv->stride_y(),
-                            /* caffeMode */ true);
-  conv->set_output_x(output_x);
-  conv->set_output_y(output_y);
-  LOG(INFO) << "DILATION:" << DILATION << "; output_x: " << output_x
-            << "; output_y: " << output_y;
-  if (isDeconv) {
-    int deconv_image_x = imageSize(output_x,
-                                   (conv->filter_size() - 1) * DILATION + 1,
-                                   conv->padding(),
-                                   conv->stride(),
-                                   /* caffeMode */ true);
-    int deconv_image_y = imageSize(output_y,
-                                   (conv->filter_size_y() - 1) * DILATION + 1,
-                                   conv->padding_y(),
-                                   conv->stride_y(),
-                                   /* caffeMode */ true);
-
-    LOG(INFO) << " deconv_image_x: " << deconv_image_x
-              << "; deconv_image_y: " << deconv_image_y;
-    conf.set_input_size(output_x * output_y * CHANNELS);
-    conf.set_output_size(deconv_image_x * deconv_image_y * NUM_FILTERS);
-  } else {
-    conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
-    conf.set_output_size(output_x * output_y * NUM_FILTERS);
-  }
-
-  testProjectionGrad(conf,
-                     INPUT_DATA,
-                     /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE *
-                         FILTER_SIZE_Y / groups,
-                     /* batchSize */ 100,
-                     true,
-                     false,
-                     NUM_FILTERS,
-                     true);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(Projection, conv) {
-  /// test ConvProjection
-  testProjectionConv(1, false);
-  testProjectionConv(3, false);
-  /// test ConvTransProjection
-  testProjectionConv(1, true);
-  testProjectionConv(3, true);
-}
-#endif
-
-TEST(Layer, BilinearInterpLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("bilinear_interp");
-  config.biasSize = 0;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf();
-  ImageConfig* image = bilinear->mutable_image_conf();
-  image->set_img_size(32);
-  image->set_img_size_y(32);
-  image->set_channels(4);
-
-  for (auto useGpu : {false, true}) {
-    for (auto outSize : {32, 64}) {
-      bilinear->set_out_size_x(outSize);
-      bilinear->set_out_size_y(outSize);
-      testLayerGrad(config, "bilinear_interp", 10, false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, concat) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("concat");
-  config.layerConfig.set_size(15);
-  config.layerConfig.set_active_type("sigmoid");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "concat", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, AddtoLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("addto");
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_active_type("sigmoid");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "addto", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, CTCLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("ctc");
-  config.layerConfig.set_norm_by_times(false);
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "ctc",
-                  100,
-                  /* trans */ false, /* useGpu */
-                  useGpu);
-  }
-}
-
-TEST(Layer, cosSimLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("cos");
-  config.layerConfig.set_size(1);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cos", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, CosSimVecMatLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("cos_vm");
-  config.layerConfig.set_size(5);  // output size
-  config.layerConfig.set_cos_scale(2.0);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cos_vm", 100, false, useGpu);
-  }
-}
-
-void testDepthwiseConvLayer(const string& type, bool useGpu) {
-  TestConfig config;
-  config.biasSize = 32;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(32);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 192});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(3);
-  conv->set_channels(16);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(16);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(16);
-  conv->set_img_size_y(8);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              config.layerConfig.num_filters());
-
-  testLayerGrad(config, "depthwise_conv", 100, false, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "depthwise_conv", 2, false, useGpu, true, 0.02);
-}
-
-TEST(Layer, depthwiseConvLayer) {
-  //  'depthwise_conv' is a sepecial case of 'exconv' whose
-  //  groups size equals to the input channels size.
-  testDepthwiseConvLayer("exconv", /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  testDepthwiseConvLayer("exconv", /* useGpu= */ true);
-#endif
-}
-
-void testConvLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  config.biasSize = 16;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(16);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  int dilation = 2;
-  if (type == "cudnn_conv") {
-#if CUDNN_VERSION >= 6000
-    dilation = 2;
-#else
-    dilation = 1;
-#endif
-  }
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 192});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(2);
-  conv->set_channels(3);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_dilation(dilation);
-  conv->set_dilation_y(dilation);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(16);
-  conv->set_img_size_y(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                (conv->filter_size() - 1) * dilation + 1,
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                (conv->filter_size_y() - 1) * dilation + 1,
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              config.layerConfig.num_filters());
-
-  testLayerGrad(config, "conv", 100, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "conv", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, convLayer) {
-  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
-  testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void testConvTransLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  config.biasSize = 3;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(3);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(4);
-  conv->set_channels(16);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(3 / conv->groups());
-  conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-
-  config.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                              config.layerConfig.num_filters());
-
-  testLayerGrad(config, "convTrans", 100, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "convTrans", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, convTransLayer) {
-  for (auto useGpu : {false, true}) {
-    testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
-  }
-#ifdef PADDLE_WITH_CUDA
-  testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-TEST(Layer, blockExpandLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("blockexpand");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 6144, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  BlockExpandConfig* blockExpand = input->mutable_block_expand_conf();
-  blockExpand->set_img_size_x(64);
-  blockExpand->set_img_size_y(32);
-  blockExpand->set_channels(3);
-  blockExpand->set_padding_x(0);
-  blockExpand->set_padding_y(0);
-  blockExpand->set_block_x(4);
-  blockExpand->set_block_y(32);
-  blockExpand->set_stride_x(2);
-  blockExpand->set_stride_y(2);
-  blockExpand->set_output_x(outputSize(blockExpand->img_size_x(),
-                                       blockExpand->block_x(),
-                                       blockExpand->padding_x(),
-                                       blockExpand->stride_x(),
-                                       /* caffeMode */ false));
-  blockExpand->set_output_y(outputSize(blockExpand->img_size_y(),
-                                       blockExpand->block_y(),
-                                       blockExpand->padding_y(),
-                                       blockExpand->stride_y(),
-                                       /* caffeMode */ false));
-  config.layerConfig.set_size(blockExpand->block_x() * blockExpand->block_y() *
-                              blockExpand->channels());
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "blockexpand", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, maxoutLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("maxout");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  MaxOutConfig* maxout = input->mutable_maxout_conf();
-  ImageConfig* image = maxout->mutable_image_conf();
-
-  image->set_img_size(32);
-  image->set_img_size_y(32);
-  image->set_channels(4);
-  maxout->set_groups(2);
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "maxout", 10, false, useGpu);
-  }
-}
-
-void testFcLayer(string format, size_t nnz) {
-  TestConfig config;
-  config.biasSize = 1024;
-  config.layerConfig.set_type("fc");
-  config.layerConfig.set_size(1024);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_drop_rate(0.1);
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", 2048, nnz, ParaSparse(format)});
-  config.layerConfig.add_inputs();
-
-  LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
-            << config.inputDefs[0].sparse.format;
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "fc",
-                  100,
-                  /* trans */ false,
-                  useGpu,
-                  /* weight */ true);
-  }
-}
-
-TEST(Layer, fcLayer) {
-  testFcLayer("", 1024 * 1024 * 2);
-  testFcLayer("csc", 1024 * 10);
-  testFcLayer("csr", 1024 * 10);
-}
-
-TEST(Layer, SelectiveFullyConnectedLayer) {
-  TestConfig config;
-  size_t nin = 16;
-  size_t nout = 256;
-  config.layerConfig.set_type("selective_fc");
-  config.layerConfig.set_size(nout);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_has_selected_colums(true);
-  config.layerConfig.set_selective_fc_pass_generation(false);
-  config.biasSize = nout;
-
-  config.inputDefs.push_back({INPUT_DATA, "input0", nin, nin * nout});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back(
-      {INPUT_SPARSE_NON_VALUE_DATA, "index", nout, 0, ParaSparse("csr", true)});
-  config.layerConfig.add_inputs();
-
-  testLayerGrad(config,
-                "selective_fc",
-                100,
-                /* trans= */ false,
-                /* useGup= */ false,
-                false);
-#ifdef PADDLE_WITH_CUDA
-  testLayerGrad(config,
-                "selective_fc",
-                100,
-                /* trans= */ false,
-                /* useGup= */ true,
-                false);
-#endif
-}
-
-TEST(Layer, DataNormLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("data_norm");
-  config.layerConfig.set_size(20);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 100});
-  config.inputDefs.back().isStatic = true;
-  config.layerConfig.add_inputs();
-
-  for (auto strategy : {"z-score", "min-max", "decimal-scaling"}) {
-    config.layerConfig.set_data_norm_strategy(strategy);
-    // The parameters are static, so not support GPU now
-    testLayerGrad(config,
-                  "data_norm",
-                  200,
-                  /* trans */ false,
-                  /* useGpu */ false);
-  }
-}
-
-TEST(Layer, hsigmoidLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("hsigmoid");
-  config.layerConfig.set_num_classes(5);
-  config.layerConfig.set_size(1);
-  config.biasSize = config.layerConfig.num_classes() - 1;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 200});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 5, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "hsigmoid",
-                  100,
-                  /* trans */ false,
-                  /* useGpu */ useGpu);
-  }
-}
-
-TEST(Layer, multi_cross) {
-  TestConfig config;
-  config.layerConfig.set_type("multi-class-cross-entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(
-        config, "multi-class-cross-entropy", 100, /* trans */ false, useGpu);
-  }
-}
-
-TEST(Layer, multi_binary_label_sparse_mat) {
-  TestConfig config;
-  config.layerConfig.set_type("multi_binary_label_cross_entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "multi_binary_label_cross_entropy",
-                  100,
-                  /* trans */ false,
-                  useGpu);
-  }
-}
-
-TEST(layer, multi_binary_label_id) {
-  TestConfig config;
-  config.layerConfig.set_type("multi_binary_label_cross_entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "multi_binary_label_cross_entropy",
-                  100,
-                  /* trans */ false,
-                  useGpu);
-  }
-}
-
-TEST(Layer, multi_cross_with_selfnorm) {
-  TestConfig config;
-  config.layerConfig.set_type("multi_class_cross_entropy_with_selfnorm");
-  config.layerConfig.set_softmax_selfnorm_alpha(0.1);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // Not support GPU now
-  testLayerGrad(config,
-                "multi_class_cross_entropy_with_selfnorm",
-                100,
-                /* trans */ false,
-                /* useGpu */ false);
-}
-
-TEST(Layer, multi_cross_soft) {
-  TestConfig config;
-  config.layerConfig.set_type("soft_binary_class_cross_entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "soft_binary_class_cross_entropy",
-                  100,
-                  /* trans */ false,
-                  useGpu);
-  }
-}
-
-TEST(Layer, square_error) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
-  }
-}
-
-TEST(Layer, sparse_square_error) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // "GpuSparseMatrix" as label is not supported
-  testLayerGrad(config,
-                "square_error",
-                100,
-                /* trans */ false,
-                /* useGpu */ false);
-}
-
-TEST(Layer, sparse_float_square_error) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_SPARSE_FLOAT_VALUE_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // "GpuSparseMatrix" as label is not supported
-  testLayerGrad(config,
-                "square_error",
-                100,
-                /* trans */ false,
-                /* useGpu */ false);
-}
-
-TEST(Layer, square_error_weighted) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-  config.testAccumulate = false;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
-  }
-}
-
-TEST(Layer, huber_regression_loss) {
-  TestConfig config;
-  config.layerConfig.set_type("huber_regression");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto delta : {1, 3, 5}) {
-      config.layerConfig.set_delta(delta);
-      testLayerGrad(config, "huber_regression", 100, /* trans */ false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, huber_two_class) {
-  TestConfig config;
-  config.layerConfig.set_type("huber_classification");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 2, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "huber_two_class", 100, /* trans */ false, useGpu);
-  }
-}
-
-void testExpandLayer(string trans_type, bool hasSubseq) {
-  TestConfig config;
-  config.layerConfig.set_type("expand");
-
-  config.inputDefs.push_back(
-      {trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0",
-       10,
-       0});
-  config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-       "layer_1",
-       10,
-       0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.set_trans_type(trans_type);
-  LOG(INFO) << " trans_type=" << trans_type << " hasSubseq=" << hasSubseq;
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "expand", 30, false, useGpu);
-  }
-}
-
-TEST(Layer, ExpandLayer) {
-  testExpandLayer("non-seq", false);  // non-seq expand to seq
-  testExpandLayer("non-seq", true);   // non-seq expand to hasSubseq
-  testExpandLayer("seq", true);       // seq expand to hasSubseq
-}
-
-void testDegradeLayer(bool hasSubseq,
-                      string layer_type,
-                      string trans_type,
-                      int stride) {
-  TestConfig config;
-  config.layerConfig.set_type(layer_type);
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_seq_pool_stride(stride);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0",
-       10,
-       0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.set_trans_type(trans_type);
-
-  auto testDegradeLayerGrad = [](TestConfig& config, string layer_type) {
-    for (auto useGpu : {false, true}) {
-      testLayerGrad(config, layer_type, 100, false, useGpu);
-    }
-  };
-
-  if (layer_type == "average") {
-    for (auto strategy : {"average", "sum", "squarerootn"}) {
-      LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
-                << " average_strategy=" << strategy
-                << " seq_pool_stride=" << stride;
-      config.layerConfig.set_average_strategy(strategy);
-      testDegradeLayerGrad(config, layer_type);
-    }
-  } else {
-    LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
-              << " seq_pool_stride=" << stride;
-    testDegradeLayerGrad(config, layer_type);
-  }
-}
-
-TEST(Layer, MaxLayer) {
-  testDegradeLayer(false, "max", "non-seq", -1);  // seq max to non-seq
-  testDegradeLayer(false,
-                   "max",
-                   "non-seq",
-                   5);  // seq max to a shorten seq, stride window = 5
-  testDegradeLayer(true, "max", "non-seq", -1);  // hasSubseq max to non-seq
-  testDegradeLayer(true, "max", "seq", -1);      // hasSubseq max to seq
-}
-
-TEST(Layer, SequenceLastInstanceLayer) {
-  testDegradeLayer(false,
-                   "seqlastins",
-                   "non-seq",
-                   -1);  // seq seqlastins to non-seq
-  testDegradeLayer(false,
-                   "seqlastins",
-                   "non-seq",
-                   5);  // seq seqlastins to a shorten seq, stride window = 5
-  testDegradeLayer(true,
-                   "seqlastins",
-                   "non-seq",
-                   -1);  // hasSubseq seqlastins to non-seq
-  testDegradeLayer(true,
-                   "seqlastins",
-                   "seq",
-                   -1);  // hasSubseq seqlastins to seq
-}
-
-TEST(Layer, AverageLayer) {
-  testDegradeLayer(false, "average", "non-seq", -1);  // seq average to non-seq
-  testDegradeLayer(false,
-                   "average",
-                   "non-seq",
-                   5);  // seq average to a shorten seq, stride window = 5
-  testDegradeLayer(true,
-                   "average",
-                   "non-seq",
-                   -1);                          // hasSubseq average to non-seq
-  testDegradeLayer(true, "average", "seq", -1);  // hasSubseq average to seq
-}
-
-TEST(Layer, SequenceConcatLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("seqconcat");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "seqconcat", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, SequenceReshapeLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("seqreshape");
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 100, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "seqreshape", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, ConvShiftLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("conv_shift");
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 3, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // Not support GPU now
-  testLayerGrad(config, "conv_shift", 100, false, false);
-}
-
-TEST(Layer, PowerLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("power");
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "power", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, ConvexCombinationLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("convex_comb");
-  config.layerConfig.set_size(20);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "convex_comb", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, InterpolationLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("interpolation");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_2", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "interpolation", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, DotProdLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("dot_prod");
-  config.layerConfig.set_size(1);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "dot_prod", 10, false, useGpu);
-  }
-}
-
-TEST(Layer, OuterProdLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("out_prod");
-  config.layerConfig.set_size(100);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "out_prod", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, SlopeInterceptLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("slope_intercept");
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_slope(1.0);
-  config.layerConfig.set_intercept(0.1);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "slope_intercept", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, ScalingLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("scaling");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "scaling", 100, false, useGpu);
-  }
-}
-
-void testNormLayer(const string& normType, bool trans, bool useGpu) {
-  TestConfig config;
-  config.layerConfig.set_type("norm");
-  config.layerConfig.set_active_type("relu");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_norm_type(normType);
-  norm->set_channels(16);
-  norm->set_size(5);
-  norm->set_scale(0.001);
-  norm->set_pow(0.75);
-  norm->set_blocked(0);
-  norm->set_img_size(14);
-  norm->set_img_size_y(7);
-  norm->set_output_x(norm->img_size());
-  norm->set_output_y(norm->img_size_y());
-  if (norm->norm_type() == "cmrnorm" ||
-      norm->norm_type() == "cmrnorm-projection") {
-    norm->set_scale(norm->scale() / norm->size());
-  } else {
-    norm->set_scale(norm->scale() / (norm->size() * norm->size()));
-  }
-
-  config.layerConfig.set_size(norm->output_x() * norm->output_y() *
-                              norm->channels());
-  config.biasSize = 0;
-
-  testLayerGrad(config, "norm", 100, trans, useGpu);
-}
-
-TEST(Layer, NormLayer) {
-  testNormLayer("cmrnorm-projection",
-                /* trans= */ false, /* useGpu= */
-                true);
-  testNormLayer("cmrnorm-projection",
-                /* trans= */ false, /* useGpu= */
-                false);
-}
-
-void setPoolConfig(TestConfig* config,
-                   PoolConfig* pool,
-                   const string& poolType) {
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool");
-  (*config).layerConfig.set_num_filters(16);
-
-  int kw = 3, kh = 3;
-  int pw = 0, ph = 0;
-  int sw = 2, sh = 2;
-  pool->set_pool_type(poolType);
-  pool->set_channels(16);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_start(0);
-  pool->set_padding(pw);
-  pool->set_padding_y(ph);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-
-  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-}
-
-void testPoolLayer(const string& poolType,
-                   bool trans,
-                   bool useGpu,
-                   bool excludeMode = true) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_img_size(14);
-  pool->set_img_size_y(14);
-  pool->set_exclude_mode(excludeMode);
-  setPoolConfig(&config, pool, poolType);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  testLayerGrad(config, "pool", 100, trans, useGpu);
-}
-
-#ifdef PADDLE_WITH_CUDA
-void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_size_y(4);
-  pool->set_stride_y(3);
-  pool->set_img_size(10);
-  pool->set_img_size_y(20);
-  setPoolConfig(&config, pool, poolType);
-  pool->set_output_y((pool->img_size_y() - pool->start() - pool->size_y()) /
-                         ((float)pool->stride_y()) +
-                     1.5);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  testLayerGrad(config, "pool", 100, trans, useGpu);
-}
-#endif
-
-TEST(Layer, PoolLayer) {
-  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
-  testPoolLayer("avg-projection",
-                /* trans= */ false,
-                /* useGpu= */ false,
-                /* excludeMode= */ false);
-  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
-  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
-
-#ifdef PADDLE_WITH_CUDA
-  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer("avg-projection",
-                /* trans= */ false,
-                /* useGpu= */ true,
-                /* excludeMode= */ false);
-  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer2("cudnn-avg-incl-pad-pool",
-                 /* trans= */ false,
-                 /* useGpu= */ true);
-  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void setPool3DConfig(TestConfig* config,
-                     PoolConfig* pool,
-                     const string& poolType) {
-  // filter size
-  const int NUM_FILTERS = 16;
-  const int FILTER_SIZE = 3;
-  const int FILTER_SIZE_Y = 3;
-  const int FILTER_SIZE_Z = 3;
-  const int CHANNELS = 16;
-
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool3d");
-  (*config).layerConfig.set_num_filters(NUM_FILTERS);
-
-  int kw = FILTER_SIZE, kh = FILTER_SIZE_Y, kd = FILTER_SIZE_Z;
-  int pw = 0, ph = 0, pd = 0;
-  int sw = 2, sh = 2, sd = 2;
-
-  pool->set_pool_type(poolType);
-  pool->set_pool_type("avg");
-  pool->set_channels(CHANNELS);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_size_z(kd);
-  pool->set_padding(0);
-  pool->set_padding_y(0);
-  pool->set_padding_z(0);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-  pool->set_stride_z(sd);
-  pool->set_start(0);
-  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  int od = outputSize(pool->img_size_z(), kd, pd, sd, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-  pool->set_output_z(od);
-}
-
-void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 11664, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  const int IMAGE_SIZE = 9;
-  const int IMAGE_SIZE_Y = 9;
-  const int IMAGE_SIZE_Z = 9;
-
-  pool->set_img_size(IMAGE_SIZE);
-  pool->set_img_size_y(IMAGE_SIZE_Y);
-  pool->set_img_size_z(IMAGE_SIZE_Z);
-
-  setPool3DConfig(&config, pool, poolType);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  testLayerGrad(config, "pool3d", 100, trans, useGpu);
-}
-
-TEST(Layer, Pool3DLayer) {
-  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
-  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
-  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void testSppLayer(const string& poolType,
-                  const int pyramidHeight,
-                  bool trans,
-                  bool useGpu) {
-  TestConfig config;
-  config.layerConfig.set_type("spp");
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  SppConfig* sppConfig = input->mutable_spp_conf();
-  sppConfig->set_pool_type(poolType);
-  sppConfig->set_pyramid_height(pyramidHeight);
-  ImageConfig* imageConfig = sppConfig->mutable_image_conf();
-  imageConfig->set_channels(16);
-  imageConfig->set_img_size(10);
-  imageConfig->set_img_size_y(20);
-  int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
-  config.layerConfig.set_size(outputSize * imageConfig->channels());
-  testLayerGrad(config, "spp", 100, trans, useGpu);
-}
-
-TEST(Layer, SpatialPyramidPoolLayer) {
-  for (auto useGpu : {false, true}) {
-    for (auto pyramidHeight : {1, 2, 3}) {
-      testSppLayer("avg-projection", pyramidHeight, false, useGpu);
-      testSppLayer("max-projection", pyramidHeight, false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, rankCostLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("rank-cost");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "rank-cost", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, sumCostLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("sum_cost");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "sum_cost", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, weightedRankCostLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("rank-cost");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_3", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "weighted-rank-cost", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, TensorLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("tensor");
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_active_type("sigmoid");
-  config.biasSize = config.layerConfig.size();
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 250});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 5, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "tensor", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, RecurrentLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("recurrent");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("tanh");
-  config.biasSize = 4;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 4, /* paraSize= */ 16});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto reversed : {false, true}) {
-      config.layerConfig.set_reversed(reversed);
-      config.testState = !reversed;
-      testLayerGrad(
-          config, "recurrent", 50, /* trans= */ false, useGpu, false, 1.0);
-    }
-  }
-}
-
-TEST(Layer, LstmLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("lstmemory");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("tanh");
-  config.layerConfig.set_active_state_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 28;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 64});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto reversed : {false, true}) {
-      config.layerConfig.set_reversed(reversed);
-      config.testState = !reversed;
-      testLayerGrad(
-          config, "lstmemory", 100, /* trans= */ false, useGpu, false, 0.02);
-    }
-  }
-  for (auto useGpu : {true}) {
-    config.testBatchState = true;
-    config.layerConfig.set_reversed(false);
-    testLayerGrad(config, "lstmemory", 10, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, MDLstmLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("mdlstmemory");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_state_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 4 * 9;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_MDIM_DATA, "layer_0", 4 * 5, 4 * 4 * 5});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_directions(true);
-  config.layerConfig.add_directions(true);
-
-  for (auto useGpu : {false, true}) {
-    for (int i = 0; i < 2; i++) {
-      for (int j = 0; j < 2; j++) {
-        config.layerConfig.set_directions(0, bool(i));
-        config.layerConfig.set_directions(1, bool(j));
-        testLayerGrad(config, "mdlstmemory", 100, false, useGpu);
-      }
-    }
-  }
-}
-
-TEST(Layer, ParameterReluLayer) {
-  auto testParameterReluLayer = [&](size_t inputSize, size_t channels) {
-    TestConfig config;
-    config.layerConfig.set_type("prelu");
-    config.inputDefs.push_back({INPUT_DATA, "layer_0", inputSize, channels});
-    config.layerConfig.add_inputs();
-    config.layerConfig.set_size(inputSize);
-    config.layerConfig.set_partial_sum(inputSize /
-                                       channels);  // size of feature map
-    for (auto useGpu : {false, true}) {
-      testLayerGrad(config, "prelu", 100, false, useGpu);
-    }
-  };
-
-  testParameterReluLayer(192, 1);
-  testParameterReluLayer(192, 3);
-  testParameterReluLayer(192, 192);
-}
-
-TEST(Layer, ResizeLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("resize");
-  config.layerConfig.set_size(64);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 16, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "resize", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, RotateLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("rotate");
-  const int CHANNEL = 2;
-  const int HEIGHT = 8;
-  const int WIDTH = 4;
-  const int INPUT_SIZE = HEIGHT * WIDTH * CHANNEL;
-  config.layerConfig.set_size(INPUT_SIZE);
-  config.layerConfig.set_height(HEIGHT);
-  config.layerConfig.set_width(WIDTH);
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", INPUT_SIZE, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "rotate", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, NCELayer) {
-  TestConfig config;
-  size_t numClasses = 4;
-  config.layerConfig.set_type("nce");
-  config.layerConfig.set_size(1);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_num_classes(numClasses);
-  config.biasSize = numClasses;
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 16 * numClasses});
-  config.inputDefs.push_back(
-      {INPUT_LABEL, "label", /* dim= */ numClasses, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto withWeight : {false, true}) {
-    if (withWeight) {
-      config.inputDefs.push_back(
-          {INPUT_DATA_TARGET, "weight", /* dim= */ 1, /* paraSize= */ 0});
-      config.layerConfig.add_inputs();
-    }
-
-    for (auto isIdLabel : {false, true}) {
-      config.inputDefs[1] = {
-          isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA,
-          "label",
-          /* dim= */ numClasses,
-          /* paraSize= */ 0};
-
-      for (auto withDist : {false, true}) {
-        config.layerConfig.clear_neg_sampling_dist();
-        if (withDist) {
-          double sum = 0;
-          for (size_t i = 0; i < numClasses; ++i) {
-            real p = rand();  // NOLINT use rand_r
-            config.layerConfig.add_neg_sampling_dist(p);
-            sum += p;
-          }
-          for (size_t i = 0; i < numClasses; ++i) {
-            real p = config.layerConfig.neg_sampling_dist(i) / sum;
-            config.layerConfig.set_neg_sampling_dist(i, p);
-          }
-        }
-        LOG(INFO) << "NCELayer "
-                  << " isIdLabel=" << isIdLabel << " withWeight=" << withWeight
-                  << " withDist=" << withDist;
-        // Not support GPU now
-        testLayerGrad(config,
-                      "nce",
-                      100,
-                      /* trans= */ false,
-                      /* useGpu */ false);
-      }
-    }
-  }
-}
-
-TEST(Layer, GatedRecurrentLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("gated_recurrent");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 12;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto reversed : {false, true}) {
-      config.layerConfig.set_reversed(reversed);
-      config.testState = !reversed;
-      testLayerGrad(config, "gated_recurrent", 100, /* trans= */ false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, GruStepLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("gru_step");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 12;
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "gruStep", 100, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, LstmStepLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("lstm_step");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_state_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 12;
-  config.testAccumulate = false;
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "lstmStep", 100, /* trans= */ false, useGpu);
-  }
-}
-
-void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  const int CHANNELS = 10;
-  const int IMG_SIZE = 16;
-  const int IMG_SIZE_Y = 8;
-  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sigmoid");
-  config.biasSize = CHANNELS;
-  config.inputDefs.push_back({INPUT_DATA,
-                              "layer_0",
-                              /* dim= */ size,
-                              /* paraSize= */ CHANNELS});
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(CHANNELS);
-  img_conf->set_img_size(IMG_SIZE);
-  img_conf->set_img_size_y(IMG_SIZE_Y);
-
-  testLayerGrad(config,
-                "batch_norm",
-                64,
-                /* trans= */ trans,
-                useGpu,
-                /* useWeight */ true);
-}
-
-TEST(Layer, BatchNormalizationLayer) {
-  testBatchNormLayer("batch_norm", false, false);
-#ifdef PADDLE_WITH_CUDA
-  testBatchNormLayer("batch_norm", false, true);
-  if (hl_get_cudnn_lib_version() >= int(4000)) {
-    testBatchNormLayer("cudnn_batch_norm", false, true);
-  }
-#endif
-}
-
-void testBatchNorm3DLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  const int CHANNELS = 10;
-  const int IMG_SIZE = 16;
-  const int IMG_SIZE_Y = 8;
-  const int IMG_SIZE_Z = 8;
-  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y * IMG_SIZE_Z;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sigmoid");
-  config.biasSize = CHANNELS;
-  config.inputDefs.push_back({INPUT_DATA,
-                              "layer_0",
-                              /* dim= */ size,
-                              /* paraSize= */ CHANNELS});
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(CHANNELS);
-  img_conf->set_img_size(IMG_SIZE);
-  img_conf->set_img_size_y(IMG_SIZE_Y);
-  img_conf->set_img_size_z(IMG_SIZE_Z);
-
-  testLayerGrad(config,
-                "batch_norm",
-                64,
-                /* trans= */ trans,
-                useGpu,
-                /* useWeight */ true);
-}
-
-TEST(Layer, testBatchNorm3DLayer) {
-  testBatchNorm3DLayer("batch_norm", false, false);
-#ifdef PADDLE_WITH_CUDA
-  testBatchNorm3DLayer("batch_norm", false, true);
-  if (hl_get_cudnn_lib_version() >= int(4000)) {
-    testBatchNorm3DLayer("cudnn_batch_norm", false, true);
-  }
-#endif
-}
-
-void testConvOperator(bool isDeconv) {
-  TestConfig config;
-  const int NUM_FILTERS = 16;
-  const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 3;
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 16;
-  const int IMAGE_SIZE_Y = 9;
-  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
-  if (isDeconv) {
-    operatorConf.set_type("convt");
-  } else {
-    operatorConf.set_type("conv");
-  }
-  ConvConfig* conv = operatorConf.mutable_conv_conf();
-  operatorConf.set_num_filters(NUM_FILTERS);
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_channels(CHANNELS);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_img_size(IMAGE_SIZE);
-  conv->set_img_size_y(IMAGE_SIZE_Y);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /*  caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /*  caffeMode */ true));
-
-  if (isDeconv) {
-    conv->set_filter_channels(NUM_FILTERS / conv->groups());
-    config.inputDefs.push_back({INPUT_DATA,
-                                "layer_0",
-                                conv->output_x() * conv->output_y() * CHANNELS,
-                                0});
-    config.layerConfig.set_size(IMAGE_SIZE * IMAGE_SIZE_Y * NUM_FILTERS);
-  } else {
-    conv->set_filter_channels(conv->channels() / conv->groups());
-    config.inputDefs.push_back(
-        {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
-    config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                                NUM_FILTERS);
-  }
-
-  config.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_1",
-       FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS,
-       0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  testOperatorGrad(config, operatorConf, 100, /*useGpu*/ true, false);
-}
-
-TEST(Operator, conv) {
-  testConvOperator(/*isDeconv*/ true);
-  testConvOperator(/*isDeconv*/ false);
-}
-
-TEST(Layer, FeatureMapExpandLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("featmap_expand");
-  const int CHANNELS = 10;
-  const int INPUT_SIZE = 100;
-  config.layerConfig.set_size(INPUT_SIZE * CHANNELS);
-  config.layerConfig.set_num_filters(CHANNELS);
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
-                              "layer_0",
-                              /* dim= */ INPUT_SIZE,
-                              /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  for (auto useGpu : {false, true}) {
-    for (auto asRowVec : {false, true}) {
-      config.layerConfig.set_user_arg(asRowVec ? "as_row_vec" : "as_col_vec");
-      testLayerGrad(config,
-                    "featmap_expand",
-                    /*batch_size*/ 100,
-                    /* trans= */ false,
-                    useGpu,
-                    /* useWeight */ true);
-    }
-  }
-}
-
-TEST(Layer, MultiplexLayer) {
-  TestConfig config;
-  const int LAYER_SIZE = 100;
-  config.layerConfig.set_type("multiplex");
-  config.layerConfig.set_size(LAYER_SIZE);
-
-  config.inputDefs.push_back({INPUT_LABEL, "layer_0", 2, 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_2", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "multiplex", 512, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, PadLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("pad");
-
-  int c = 4;
-  int h = 31;
-  int w = 36;
-  size_t size = c * h * w;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PadConfig* pad = input->mutable_pad_conf();
-  ImageConfig* image = pad->mutable_image_conf();
-
-  image->set_channels(c);
-  image->set_img_size(h);
-  image->set_img_size_y(w);
-  pad->add_pad_c(1);
-  pad->add_pad_c(2);
-  pad->add_pad_h(2);
-  pad->add_pad_h(3);
-  pad->add_pad_w(3);
-  pad->add_pad_w(5);
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "pad", 10, false, useGpu);
-  }
-}
-
-TEST(Layer, CrossChannelNormLayer) {
-  TestConfig config;
-  config.paramInitialMean = 1.;
-  config.paramInitialStd = 0.;
-  config.layerConfig.set_type("norm");
-  config.layerConfig.set_size(100);
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_norm_type("cross-channel-norm");
-  norm->set_channels(10);
-  norm->set_size(100);
-  norm->set_scale(0);
-  norm->set_pow(0);
-  norm->set_blocked(0);
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false);
-  }
-}
-
-TEST(Layer, smooth_l1) {
-  TestConfig config;
-  config.layerConfig.set_type("smooth_l1");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 200, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 200, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "smooth_l1", 100, false, useGpu, false);
-  }
-}
-
-TEST(Layer, multibox_loss) {
-  TestConfig config;
-  config.layerConfig.set_type("multibox_loss");
-  config.biasSize = 0;
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  MultiBoxLossConfig* multiboxLoss = input->mutable_multibox_loss_conf();
-  multiboxLoss->set_num_classes(21);
-  multiboxLoss->set_input_num(1);
-  multiboxLoss->set_overlap_threshold(0.5);
-  multiboxLoss->set_neg_pos_ratio(3);
-  multiboxLoss->set_neg_overlap(0.5);
-  multiboxLoss->set_background_id(0);
-  multiboxLoss->set_height(3);
-  multiboxLoss->set_width(3);
-
-  size_t gtNum = 1;
-  MatrixPtr labelValue = Matrix::create(gtNum, 6, false, false);
-  labelValue->randomizeUniform();
-  labelValue->add(-0.5);
-  labelValue->sigmoid(*labelValue);
-  real* labelData = labelValue->getData();
-  size_t labelWidth = labelValue->getWidth();
-  for (size_t i = 0; i < gtNum; ++i) {
-    *(labelData + i * labelWidth) = std::rand() % 20 + 1;
-    *(labelData + i * labelWidth + 1) = 0.400259;
-    *(labelData + i * labelWidth + 2) = 0.377857;
-    *(labelData + i * labelWidth + 3) = 0.525712;
-    *(labelData + i * labelWidth + 4) = 0.519368;
-  }
-  vector<int> seqStartPositions(gtNum + 1, 0);
-  for (size_t i = 1; i <= gtNum; ++i) {
-    seqStartPositions[i] = i;
-  }
-
-  // Ensure at lease one matched bbox
-  MatrixPtr priorValue = Matrix::create(1, 72, false, false);
-  priorValue->randomizeUniform();
-  priorValue->add(-0.5);
-  priorValue->sigmoid(*priorValue);
-  real* priorData = priorValue->getData();
-  *(priorData) = 0.424811;
-  *(priorData + 1) = 0.397059;
-  *(priorData + 2) = 0.538905;
-  *(priorData + 3) = 0.447091;
-  *(priorData + 4) = 0.425720;
-  *(priorData + 5) = 0.515228;
-  *(priorData + 6) = 0.519452;
-  *(priorData + 7) = 0.591065;
-
-  config.inputDefs.push_back(
-      {INPUT_SELF_DEFINE_DATA, "priorbox", priorValue, {}});
-  config.inputDefs.push_back(
-      {INPUT_SELF_DEFINE_DATA, "label", labelValue, seqStartPositions});
-  config.inputDefs.push_back({INPUT_DATA, "locPred", 36, 0});
-  config.inputDefs.push_back({INPUT_DATA, "confPred", 189, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "multibox_loss", 1, false, useGpu, false);
-  }
-}
-
-TEST(Layer, TransLayer) {
-  TestConfig config;
-  const int height = 128;
-  const int width = 256;
-  config.layerConfig.set_type("trans");
-  config.layerConfig.set_size(width);
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ height * width, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "trans", height, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, RowConvLayer) {
-  const int context = 3;
-  const int size = 512;
-
-  TestConfig config;
-  config.layerConfig.set_type("row_conv");
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sigmoid");
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", size, context * size});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  RowConvConfig* conv = input->mutable_row_conv_conf();
-  conv->set_context_length(context);
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "row_conv", 100, false, useGpu, false);
-  }
-}
-
-TEST(Layer, CropLayer) {
-  TestConfig config;
-  // config input_0
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ImageConfig* img = input->mutable_image_conf();
-  img->set_channels(4);
-  img->set_img_size(16);
-  config.layerConfig.set_axis(2);
-  config.layerConfig.add_offset(0);
-  config.layerConfig.add_offset(0);
-
-  // config input_1
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 128, 0});
-  input = config.layerConfig.add_inputs();
-  img = input->mutable_image_conf();
-  img->set_channels(2);
-  img->set_img_size(8);
-
-  // config crop layer
-  config.layerConfig.set_type("crop");
-  config.layerConfig.set_name("cropLayer");
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "crop", 100, false, useGpu, false);
-  }
-}
-
-TEST(Layer, roi_pool) {
-  TestConfig config;
-  config.layerConfig.set_type("roi_pool");
-  config.biasSize = 0;
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ROIPoolConfig* roiPoolConf = input->mutable_roi_pool_conf();
-  roiPoolConf->set_pooled_width(7);
-  roiPoolConf->set_pooled_height(7);
-  roiPoolConf->set_spatial_scale(1. / 16);
-  roiPoolConf->set_width(14);
-  roiPoolConf->set_height(14);
-
-  const size_t roiNum = 10;
-  const size_t roiDim = 10;
-  const size_t batchSize = 5;
-  MatrixPtr roiValue = Matrix::create(roiNum, roiDim, false, false);
-  roiValue->zeroMem();
-  real* roiData = roiValue->getData();
-  for (size_t i = 0; i < roiNum; ++i) {
-    roiData[i * roiDim + 0] = std::rand() % batchSize;
-    roiData[i * roiDim + 1] = std::rand() % 224;  // xMin
-    roiData[i * roiDim + 2] = std::rand() % 224;  // yMin
-    size_t xMin = static_cast<size_t>(roiData[i * roiDim + 1]);
-    size_t yMin = static_cast<size_t>(roiData[i * roiDim + 2]);
-    roiData[i * roiDim + 3] = xMin + std::rand() % (224 - xMin);  // xMax
-    roiData[i * roiDim + 4] = yMin + std::rand() % (224 - yMin);  // yMax
-  }
-
-  config.inputDefs.push_back({INPUT_DATA, "input", 3 * 14 * 14, {}});
-  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "rois", roiValue, {}});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "roi_pool", batchSize, false, useGpu, false);
-  }
-}
-
-TEST(Layer, SwitchOrderLayer) {
-  TestConfig config;
-  // config input_0
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ImageConfig* img = input->mutable_image_conf();
-  img->set_channels(4);
-  img->set_img_size(16);
-  img->set_img_size_y(16);
-
-  ReshapeConfig* reshape = config.layerConfig.mutable_reshape_conf();
-  reshape->add_height_axis(0);
-  reshape->add_height_axis(1);
-  reshape->add_height_axis(2);
-  reshape->add_width_axis(3);
-
-  // config softmax layer
-  config.layerConfig.set_type("switch_order");
-  config.layerConfig.set_name("switchOrderLayer");
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "switch_order", 100, false, useGpu, true);
-  }
-}
-
-vector<real> randSampling(real range, int n) {
-  CHECK_GE(range, n);
-  vector<real> num(range);
-  iota(begin(num), end(num), 0.);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  sort(begin(num), end(num));
-  return num;
-}
-
-TEST(Layer, SubNestedSequenceLayer) {
-  // layer size is not crutial for this layer,
-  // so use a small layer size in unittest
-  const int layerSize = 4;
-
-  const int maxSeqNum = 50;
-  const int maxSeqLen = 50;
-  const int maxBeamSize = 32;
-
-  srand((size_t)(time(NULL)));
-  int beamSize = 1 + (rand() % maxBeamSize);
-
-  TestConfig config;
-  config.layerConfig.set_type("sub_nested_seq");
-  config.layerConfig.set_name("sub_nested_seq_layer");
-  config.layerConfig.set_size(layerSize);
-
-  int seqNum = 1 + (rand() % maxSeqNum);
-
-  // sequence information for the first input, it is a nested sequence
-  vector<int> seqStartPos(seqNum + 1, 0);
-  vector<int> subSeqStartPos(1, 0);
-
-  // selected indices
-  MatrixPtr selectedIndices = Matrix::create(seqNum, beamSize, false, false);
-  selectedIndices->one();
-  selectedIndices->mulScalar(-1.);
-  real* indicesData = selectedIndices->getData();
-
-  for (int i = 0; i < seqNum; ++i) {
-    int subSeqNum = 1 + (rand() % maxSeqNum);
-    for (int j = 0; j < subSeqNum; ++j) {
-      subSeqStartPos.push_back(subSeqStartPos.back() +
-                               (1 + (rand() % maxSeqLen)));
-    }
-    vector<real> selSeqs =
-        randSampling(static_cast<real>(subSeqNum), min(beamSize, subSeqNum));
-    memcpy(indicesData + (i * beamSize),
-           selSeqs.data(),
-           selSeqs.size() * sizeof(real));
-    seqStartPos[i + 1] = subSeqStartPos.back();
-  }
-
-  MatrixPtr seqInputPtr =
-      Matrix::create(seqStartPos.back(), layerSize, false, false);
-  seqInputPtr->randomizeUniform();
-  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                              "nested_seq_input",
-                              seqInputPtr,
-                              seqStartPos,
-                              subSeqStartPos});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back(
-      {INPUT_SELF_DEFINE_DATA, "selected_indices", selectedIndices});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "sub_nested_seq",
-                  /* batchSize */ seqNum,
-                  /* trans */ false,
-                  /* useGpu*/ useGpu,
-                  /* useWeight */ false);
-  }
-}
-
-TEST(Layer, ClipLayer) {
-  const size_t batchSize = 128;
-  const size_t size = 512;
-  TestConfig config;
-  config.layerConfig.set_type("clip");
-  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ClipConfig* layerConf = input->mutable_clip_conf();
-  double p1 = std::rand() / (double)RAND_MAX;
-  double p2 = std::rand() / (double)RAND_MAX;
-  layerConf->set_min(std::min(p1, p2));
-  layerConf->set_max(std::max(p1, p2));
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "clip", batchSize, false, useGpu, false);
-  }
-}
-
-TEST(Layer, RowL2NormLayer) {
-  const size_t batchSize = 128;
-  const size_t size = 512;
-  TestConfig config;
-  config.layerConfig.set_type("row_l2_norm");
-  config.layerConfig.set_size(size);
-  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
-  config.layerConfig.add_inputs();
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false);
-  }
-}
-
-void test3DConvLayer(const string& type, bool trans, bool useGpu) {
-  // filter size
-  const int NUM_FILTERS = 6;
-  // const int CHANNELS = 3;
-  const int FILTER_SIZE = 3;
-  const int FILTER_SIZE_Y = 3;
-  const int FILTER_SIZE_Z = 3;
-
-  // input image
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 9;
-  const int IMAGE_SIZE_Y = 9;
-  const int IMAGE_SIZE_Z = 9;
-
-  TestConfig config;
-  config.biasSize = NUM_FILTERS;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(NUM_FILTERS);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  // Setting up conv3D-trans layer
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-
-  conv->set_channels(CHANNELS);
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_filter_size_z(FILTER_SIZE_Z);
-  conv->set_padding(0);
-  conv->set_padding_y(0);
-  conv->set_padding_z(0);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_stride_z(2);
-  conv->set_img_size(IMAGE_SIZE);
-  conv->set_img_size_y(IMAGE_SIZE_Y);
-  conv->set_img_size_z(IMAGE_SIZE_Z);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /*  caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /*  caffeMode */ true));
-  conv->set_output_z(outputSize(conv->img_size_z(),
-                                conv->filter_size_z(),
-                                conv->padding_z(),
-                                conv->stride_z(),
-                                /*  caffeMode */ true));
-
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              conv->output_z() * NUM_FILTERS);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  config.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
-       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
-           NUM_FILTERS});
-
-  testLayerGrad(config, "conv3D", 10, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "conv3D", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, test3DConvLayer) {
-  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void test3DDeConvLayer(const string& type, bool trans, bool useGpu) {
-  // filter size
-  const int NUM_FILTERS = 6;
-  // const int CHANNELS = 3;
-  const int FILTER_SIZE = 3;
-  const int FILTER_SIZE_Y = 3;
-  const int FILTER_SIZE_Z = 3;
-
-  // input image
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 4;
-  const int IMAGE_SIZE_Y = 6;
-  const int IMAGE_SIZE_Z = 6;
-
-  // Setting up conv-trans layer
-  TestConfig config;
-  config.biasSize = NUM_FILTERS;
-  config.layerConfig.set_type("deconv3d");
-  config.layerConfig.set_num_filters(NUM_FILTERS);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-
-  conv->set_channels(CHANNELS);
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_filter_size_z(FILTER_SIZE_Z);
-  conv->set_padding(0);
-  conv->set_padding_y(0);
-  conv->set_padding_z(0);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_stride_z(2);
-  conv->set_output_x(IMAGE_SIZE);
-  conv->set_output_y(IMAGE_SIZE_Y);
-  conv->set_output_z(IMAGE_SIZE_Z);
-
-  conv->set_img_size(imageSize(conv->output_x(),
-                               conv->filter_size(),
-                               conv->padding(),
-                               conv->stride(),
-                               true));
-  conv->set_img_size_y(imageSize(conv->output_y(),
-                                 conv->filter_size_y(),
-                                 conv->padding_y(),
-                                 conv->stride_y(),
-                                 true));
-  conv->set_img_size_z(imageSize(conv->output_z(),
-                                 conv->filter_size_z(),
-                                 conv->padding_z(),
-                                 conv->stride_z(),
-                                 true));
-  config.layerConfig.set_size(conv->img_size() * conv->img_size_y() *
-                              conv->img_size_z() * NUM_FILTERS);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  config.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
-       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
-           NUM_FILTERS});
-
-  testLayerGrad(config, "deconv3D", 10, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "deconv3D", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, test3DDeConvLayer) {
-  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-TEST(Layer, ScaleShiftLayer) {
-  // FIXME: Disable ScaleShiftLayer because it is not stable.
-  // https://github.com/PaddlePaddle/Paddle/issues/7781
-  return;
-  //  const size_t batchSize = 16;
-  //  const size_t size = 32;
-  //  TestConfig config;
-  //  config.layerConfig.set_type("scale_shift");
-  //  config.layerConfig.set_size(size);
-  //  config.biasSize = 1;
-  //  config.inputDefs.push_back(
-  //      {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
-  //  config.layerConfig.add_inputs();
-  //  for (auto useGpu : {false, true}) {
-  //    testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
-  //  }
-}
-
-TEST(Layer, ScaleSubRegionLayer) {
-  const size_t batchSize = 64;
-  const size_t size = 4096;
-  TestConfig config;
-  config.layerConfig.set_type("scale_sub_region");
-  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
-  MatrixPtr indicesV = Matrix::create(batchSize, 6, false, false);
-  auto* data = indicesV->getData();
-  for (size_t i = 0; i < batchSize; ++i) {
-    data[i * 2] = 2;
-    data[i * 2 + 1] = 4;
-    data[i * 2 + 2] = 16;
-    data[i * 2 + 3] = 32;
-    data[i * 2 + 4] = 16;
-    data[i * 2 + 5] = 32;
-  }
-  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "indices", indicesV, {}});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ScaleSubRegionConfig* scaleSubRegionConf =
-      input->mutable_scale_sub_region_conf();
-  ImageConfig* imgConf = scaleSubRegionConf->mutable_image_conf();
-  imgConf->set_img_size(32);
-  imgConf->set_img_size_y(32);
-  imgConf->set_channels(4);
-  scaleSubRegionConf->set_value(2.0);
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "scale_sub_region", batchSize, false, useGpu, false);
-  }
-}
-
-TEST(Layer, L2DistanceLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("l2_distance");
-  config.layerConfig.set_size(1);
-  config.biasSize = 0;
-
-  const size_t input_dim = 27;
-  const size_t batch_size = 11;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", input_dim, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", input_dim, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "l2_distance", batch_size, false, useGpu);
-  }
-}
-
-void testFactorizationMachineLayer(InputType type, bool useGpu) {
-  const int FACTOR_SIZE = 10;
-  TestConfig config;
-  config.layerConfig.set_type("factorization_machine");
-  config.layerConfig.set_factor_size(FACTOR_SIZE);
-  config.layerConfig.set_size(1);
-  config.biasSize = 0;
-  config.inputDefs.push_back({type, "layer_0", 128, 1280});
-  config.layerConfig.add_inputs();
-  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
-}
-
-TEST(Layer, FactorizationMachineLayer) {
-  for (auto useGpu : {false, true}) {
-    testFactorizationMachineLayer(INPUT_DATA, useGpu);
-  }
-  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp b/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp
deleted file mode 100644
index 7082c1363a4..00000000000
--- a/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <vector>
-#include "paddle/legacy/gserver/layers/LinearChainCRF.h"
-#include "paddle/legacy/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static inline bool getNextSequence(vector<int>& seq, int numClasses) {
-  for (auto& v : seq) {
-    if (++v < numClasses) {
-      return true;
-    }
-    v = 0;
-  }
-  return false;
-}
-
-TEST(LinearChainCRF, decoding) {
-  const int numClasses = 4;
-  CpuVector para(numClasses * (numClasses + 2));
-  real* a = para.getData();
-  real* b = para.getData() + numClasses;
-  real* w = para.getData() + 2 * numClasses;
-  LinearChainCRF crf(4, para.getData());
-  for (int length : {1, 2, 3, 10}) {
-    for (int tries = 0; tries < 10; ++tries) {
-      CpuMatrix x(length, numClasses);
-      x.randomizeUniform();
-      para.randnorm(0, 2);
-      vector<int> decodingResult(length);
-      vector<int> bestResult(length);
-      vector<int> testResult(length, 0);
-      crf.decode(x.getData(), &decodingResult[0], length);
-      real bestScore = -std::numeric_limits<real>::max();
-      do {
-        real score = a[testResult.front()] + b[testResult.back()];
-        score += x.getElement(0, testResult.front());
-        for (int k = 1; k < length; ++k) {
-          score += x.getElement(k, testResult[k]) +
-                   w[numClasses * testResult[k - 1] + testResult[k]];
-        }
-        if (score > bestScore) {
-          bestScore = score;
-          bestResult = testResult;
-        }
-      } while (getNextSequence(testResult, numClasses));
-      for (int k = 0; k < length; ++k) {
-        EXPECT_EQ(decodingResult[k], bestResult[k]);
-      }
-    }
-  }
-}
diff --git a/paddle/legacy/gserver/tests/test_MKLDNN.cpp b/paddle/legacy/gserver/tests/test_MKLDNN.cpp
deleted file mode 100644
index c79ccd1956c..00000000000
--- a/paddle/legacy/gserver/tests/test_MKLDNN.cpp
+++ /dev/null
@@ -1,448 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/utils/PythonUtil.h>
-#include <string>
-#include <vector>
-#include "MKLDNNTester.h"
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/activations/MKLDNNActivation.h"
-#include "paddle/legacy/math/MathUtils.h"
-
-using namespace paddle;  // NOLINT
-
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(use_gpu);
-DECLARE_bool(use_mkldnn);
-
-#define RUN_MKLDNN_TEST(DNN_CONFIG, REF_CONFIG, DESC)         \
-  MKLDNNTester tester;                                        \
-  for (auto bs : {DESC.bs, 1}) {                              \
-    tester.run(DNN_CONFIG, REF_CONFIG, bs, DESC.ih, DESC.iw); \
-  }
-
-#define RUN_MKLDNN_TEST_LAYER(DNN_CONFIG, REF_TYPE, DESC) \
-  TestConfig ref = DNN_CONFIG;                            \
-  ref.layerConfig.set_type(REF_TYPE);                     \
-  RUN_MKLDNN_TEST(DNN_CONFIG, ref, DESC)
-
-struct testFcDesc {
-  int bs;
-  int ic;
-  int ih, iw;  // oh == ow == 1
-  int oc;
-};
-
-static void getMKLDNNFcConfig(TestConfig& cfg, const testFcDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_fc");
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.set_size(pm.oc);
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       /* size of weight= */ size_t(pm.oc * pm.ic * pm.ih * pm.iw)});
-  cfg.layerConfig.add_inputs();
-}
-
-void testFcLayer(const testFcDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNFcConfig(dnnConfig, pm);
-  for (auto biasSize : {pm.oc, 0}) {
-    dnnConfig.biasSize = biasSize;
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "fc", pm)
-  }
-}
-
-TEST(MKLDNNLayer, FcLayer) {
-  /* bs, ic, ih, iw, oc */
-  testFcLayer({2, 2, 1, 1, 3});
-  testFcLayer({3, 7, 1, 1, 19});
-  testFcLayer({8, 16, 13, 13, 32});
-  testFcLayer({4, 12, 13, 13, 18});
-  testFcLayer({2, 64, 16, 16, 32});
-  testFcLayer({15, 3, 16, 16, 6});
-}
-
-struct testConvDesc {
-  int bs, gp;
-  int ic, ih, iw;
-  int oc, oh, ow;
-  int fh, fw;
-  int ph, pw;
-  int sh, sw;
-  int dh, dw;
-};
-
-static void getMKLDNNConvConfig(TestConfig& cfg, const testConvDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_conv");
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.set_num_filters(pm.oc);
-  cfg.layerConfig.set_size(pm.oc * pm.oh * pm.ow);
-  cfg.layerConfig.set_shared_biases(true);
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       /* size of weight= */ size_t(pm.oc * pm.ic * pm.fh * pm.fw / pm.gp)});
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_groups(pm.gp);
-  conv->set_img_size(pm.iw);
-  conv->set_img_size_y(pm.ih);
-  conv->set_output_x(pm.ow);
-  conv->set_output_y(pm.oh);
-  conv->set_filter_size(pm.fw);
-  conv->set_filter_size_y(pm.fh);
-  conv->set_channels(pm.ic);
-  conv->set_padding(pm.pw);
-  conv->set_padding_y(pm.ph);
-  conv->set_stride(pm.sw);
-  conv->set_stride_y(pm.sh);
-  conv->set_dilation(pm.dw);
-  conv->set_dilation_y(pm.dh);
-  conv->set_caffe_mode(true);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  CHECK_EQ(conv->filter_channels() * pm.gp, conv->channels())
-      << "it is indivisible";
-
-  int fh = (pm.fh - 1) * pm.dh + 1;
-  int fw = (pm.fw - 1) * pm.dw + 1;
-  int ow = outputSize(pm.iw, fw, pm.pw, pm.sw, true);
-  int oh = outputSize(pm.ih, fh, pm.ph, pm.sh, true);
-  CHECK_EQ(ow, pm.ow) << "output size check failed";
-  CHECK_EQ(oh, pm.oh) << "output size check failed";
-}
-
-void testConvLayer(const testConvDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNConvConfig(dnnConfig, pm);
-  for (auto biasSize : {pm.oc, 0}) {
-    dnnConfig.biasSize = biasSize;
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "exconv", pm)
-  }
-}
-
-TEST(MKLDNNLayer, ConvLayer) {
-  /* bs, gp, ic, ih, iw, oc, oh, ow, fh, fw, ph, pw, sh, sw, dh, dw */
-  testConvLayer({2, 1, 3, 32, 32, 16, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({2, 1, 8, 16, 16, 8, 16, 16, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({3, 1, 16, 32, 32, 3, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({8, 1, 16, 18, 18, 32, 18, 18, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({16, 1, 1, 42, 31, 32, 23, 11, 4, 5, 3, 2, 2, 3, 1, 1});
-  testConvLayer({2, 1, 8, 16, 16, 8, 8, 8, 3, 3, 1, 1, 2, 2, 1, 1});
-  testConvLayer({3, 1, 8, 13, 13, 8, 7, 7, 3, 3, 1, 1, 2, 2, 1, 1});
-  // with groups
-  testConvLayer({2, 2, 4, 5, 5, 8, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({2, 3, 3, 5, 5, 3, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({4, 4, 16, 3, 3, 16, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1});
-}
-
-struct testPoolDesc {
-  int bs, ic;  // input channel and output channel are the same
-  int ih, iw;
-  int oh, ow;
-  int fh, fw;
-  int ph, pw;
-  int sh, sw;
-};
-
-static void getMKLDNNPoolConfig(TestConfig& cfg, const testPoolDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_pool");
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.set_size(pm.ic * pm.oh * pm.ow);
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       0});
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-  pool->set_pool_type("avg-projection");
-  pool->set_channels(pm.ic);
-  pool->set_img_size(pm.iw);
-  pool->set_img_size_y(pm.ih);
-  pool->set_output_x(pm.ow);
-  pool->set_output_y(pm.oh);
-  pool->set_size_x(pm.fw);
-  pool->set_size_y(pm.fh);
-  pool->set_padding(pm.pw);
-  pool->set_padding_y(pm.ph);
-  pool->set_stride(pm.sw);
-  pool->set_stride_y(pm.sh);
-
-  int oh = outputSize(pm.ih, pm.fh, pm.ph, pm.sh, false);
-  int ow = outputSize(pm.iw, pm.fw, pm.pw, pm.sw, false);
-  CHECK_EQ(ow, pm.ow) << "output size check failed";
-  CHECK_EQ(oh, pm.oh) << "output size check failed";
-}
-
-void testPoolLayer(const testPoolDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNPoolConfig(dnnConfig, pm);
-  LayerInputConfig* input = dnnConfig.layerConfig.mutable_inputs(0);
-  PoolConfig* pool = input->mutable_pool_conf();
-  for (auto type : {"max-projection", "avg-projection"}) {
-    pool->set_pool_type(type);
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "pool", pm)
-  }
-}
-
-TEST(MKLDNNLayer, PoolLayer) {
-  /* bs, ch, ih, iw, oh, ow, fh, fw, ph, pw, sh, sw */
-  testPoolLayer({2, 1, 4, 4, 2, 2, 3, 3, 0, 0, 2, 2});
-  testPoolLayer({10, 8, 16, 16, 8, 8, 2, 2, 0, 0, 2, 2});
-  testPoolLayer({4, 2, 5, 5, 3, 3, 3, 3, 1, 1, 2, 2});
-  testPoolLayer({8, 16, 56, 56, 28, 28, 3, 3, 0, 0, 2, 2});
-  testPoolLayer({8, 16, 14, 14, 7, 7, 3, 3, 0, 0, 2, 2});
-  testPoolLayer({4, 16, 7, 7, 1, 1, 7, 7, 0, 0, 1, 1});
-  testPoolLayer({4, 2, 5, 5, 3, 3, 5, 5, 1, 1, 1, 1});
-  testPoolLayer({2, 8, 56, 56, 29, 29, 3, 3, 1, 1, 2, 2});
-}
-
-struct testBatchNormDesc {
-  int bs;
-  int ic;
-  int ih, iw;
-};
-
-static void getMKLDNNBatchNormConfig(TestConfig& cfg,
-                                     const testBatchNormDesc& pm) {
-  cfg.layerConfig.set_size(pm.ic * pm.ih * pm.iw);
-  cfg.layerConfig.set_type("mkldnn_batch_norm");
-  cfg.biasSize = pm.ic;
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       /* size of weight= */ size_t(pm.ic)});
-  cfg.inputDefs.push_back(
-      {INPUT_DATA, "layer_1_moving_mean", 1, size_t(pm.ic)});
-  cfg.inputDefs.back().isStatic = true;
-  cfg.inputDefs.push_back({INPUT_DATA, "layer_2_moving_var", 1, size_t(pm.ic)});
-  cfg.inputDefs.back().isStatic = true;
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.add_inputs();
-  cfg.layerConfig.add_inputs();
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(pm.ic);
-  img_conf->set_img_size_y(pm.ih);
-  img_conf->set_img_size(pm.iw);
-}
-
-void testBatchNormLayer(const testBatchNormDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNBatchNormConfig(dnnConfig, pm);
-  TestConfig refConfig = dnnConfig;
-  refConfig.layerConfig.set_type("batch_norm");
-  // for PASS_TRAIN, use_global_stats always should be false, and batchsize != 1
-  VLOG(MKLDNN_TESTS) << "check train phase";
-  dnnConfig.layerConfig.set_use_global_stats(false);
-  refConfig.layerConfig.set_use_global_stats(false);
-  MKLDNNTester tester;
-  tester.run(dnnConfig, refConfig, pm.bs, pm.ih, pm.iw, PASS_TRAIN);
-  // for PASS_TEST, check use_global_stats true and false, and batchsize 1
-  VLOG(MKLDNN_TESTS) << "check test phase";
-  for (auto useGS : {false, true}) {
-    dnnConfig.layerConfig.set_use_global_stats(useGS);
-    refConfig.layerConfig.set_use_global_stats(useGS);
-    MKLDNNTester tester;
-    for (auto bs : {pm.bs, 1}) {
-      tester.run(dnnConfig, refConfig, bs, pm.ih, pm.iw, PASS_TEST);
-    }
-  }
-}
-
-TEST(MKLDNNLayer, BatchNormLayer) {
-  testBatchNormLayer({4, 10, 6, 6});
-  testBatchNormLayer({16, 32, 16, 16});
-  testBatchNormLayer({4, 16, 8, 10});
-}
-
-struct testLRNDesc {
-  int bs, ic, ih, iw;
-  float scale, pow;
-  int localSize;
-};
-
-void getMKLDNNLRNConfig(TestConfig& cfg, const testLRNDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_lrn");
-  cfg.layerConfig.set_active_type("relu");
-  size_t layerSize = pm.ic * pm.ih * pm.iw;
-  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_channels(pm.ic);
-  norm->set_size(pm.localSize);
-  norm->set_scale(pm.scale);
-  norm->set_pow(pm.pow);
-  norm->set_blocked(0);
-  norm->set_img_size(pm.iw);
-  norm->set_img_size_y(pm.ih);
-  norm->set_output_x(norm->img_size());
-  norm->set_output_y(norm->img_size_y());
-  cfg.layerConfig.set_size(layerSize);
-  cfg.biasSize = 0;
-}
-
-void testLRNLayer(const testLRNDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNLRNConfig(dnnConfig, pm);
-  // mkldnn_lrn <==> norm with cmrnorm-projection type
-  TestConfig refConfig = dnnConfig;
-  refConfig.layerConfig.set_type("norm");
-  LayerInputConfig* input = refConfig.layerConfig.mutable_inputs(0);
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_norm_type("cmrnorm-projection");
-  norm->set_scale(norm->scale() / norm->size());
-  RUN_MKLDNN_TEST(dnnConfig, refConfig, pm)
-}
-
-TEST(MKLDNNLayer, LRNLayer) {
-  testLRNLayer({4, 10, 12, 12, 0.001f, 0.75f, 5});
-  testLRNLayer({2, 32, 6, 6, 0.001f, 0.75f, 5});
-  testLRNLayer({4, 16, 8, 10, 0.01f, 0.5f, 5});
-}
-
-struct testImageDesc {
-  int bs, ic, ih, iw;
-};
-
-static void getAddtoConfig(TestConfig& cfg,
-                           const testImageDesc& pm,
-                           const size_t nInputs = 1) {
-  cfg.biasSize = 0;
-  cfg.layerConfig.set_type("addto");
-  size_t layerSize = pm.ic * pm.ih * pm.iw;
-  cfg.layerConfig.set_size(layerSize);
-  cfg.layerConfig.set_active_type("relu");
-  for (size_t i = 0; i < nInputs; ++i) {
-    std::stringstream ss;
-    ss << "layer_" << i;
-    cfg.inputDefs.push_back({INPUT_DATA, ss.str(), layerSize, 0});
-    LayerInputConfig* input = cfg.layerConfig.add_inputs();
-    ImageConfig* img_conf = input->mutable_image_conf();
-    img_conf->set_channels(pm.ic);
-    img_conf->set_img_size_y(pm.ih);
-    img_conf->set_img_size(pm.iw);
-  }
-}
-
-void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
-  CHECK_GE(nInputs, 1UL);
-  TestConfig dnnConfig;
-  getAddtoConfig(dnnConfig, pm, nInputs);
-  dnnConfig.layerConfig.set_type("mkldnn_addto");
-  for (auto withBias : {false, true}) {
-    dnnConfig.biasSize = withBias ? pm.ic * pm.ih * pm.iw : 0;
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
-  }
-}
-
-TEST(MKLDNNLayer, AddtoLayer) {
-  testAddtoLayer({16, 5, 14, 14}, 1);
-  testAddtoLayer({8, 10, 8, 8}, 2);
-  testAddtoLayer({4, 12, 1, 1}, 3);
-}
-
-static void getMKLDNNConcatConfig(TestConfig& cfg,
-                                  const std::vector<testImageDesc>& inputs) {
-  CHECK_GE(inputs.size(), 2UL) << "at least two inputs";
-  int oc = inputs[0].ic;
-  for (size_t i = 1; i < inputs.size(); ++i) {
-    CHECK_EQ(inputs[i].bs, inputs[0].bs);
-    CHECK_EQ(inputs[i].ih, inputs[0].ih);
-    CHECK_EQ(inputs[i].iw, inputs[0].iw);
-    oc += inputs[i].ic;
-  }
-  cfg.biasSize = 0;
-  cfg.layerConfig.set_type("mkldnn_concat");
-  cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
-  cfg.layerConfig.set_active_type("relu");
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    std::stringstream ss;
-    ss << "layer_" << i;
-    cfg.inputDefs.push_back(
-        {INPUT_DATA,
-         ss.str(),
-         (size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
-         0});
-    LayerInputConfig* input = cfg.layerConfig.add_inputs();
-    ImageConfig* img_conf = input->mutable_image_conf();
-    img_conf->set_channels(inputs[i].ic);
-    img_conf->set_img_size_y(inputs[i].ih);
-    img_conf->set_img_size(inputs[i].iw);
-  }
-}
-
-void testConcatLayer(const std::vector<testImageDesc>& inputs) {
-  TestConfig dnnConfig;
-  getMKLDNNConcatConfig(dnnConfig, inputs);
-  RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
-}
-
-TEST(MKLDNNLayer, ConcatLayer) {
-  testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
-  testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
-}
-
-void testActivation(std::string actType, const testImageDesc& pm) {
-  // TODO(TJ): remove me when paddle support elu activation
-  if (actType == "mkldnn_elu") {
-    return;
-  }
-  const std::string compareTypes[] = {actType, actType.erase(0, 7)};
-  TestConfig cfg;
-  getAddtoConfig(cfg, pm);
-  TestConfig ref = cfg;
-  cfg.layerConfig.set_active_type(compareTypes[0]);
-  ref.layerConfig.set_active_type(compareTypes[1]);
-  RUN_MKLDNN_TEST(cfg, ref, pm)
-}
-
-TEST(MKLDNNActivation, Activations) {
-  auto types = MKLDNNActivation::getAllRegisteredTypes();
-  for (auto type : types) {
-    /* bs, c, h, w*/
-    testActivation(type, {16, 64, 32, 32});
-    testActivation(type, {2, 8, 1, 1});
-  }
-}
-
-DECLARE_string(config_args);
-TEST(MKLDNNNet, net) {
-  std::vector<std::string> cases = {"simple", "branch"};
-  for (auto name : cases) {
-    std::string config = "./legacy/gserver/tests/mkldnn_" + name + "_net.conf";
-    for (auto channels : {2, 32}) {
-      std::ostringstream oss;
-      oss << "channels=" << channels;
-      FLAGS_config_args = oss.str();
-      MKLDNNTester::runNetTest(config);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  FLAGS_use_gpu = false;
-  FLAGS_use_mkldnn = true;
-  initMain(argc, argv);
-  initPython(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_MaxPoolingWithMaskOutput.cpp b/paddle/legacy/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
deleted file mode 100644
index 2bc261b4a87..00000000000
--- a/paddle/legacy/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-
-#include "LayerGradUtil.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;
-
-void setPoolConfig(TestConfig* config,
-                   PoolConfig* pool,
-                   const string& poolType) {
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool");
-  (*config).layerConfig.set_num_filters(1);
-
-  int kw = 3, kh = 3;
-  int pw = 0, ph = 0;
-  int sw = 2, sh = 2;
-  pool->set_pool_type(poolType);
-  pool->set_channels(1);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_start(0);
-  pool->set_padding(pw);
-  pool->set_padding_y(ph);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-
-  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-}
-
-void doOneMaxPoolingWithMaskOutputTest(MatrixPtr& inputMat,
-                                       const string& poolType,
-                                       bool use_gpu,
-                                       MatrixPtr& maskMat) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 25, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_img_size(5);
-  pool->set_img_size_y(5);
-  setPoolConfig(&config, pool, poolType);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  config.layerConfig.set_name("MaxPoolWithMask");
-
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-
-  initDataLayer(config,
-                &dataLayers,
-                &datas,
-                &layerMap,
-                "MaxPoolWithMask",
-                1,
-                false,
-                use_gpu);
-
-  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
-
-  FLAGS_use_gpu = use_gpu;
-  std::vector<ParameterPtr> parameters;
-  LayerPtr maxPoolingWithMaskOutputLayer;
-  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
-  maxPoolingWithMaskOutputLayer->forward(PASS_GC);
-
-  checkMatrixEqual(maxPoolingWithMaskOutputLayer->getOutput("mask").value,
-                   maskMat);
-}
-
-TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
-  bool useGpu = false;
-  MatrixPtr inputMat;
-  MatrixPtr maskMat;
-  real inputData[] = {0.1, 0.1, 0.5, 0.5, 1.1, 0.2, 0.2, 0.6, 0.1,
-                      0.1, 0.3, 0.3, 0.7, 0.1, 0.1, 0.4, 0.4, 0.8,
-                      0.8, 0.1, 1.0, 2.0, 3.0, 0.0, 9.0};
-  real maskData[] = {12, 4, 22, 24};
-
-  inputMat = Matrix::create(1, 25, false, useGpu);
-  maskMat = Matrix::create(1, 4, false, useGpu);
-  inputMat->setData(inputData);
-  maskMat->setData(maskData);
-  doOneMaxPoolingWithMaskOutputTest(
-      inputMat, "max-pool-with-mask", useGpu, maskMat);
-#ifdef PADDLE_WITH_CUDA
-  useGpu = true;
-  inputMat = Matrix::create(1, 25, false, useGpu);
-  maskMat = Matrix::create(1, 4, false, useGpu);
-  inputMat->copyFrom(inputData, 25);
-  maskMat->copyFrom(maskData, 4);
-  doOneMaxPoolingWithMaskOutputTest(
-      inputMat, "max-pool-with-mask", useGpu, maskMat);
-#endif
-}
diff --git a/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp b/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp
deleted file mode 100644
index 25b1a1191d0..00000000000
--- a/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-
-#include <gtest/gtest.h>
-#include <vector>
-
-#undef PADDLE_DISABLE_TIMER
-#include "paddle/legacy/utils/Stat.h"
-
-#include "paddle/legacy/gserver/layers/MultinomialSampler.h"
-#include "paddle/legacy/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-class MultinomialSamplerTester : public MultinomialSampler {
- public:
-  MultinomialSamplerTester(real* prob, int size)
-      : MultinomialSampler(prob, size) {}
-
-  template <typename Rand1>
-  int testGen(Rand1 rand1) {
-    return gen1(rand1);
-  }
-};
-
-TEST(MultinomialSampler, gen) {
-  int numGrids = 1024 * 1024;
-  int size = 1024 * 4;
-  default_random_engine reng;
-
-  for (size_t iter = 0; iter < 256; ++iter) {
-    uniform_int_distribution<int> rand(1, numGrids / size * 1.8);
-    vector<real> prob;
-    int sum = 0;
-    for (int i = 0; i < size; ++i) {
-      prob.push_back(rand(reng));
-      sum += prob.back();
-    }
-
-    CHECK_LE(sum, numGrids);
-    prob.back() += numGrids - sum;
-
-    vector<int> counts(size);
-    MultinomialSamplerTester sampler(&prob[0], size);
-    counts.assign(size, 0);
-    {
-      double s = (double)size / (double)numGrids;
-      REGISTER_TIMER("MultinomialSampler");
-      for (double i = 0; i < numGrids; ++i) {
-        int ret = sampler.testGen([i, s]() { return s * i; });
-        if (ret < 0 || ret >= size) {
-          EXPECT_GE(ret, 0);
-          EXPECT_LT(ret, size);
-          break;
-        }
-        ++counts[ret];
-      }
-    }
-    for (int i = 0; i < size; ++i) {
-      if (prob[i] != counts[i]) {
-        EXPECT_EQ(prob[i], counts[i]);
-        LOG(INFO) << iter;
-        break;
-      }
-    }
-  }
-}
-
-void benchmarkRandom() {
-  int n = 1024 * 1024;
-
-  int sum;
-  double sum1;
-
-  sum = 0;
-  unsigned int seed = 1;
-  {
-    REGISTER_TIMER("crand");
-    for (int i = 0; i < n; ++i) {
-      sum += rand_r(&seed) % 1000;
-    }
-  }
-  LOG(INFO) << "sum=" << sum;
-
-  default_random_engine reng;
-  uniform_int_distribution<int> rand(1, 1000);
-  sum = 0;
-  {
-    REGISTER_TIMER("stdrand");
-    for (int i = 0; i < n; ++i) {
-      sum += rand(reng);
-    }
-  }
-  LOG(INFO) << "sum=" << sum;
-
-  sum = 0;
-  {
-    REGISTER_TIMER("default_random_engine");
-    for (int i = 0; i < n; ++i) {
-      sum += reng();
-    }
-  }
-  LOG(INFO) << "sum=" << sum;
-
-  uniform_real_distribution<double> rand1(0, 1);
-  sum1 = 0;
-  {
-    REGISTER_TIMER("stdrand1");
-    for (int i = 0; i < n; ++i) {
-      sum1 += rand1(reng);
-    }
-  }
-  LOG(INFO) << "sum1=" << sum1;
-
-  sum1 = 0;
-  {
-    real a = 1.0f / (real)RAND_MAX;
-    REGISTER_TIMER("crand1");
-    for (int i = 0; i < n; ++i) {
-      sum1 += a * rand_r(&seed);
-    }
-  }
-  LOG(INFO) << "sum1=" << sum1;
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  benchmarkRandom();
-  int ret = RUN_ALL_TESTS();
-  globalStat.printSegTimerStatus();
-  return ret;
-}
diff --git a/paddle/legacy/gserver/tests/test_NetworkCompare.cpp b/paddle/legacy/gserver/tests/test_NetworkCompare.cpp
deleted file mode 100644
index c9f9f3e61be..00000000000
--- a/paddle/legacy/gserver/tests/test_NetworkCompare.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#undef PADDLE_DISABLE_TIMER
-#include <gtest/gtest.h>
-#include <paddle/legacy/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-
-#include "paddle/legacy/trainer/Trainer.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DEFINE_bool(use_label, true, "input label or sequence label");
-DEFINE_bool(static_para, false, "static parameter");
-
-struct DataIn {
-  std::vector<Argument> inArgs;
-  std::vector<MatrixPtr> outGrads;
-  std::vector<VectorPtr> paraValues;
-};
-
-struct DataOut {
-  std::vector<MatrixPtr> outValues;
-  std::vector<VectorPtr> paraGrads;
-};
-
-void initArgument(DataIn& data,
-                  const std::string& configPath,
-                  bool useGpu = FLAGS_use_gpu) {
-  TrainerConfigHelper config(configPath);
-  size_t batchSize = config.getOptConfig().batch_size();
-
-  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    Argument arg;
-    arg.value = Matrix::create(batchSize, layerSize, false, useGpu);
-    arg.grad = Matrix::create(batchSize, layerSize, false, useGpu);
-    arg.value->randomizeUniform();
-    arg.value->add(-0.5);
-    arg.value->sigmoid(*arg.value);
-    arg.grad->zeroMem();
-    if (FLAGS_use_label) {
-      arg.ids = VectorT<int>::create(batchSize, useGpu);
-      arg.ids->rand(layerSize);
-    }
-    generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
-    data.inArgs.push_back(arg);
-  }
-
-  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    MatrixPtr grad = Matrix::create(batchSize, layerSize, false, useGpu);
-    grad->randomizeUniform();
-    data.outGrads.push_back(grad);
-  }
-
-  for (const auto& para_config : config.getModelConfig().parameters()) {
-    VectorPtr value = Vector::create(para_config.size(), useGpu);
-    value->randnorm(0, 2);
-    data.paraValues.push_back(value);
-  }
-}
-
-void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) {
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-
-  Trainer trainer;
-  auto config = std::make_shared<TrainerConfigHelper>(configPath);
-  trainer.init(config, false);
-
-  std::vector<ParameterPtr> parameters;
-  vector<Argument> outArgs;
-
-  auto gradientMachine = trainer.getGradientMachine();
-  parameters = gradientMachine->getParameters();
-  if (FLAGS_static_para) {
-    for (size_t i = 0; i < parameters.size(); i++) {
-      parameters[i]->getBuf(PARAMETER_VALUE)->one();
-    }
-  } else {
-    for (size_t i = 0; i < in.paraValues.size(); i++) {
-      parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
-    }
-  }
-  gradientMachine->start();
-  gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
-  for (size_t i = 0; i < in.outGrads.size(); i++) {
-    // If the all the layers in the config have no parameters, also
-    // not set NeedGradient(), the outArgs[i] will be nullptr.
-    outArgs[i].grad->copyFrom(*in.outGrads[i]);
-  }
-  gradientMachine->backward();
-  for (size_t i = 0; i < in.outGrads.size(); i++) {
-    MatrixPtr value = Matrix::create(outArgs[i].value->getHeight(),
-                                     outArgs[i].value->getWidth(),
-                                     false,
-                                     false);
-    value->copyFrom(*outArgs[i].value);
-    out.outValues.push_back(value);
-  }
-  for (size_t i = 0; i < in.paraValues.size(); i++) {
-    VectorPtr grad = Vector::create(
-        parameters[i]->getBuf(PARAMETER_GRADIENT)->getSize(), false);
-    grad->copyFrom(*parameters[i]->getBuf(PARAMETER_GRADIENT));
-    out.paraGrads.push_back(grad);
-  }
-
-  for (int i = 0; i < 20; i++) {
-    REGISTER_TIMER("forward");
-    gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
-  }
-  for (int i = 0; i < 20; i++) {
-    REGISTER_TIMER("backward");
-    gradientMachine->backward();
-  }
-
-  gradientMachine->finish();
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 size_t width = 1) {
-  int nNum = 0;
-  for (size_t i = 0; i < len; ++i) {
-    real diff = fabs(A[i] - B[i]);
-    if (diff > 0.0f &&
-        diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_checkgrad_eps) {
-      nNum++;
-      LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i]
-                << "    " << desB << " : " << B[i];
-    }
-  }
-  EXPECT_EQ(0, nNum);
-}
-
-void compareGradient(DataOut& outA, DataOut& outB) {
-  LOG(INFO) << "------------------------------"
-            << " Check Network Output "
-            << "------------------------------";
-  for (size_t i = 0; i < outA.outValues.size(); ++i) {
-    LOG(INFO) << "OUTPUT VALUE: " << i;
-    checkBuffer(outA.outValues[i]->getData(),
-                "network A output",
-                outB.outValues[i]->getData(),
-                "network B output",
-                outA.outValues[i]->getElementCnt(),
-                outA.outValues[i]->getWidth());
-  }
-
-  if (!FLAGS_static_para) {
-    LOG(INFO) << "------------------------------"
-              << " Check Parameters "
-              << "------------------------------";
-    for (size_t i = 0; i < outA.paraGrads.size(); ++i) {
-      LOG(INFO) << "PARAMETER GRADIENT: " << i;
-      checkBuffer(outA.paraGrads[i]->getData(),
-                  "Network A",
-                  outB.paraGrads[i]->getData(),
-                  "Network B",
-                  outA.paraGrads[i]->getSize());
-    }
-  }
-}
-
-void compareNetwork(const std::string& config_file_a,
-                    const std::string& config_file_b) {
-  DataIn in;
-  initArgument(in, config_file_a);
-
-  DataOut dataA;
-  calcGradient(in, dataA, config_file_a);
-  LOG(INFO) << "forwardBackward of Network A is finished";
-  globalStat.printSegTimerStatus();
-  globalStat.reset();
-  LOG(INFO) << "\n\n";
-
-  DataOut dataB;
-  calcGradient(in, dataB, config_file_b);
-  LOG(INFO) << "forwardBackward of the Network B is finished";
-  globalStat.printSegTimerStatus();
-  globalStat.reset();
-  LOG(INFO) << "\n\n";
-
-  compareGradient(dataA, dataB);
-}
-
-TEST(Compare, concat_dotmul) {
-  std::string config_file_a = "./legacy/gserver/tests/concat_dotmul_a.conf";
-  std::string config_file_b = "./legacy/gserver/tests/concat_dotmul_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-TEST(Compare, concat_fullmatrix) {
-  std::string config_file_a = "./legacy/gserver/tests/concat_fullmatrix_a.conf";
-  std::string config_file_b = "./legacy/gserver/tests/concat_fullmatrix_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-TEST(Compare, concat_table) {
-  std::string config_file_a = "./legacy/gserver/tests/concat_table_a.conf";
-  std::string config_file_b = "./legacy/gserver/tests/concat_table_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-TEST(Compare, concat_slice) {
-  std::string config_file_a = "./legacy/gserver/tests/concat_slice_a.conf";
-  std::string config_file_b = "./legacy/gserver/tests/concat_slice_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(Compare, img_pool) {
-  std::string config_file_a = "./legacy/gserver/tests/img_pool_a.conf";
-  std::string config_file_b = "./legacy/gserver/tests/img_pool_b.conf";
-  bool useGpu = FLAGS_use_gpu;
-  FLAGS_use_gpu = true;
-  compareNetwork(config_file_a, config_file_b);
-  FLAGS_use_gpu = useGpu;
-}
-
-TEST(Compare, img_conv) {
-  std::string config_file_a = "./legacy/gserver/tests/img_conv_a.conf";
-  std::string config_file_b = "./legacy/gserver/tests/img_conv_b.conf";
-  bool useGpu = FLAGS_use_gpu;
-  FLAGS_use_gpu = true;
-  compareNetwork(config_file_a, config_file_b);
-  FLAGS_use_gpu = useGpu;
-}
-
-// Test cudnn_conv and exconv give the same result
-TEST(Compare, img_conv2) {
-  std::string config_file_a = "./legacy/gserver/tests/img_conv_cudnn.py";
-  std::string config_file_b = "./legacy/gserver/tests/img_conv_exconv.py";
-  bool useGpu = FLAGS_use_gpu;
-  double eps = FLAGS_checkgrad_eps;
-  FLAGS_use_gpu = true;
-  // Sometimes, this unit test will fail with 1e-2
-  FLAGS_checkgrad_eps = 4e-2;
-  compareNetwork(config_file_a, config_file_b);
-  FLAGS_use_gpu = useGpu;
-  FLAGS_checkgrad_eps = eps;
-}
-#endif
-
-DEFINE_string(config_file_a, "", "config of one network to compare");
-DEFINE_string(config_file_b, "", "config of another network to compare");
-TEST(Compare, network) {
-  if (FLAGS_config_file_a != "" && FLAGS_config_file_b != "") {
-    compareNetwork(FLAGS_config_file_a, FLAGS_config_file_b);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  initPython(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/legacy/gserver/tests/test_PriorBox.cpp b/paddle/legacy/gserver/tests/test_PriorBox.cpp
deleted file mode 100644
index 10d512ec45f..00000000000
--- a/paddle/legacy/gserver/tests/test_PriorBox.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-// Do one forward pass of priorBox layer and check to see if its output
-// matches the given result
-void doOnePriorBoxTest(size_t feature_map_width,
-                       size_t feature_map_height,
-                       size_t image_width,
-                       size_t image_height,
-                       vector<int> min_size,
-                       vector<int> max_size,
-                       vector<real> aspect_ratio,
-                       vector<real> variance,
-                       bool use_gpu,
-                       MatrixPtr& result) {
-  // Setting up the priorbox layer
-  TestConfig configt;
-  configt.layerConfig.set_type("priorbox");
-
-  configt.inputDefs.push_back({INPUT_DATA, "featureMap", 1, 0});
-  LayerInputConfig* input = configt.layerConfig.add_inputs();
-  configt.inputDefs.push_back({INPUT_DATA, "image", 1, 0});
-  configt.layerConfig.add_inputs();
-  PriorBoxConfig* pb = input->mutable_priorbox_conf();
-  for (size_t i = 0; i < min_size.size(); i++) pb->add_min_size(min_size[i]);
-  for (size_t i = 0; i < max_size.size(); i++) pb->add_max_size(max_size[i]);
-  for (size_t i = 0; i < variance.size(); i++) pb->add_variance(variance[i]);
-  for (size_t i = 0; i < aspect_ratio.size(); i++)
-    pb->add_aspect_ratio(aspect_ratio[i]);
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      configt, &dataLayers, &datas, &layerMap, "priorbox", 1, false, use_gpu);
-  dataLayers[0]->getOutput().setFrameHeight(feature_map_height);
-  dataLayers[0]->getOutput().setFrameWidth(feature_map_width);
-  dataLayers[1]->getOutput().setFrameHeight(image_height);
-  dataLayers[1]->getOutput().setFrameWidth(image_width);
-
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr priorboxLayer;
-  initTestLayer(configt, &layerMap, &parameters, &priorboxLayer);
-  priorboxLayer->forward(PASS_GC);
-  checkMatrixEqual(priorboxLayer->getOutputValue(), result);
-}
-
-TEST(Layer, priorBoxLayerFwd) {
-  vector<int> minSize;
-  vector<int> maxSize;
-  vector<real> aspectRatio;
-  vector<real> variance;
-  bool useGpu = false;
-
-  minSize.push_back(276);
-  maxSize.push_back(330);
-  variance.push_back(0.1);
-  variance.push_back(0.1);
-  variance.push_back(0.2);
-  variance.push_back(0.2);
-
-  // CPU case 1.
-  MatrixPtr result;
-  real resultData[] = {0.04,
-                       0.04,
-                       0.96,
-                       0.96,
-                       0.1,
-                       0.1,
-                       0.2,
-                       0.2,
-                       0,
-                       0,
-                       1,
-                       1,
-                       0.1,
-                       0.1,
-                       0.2,
-                       0.2};
-  result = Matrix::create(1, 2 * 8, false, useGpu);
-  result->setData(resultData);
-  doOnePriorBoxTest(/* feature_map_width */ 1,
-                    /* feature_map_height */ 1,
-                    /* image_width */ 300,
-                    /* image_height */ 300,
-                    minSize,
-                    maxSize,
-                    aspectRatio,
-                    variance,
-                    useGpu,
-                    result);
-  // CPU case 2.
-  variance[1] = 0.2;
-  variance[3] = 0.1;
-  maxSize.pop_back();
-  real resultData2[] = {0,     0,     0.595, 0.595, 0.1, 0.2, 0.2, 0.1,
-                        0.405, 0,     1,     0.595, 0.1, 0.2, 0.2, 0.1,
-                        0,     0.405, 0.595, 1,     0.1, 0.2, 0.2, 0.1,
-                        0.405, 0.405, 1,     1,     0.1, 0.2, 0.2, 0.1};
-  Matrix::resizeOrCreate(result, 1, 4 * 8, false, useGpu);
-  result->setData(resultData2);
-  doOnePriorBoxTest(/* feature_map_width */ 2,
-                    /* feature_map_height */ 2,
-                    /* image_width */ 400,
-                    /* image_height */ 400,
-                    minSize,
-                    maxSize,
-                    aspectRatio,
-                    variance,
-                    useGpu,
-                    result);
-  // CPU case 3.
-  aspectRatio.push_back(2);
-  real resultData3[] = {0.04,     0.04, 0.96, 0.96,       0.1,        0.2,
-                        0.2,      0.1,  0,    0.17473088, 1,          0.825269,
-                        0.1,      0.2,  0.2,  0.1,        0.17473088, 0,
-                        0.825269, 1,    0.1,  0.2,        0.2,        0.1};
-  Matrix::resizeOrCreate(result, 1, 3 * 8, false, useGpu);
-  result->setData(resultData3);
-  doOnePriorBoxTest(/* feature_map_width */ 1,
-                    /* feature_map_height */ 1,
-                    /* image_width */ 300,
-                    /* image_height */ 300,
-                    minSize,
-                    maxSize,
-                    aspectRatio,
-                    variance,
-                    useGpu,
-                    result);
-
-#ifdef PADDLE_WITH_CUDA
-  // reset the input parameters
-  variance[1] = 0.1;
-  variance[3] = 0.2;
-  maxSize.push_back(330);
-  aspectRatio.pop_back();
-  MatrixPtr resultGpu;
-  useGpu = true;
-  // GPU case 1.
-  resultGpu = Matrix::create(1, 2 * 8, false, useGpu);
-  resultGpu->copyFrom(resultData, 2 * 8);
-  doOnePriorBoxTest(/* feature_map_width */ 1,
-                    /* feature_map_height */ 1,
-                    /* image_width */ 300,
-                    /* image_height */ 300,
-                    minSize,
-                    maxSize,
-                    aspectRatio,
-                    variance,
-                    useGpu,
-                    resultGpu);
-  // GPU case 2.
-  variance[1] = 0.2;
-  variance[3] = 0.1;
-  maxSize.pop_back();
-  Matrix::resizeOrCreate(resultGpu, 1, 4 * 8, false, useGpu);
-  resultGpu->copyFrom(resultData2, 4 * 8);
-  doOnePriorBoxTest(/* feature_map_width */ 2,
-                    /* feature_map_height */ 2,
-                    /* image_width */ 400,
-                    /* image_height */ 400,
-                    minSize,
-                    maxSize,
-                    aspectRatio,
-                    variance,
-                    useGpu,
-                    resultGpu);
-  // GPU case 3.
-  aspectRatio.push_back(2);
-  Matrix::resizeOrCreate(resultGpu, 1, 3 * 8, false, useGpu);
-  resultGpu->copyFrom(resultData3, 3 * 8);
-  doOnePriorBoxTest(/* feature_map_width */ 1,
-                    /* feature_map_height */ 1,
-                    /* image_width */ 300,
-                    /* image_height */ 300,
-                    minSize,
-                    maxSize,
-                    aspectRatio,
-                    variance,
-                    useGpu,
-                    resultGpu);
-#endif
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_PyDataProvider.cpp b/paddle/legacy/gserver/tests/test_PyDataProvider.cpp
deleted file mode 100644
index 0209e6818a8..00000000000
--- a/paddle/legacy/gserver/tests/test_PyDataProvider.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include <gtest/gtest.h>
-
-#include "paddle/legacy/gserver/dataproviders/PyDataProvider.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace std;     // NOLINT
-using namespace paddle;  // NOLINT
-
-void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu);
-void simpleSequenceCheck(const vector<Argument>& argumentList, int sample_num);
-
-TEST(PyDataProvider, py_fill_slots) {
-  DataConfig config;
-  config.set_type("py");
-  config.set_async_load_data(false);
-  config.set_load_data_module(std::string("pyDataProvider"));
-  config.set_load_data_object(std::string("SimpleDataProvider"));
-  config.clear_files();
-  std::string dataFile =
-      "legacy/gserver/tests/pyDataProvider/pyDataProviderList";
-  config.set_files(dataFile);
-#ifndef PADDLE_WITH_CUDA
-  bool useGpu = false;
-#else
-  bool useGpu = true;
-#endif
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  DataBatch dataBatch;
-  dataProvider->getNextBatchInternal(2, &dataBatch);
-  const std::vector<Argument>& argumentList = dataBatch.getStreams();
-  // Check size
-  EXPECT_EQ(argumentList.size(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getWidth(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getHeight(), 2UL);
-  EXPECT_EQ(argumentList[0].value->getElementCnt(), 6UL);
-  EXPECT_EQ(argumentList[1].value->getWidth(), 7UL);
-  EXPECT_EQ(argumentList[1].value->getHeight(), 2UL);
-  EXPECT_EQ(argumentList[1].value->getElementCnt(), 4UL);
-  EXPECT_EQ(argumentList[2].ids->getSize(), 2UL);
-  // Check value
-  simpleValueCheck(argumentList, useGpu);
-  // Check sequenceStartPositions
-  simpleSequenceCheck(argumentList, 2);
-}
-
-TEST(PyDataProvider, py_fill_nest_slots) {
-  DataConfig config;
-  config.set_type("py");
-  config.set_async_load_data(false);
-  config.set_load_data_module(std::string("pyDataProvider"));
-  config.set_load_data_object(std::string("SimpleNestDataProvider"));
-  config.clear_files();
-  std::string dataFile =
-      "legacy/gserver/tests/pyDataProvider/pyDataProviderList";
-  config.set_files(dataFile);
-  EXPECT_EQ(config.IsInitialized(), true);
-#ifndef PADDLE_WITH_CUDA
-  bool useGpu = false;
-#else
-  bool useGpu = true;
-#endif
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  DataBatch dataBatch;
-  dataProvider->getNextBatchInternal(2, &dataBatch);
-  const std::vector<Argument>& argumentList = dataBatch.getStreams();
-  // Check size
-  EXPECT_EQ(argumentList.size(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getWidth(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getHeight(), 4UL);
-  EXPECT_EQ(argumentList[0].value->getElementCnt(), 12UL);
-  EXPECT_EQ(argumentList[1].value->getWidth(), 7UL);
-  EXPECT_EQ(argumentList[1].value->getHeight(), 4UL);
-  EXPECT_EQ(argumentList[1].value->getElementCnt(), 8UL);
-  EXPECT_EQ(argumentList[2].ids->getSize(), 4UL);
-  // Check value
-  simpleValueCheck(argumentList, useGpu);
-  // Check sequenceStartPositions
-  simpleSequenceCheck(argumentList, 4);
-  // Check subSequenceStartPositions
-  EXPECT_EQ(argumentList[0].subSequenceStartPositions->getSize(), 4UL);
-  EXPECT_EQ(argumentList[1].subSequenceStartPositions->getSize(), 3UL);
-  EXPECT_EQ(argumentList[2].subSequenceStartPositions->getSize(), 4UL);
-  for (size_t i = 0; i < argumentList.size(); i++) {
-    EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(0), 0);
-    EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(1), 1);
-    if (i != 1) {
-      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(2), 2);
-      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(3), 4);
-    } else {
-      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(2), 4);
-    }
-  }
-}
-
-void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu) {
-  // Dense
-  real* data;
-  if (useGpu) {
-    MatrixPtr cpuMatrixPtr = Matrix::create(argumentList[0].value->getHeight(),
-                                            argumentList[0].value->getWidth(),
-                                            0,
-                                            0);
-    cpuMatrixPtr->copyFrom(*argumentList[0].value);
-    data = cpuMatrixPtr->getData();
-  } else {
-    data = argumentList[0].value->getData();
-  }
-  for (size_t i = 0; i < argumentList[0].value->getElementCnt(); ++i) {
-    EXPECT_EQ(*(data + i), (float)(i % 3 + 1));
-  }
-  // Sparse without value
-  GpuSparseMatrixPtr matGpu;
-  CpuSparseMatrixPtr matCpu;
-  if (useGpu) {
-    matGpu = dynamic_pointer_cast<GpuSparseMatrix>(argumentList[1].value);
-    ASSERT_TRUE(matGpu != NULL);
-  } else {
-    data = argumentList[0].value->getData();
-    matCpu = dynamic_pointer_cast<CpuSparseMatrix>(argumentList[1].value);
-    ASSERT_TRUE(matCpu != NULL);
-  }
-  for (size_t i = 0; i < argumentList[1].value->getHeight(); ++i) {
-    size_t colNum = useGpu ? matGpu->getColNum(i) : matCpu->getColNum(i);
-    EXPECT_EQ(colNum, (size_t)2);
-    const int* buf = useGpu ? matGpu->getRowCols(i) : matCpu->getRowCols(i);
-    for (size_t j = 0; j < colNum; ++j) {
-      EXPECT_EQ((size_t)buf[j], (size_t)(j + 1));
-    }
-  }
-  // Index
-  for (size_t j = 0; j < argumentList[2].ids->getSize(); ++j) {
-    EXPECT_EQ((size_t)argumentList[2].ids->get(j), 0UL);
-  }
-}
-
-void simpleSequenceCheck(const vector<Argument>& argumentList, int sample_num) {
-  EXPECT_EQ(argumentList[0].sequenceStartPositions->getSize(), 3UL);
-  EXPECT_EQ(argumentList[1].sequenceStartPositions->getSize(), 2UL);
-  EXPECT_EQ(argumentList[2].sequenceStartPositions->getSize(), 3UL);
-  for (size_t i = 0; i < argumentList.size(); i++) {
-    EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(0), 0);
-    if (i != 1) {
-      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(1), 1);
-      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(2),
-                sample_num);
-    } else {
-      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(1),
-                sample_num);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp b/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp
deleted file mode 100644
index de313ba82cf..00000000000
--- a/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp
+++ /dev/null
@@ -1,409 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_NO_PYTHON
-#include <gtest/gtest.h>
-#include <fstream>
-#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-#include "paddle/legacy/utils/Util.h"
-
-DEFINE_string(train_list, "unittest.list", "file list for unittest");
-
-namespace paddle {
-namespace unittest {
-namespace pydp2 {
-extern void setOnPoolFilledHook(const std::function<void(size_t)> &func);
-extern void clearOnPoolFilledHook();
-
-}  // namespace pydp2
-}  // namespace unittest
-}  // namespace paddle
-
-const paddle::real epsilon = 1e-5;
-
-static inline int64_t readDataBatch(paddle::DataBatch *batch,
-                                    const std::string &funcName,
-                                    int64_t batchSize = 65535) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object(funcName);
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  return provider->getNextBatchInternal(batchSize, batch);
-}
-
-TEST(PyDataProvider2, dense_no_seq) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_dense_no_seq");
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-
-  provider->setSkipShuffle();  // skip shuffle for unittest.
-
-  paddle::DataBatch batch;
-  for (size_t pass = 0; pass < 2; ++pass) {  // read 2 passes
-    provider->reset();
-    int64_t num = provider->getNextBatchInternal(100, &batch);
-    ASSERT_NE(num, 0);
-    ASSERT_EQ((size_t)batch.getStreams().size(), (size_t)1);
-    ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
-    // Check batch data.
-    for (size_t i = 0; i < 100; ++i) {
-      for (size_t j = 0; j < 200; ++j) {
-        paddle::real tmp = (paddle::real)((j - 100.0) * (i + 1) / 200.0);
-        ASSERT_NEAR(
-            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
-      }
-    }
-
-    num = provider->getNextBatchInternal(100, &batch);
-    ASSERT_NE(num, 0);
-    ASSERT_EQ(batch.getStreams().size(), (size_t)1);
-    ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
-    // Check batch data.
-    for (size_t i = 0; i < 100; ++i) {
-      size_t ii = i + 100;
-      for (size_t j = 0; j < 200; ++j) {
-        paddle::real tmp = (paddle::real)((j - 100.0) * (ii + 1) / 200.0);
-        ASSERT_NEAR(
-            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
-      }
-    }
-    num = provider->getNextBatchInternal(100, &batch);
-    ASSERT_EQ(num, 0);
-  }
-}
-
-TEST(PyDataProvider2, index_no_seq) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_index_no_seq");
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-
-  provider->setSkipShuffle();  // skip shuffle for unittest.
-  paddle::DataBatch batch;
-  for (size_t pass = 0; pass < 2; ++pass) {
-    provider->reset();
-    int64_t num = provider->getNextBatchInternal(10000, &batch);
-    CHECK_EQ(num, 200);
-    for (int i = 0; i < 200; ++i) {
-      CHECK_EQ(i, batch.getStreams()[0].ids->getData()[i]);
-    }
-  }
-}
-
-TEST(PyDataProvider2, init_hook) {
-  paddle::PyObjectPtr pickle = paddle::py::import("pickle");
-  paddle::PyObjectPtr globals(PyModule_GetDict(PyImport_AddModule("__main__")));
-  PyDict_SetItemString(globals.get(), "pickle", pickle.get());
-  paddle::PyObjectPtr locals(PyDict_New());
-  paddle::PyObjectPtr mdl(PyRun_String(
-      "dumps = pickle.dumps({'value':[float(x) for x in xrange(20)]})",
-      Py_file_input,
-      globals.get(),
-      locals.get()));
-  CHECK_PY(mdl) << "Error!";
-  paddle::PyObjectPtr dps(PyDict_GetItemString(locals.get(), "dumps"));
-  CHECK_PY(dps) << "Error!";
-
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_init_hook");
-  config.set_load_data_args(PyString_AsString(dps.get()));
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->setSkipShuffle();  // skip shuffle for unittest.
-  provider->reset();
-  paddle::DataBatch batch;
-  int64_t num = provider->getNextBatchInternal(100000, &batch);
-  ASSERT_EQ(num, 200);
-  auto &mat = batch.getStreams()[0].value;
-  ASSERT_EQ((size_t)mat->getWidth(), (size_t)20);
-  for (size_t i = 0; i < 200; ++i) {
-    for (size_t j = 0; j < 20; ++j) {
-      ASSERT_NEAR((paddle::real)j, mat->getData()[i * 20 + j], epsilon);
-    }
-  }
-}
-
-TEST(PyDataProvider2, sparse_no_value_no_seq) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_sparse_non_value_no_seq");
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batch;
-  int64_t num = provider->getNextBatchInternal(10000, &batch);
-  CHECK_EQ(num, 200);
-  auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
-      batch.getStreams()[0].value);
-  CHECK(csm != nullptr);
-  for (int i = 0; i < 200; ++i) {
-    CHECK_EQ(csm->getColNum(i), (size_t)10);
-    int *cols = csm->getRowCols(i);
-    for (int j = 0; j < 10; ++j) {
-      CHECK_EQ(cols[j], (i + 1) * (j + 1));
-    }
-  }
-}
-
-TEST(PyDataProvider2, sparse_value_no_seq) {
-  paddle::DataBatch batch;
-  CHECK_EQ(readDataBatch(&batch, "test_sparse_value_no_seq"), 200);
-  auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
-      batch.getStreams()[0].value);
-  CHECK(csm != nullptr);
-  for (int i = 0; i < 200; ++i) {
-    CHECK_EQ(csm->getColNum(i), (size_t)10);
-    int *cols = csm->getRowCols(i);
-    real *dat = csm->getRowValues(i);
-    for (int j = 0; j < 10; ++j) {
-      EXPECT_EQ(cols[j], (i + 1) * (j + 1));
-      EXPECT_EQ(dat[j], real(j) / real(i + 1));
-    }
-  }
-}
-
-TEST(PyDataProvider2, index_seq) {
-  paddle::DataBatch batch;
-  CHECK_EQ(readDataBatch(&batch, "test_index_seq"), 200);
-  auto &arg = batch.getStreams()[0];
-  CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 / 2);
-  size_t tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {  // CHECK DATA CORRECT
-    for (size_t j = 0; j < i + 1; ++j) {
-      ASSERT_EQ((size_t)arg.ids->getData()[tmp], j);
-      ++tmp;
-    }
-  }
-  ASSERT_EQ(arg.sequenceStartPositions->getSize(), (size_t)201);
-  tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {
-    tmp += i;
-    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i], tmp);
-  }
-  tmp += 200;
-  ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[200], tmp);
-}
-
-TEST(PyDataProvider2, index_sub_seq) {
-  paddle::DataBatch batch;
-  ASSERT_EQ(readDataBatch(&batch, "test_index_sub_seq"), 200);
-  auto &arg = batch.getStreams()[0];
-  size_t tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {
-    for (size_t j = 0; j < i + 1; ++j) {
-      for (size_t k = 0; k < j + 1; ++k) {
-        CHECK_EQ((size_t)arg.ids->getData()[tmp++], k);
-      }
-    }
-  }
-
-  CHECK_EQ(tmp, arg.ids->getSize());
-
-  ASSERT_EQ((size_t)arg.sequenceStartPositions->getSize(), (size_t)201);
-  ASSERT_EQ(arg.subSequenceStartPositions->getData(false)[0], 0);
-  ASSERT_EQ(arg.sequenceStartPositions->getData(false)[0], 0);
-  size_t idx = 1;
-  tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {
-    for (size_t j = 0; j < i + 1; ++j) {
-      tmp += j + 1;
-      ASSERT_EQ((size_t)arg.subSequenceStartPositions->getData(false)[idx],
-                (size_t)tmp);
-      ++idx;
-    }
-    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i + 1], tmp);
-  }
-}
-
-TEST(PyDataProvider2, min_pool_size) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_min_pool_size");
-  config.set_load_data_args("");
-  size_t totalData = 1 << 14;
-  constexpr size_t batchSize = 100;
-  constexpr size_t minPoolSize = 1000;
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-
-  paddle::unittest::pydp2::setOnPoolFilledHook([&](size_t poolSize) {
-    if (totalData > batchSize) {
-      CHECK_GE(poolSize, std::min(totalData - batchSize, minPoolSize));
-    }
-  });
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
-    if (realBatchSize) {
-      totalData -= realBatchSize;
-    } else {
-      break;
-    }
-  }
-  paddle::unittest::pydp2::clearOnPoolFilledHook();
-}
-
-TEST(PyDataProvider2, can_over_batch_size) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_can_over_batch_size");
-  config.set_load_data_args("");
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-  constexpr size_t batchSize = 100;
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
-    if (realBatchSize) {
-      CHECK_LE(static_cast<size_t>(realBatchSize), batchSize);
-    } else {
-      break;
-    }
-  }
-}
-
-TEST(PyDataProvider2, input_order) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_input_order");
-  config.set_load_data_args("");
-
-  paddle::ModelConfig modelConfig;
-  *modelConfig.add_input_layer_names() = "input1";
-  *modelConfig.add_input_layer_names() = "input2";
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, modelConfig, false));
-  provider->reset();
-  constexpr size_t batchSize = 100;
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
-    if (!realBatchSize) {
-      break;
-    }
-    ASSERT_EQ(batch.getStreams().size(), static_cast<size_t>(2));
-    for (int64_t i = 0; i < realBatchSize; ++i) {
-      ASSERT_EQ(batch.getStream(0).ids->getData()[i], 0);
-      ASSERT_EQ(batch.getStream(1).ids->getData()[i], 1);
-    }
-  }
-}
-
-TEST(PyDataProvider2, test_check) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_check");
-  config.set_load_data_args("");
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(100, &batch);
-    if (!realBatchSize) {
-      break;
-    } else {
-      auto &ivec = batch.getStream(0).ids;
-      for (size_t i = 0; i < ivec->getSize(); ++i) {
-        CHECK_LT(ivec->getData()[i], 10);
-      }
-    }
-  }
-}
-
-TEST(PyDataProvider2, multiThread) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_dense_no_seq");
-  config.set_async_load_data(true);
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-  paddle::DataBatch batch;
-  provider->getNextBatch(100, &batch);
-  provider->reset();
-  provider.reset();
-}
-
-TEST(PyDataProvider2, minPoolSizeWithCache) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_min_pool_size_with_cache");
-  config.set_async_load_data(true);
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-
-  paddle::DataBatch batch;
-
-  for (int i = 0; i < 10; ++i) {
-    provider->reset();
-    int64_t sum = 0;
-    while (int64_t actualNum = provider->getNextBatch(100, &batch)) {
-      sum += actualNum;
-    }
-    ASSERT_EQ(1 << 20, sum);
-  }
-}
-
-int main(int argc, char **argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  paddle::initPython(argc, argv);
-
-  std::ofstream fout(FLAGS_train_list);
-  CHECK(fout.is_open());
-  fout << "stub file name" << std::endl;  // in unittest, filename is not used.
-  fout.close();
-
-  return RUN_ALL_TESTS();
-}
-
-#endif
diff --git a/paddle/legacy/gserver/tests/test_PyDataProvider2.py b/paddle/legacy/gserver/tests/test_PyDataProvider2.py
deleted file mode 100644
index 461d80b9e68..00000000000
--- a/paddle/legacy/gserver/tests/test_PyDataProvider2.py
+++ /dev/null
@@ -1,125 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-import random
-
-from paddle.trainer.PyDataProvider2 import *
-
-
-@provider(slots=[dense_vector(200, seq_type=SequenceType.NO_SEQUENCE)])
-def test_dense_no_seq(setting, filename):
-    for i in xrange(200):
-        yield [(float(j - 100) * float(i + 1)) / 200.0 for j in xrange(200)]
-
-
-@provider(input_types=[integer_value(200, seq_type=SequenceType.NO_SEQUENCE)])
-def test_index_no_seq(setting, filename):
-    for i in xrange(200):
-        yield i
-
-
-def test_init_hooker(setting, value, **kwargs):
-    setting.value = value
-
-
-@provider(
-    input_types=[dense_vector(
-        20, seq_type=SequenceType.NO_SEQUENCE)],
-    init_hook=test_init_hooker)
-def test_init_hook(setting, filename):
-    for i in xrange(200):
-        yield setting.value
-
-
-@provider(input_types=[
-    sparse_binary_vector(
-        30000, seq_type=SequenceType.NO_SEQUENCE)
-])
-def test_sparse_non_value_no_seq(setting, filename):
-    for i in xrange(200):
-        yield [(i + 1) * (j + 1) for j in xrange(10)]
-
-
-@provider(input_types=[
-    sparse_float_vector(
-        30000, seq_type=SequenceType.NO_SEQUENCE)
-])
-def test_sparse_value_no_seq(setting, filename):
-    for i in xrange(200):
-        yield [((i + 1) * (j + 1), float(j) / float(i + 1)) for j in xrange(10)]
-
-
-@provider(input_types=[integer_value(200, seq_type=SequenceType.SEQUENCE)])
-def test_index_seq(setting, filename):
-    for i in xrange(200):
-        yield range(i + 1)
-
-
-@provider(input_types=[index_slot(200, seq_type=SequenceType.SUB_SEQUENCE)])
-def test_index_sub_seq(setting, filename):
-    def gen_sub_seq(l):
-        l += 1
-        for j in xrange(l):
-            yield range(j + 1)
-
-    for i in xrange(200):
-        yield list(gen_sub_seq(i))
-
-
-@provider(input_types=[index_slot(100)], min_pool_size=1000)
-def test_min_pool_size(setting, filename):
-    for _ in xrange(1 << 14):
-        yield random.randint(0, 100 - 1)
-
-
-@provider(
-    input_types=[index_slot(
-        100, seq_type=SequenceType.SEQUENCE)],
-    can_over_batch_size=False,
-    calc_batch_size=lambda x: len(x[0]))
-def test_can_over_batch_size(setting, filename):
-    for _ in xrange(1 << 10):
-        seq_len = random.randint(0, 99)
-        yield [random.randint(0, 100 - 1) for _ in xrange(seq_len)]
-
-
-@provider(input_types={'input1': index_slot(10), 'input2': index_slot(10)})
-def test_input_order(setting, filename):
-    for _ in xrange(1000):
-        yield {'input1': 0, 'input2': 1}
-
-
-@provider(
-    input_types=[index_slot(10)],
-    check=True,
-    check_fail_continue=True,
-    should_shuffle="123")  # also test should shuffle
-def test_check(settings, filename):
-    yield_good_value = False
-
-    while not yield_good_value:
-        for _ in xrange(10000):
-            i = random.randint(0, 100)
-            if i < 10:
-                yield_good_value = True
-            yield i
-
-
-@provider(
-    input_types=[index_slot(10)],
-    min_pool_size=1000,
-    cache=CacheType.CACHE_PASS_IN_MEM, )
-def test_min_pool_size_with_cache(settings, filename):
-    import random
-    for _ in xrange(2**20):
-        yield random.randint(0, 9)
diff --git a/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp
deleted file mode 100644
index 153c3e7f36a..00000000000
--- a/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/gserver/gradientmachines/GradientMachine.h>
-#include <paddle/legacy/parameter/ParameterUpdateFunctions.h>
-#include <paddle/legacy/trainer/Trainer.h>
-#include <paddle/legacy/trainer/TrainerInternal.h>
-#include <paddle/legacy/utils/PythonUtil.h>
-#include <paddle/legacy/utils/Util.h>
-#include <paddle/legacy/utils/Version.h>
-
-DECLARE_int32(seed);
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-class TrainerForTest : public paddle::Trainer {
- public:
-  void startTrain() {
-    GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
-    gm.start();
-  }
-
-  void finishTrain() {
-    GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
-    gm.finish();
-  }
-
-  /**
-   * Get total dimension of all parameters.
-   *
-   * @return the total dimension of all parameters
-   */
-  size_t getTotalParameterSize() const {
-    auto p = const_cast<TrainerForTest*>(this);
-    auto& params = p->getGradientMachine()->getParameters();
-    return std::accumulate(
-        params.begin(), params.end(), 0UL, [](size_t a, const ParameterPtr& p) {
-          return a + p->getSize();
-        });
-  }
-};
-
-void CalCost(const string& conf,
-             const string& dir,
-             real* cost,
-             int num_passes) {
-  auto config = std::make_shared<TrainerConfigHelper>(conf);
-  TrainerForTest trainer;
-  trainer.init(config);
-  mkDir(dir.c_str());
-  config->setSaveDir(dir);
-  auto dataProvider = trainer.getDataProvider();
-  int32_t batchSize = config->getOptConfig().batch_size();
-  real learningRate = config->getOptConfig().learning_rate();
-  real momentum = 0;
-  real decayRate = 0;
-  int64_t dim = trainer.getTotalParameterSize();
-  CpuVector vecW(dim);
-  CpuVector vecGradient(dim);
-  CpuVector vecMomentum(dim);
-
-  // vecW needs to be assigned, otherwise the variable is an uncertain value.
-
-  *ThreadLocalRand::getSeed() = FLAGS_seed;
-  vecW.randnorm(0, 0.1);
-  vecMomentum.randnorm(0, 0.1);
-
-  trainer.startTrain();
-  for (int i = 0; i < num_passes; ++i) {
-    real totalCost = 0;
-    dataProvider->reset();
-    while (true) {
-      DataBatch dataBatch;
-      int num = dataProvider->getNextBatch(batchSize, &dataBatch);
-      if (num == 0) break;
-      totalCost += trainer.calcGradient(dataBatch, vecW, vecGradient);
-      sgdUpdate(
-          learningRate, momentum, decayRate, &vecW, &vecGradient, &vecMomentum);
-    }
-    cost[i] = totalCost;
-  }
-  trainer.finishTrain();
-  rmDir(dir.c_str());
-}
-
-void test(const string& conf1, const string& conf2, double eps, bool useGpu) {
-  if (!paddle::version::isWithGpu() && useGpu) {
-    return;
-  }
-  FLAGS_use_gpu = useGpu;
-  int num_passes = 5;
-  real* cost1 = new real[num_passes];
-  const string dir1 = "legacy/gserver/tests/t1";
-  CalCost(conf1, dir1, cost1, num_passes);
-
-  real* cost2 = new real[num_passes];
-  const string dir2 = "legacy/gserver/tests/t2";
-  CalCost(conf2, dir2, cost2, num_passes);
-
-  for (int i = 0; i < num_passes; i++) {
-    LOG(INFO) << "num_passes: " << i << ", cost1=" << cost1[i]
-              << ", cost2=" << cost2[i]
-              << ", diff=" << std::abs(cost1[i] - cost2[i]);
-    ASSERT_NEAR(cost1[i], cost2[i], eps);
-  }
-  delete[] cost1;
-  delete[] cost2;
-}
-
-TEST(RecurrentGradientMachine, HasSubSequence) {
-  for (bool useGpu : {false, true}) {
-    test("legacy/gserver/tests/sequence_layer_group.conf",
-         "legacy/gserver/tests/sequence_nest_layer_group.conf",
-         1e-5,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn) {
-  for (bool useGpu : {false, true}) {
-    test("legacy/gserver/tests/sequence_rnn.conf",
-         "legacy/gserver/tests/sequence_nest_rnn.conf",
-         1e-6,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn_multi_input) {
-  for (bool useGpu : {false, true}) {
-    test("legacy/gserver/tests/sequence_rnn_multi_input.conf",
-         "legacy/gserver/tests/sequence_nest_rnn_multi_input.conf",
-         1e-6,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
-  for (bool useGpu : {false, true}) {
-    test("legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py",
-         "legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py",
-         1e-6,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn_mixed_input) {
-  for (bool useGpu : {false, true}) {
-    test("legacy/gserver/tests/sequence_rnn_mixed_inputs.py",
-         "legacy/gserver/tests/sequence_rnn_matched_inputs.py",
-         1e-6,
-         useGpu);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-
-  if (paddle::version::isWithPyDataProvider()) {
-    if (!paddle::version::isWithGpu()) {
-      FLAGS_use_gpu = false;
-    }
-    initMain(argc, argv);
-    initPython(argc, argv);
-    return RUN_ALL_TESTS();
-  } else {
-    return 0;
-  }
-}
diff --git a/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp b/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp
deleted file mode 100644
index 71198cb6a1d..00000000000
--- a/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp
+++ /dev/null
@@ -1,571 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/utils/Version.h>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/gserver/layers/Layer.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-DECLARE_bool(use_gpu);
-DECLARE_bool(rnn_use_batch);
-DECLARE_int32(fixed_seq_length);
-
-void checkError(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void checkError(const CpuVector& vector1, const CpuVector& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int size = vector1.getSize();
-  const real* data1 = vector1.getData();
-  const real* data2 = vector2.getData();
-  int count = 0;
-  for (int i = 0; i < size; i++) {
-    if (fabs(data1[i] - data2[i]) > err) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-LayerPtr creatDataLayer(string name,
-                        size_t batchSize,
-                        int layerSize,
-                        bool useGpu) {
-  LayerConfig dataConfig;
-  dataConfig.set_name(name);
-  dataConfig.set_type("data");
-  dataConfig.set_size(layerSize);
-  LayerPtr layer = LayerPtr(new DataLayer(dataConfig));
-
-  Argument data;
-  data.value = Matrix::create(batchSize, layer->getSize(), false, useGpu);
-  data.grad = Matrix::create(batchSize, layer->getSize(), false, useGpu);
-  data.value->randomizeUniform();
-  data.value->add(-0.5);
-  data.value->sigmoid(*data.value);
-  data.grad->zeroMem();
-
-  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
-
-  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  dataLayer->setData(data);
-  dataLayer->forward(PASS_GC);
-
-  return layer;
-}
-
-ParameterPtr creatParameter(string name,
-                            int pid,
-                            size_t paraSize,
-                            bool useGpu) {
-  ParameterConfig paraConfig;
-  paraConfig.set_name(name);
-  paraConfig.set_size(paraSize);
-
-  ParameterPtr parameter =
-      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ false);
-  parameter->enableType(PARAMETER_VALUE);
-  parameter->enableType(PARAMETER_GRADIENT);
-  parameter->randomize();
-  parameter->setID(pid);
-
-  return parameter;
-}
-
-ParameterPtr creatParameterBias(string name,
-                                int pid,
-                                size_t paraSize,
-                                bool useGpu) {
-  ParameterConfig paraConfig;
-  paraConfig.set_name(name);
-  paraConfig.set_size(paraSize);
-  paraConfig.set_initial_std(1);
-
-  ParameterPtr parameter =
-      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ true);
-  parameter->randomize();
-  parameter->setID(pid);
-
-  return parameter;
-}
-
-LayerPtr initRecurrentLayer(LayerConfig layerConfig,
-                            size_t batchSize,
-                            int layerSize,
-                            bool useGpu) {
-  FLAGS_use_gpu = useGpu;
-  LayerMap layerMap;
-  ParameterMap parameterMap;
-  LayerPtr dataLayer = creatDataLayer("layer_0", batchSize, layerSize, useGpu);
-  layerMap[dataLayer->getName()] = dataLayer;
-
-  ParameterPtr para =
-      creatParameter("para_0", 0, layerSize * layerSize, useGpu);
-  parameterMap[para->getName()] = para;
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-  LayerPtr testLayer = Layer::create(layerConfig);
-  layerMap[testLayer->getName()] = testLayer;
-
-  testLayer->init(layerMap, parameterMap);
-  testLayer->setNeedGradient(true);
-
-  return testLayer;
-}
-
-void checkRecurrentLayer(LayerPtr testLayer) {
-  const VectorPtr& weightGrad =
-      (testLayer->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
-  const MatrixPtr& inputGrad = testLayer->getPrev(0)->getOutputGrad();
-  CpuVector seqPara(weightGrad->getSize());
-  CpuVector batPara(weightGrad->getSize());
-  CpuMatrix seqInputGrad(inputGrad->getHeight(), inputGrad->getWidth());
-  CpuMatrix batInputGrad(inputGrad->getHeight(), inputGrad->getWidth());
-
-  CpuMatrix outputGrad(inputGrad->getHeight(), inputGrad->getWidth());
-  outputGrad.randomizeUniform();
-
-  /* use sequence calculate */
-  FLAGS_rnn_use_batch = false;
-  weightGrad->zero();
-  inputGrad->zero();
-  testLayer->forward(PASS_GC);
-  testLayer->getOutputGrad()->copyFrom(outputGrad);
-  testLayer->backward();
-  seqPara.copyFrom(*weightGrad);
-  seqInputGrad.copyFrom(*inputGrad);
-
-  /* use batch calculate */
-  FLAGS_rnn_use_batch = true;
-  weightGrad->zero();
-  inputGrad->zero();
-  testLayer->forward(PASS_GC);
-  testLayer->getOutputGrad()->copyFrom(outputGrad);
-  testLayer->backward();
-  batPara.copyFrom(*weightGrad);
-  batInputGrad.copyFrom(*inputGrad);
-
-  /* check */
-  checkError(seqInputGrad, batInputGrad);
-  checkError(seqPara, batPara);
-}
-
-TEST(Layer, RecurrentLayer) {
-  LayerConfig layerConfig;
-  layerConfig.set_name("rnn");
-  layerConfig.set_type("recurrent");
-  layerConfig.set_active_type("tanh");
-  for (auto layerSize : {1, 10, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 20, 100, 128}) {
-      for (auto useGpu : {false, true}) {
-        for (auto reversed : {false, true}) {
-          LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize
-                    << " useGpu=" << useGpu << " reversed=" << reversed;
-          layerConfig.set_size(layerSize);
-          layerConfig.set_reversed(reversed);
-          LayerPtr testLayer =
-              initRecurrentLayer(layerConfig, batchSize, layerSize, useGpu);
-          checkRecurrentLayer(testLayer);
-        }
-      }
-    }
-  }
-}
-
-#define protected public
-#include "paddle/legacy/gserver/layers/GatedRecurrentLayer.h"
-#include "paddle/legacy/gserver/layers/LstmLayer.h"
-#include "paddle/legacy/gserver/layers/RecurrentLayer.h"
-template <class T>
-class TestRecurrentLayer {
- public:
-  LayerConfig config_;
-  bool useGpu_;
-  bool useBatch_;
-  LayerPtr testLayer_;
-  LayerPtr dataLayer_;
-  ParameterPtr para_;
-  ParameterPtr bias_;
-  LayerMap layerMap_;
-  ParameterMap parameterMap_;
-  TestRecurrentLayer(const LayerConfig& config,
-                     bool useGpu,
-                     bool useBatch = false)
-      : config_(config), useGpu_(useGpu), useBatch_(useBatch) {}
-  void init(size_t batchSize) {
-    FLAGS_use_gpu = useGpu_;
-    testLayer_ = Layer::create(config_);
-    if (typeid(T) == typeid(GatedRecurrentLayer)) {
-      dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
-                                  batchSize,
-                                  config_.size() * 3,
-                                  useGpu_);
-      para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
-                             0,
-                             config_.size() * config_.size() * 3,
-                             useGpu_);
-      bias_ = creatParameterBias(
-          config_.bias_parameter_name(), 1, config_.size() * 3, useGpu_);
-    } else if (typeid(T) == typeid(LstmLayer)) {
-      dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
-                                  batchSize,
-                                  config_.size() * 4,
-                                  useGpu_);
-      para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
-                             0,
-                             config_.size() * config_.size() * 4,
-                             useGpu_);
-      bias_ = creatParameterBias(
-          config_.bias_parameter_name(), 1, config_.size() * 7, useGpu_);
-    }
-    layerMap_[dataLayer_->getName()] = dataLayer_;
-    parameterMap_[para_->getName()] = para_;
-    parameterMap_[bias_->getName()] = bias_;
-
-    layerMap_[testLayer_->getName()] = testLayer_;
-    testLayer_->init(layerMap_, parameterMap_);
-    testLayer_->setNeedGradient(true);
-    (dynamic_cast<T*>(testLayer_.get()))->useBatch_ = useBatch_;
-  }
-  void forward() {
-    FLAGS_use_gpu = useGpu_;
-    testLayer_->forward(PASS_GC);
-  }
-  void backward() {
-    FLAGS_use_gpu = useGpu_;
-    testLayer_->backward(nullptr);
-  }
-};
-
-template <class T>
-void checkRecurrentLayer(LayerConfig layerConfig,
-                         size_t batchSize,
-                         bool cpuBatch,
-                         bool gpuBatch) {
-  TestRecurrentLayer<T> testCpu(layerConfig, false, cpuBatch);
-  TestRecurrentLayer<T> testGpu(layerConfig, true, gpuBatch);
-  testCpu.init(batchSize);
-  testGpu.init(batchSize);
-  auto checkError = [](
-      MatrixPtr cpu, MatrixPtr gpu, int numSequences, const char* str) {
-    CpuMatrix check(gpu->getHeight(), gpu->getWidth());
-    check.copyFrom(*gpu);
-    int height = cpu->getHeight();
-    int width = cpu->getWidth();
-    const real* data1 = cpu->getData();
-    const real* data2 = check.getData();
-    int count = 0;
-    for (int i = 0; i < height; i++) {
-      for (int j = 0; j < width; j++) {
-        if (fabs(data1[i * width + j] - data2[i * width + j]) / numSequences >
-            1e-4) {
-          count++;
-        }
-      }
-    }
-    EXPECT_EQ(count, 0) << "[" << str << "]"
-                        << "There are " << count << " different element.";
-  };
-  T* cpuLayer = dynamic_cast<T*>(testCpu.testLayer_.get());
-  T* gpuLayer = dynamic_cast<T*>(testGpu.testLayer_.get());
-
-  Argument& cpuInput = testCpu.dataLayer_->getOutput();
-  Argument& gpuInput = testGpu.dataLayer_->getOutput();
-  gpuInput.resizeAndCopyFrom(cpuInput, true);
-
-  const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE);
-  const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE);
-  gpuVec->copyFrom(*cpuVec);
-
-  const VectorPtr& cpuBiasVec = testCpu.bias_->getBuf(PARAMETER_VALUE);
-  const VectorPtr& gpuBiasVec = testGpu.bias_->getBuf(PARAMETER_VALUE);
-  gpuBiasVec->copyFrom(*cpuBiasVec);
-
-  /* check forward */
-  testCpu.forward();
-  testGpu.forward();
-
-  checkError(
-      cpuLayer->getOutputValue(), gpuLayer->getOutputValue(), 1, "outputValue");
-
-  /* check backward */
-  cpuLayer->getOutputGrad()->randomizeUniform();
-  gpuLayer->getOutputGrad()->copyFrom(*cpuLayer->getOutputGrad());
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-
-  testCpu.backward();
-  testGpu.backward();
-
-  // check input grad
-  checkError(cpuInput.grad, gpuInput.grad, 1, "inputGrad");
-  // check weight grad
-  int numSequences = cpuInput.getNumSequences();
-  checkError(cpuLayer->weight_->getWGrad(),
-             gpuLayer->weight_->getWGrad(),
-             numSequences,
-             "weightGrad");
-  // check bias grad
-  checkError(cpuLayer->bias_->getWGrad(),
-             gpuLayer->bias_->getWGrad(),
-             numSequences,
-             "biasGrad");
-}
-
-TEST(Layer, GatedRecurrentLayer) {
-  LayerConfig layerConfig;
-  layerConfig.set_type("gated_recurrent");
-  layerConfig.set_active_type("sigmoid");
-  layerConfig.set_active_gate_type("sigmoid");
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-  layerConfig.set_bias_parameter_name("bias");
-
-  for (auto frameSize : {32, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 100, 500}) {
-      for (auto reversed : {false, true}) {
-        for (auto cpuBatch : {false, true}) {
-          for (auto gpuBatch : {false, true}) {
-            LOG(INFO) << " batchSize=" << batchSize
-                      << " frameSize=" << frameSize << " reversed=" << reversed
-                      << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
-            layerConfig.set_size(frameSize);
-            layerConfig.set_reversed(reversed);
-            checkRecurrentLayer<GatedRecurrentLayer>(
-                layerConfig, batchSize, cpuBatch, gpuBatch);
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Layer, LstmLayer) {
-  LayerConfig layerConfig;
-  layerConfig.set_type("lstmemory");
-  layerConfig.set_active_type("relu");
-  layerConfig.set_active_state_type("tanh");
-  layerConfig.set_active_gate_type("sigmoid");
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-  layerConfig.set_bias_parameter_name("bias");
-
-  for (auto frameSize : {32, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 100, 500}) {
-      for (auto reversed : {false, true}) {
-        for (auto cpuBatch : {false, true}) {
-          for (auto gpuBatch : {false, true}) {
-            LOG(INFO) << " batchSize=" << batchSize
-                      << " frameSize=" << frameSize << " reversed=" << reversed
-                      << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
-            layerConfig.set_size(frameSize);
-            layerConfig.set_reversed(reversed);
-            checkRecurrentLayer<LstmLayer>(
-                layerConfig, batchSize, cpuBatch, gpuBatch);
-          }
-        }
-      }
-    }
-  }
-}
-
-#ifdef PADDLE_WITH_MKLML
-
-#include "paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h"
-
-LayerPtr initMKLPackedLayer(LayerConfig layerConfig,
-                            bool reversed,
-                            int layerSize,
-                            LayerPtr dataLayer,
-                            ParameterPtr para,
-                            ParameterPtr bias = nullptr) {
-  LayerMap layerMap;
-  ParameterMap parameterMap;
-  layerMap[dataLayer->getName()] = dataLayer;
-  parameterMap[para->getName()] = para;
-  if (bias) {
-    parameterMap[bias->getName()] = bias;
-    layerConfig.set_bias_parameter_name("bias_0");
-  }
-
-  layerConfig.set_size(layerSize);
-  layerConfig.set_reversed(reversed);
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-
-  LayerPtr testLayer = Layer::create(layerConfig);
-  layerMap[testLayer->getName()] = testLayer;
-
-  testLayer->init(layerMap, parameterMap);
-  testLayer->setNeedGradient(true);
-
-  return testLayer;
-}
-
-void checkMKLPackedLayer(LayerConfig layerConfig1,
-                         LayerConfig layerConfig2,
-                         bool reversed,
-                         int layerSize,
-                         int batchSize,
-                         bool useBatch1,
-                         bool useBatch2) {
-  LayerPtr dataLayer;
-  ParameterPtr para, bias;
-
-  if (layerConfig1.type() == "recurrent") {
-    dataLayer = creatDataLayer("layer_0", batchSize, layerSize, false);
-    para = creatParameter("para_0", 0, layerSize * layerSize, false);
-    bias = nullptr;
-  } else if (layerConfig1.type() == "gated_recurrent") {
-    dataLayer = creatDataLayer("layer_0", batchSize, layerSize * 3, false);
-    para = creatParameter("para_0", 0, layerSize * layerSize * 3, false);
-    bias = creatParameterBias("bias_0", 1, layerSize * 3, false);
-  }
-
-  LayerPtr testLayer1 = initMKLPackedLayer(
-      layerConfig1, reversed, layerSize, dataLayer, para, bias);
-  LayerPtr testLayer2 = initMKLPackedLayer(
-      layerConfig2, reversed, layerSize, dataLayer, para, bias);
-
-  const VectorPtr& weightGrad =
-      (testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
-  const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad();
-  CpuVector wgt_grad1(weightGrad->getSize());
-  CpuVector wgt_grad2(weightGrad->getSize());
-  CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth());
-  CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth());
-
-  for (int i = 0; i < 2; i++) {
-    FLAGS_rnn_use_batch = useBatch1;
-
-    testLayer1->forward(PASS_GC);
-
-    FLAGS_rnn_use_batch = useBatch2;
-    testLayer2->forward(PASS_GC);
-
-    testLayer1->getOutputGrad()->randomizeUniform();
-    testLayer2->getOutputGrad()->copyFrom(*testLayer1->getOutputGrad());
-
-    weightGrad->zero();
-    inputGrad->zero();
-    FLAGS_rnn_use_batch = useBatch1;
-    testLayer1->backward(nullptr);
-
-    wgt_grad1.copyFrom(*weightGrad);
-    input_grad1.copyFrom(*inputGrad);
-
-    weightGrad->zero();
-    inputGrad->zero();
-    FLAGS_rnn_use_batch = useBatch2;
-    testLayer2->backward(nullptr);
-
-    wgt_grad2.copyFrom(*weightGrad);
-    input_grad2.copyFrom(*inputGrad);
-
-    checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue());
-    checkError(wgt_grad1, wgt_grad2);
-    checkError(input_grad1, input_grad2);
-  }
-}
-
-TEST(MKLPackedLayer, RecurrentLayer) {
-  LayerConfig layerConfig1;
-  LayerConfig layerConfig2;
-
-  layerConfig1.set_name("paddle-rnn");
-  layerConfig1.set_type("recurrent");
-  layerConfig1.set_active_type("relu");
-
-  layerConfig2.set_name("mkl-packed-rnn");
-  layerConfig2.set_type("mkl_packed_recurrent");
-  layerConfig2.set_active_type("relu");
-
-  FLAGS_use_gpu = false;
-
-  for (auto layerSize : {32, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 100, 500}) {
-      for (auto reversed : {true, false}) {
-        for (auto paddle_use_batch : {true, false}) {
-          for (auto MKLPacked_use_batch : {true, false}) {
-            LOG(INFO) << " layerSize=" << layerSize
-                      << " batchSize=" << batchSize << " reversed=" << reversed
-                      << " paddle_use_batch=" << paddle_use_batch
-                      << " MKLPacked_use_batch=" << MKLPacked_use_batch;
-
-            checkMKLPackedLayer(layerConfig1,
-                                layerConfig2,
-                                reversed,
-                                layerSize,
-                                batchSize,
-                                paddle_use_batch,
-                                MKLPacked_use_batch);
-          }
-        }
-      }
-    }
-  }
-}
-#endif
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  if (!version::isWithGpu()) {
-    testing::GTEST_FLAG(filter) = "-Layer.*";
-  }
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp
deleted file mode 100644
index 1975d9196d6..00000000000
--- a/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp
+++ /dev/null
@@ -1,471 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <math.h>
-#include <paddle/legacy/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-#include <ctime>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/gserver/layers/FullyConnectedLayer.h"
-#include "paddle/legacy/gserver/layers/Layer.h"
-#include "paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h"
-#include "paddle/legacy/math/CpuSparseMatrix.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(num_passes);
-DECLARE_string(config);
-DECLARE_string(init_model_path);
-DECLARE_string(config_args);
-
-size_t fcLayerWidth = 1024;
-
-struct ComData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-
-int randint(int* data, size_t int_max, size_t size) {
-  srand((size_t)(time(NULL)));
-  if (int_max < size) {
-    return -1;
-  }
-  size_t count = 0;
-  std::map<int, int> tmp;
-  int this_int = 0;
-
-  while (count < size) {
-    this_int = std::rand() % int_max;  // NOLINT
-    if (tmp.find(this_int) == tmp.end()) {
-      tmp[this_int] = 0;
-      count += 1;
-    }
-  }
-
-  if (tmp.size() != size) {
-    return -1;
-  }
-  count = 0;
-  for (auto itr = tmp.begin(); itr != tmp.end(); ++itr) {
-    data[count] = itr->first;
-    count += 1;
-  }
-  return 0;
-}
-
-void calcOutput(ComData& comData,
-                const string configFile,
-                const string configArgs,
-                bool useGpu) {
-  FLAGS_config = configFile;
-  FLAGS_config_args = configArgs;
-  FLAGS_use_gpu = useGpu;
-  FLAGS_init_model_path = "legacy/gserver/tests/SelectiveFcTest/model";
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlags(), false);
-
-  comData.parameters = trainer.getGradientMachine()->getParameters();
-
-  auto dataProvider = trainer.getDataProvider();
-  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
-  DataBatch dataBatch;
-  dataProvider->setSkipShuffle();
-  dataProvider->reset();
-  dataProvider->getNextBatch(batchSize, &dataBatch);
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-
-  vector<Argument>& inArgs = dataBatch.getStreams();
-  trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
-  trainer.getGradientMachine()->forwardBackward(
-      inArgs, &comData.outArgs, PASS_TRAIN);
-  trainer.getGradientMachine()->finish();
-}
-
-void checkMatrix(real* A, real* B, size_t matSize) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  int diffNum = 0;
-  for (size_t i = 0; i < matSize; ++i) {
-    if (std::isinf(A[i]) || std::isnan(A[i]) || std::isinf(B[i]) ||
-        std::isnan(B[i])) {
-    } else if (fabs(A[i] - B[i]) > err) {
-      diffNum++;
-    }
-  }
-  EXPECT_EQ(0, diffNum);
-}
-
-void checkTranspose(real* matrix,
-                    real* transpose,
-                    size_t width,
-                    size_t matSize) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  size_t height = matSize / width;
-  int diffNum = 0;
-  size_t rowId = 0;
-  size_t colId = 0;
-  for (size_t i = 0; i < matSize; ++i) {
-    if (i % width == 0 && i) {
-      rowId++;
-    }
-    colId = i % width;
-    if (fabs(matrix[i] - transpose[colId * height + rowId]) > err) {
-      diffNum++;
-      LOG(INFO) << i << " diff : " << matrix[i] << "\t"
-                << transpose[colId * height + rowId];
-    }
-  }
-  EXPECT_EQ(0, diffNum);
-}
-
-void compareOutput(ComData& fcData, ComData& selFcData) {
-  vector<Argument> outArgsFc = fcData.outArgs;
-  vector<Argument> outArgsSelfc = selFcData.outArgs;
-
-  // check cost
-  LOG(INFO) << "Check cost";
-  CpuMatrix fcCost(outArgsFc[0].value->getHeight(),
-                   outArgsFc[0].value->getWidth());
-  CpuMatrix selfcCost(outArgsSelfc[0].value->getHeight(),
-                      outArgsSelfc[0].value->getWidth());
-  fcCost.copyFrom(*outArgsFc[0].value);
-  selfcCost.copyFrom(*outArgsSelfc[0].value);
-  checkMatrix(fcCost.getData(), selfcCost.getData(), fcCost.getElementCnt());
-
-  // check selective fc output and fc output
-  LOG(INFO) << "Compare output of SelectiveFullyConectedLayer "
-            << "with FullyConectedLayer";
-  CpuMatrix fcOut(outArgsFc[1].value->getHeight(),
-                  outArgsFc[1].value->getWidth());
-  CpuMatrix selfcOut(outArgsSelfc[1].value->getHeight(),
-                     outArgsSelfc[1].value->getWidth());
-
-  fcOut.copyFrom(*outArgsFc[1].value);
-  selfcOut.copyFrom(*outArgsSelfc[1].value);
-  checkMatrix(fcOut.getData(), selfcOut.getData(), fcOut.getElementCnt());
-
-  // check gradient math
-  vector<ParameterPtr>& fcParam = fcData.parameters;
-  vector<ParameterPtr>& selfcParam = selFcData.parameters;
-  for (size_t i = 0; i < fcParam.size(); ++i) {
-    ParameterPtr p1, p2;
-    p1 = fcParam[i];
-    p2 = selfcParam[i];
-
-    string paramName = p1->getName();
-    LOG(INFO) << "check parameter : " << paramName;
-
-    // check parameter value
-    CpuVector paraValue1(p1->getSize());
-    CpuVector paraValue2(p2->getSize());
-    paraValue1.copyFrom(*p1->getBuf(PARAMETER_VALUE));
-    paraValue2.copyFrom(*p2->getBuf(PARAMETER_VALUE));
-
-    // check gradient
-    CpuVector paraGrad1(*p1->getBuf(PARAMETER_GRADIENT));
-    CpuVector paraGrad2(*p2->getBuf(PARAMETER_GRADIENT));
-    if (paramName == "rand_fc_param.bias") {
-      checkMatrix(
-          paraValue1.getData(), paraValue2.getData(), paraValue1.getSize());
-      checkMatrix(
-          paraGrad1.getData(), paraGrad2.getData(), paraGrad1.getSize());
-    } else {
-      checkTranspose(paraValue1.getData(),
-                     paraValue2.getData(),
-                     fcLayerWidth,
-                     paraValue1.getSize());
-      checkTranspose(paraGrad1.getData(),
-                     paraGrad2.getData(),
-                     fcLayerWidth,
-                     paraGrad1.getSize());
-    }
-  }
-}
-
-void compareSparseMulOutput(
-    real* fcOutput,
-    real* selOutput,
-    size_t nnz,
-    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& selCols) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  size_t nnzCount =
-      std::accumulate(selCols->begin(),
-                      selCols->end(),
-                      0UL,
-                      [](size_t a, const std::pair<int*, size_t>& arr) {
-                        return a + arr.second;
-                      });
-  EXPECT_EQ(nnz, nnzCount);
-
-  size_t sampleNum = selCols->size();
-  int diffNum = 0;
-  size_t count = 0;
-  for (size_t i = 0; i < sampleNum; ++i) {
-    for (size_t j = 0; j < (*selCols)[i].second; ++j) {
-      size_t selIdx = (*selCols)[i].first[j];
-      if (fabs(fcOutput[i * fcLayerWidth + selIdx] - selOutput[count]) > err) {
-        diffNum++;
-        LOG(INFO) << count << " diff : " << fcOutput[i * fcLayerWidth + selIdx]
-                  << "\t" << selOutput[count];
-      }
-      count++;
-    }
-  }
-  EXPECT_EQ(0, diffNum);
-}
-
-LayerPtr creatDataLayer(string name,
-                        size_t batchSize,
-                        size_t layerSize,
-                        std::vector<real>& values,
-                        bool useGpu) {
-  LayerConfig dataConfig;
-  dataConfig.set_name(name);
-  dataConfig.set_type("data");
-  dataConfig.set_size(layerSize);
-  LayerPtr layer = LayerPtr(new DataLayer(dataConfig));
-
-  Argument data;
-  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
-  data.value->copyFrom(values.data(), batchSize * layerSize);
-
-  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  dataLayer->setData(data);
-  dataLayer->forward(PASS_TEST);
-  return layer;
-}
-
-ParameterPtr creatParameter(
-    string name, int pid, size_t paraSize, string paramFile, bool useGpu) {
-  ParameterConfig paraConfig;
-  paraConfig.set_name(name);
-  paraConfig.set_size(paraSize);
-
-  ParameterPtr parameter =
-      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ false);
-  parameter->enableType(PARAMETER_VALUE);
-  parameter->randomize();
-  parameter->setID(pid);
-  parameter->load(paramFile);
-  return parameter;
-}
-
-LayerPtr initFcLayer(LayerPtr dataLayer,
-                     LayerConfig layerConfig,
-                     int dataLayerSize,
-                     int fcLayerSize,
-                     string paraName,
-                     string paraFile,
-                     bool useGpu) {
-  LayerMap layerMap;
-  ParameterMap parameterMap;
-
-  layerMap[dataLayer->getName()] = dataLayer;
-  ParameterPtr para = creatParameter(
-      paraName, 0, dataLayerSize * fcLayerSize, paraFile, useGpu);
-  parameterMap[para->getName()] = para;
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name(dataLayer->getName());
-  input.set_input_parameter_name(paraName);
-
-  LayerPtr testLayer = Layer::create(layerConfig);
-  layerMap[testLayer->getName()] = testLayer;
-
-  testLayer->setNeedGradient(false);
-  testLayer->init(layerMap, parameterMap);
-  return testLayer;
-}
-
-#ifndef PADDLE_TYPE_DOUBLE
-// The parameter file used in fc.conf and selective_fc.conf is float
-TEST(Layer, SelectiveFcLayer_train_dense_mul) {
-  const string& fcConfig = "legacy/gserver/tests/SelectiveFcTest/conf/fc.conf";
-  const string& fcConfigArgs =
-      "filelist=legacy/gserver/tests/SelectiveFcTest/dense_mul_list";
-  const string& selFcConfig =
-      "legacy/gserver/tests/SelectiveFcTest/conf/selective_fc.conf";
-  const string& selConfigArgs =
-      "filelist=legacy/gserver/tests/SelectiveFcTest/dense_mul_list";
-
-  for (auto useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-    if (useGpu) {
-      break;
-    }
-#endif
-    LOG(INFO) << "FullyConnectedLayer forwardBackward()";
-    ComData fcData;
-    calcOutput(fcData, fcConfig, fcConfigArgs, useGpu);
-
-    LOG(INFO) << "SelectiveFullyConnectedLayer forwardBackward()";
-    ComData selFcData;
-    calcOutput(selFcData, selFcConfig, selConfigArgs, useGpu);
-    compareOutput(fcData, selFcData);
-  }
-}
-#endif  // PADDLE_TYPE_DOUBLE
-
-void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
-                                        bool useGpu) {
-  FLAGS_use_gpu = useGpu;
-  size_t batchSize = 100;
-  size_t dataLayerSize = 512;
-  std::vector<real> values(batchSize * dataLayerSize);
-  for (size_t j = 0; j < batchSize * dataLayerSize; ++j) {
-    values[j] = std::rand() / real(RAND_MAX);
-  }
-  LayerPtr dataLayer =
-      creatDataLayer("data", batchSize, dataLayerSize, values, useGpu);
-
-  const string& selfcParaFile =
-      "legacy/gserver/tests/SelectiveFcTest/model/rand_fc_param.w.transpose";
-  const string& selfcParaName = "rand_fc_param.w.transpose";
-
-  std::shared_ptr<SelectiveFullyConnectedLayer> selfcLayer =
-      std::dynamic_pointer_cast<SelectiveFullyConnectedLayer>(
-          initFcLayer(dataLayer,
-                      config,
-                      dataLayerSize,
-                      fcLayerWidth,
-                      selfcParaName,
-                      selfcParaFile,
-                      useGpu));
-
-  // create selected columns
-  std::shared_ptr<std::vector<std::pair<int*, size_t>>> selCols(
-      new std::vector<std::pair<int*, size_t>>(batchSize));
-  size_t maxNNZ = 30;
-  srand((size_t)(time(NULL)));
-  int total = 0;
-  while (total == 0) {
-    for (size_t i = 0; i < batchSize; ++i) {
-      size_t num = std::rand() % maxNNZ;
-      int* data = new int[num];
-      randint(data, fcLayerWidth, num);
-      (*selCols)[i] = std::make_pair(data, num);
-      total += num;
-    }
-  }
-  selfcLayer->fillSelectiveData(selCols);
-  selfcLayer->forward(PASS_TEST);
-
-  MatrixPtr outMatSelfc = selfcLayer->getOutputValue();
-  CpuSparseMatrixPtr cpuOutMatSelfc(
-      new CpuSparseMatrix(outMatSelfc->getHeight(),
-                          outMatSelfc->getWidth(),
-                          outMatSelfc->getElementCnt()));
-  cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT);
-#ifdef PADDLE_WITH_CUDA
-  if (useGpu) {
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-#endif
-  real* outValueSelfc = cpuOutMatSelfc->getValue();
-  size_t nnz = cpuOutMatSelfc->getElementCnt();
-
-  const string& fcParaFile =
-      "legacy/gserver/tests/SelectiveFcTest/model/rand_fc_param.w";
-  const string& fcParaName = "rand_fc_param.w";
-  LayerConfig fcLayerConfig;
-  fcLayerConfig.set_name("fc_layer");
-  fcLayerConfig.set_type("fc");
-  fcLayerConfig.set_active_type("linear");
-  fcLayerConfig.set_size(fcLayerWidth);
-
-  LayerPtr fcLayer = initFcLayer(dataLayer,
-                                 fcLayerConfig,
-                                 dataLayerSize,
-                                 fcLayerWidth,
-                                 fcParaName,
-                                 fcParaFile,
-                                 useGpu);
-  fcLayer->forward(PASS_TEST);
-
-  MatrixPtr outMatFc = fcLayer->getOutputValue();
-  MatrixPtr cpuOutMatFc(
-      new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
-  cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT);
-#ifdef PADDLE_WITH_CUDA
-  if (useGpu) {
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-#endif
-  real* outValueFc = cpuOutMatFc->getData();
-
-  compareSparseMulOutput(outValueFc, outValueSelfc, nnz, selCols);
-  for (size_t i = 0; i < batchSize; ++i) {
-    delete[](*selCols)[i].first;
-  }
-}
-
-#ifndef PADDLE_TYPE_DOUBLE
-// The parameter file used in testSelectiveFcLayerTrainSparseMul is float
-TEST(Layer, SelectiveFcLayer_train_sparse_mul) {
-  LayerConfig selLayerConfig;
-  selLayerConfig.set_name("sel_fc");
-  selLayerConfig.set_type("selective_fc");
-  selLayerConfig.set_active_type("linear");
-  selLayerConfig.set_has_selected_colums(false);
-  selLayerConfig.set_selective_fc_pass_generation(true);
-  selLayerConfig.set_size(fcLayerWidth);
-
-  testSelectiveFcLayerTrainSparseMul(selLayerConfig, false);
-#ifdef PADDLE_WITH_CUDA
-  testSelectiveFcLayerTrainSparseMul(selLayerConfig, true);
-#endif
-}
-#endif  // PADDLE_TYPE_DOUBLE
-
-// TODO(dangqingqing) test multi threads after support in matrix
-// TEST(Layer, SelectiveFcLayer_train_sparse_mul_parallel) {
-//   LayerConfig selLayerConfig;
-//   selLayerConfig.set_name("sel_fc");
-//   selLayerConfig.set_type("selective_fc");
-//   selLayerConfig.set_active_type("linear");
-//   selLayerConfig.set_has_selected_colums(false);
-//   selLayerConfig.set_selective_fc_pass_generation(true);
-//   selLayerConfig.set_selective_fc_parallel_plain_mul_thread_num(10);
-//   selLayerConfig.set_selective_fc_full_mul_ratio(1000);
-//   selLayerConfig.set_size(fcLayerWidth);
-//   SelectiveFcLayer_test(selLayerConfig, false);
-// }
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/legacy/gserver/tests/test_SeqSliceLayerGrad.cpp b/paddle/legacy/gserver/tests/test_SeqSliceLayerGrad.cpp
deleted file mode 100644
index 05acd714219..00000000000
--- a/paddle/legacy/gserver/tests/test_SeqSliceLayerGrad.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-const int MAX_SEQ_NUM = 17;
-const int MAX_SEQ_LEN = 23;
-const int MAX_BEAM_SIZE = 13;
-
-const size_t SEED = (size_t)(time(NULL));
-
-vector<real> randSampling(real range, int n) {
-  CHECK_GE(range, n);
-  vector<real> num(range);
-  iota(begin(num), end(num), 0.);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  sort(begin(num), end(num));
-  return num;
-}
-
-void genSeqInfo(vector<int>& seqStartPos, vector<int>& subSeqStartPos) {
-  seqStartPos.resize(1, 0);
-  subSeqStartPos.resize(1, 0);
-
-  srand(SEED);
-  int seqNum = 1 + (rand() % MAX_SEQ_NUM);
-  for (int i = 0; i < seqNum; ++i) {
-    int subSeqNum = 1 + (rand() % MAX_SEQ_NUM);
-    for (int j = 0; j < subSeqNum; ++j)
-      subSeqStartPos.push_back(subSeqStartPos.back() +
-                               (1 + (rand() % MAX_SEQ_LEN)));
-    seqStartPos.push_back(subSeqStartPos.back());
-  }
-}
-
-/*
-  generate start indices according to sequence start positions.
- */
-void genStarts(vector<int>& seqStartPos,
-               vector<vector<real>>& starts,
-               size_t beamSize) {
-  starts.clear();
-  starts.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
-
-  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
-    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
-    vector<real> randStarts =
-        randSampling(seqLen, min(seqLen, static_cast<int>(beamSize)));
-    copy(begin(randStarts), end(randStarts), begin(starts[i]));
-  }
-}
-
-/*
-  generate end indices according to sequence start positions and start indices.
- */
-void genEnds(vector<int>& seqStartPos,
-             vector<vector<real>>& starts,
-             vector<vector<real>>& ends,
-             size_t beamSize) {
-  CHECK_EQ(seqStartPos.size() - 1, starts.size());
-  ends.clear();
-  ends.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
-
-  for (size_t i = 0; i < starts.size(); ++i) {
-    for (size_t j = 0; j < starts[i].size(); ++j) {
-      int seqLen = seqStartPos[i + 1] - seqStartPos[i];
-      CHECK_GE(seqLen - 1, starts[i][j]);
-      if (starts[i][j] == -1.) break;
-      if (starts[i][j] == (seqLen - 1)) {
-        ends[i][j] = starts[i][j];
-      } else {
-        ends[i][j] = starts[i][j] + randSampling(seqLen - starts[i][j], 1)[0];
-      }
-    }
-  }
-}
-
-void genTestData(vector<int>& seqStartPos,
-                 vector<int>& subSeqStartPos,
-                 vector<vector<real>>& starts,
-                 vector<vector<real>>& ends,
-                 bool hasSubseq) {
-  size_t beamSize = 1 + (rand() % MAX_BEAM_SIZE);
-  genSeqInfo(seqStartPos, subSeqStartPos);
-
-  genStarts(hasSubseq ? subSeqStartPos : seqStartPos, starts, beamSize);
-  genEnds(hasSubseq ? subSeqStartPos : seqStartPos, starts, ends, beamSize);
-}
-
-template <typename T>
-void flatten2dVector(vector<vector<T>>& inVec, vector<T>& outVec) {
-  size_t totalSize{0};
-  for (auto const& items : inVec) totalSize += items.size();
-  outVec.reserve(totalSize);
-
-  for (auto& items : inVec)
-    move(items.begin(), items.end(), back_inserter(outVec));
-}
-
-void testSeqSliceLayer(bool hasSubseq,
-                       bool useGpu,
-                       vector<int>& seqStartPos,
-                       vector<int>& subSeqStartPos,
-                       vector<vector<real>>& starts,
-                       vector<vector<real>>& ends) {
-  // layer size is not crutial for this layer,
-  // so here use a small layer size in the unittest.
-  const size_t layerSize{4};
-  TestConfig config;
-  config.layerConfig.set_type("seq_slice");
-  config.layerConfig.set_size(layerSize);
-
-  // add the first input
-  MatrixPtr seqInputPtr =
-      Matrix::create(hasSubseq ? subSeqStartPos.back() : seqStartPos.back(),
-                     layerSize,
-                     false,
-                     false);
-  seqInputPtr->randomizeUniform();
-
-  if (hasSubseq) {
-    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                "seq_input",
-                                seqInputPtr,
-                                seqStartPos,
-                                subSeqStartPos});
-  } else {
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, "seq_input", seqInputPtr, seqStartPos});
-  }
-  config.layerConfig.add_inputs();
-
-  // add start indices
-  if (starts.size()) {
-    vector<real> startsToVec;
-    flatten2dVector(starts, startsToVec);
-
-    MatrixPtr startMatrixPtr =
-        Matrix::create(starts.size(), starts[0].size(), false, false);
-    startMatrixPtr->copyFrom(startsToVec.data(), startsToVec.size());
-
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, "starts", startMatrixPtr});
-    config.layerConfig.add_inputs();
-    config.layerConfig.set_select_first(true);
-  }
-
-  // add end indices
-  if (ends.size()) {
-    vector<real> endsToVec;
-    flatten2dVector(ends, endsToVec);
-
-    MatrixPtr endMatrixPtr =
-        Matrix::create(ends.size(), ends[0].size(), false, false);
-    endMatrixPtr->copyFrom(endsToVec.data(), endsToVec.size());
-
-    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "ends", endMatrixPtr});
-    config.layerConfig.add_inputs();
-    config.layerConfig.set_select_first(false);
-  }
-
-  testLayerGrad(config, "seq_slice", /*batchSize*/ 100, false, useGpu, false);
-}
-
-TEST(Layer, SeqSliceLayer) {
-  vector<int> seqStartPos;
-  vector<int> subSeqStartPos;
-  vector<vector<real>> starts;
-  vector<vector<real>> ends;
-
-  std::vector<bool> mode = {false};
-#ifdef PADDLE_WITH_CUDA
-  mode.push_back(true);
-#endif
-  genSeqInfo(seqStartPos, subSeqStartPos);
-  for (bool hasSubseq : {true, false}) {
-    LOG(INFO) << "hasSubSeq : " << hasSubseq;
-    genTestData(seqStartPos, subSeqStartPos, starts, ends, hasSubseq);
-    for (bool useGpu : mode) {
-      vector<vector<real>> tmp;
-      testSeqSliceLayer(
-          hasSubseq, useGpu, seqStartPos, subSeqStartPos, tmp, ends);
-      testSeqSliceLayer(
-          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, tmp);
-      testSeqSliceLayer(
-          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, ends);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_Upsample.cpp b/paddle/legacy/gserver/tests/test_Upsample.cpp
deleted file mode 100644
index 940d46baf73..00000000000
--- a/paddle/legacy/gserver/tests/test_Upsample.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-
-#include "LayerGradUtil.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/testing/TestUtil.h"
-
-void setPoolConfig(paddle::TestConfig* config,
-                   paddle::PoolConfig* pool,
-                   const string& poolType) {
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool");
-  (*config).layerConfig.set_num_filters(1);
-
-  int kw = 2, kh = 2;
-  int pw = 0, ph = 0;
-  int sw = 2, sh = 2;
-  pool->set_pool_type(poolType);
-  pool->set_channels(2);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_start(0);
-  pool->set_padding(pw);
-  pool->set_padding_y(ph);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-
-  int ow =
-      paddle::outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh =
-      paddle::outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-}
-
-paddle::LayerPtr doOneUpsampleTest(const paddle::MatrixPtr& inputMat,
-                                   const string& poolType,
-                                   bool use_gpu,
-                                   real* tempGradData) {
-  /* prepare maxPoolWithMaskLayer */
-  paddle::TestConfig config;
-  config.inputDefs.push_back({paddle::INPUT_DATA, "layer_0", 128, 0});
-  paddle::LayerInputConfig* input = config.layerConfig.add_inputs();
-  paddle::PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_img_size(8);
-  pool->set_img_size_y(8);
-  setPoolConfig(&config, pool, "max-pool-with-mask");
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  config.layerConfig.set_name("MaxPoolWithMask");
-
-  std::vector<paddle::DataLayerPtr> dataLayers;
-  paddle::LayerMap layerMap;
-  vector<paddle::Argument> datas;
-
-  initDataLayer(config,
-                &dataLayers,
-                &datas,
-                &layerMap,
-                "MaxPoolWithMask",
-                1,
-                false,
-                use_gpu);
-
-  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
-
-  FLAGS_use_gpu = use_gpu;
-  std::vector<paddle::ParameterPtr> parameters;
-  paddle::LayerPtr maxPoolingWithMaskOutputLayer;
-  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
-  maxPoolingWithMaskOutputLayer->forward(paddle::PASS_GC);
-
-  /* prepare the upsample layer */
-  paddle::LayerConfig upsampleLayerConfig;
-  upsampleLayerConfig.set_type("upsample");
-  paddle::LayerInputConfig* input1 = upsampleLayerConfig.add_inputs();
-  upsampleLayerConfig.add_inputs();
-
-  paddle::UpsampleConfig* upsampleConfig = input1->mutable_upsample_conf();
-  upsampleConfig->set_scale(2);
-  paddle::ImageConfig* imageConfig = upsampleConfig->mutable_image_conf();
-  imageConfig->set_channels(2);
-  imageConfig->set_img_size(4);
-  imageConfig->set_img_size_y(4);
-  upsampleLayerConfig.set_size(2 * 8 * 8);
-  upsampleLayerConfig.set_name("upsample");
-
-  for (size_t i = 0; i < 2; i++) {
-    paddle::LayerInputConfig& inputTemp =
-        *(upsampleLayerConfig.mutable_inputs(i));
-    inputTemp.set_input_layer_name("MaxPoolWithMask");
-  }
-
-  paddle::LayerPtr upsampleLayer;
-  paddle::ParameterMap parameterMap;
-  upsampleLayer = paddle::Layer::create(upsampleLayerConfig);
-  layerMap[upsampleLayerConfig.name()] = upsampleLayer;
-  upsampleLayer->init(layerMap, parameterMap);
-  upsampleLayer->setNeedGradient(true);
-  upsampleLayer->forward(paddle::PASS_GC);
-  upsampleLayer->getOutputGrad()->copyFrom(tempGradData, 128);
-  upsampleLayer->backward();
-
-  return upsampleLayer;
-}
-
-TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
-  bool useGpu = false;
-  paddle::MatrixPtr inputMat;
-  paddle::MatrixPtr inputGPUMat;
-  paddle::MatrixPtr tempGradMat;
-
-  inputMat = paddle::Matrix::create(1, 128, false, useGpu);
-  inputMat->randomizeUniform();
-
-  tempGradMat = paddle::Matrix::create(1, 128, false, useGpu);
-  tempGradMat->randomizeUniform();
-  real* tempGradData = tempGradMat->getData();
-
-  paddle::LayerPtr upsampleLayerCPU =
-      doOneUpsampleTest(inputMat, "max-pool-with-mask", useGpu, tempGradData);
-
-#ifdef PADDLE_WITH_CUDA
-  useGpu = true;
-  real* data = inputMat->getData();
-  inputGPUMat = paddle::Matrix::create(1, 128, false, useGpu);
-  inputGPUMat->copyFrom(data, 128);
-  paddle::LayerPtr upsampleLayerGPU = doOneUpsampleTest(
-      inputGPUMat, "max-pool-with-mask", useGpu, tempGradData);
-  paddle::checkMatrixEqual(upsampleLayerCPU->getOutput("").value,
-                           upsampleLayerGPU->getOutput("").value);
-
-  paddle::checkMatrixEqual(upsampleLayerCPU->getPrev(0)->getOutputGrad(),
-                           upsampleLayerGPU->getPrev(0)->getOutputGrad());
-#endif
-}
diff --git a/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp b/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp
deleted file mode 100644
index b1697e16164..00000000000
--- a/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/utils/Version.h>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/CTCLayer.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/gserver/layers/Layer.h"
-#include "paddle/legacy/gserver/layers/WarpCTCLayer.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-
-const real* getData(const Matrix& matrix) {
-  if (matrix.useGpu()) {
-    MatrixPtr cpuMatrix = Matrix::create(
-        matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false);
-    cpuMatrix->copyFrom(matrix);
-    return cpuMatrix->getData();
-  } else {
-    return matrix.getData();
-  }
-}
-
-int checkError(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK_EQ(matrix1.getHeight(), matrix2.getHeight());
-  CHECK_EQ(matrix1.getWidth(), matrix2.getWidth());
-  CHECK_EQ(matrix1.isTransposed(), matrix2.isTransposed());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-
-  const real* data1 = getData(matrix1);
-  const real* data2 = getData(matrix2);
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-  return count;
-}
-
-void initArgument(size_t batchSize,
-                  int layerSize,
-                  bool useGpu,
-                  Argument& data) {
-  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
-  data.grad = Matrix::create(batchSize, layerSize, false, useGpu);
-  data.value->randomizeUniform();
-  data.value->add(-0.5);
-  data.grad->zeroMem();
-
-  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
-}
-
-LayerPtr createDataLayer(
-    string name, size_t batchSize, int layerSize, bool useGpu, Argument& data) {
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("data");
-  layerConfig.set_size(layerSize);
-  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
-
-  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  dataLayer->setData(data);
-  dataLayer->forward(PASS_GC);
-
-  return layer;
-}
-
-LayerPtr createLabelLayer(string name,
-                          size_t batchSize,
-                          size_t numClasses,
-                          bool useGpu) {
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("data");
-  layerConfig.set_size(1);
-  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
-
-  Argument data;
-  data.ids = IVector::create(batchSize, useGpu);
-  data.ids->rand(numClasses - 1);
-
-  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
-
-  DataLayerPtr labelLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  labelLayer->setData(data);
-  labelLayer->forward(PASS_GC);
-
-  return layer;
-}
-
-LayerPtr createCTCLayer(string name,
-                        size_t numClasses,
-                        bool useGpu,
-                        bool normByTimes,
-                        LayerPtr dataLayer,
-                        LayerPtr labelLayer) {
-  LayerMap layerMap;
-  layerMap[dataLayer->getName()] = dataLayer;
-  layerMap[labelLayer->getName()] = labelLayer;
-
-  ParameterMap parameterMap;
-
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("ctc");
-  layerConfig.set_size(numClasses);
-  layerConfig.set_norm_by_times(normByTimes);
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
-  input0.set_input_layer_name(dataLayer->getName());
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
-  input1.set_input_layer_name(labelLayer->getName());
-
-  LayerPtr layer = LayerPtr(new CTCLayer(layerConfig));
-  layerMap[layer->getName()] = layer;
-  layer->init(layerMap, parameterMap);
-
-  ActivationFunction* softmaxActivation = ActivationFunction::create("softmax");
-
-  softmaxActivation->forward(dataLayer->getOutput()).check();
-  layer->forward(PASS_GC);
-
-  layer->backward();
-  softmaxActivation->backward(dataLayer->getOutput()).check();
-
-  return layer;
-}
-
-LayerPtr createWarpCTCLayer(string name,
-                            size_t numClasses,
-                            bool useGpu,
-                            bool normByTimes,
-                            LayerPtr dataLayer,
-                            LayerPtr labelLayer) {
-  LayerMap layerMap;
-  layerMap[dataLayer->getName()] = dataLayer;
-  layerMap[labelLayer->getName()] = labelLayer;
-
-  ParameterMap parameterMap;
-
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("warp_ctc");
-  layerConfig.set_size(numClasses);
-  layerConfig.set_blank(numClasses - 1);
-  layerConfig.set_norm_by_times(normByTimes);
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
-  input0.set_input_layer_name(dataLayer->getName());
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
-  input1.set_input_layer_name(labelLayer->getName());
-
-  LayerPtr layer = LayerPtr(new WarpCTCLayer(layerConfig));
-  layerMap[layer->getName()] = layer;
-  layer->init(layerMap, parameterMap);
-
-  layer->forward(PASS_GC);
-  layer->backward();
-
-  return layer;
-}
-
-TEST(Layer, WarpCTCLayer) {
-  for (auto layerSize : {10, 64}) {
-    for (auto batchSize : {1, 10, 32}) {
-      for (auto normByTimes : {false, true}) {
-        for (auto useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-          if (useGpu) continue;
-#endif
-          LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
-                    << " normByTimes = " << normByTimes << " useGpu=" << useGpu;
-
-          FLAGS_use_gpu = useGpu;
-
-          Argument data0;
-          initArgument(batchSize, layerSize, useGpu, data0);
-
-          Argument data1;
-          data1.resizeAndCopyFrom(data0);
-
-          LayerPtr dataLayer0 =
-              createDataLayer("data", batchSize, layerSize, useGpu, data0);
-          LayerPtr dataLayer1 =
-              createDataLayer("data", batchSize, layerSize, useGpu, data1);
-
-          LayerPtr labelLayer =
-              createLabelLayer("label", batchSize, layerSize, useGpu);
-
-          LayerPtr warpctcLayer = createWarpCTCLayer(
-              "cost", layerSize, useGpu, normByTimes, dataLayer0, labelLayer);
-          LayerPtr ctcLayer = createCTCLayer(
-              "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer);
-
-          /// Check cost
-          LOG(INFO) << "Check cost: "
-                    << checkError(*(warpctcLayer->getOutput().value),
-                                  *(ctcLayer->getOutput().value))
-                    << " different elements.";
-
-          /// Check gradients
-          LOG(INFO) << "Check gradients: "
-                    << checkError(*(dataLayer0->getOutput().grad),
-                                  *(dataLayer1->getOutput().grad))
-                    << " different elements";
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/legacy/math/Allocator.h b/paddle/legacy/math/Allocator.h
deleted file mode 100644
index ffb5ec1cad4..00000000000
--- a/paddle/legacy/math/Allocator.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdlib.h>
-#include <mutex>
-#include "hl_gpu.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-/**
- * @brief Allocator base class.
- *
- * This is the base class of all Allocator class.
- */
-class Allocator {
- public:
-  virtual ~Allocator() {}
-  virtual void* alloc(size_t size) = 0;
-  virtual void free(void* ptr) = 0;
-  virtual std::string getName() = 0;
-};
-
-/**
- * @brief CPU allocator implementation.
- */
-class CpuAllocator : public Allocator {
- public:
-  ~CpuAllocator() {}
-
-  /**
-   * @brief Aligned allocation on CPU.
-   * @param size Size to be allocated.
-   * @return Pointer to the allocated memory
-   */
-  virtual void* alloc(size_t size) {
-    void* ptr;
-#ifdef PADDLE_WITH_MKLDNN
-    // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
-    // memory alignment
-    CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
-#else
-    CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
-#endif
-    CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
-    return ptr;
-  }
-
-  /**
-   * @brief Free the memory space.
-   * @param ptr  Pointer to be free.
-   */
-  virtual void free(void* ptr) {
-    if (ptr) {
-      ::free(ptr);
-    }
-  }
-
-  virtual std::string getName() { return "cpu_alloc"; }
-};
-
-/**
- * @brief GPU allocator implementation.
- */
-class GpuAllocator : public Allocator {
- public:
-  ~GpuAllocator() {}
-
-  /**
-   * @brief Allocate GPU memory.
-   * @param size Size to be allocated.
-   * @return Pointer to the allocated memory
-   */
-  virtual void* alloc(size_t size) {
-    void* ptr = hl_malloc_device(size);
-    CHECK(ptr) << "Fail to allocate GPU memory " << size << " bytes";
-    return ptr;
-  }
-
-  /**
-   * @brief Free the GPU memory.
-   * @param ptr  Pointer to be free.
-   */
-  virtual void free(void* ptr) {
-    if (ptr) {
-      hl_free_mem_device(ptr);
-    }
-  }
-
-  virtual std::string getName() { return "gpu_alloc"; }
-};
-
-/**
- * @brief CPU pinned memory allocator implementation.
- */
-class CudaHostAllocator : public Allocator {
- public:
-  ~CudaHostAllocator() {}
-
-  /**
-   * @brief Allocate pinned memory.
-   * @param size Size to be allocated.
-   * @return Pointer to the allocated memory
-   */
-  virtual void* alloc(size_t size) {
-    void* ptr = hl_malloc_host(size);
-    CHECK(ptr) << "Fail to allocate pinned memory " << size << " bytes";
-    return ptr;
-  }
-
-  /**
-   * @brief Free the pinned memory.
-   * @param ptr  Pointer to be free.
-   */
-  virtual void free(void* ptr) {
-    if (ptr) {
-      hl_free_mem_host(ptr);
-    }
-  }
-
-  virtual std::string getName() { return "cuda_host_alloc"; }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/BaseMatrix.cu b/paddle/legacy/math/BaseMatrix.cu
deleted file mode 100644
index 7e7cdc57a98..00000000000
--- a/paddle/legacy/math/BaseMatrix.cu
+++ /dev/null
@@ -1,1953 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/legacy/utils/Logging.h>
-#include <string.h>
-#include <cmath>
-#include "BaseMatrix.h"
-#include "MathFunctions.h"
-#include "NEONFunctions.h"
-#include "SIMDFunctions.h"
-#include "hl_matrix_apply.cuh"
-#include "hl_matrix_base.cuh"
-#include "hl_matrix_ops.cuh"
-
-namespace paddle {
-
-const char* SPARSE_SUPPORT_ERROR = "Sparse Matrix/Vector is not supported.";
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyUnary(Op op) {
-  MatrixOffset offset(0, 0);
-  applyUnary(op, height_, width_, offset);
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyUnary(Op op,
-                               int numRows,
-                               int numCols,
-                               MatrixOffset& offset) {
-  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
-  int dimM = numRows;
-  int dimN = numCols;
-  int lda = stride_;
-
-  T* A = data_;
-  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-
-  CHECK_LE(dimM + offset.aRow_, this->height_);
-  CHECK_LE(dimN + offset.aCol_, this->width_);
-  if (true == useGpu_) {
-    hl_gpu_apply_unary_op(op, A, dimM, dimN, lda);
-  } else {
-    hl_cpu_apply_unary_op(op, A, dimM, dimN, lda);
-  }
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
-  CHECK(height_ == b.height_ && width_ == b.width_)
-      << "Matrix dimensions are not equal";
-
-  MatrixOffset offset(0, 0, 0, 0);
-  applyBinary(op, b, height_, width_, offset);
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyBinary(
-    Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset) {
-  applyBinary(op, b, numRows, numCols, offset, false_type(), false_type());
-  return 0;
-}
-
-template <class T>
-template <class Op, class bAsRowVector, class bAsColVector>
-int BaseMatrixT<T>::applyBinary(Op op,
-                                BaseMatrixT& b,
-                                int numRows,
-                                int numCols,
-                                MatrixOffset& offset,
-                                bAsRowVector,
-                                bAsColVector) {
-  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(useGpu_ == b.useGpu_) << "Matrix type mismatch";
-
-  int dimM = numRows;
-  int dimN = numCols;
-  int lda = stride_;
-  int ldb = b.stride_;
-
-  T* A = data_;
-  T* B = b.data_;
-  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CHECK_LE(dimM + offset.aRow_, this->height_);
-  CHECK_LE(dimN + offset.aCol_, this->width_);
-  if (!bAsRowVector::value && !bAsColVector::value) {
-    CHECK_LE(dimM + offset.bRow_, b.height_);
-    CHECK_LE(dimN + offset.bCol_, b.width_);
-  } else if (bAsRowVector::value && !bAsColVector::value) {
-    CHECK_LE(dimN + offset.bCol_, b.width_);
-  } else if (!bAsRowVector::value && bAsColVector::value) {
-    CHECK_LE(dimM + offset.bRow_, b.height_);
-  } else {
-  }
-  if (true == useGpu_) {
-    hl_gpu_apply_binary_op<T, Op, bAsRowVector::value, bAsColVector::value>(
-        op, A, B, dimM, dimN, lda, ldb);
-  } else {
-    hl_cpu_apply_binary_op<T, Op, bAsRowVector::value, bAsColVector::value>(
-        op, A, B, dimM, dimN, lda, ldb);
-  }
-
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
-  CHECK_EQ(height_, b.height_);
-  CHECK_EQ(width_, b.width_);
-  CHECK_EQ(height_, c.height_);
-  CHECK_EQ(width_, c.width_);
-
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  applyTernary(op, b, c, height_, width_, offset);
-
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyTernary(Op op,
-                                 BaseMatrixT& b,
-                                 BaseMatrixT& c,
-                                 int numRows,
-                                 int numCols,
-                                 MatrixOffset& offset) {
-  applyTernary(op, b, c, numRows, numCols, offset, false_type(), false_type());
-
-  return 0;
-}
-
-template <class T>
-template <class Op, class cAsRowVector, class cAsColVector>
-int BaseMatrixT<T>::applyTernary(Op op,
-                                 BaseMatrixT& b,
-                                 BaseMatrixT& c,
-                                 int numRows,
-                                 int numCols,
-                                 MatrixOffset& offset,
-                                 cAsRowVector,
-                                 cAsColVector) {
-  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK_EQ(useGpu_, b.useGpu_);
-  CHECK_EQ(useGpu_, c.useGpu_);
-
-  int dimM = numRows;
-  int dimN = numCols;
-  int lda = stride_;
-  int ldb = b.stride_;
-  int ldc = c.stride_;
-
-  T* A = data_;
-  T* B = b.data_;
-  T* C = c.data_;
-  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(
-      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
-
-  CHECK_LE(dimM + offset.aRow_, this->height_);
-  CHECK_LE(dimN + offset.aCol_, this->width_);
-  CHECK_LE(dimM + offset.bRow_, b.height_);
-  CHECK_LE(dimN + offset.bCol_, b.width_);
-  if (!cAsRowVector::value && !cAsColVector::value) {
-    CHECK_LE(dimM + offset.cRow_, c.height_);
-    CHECK_LE(dimN + offset.cCol_, c.width_);
-  } else if (cAsRowVector::value && !cAsColVector::value) {
-    CHECK_LE(dimN + offset.cCol_, c.width_);
-  } else if (!cAsRowVector::value && cAsColVector::value) {
-    CHECK_LE(dimM + offset.cRow_, c.height_);
-  } else {
-  }
-
-  if (true == useGpu_) {
-    hl_gpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
-        op, A, B, C, dimM, dimN, lda, ldb, ldc);
-  } else {
-    hl_cpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
-        op, A, B, C, dimM, dimN, lda, ldb, ldc);
-  }
-
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyQuaternary(Op op,
-                                    BaseMatrixT& b,
-                                    BaseMatrixT& c,
-                                    BaseMatrixT& d) {
-  CHECK_EQ(height_, b.height_);
-  CHECK_EQ(width_, b.width_);
-  CHECK_EQ(height_, c.height_);
-  CHECK_EQ(width_, c.width_);
-  CHECK_EQ(height_, d.height_);
-  CHECK_EQ(width_, d.width_);
-
-  MatrixOffset offset(0, 0, 0, 0, 0, 0, 0, 0);
-  applyQuaternary(op, b, c, d, height_, width_, offset);
-
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyQuaternary(Op op,
-                                    BaseMatrixT& b,
-                                    BaseMatrixT& c,
-                                    BaseMatrixT& d,
-                                    int numRows,
-                                    int numCols,
-                                    MatrixOffset& offset) {
-  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!d.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK_EQ(useGpu_, b.useGpu_);
-  CHECK_EQ(useGpu_, c.useGpu_);
-  CHECK_EQ(useGpu_, d.useGpu_);
-
-  int dimM = numRows;
-  int dimN = numCols;
-  int lda = stride_;
-  int ldb = b.stride_;
-  int ldc = c.stride_;
-  int ldd = d.stride_;
-
-  T* A = data_;
-  T* B = b.data_;
-  T* C = c.data_;
-  T* D = d.data_;
-  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(
-      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
-  CAL_MATRIX_START_ADDRESS(
-      D, d.height_, d.width_, ldd, offset.dCol_, offset.dRow_);
-
-  CHECK_LE(dimM + offset.aRow_, this->height_);
-  CHECK_LE(dimN + offset.aCol_, this->width_);
-  CHECK_LE(dimM + offset.bRow_, b.height_);
-  CHECK_LE(dimN + offset.bCol_, b.width_);
-  CHECK_LE(dimM + offset.cRow_, c.height_);
-  CHECK_LE(dimN + offset.cCol_, c.width_);
-  CHECK_LE(dimM + offset.dRow_, d.height_);
-  CHECK_LE(dimN + offset.dCol_, d.width_);
-  if (true == useGpu_) {
-    hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
-  } else {
-    hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
-  }
-
-  return 0;
-}
-
-template <class T>
-template <class Agg,
-          class Op,
-          class Saver,
-          class aAsRowVector,
-          class aAsColVector>
-int BaseMatrixT<T>::aggregate(Agg agg,
-                              Op op,
-                              Saver sv,
-                              BaseMatrixT& b,
-                              int numRows,
-                              int numCols,
-                              MatrixOffset& offset,
-                              aAsRowVector,
-                              aAsColVector) {
-  CHECK_EQ(useGpu_, b.useGpu_);
-
-  int ld = stride_;
-  int ldb = b.stride_;
-
-  T* dst = data_;
-  T* B = b.data_;
-  CAL_MATRIX_START_ADDRESS(
-      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-
-  if (aAsRowVector::value && !aAsColVector::value) {
-    if (useGpu_) {
-      hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, ldb);
-    } else {
-      hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, ldb);
-    }
-  } else if (!aAsRowVector::value && aAsColVector::value) {
-    if (useGpu_) {
-      hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, ldb);
-    } else {
-      hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, ldb);
-    }
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-
-  return 0;
-}
-
-template <class T>
-template <class Agg,
-          class Op,
-          class Saver,
-          class aAsRowVector,
-          class aAsColVector>
-int BaseMatrixT<T>::aggregate(Agg agg,
-                              Op op,
-                              Saver sv,
-                              BaseMatrixT& b,
-                              BaseMatrixT& c,
-                              int numRows,
-                              int numCols,
-                              MatrixOffset& offset,
-                              aAsRowVector,
-                              aAsColVector) {
-  CHECK_EQ(useGpu_, b.useGpu_);
-  CHECK_EQ(useGpu_, c.useGpu_);
-
-  int ld = stride_;
-  int ldb = b.stride_;
-  int ldc = c.stride_;
-
-  T* dst = data_;
-  T* B = b.data_;
-  T* C = c.data_;
-  CAL_MATRIX_START_ADDRESS(
-      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(
-      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
-
-  if (aAsRowVector::value && !aAsColVector::value) {
-    if (useGpu_) {
-      hl_gpu_matrix_column_op(
-          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
-    } else {
-      hl_cpu_matrix_column_op(
-          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
-    }
-  } else if (!aAsRowVector::value && aAsColVector::value) {
-    if (useGpu_) {
-      hl_gpu_matrix_row_op(
-          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
-    } else {
-      hl_cpu_matrix_row_op(
-          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
-    }
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-
-  return 0;
-}
-
-/**
- * @brief   unary operator.
- *
- */
-
-DEFINE_MATRIX_UNARY_OP(Neg, a = -a);
-template <class T>
-void BaseMatrixT<T>::neg() {
-  applyUnary(unary::Neg<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
-template <>
-void BaseMatrixT<real>::exp2() {
-  applyUnary(unary::Exp<real>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
-template <>
-void BaseMatrixT<real>::log2() {
-  if (useGpu_) {
-    applyUnary(unary::Log<real>());
-  } else {
-    vLog(height_ * width_, data_, data_);
-  }
-}
-
-DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
-template <>
-void BaseMatrixT<real>::sqrt2() {
-  applyUnary(unary::Sqrt<real>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
-template <class T>
-void BaseMatrixT<T>::square2() {
-  applyUnary(unary::Square<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
-template <class T>
-void BaseMatrixT<T>::reciprocal2() {
-  applyUnary(unary::Reciprocal<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
-template <class T>
-void BaseMatrixT<T>::abs2() {
-  applyUnary(unary::Abs<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
-template <class T>
-void BaseMatrixT<T>::sign2() {
-  applyUnary(unary::Sign<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-template <class T>
-void BaseMatrixT<T>::zero() {
-  applyUnary(unary::Zero<T>());
-}
-
-template <class T>
-void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
-  int numRows = height_;
-  int numCols = numColumns;
-  MatrixOffset offset(columnOffset, 0);
-  applyUnary(unary::Zero<T>(), numRows, numCols, offset);
-}
-
-DEFINE_MATRIX_UNARY_OP(One, a = 1);
-template <class T>
-void BaseMatrixT<T>::one() {
-  applyUnary(unary::One<T>());
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
-template <>
-void BaseMatrixT<real>::pow2(real p) {
-  if (useGpu_) {
-    applyUnary(unary::Pow<real>(p));
-  } else {
-    vPow(height_ * width_, data_, p, data_);
-  }
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a -= p);
-template <class T>
-void BaseMatrixT<T>::subScalar(T p) {
-  applyUnary(unary::SubScalar<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a *= p);
-template <class T>
-void BaseMatrixT<T>::mulScalar(T p) {
-  applyUnary(unary::MulScalar<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a /= p);
-template <class T>
-void BaseMatrixT<T>::divScalar(T p) {
-  applyUnary(unary::DivScalar<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Assign, ONE_PARAMETER, a = p);
-template <class T>
-void BaseMatrixT<T>::assign(T p) {
-  applyUnary(unary::Assign<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Add, ONE_PARAMETER, a += p);
-template <class T>
-void BaseMatrixT<T>::add(T p) {
-  applyUnary(unary::Add<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = a * p1 + p2);
-template <class T>
-void BaseMatrixT<T>::add(T p1, T p2) {
-  applyUnary(unary::Add2<T>(p1, p2));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip,
-                                 TWO_PARAMETER,
-                                 a = a < p1 ? p1 : (a > p2 ? p2 : a));
-template <class T>
-void BaseMatrixT<T>::clip(T p1, T p2) {
-  applyUnary(unary::Clip<T>(p1, p2));
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative,
-                                  TWO_PARAMETER,
-                                  a = b < p1 ? 0 : (b > p2 ? 0 : 1));
-template <class T>
-void BaseMatrixT<T>::clipDerivative(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::ClipDerivative<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar,
-                                 ONE_PARAMETER,
-                                 a = a > p ? 1.0f : 0.0f);
-template <class T>
-void BaseMatrixT<T>::biggerThanScalar(T p) {
-  applyUnary(unary::BiggerThanScalar<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, a = a > p ? a : p);
-template <class T>
-void BaseMatrixT<T>::downClip(T p) {
-  applyUnary(unary::DownClip<T>(p));
-}
-
-/**
- * @brief   binary operator.
- *
- */
-
-DEFINE_MATRIX_BINARY_OP(Add, a += b);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b) {
-  applyBinary(binary::Add<T>(), b);
-}
-
-template <>
-void BaseMatrixT<real>::add(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::Add<real>(), b);
-  } else {  // cpu branch
-    CHECK_EQ(height_, b.height_);
-    CHECK_EQ(width_, b.width_);
-    vAdd(height_ * width_, data_, b.data_, data_);
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
-  if (columnOffset + b.width_ <= width_) {
-    int numRows = height_;
-    int numCols = b.width_;
-    MatrixOffset offset(columnOffset, 0, 0, 0);
-    applyBinary(binary::Add<T>(), b, numRows, numCols, offset);
-  } else if (columnOffset + width_ <= b.width_) {
-    int numRows = height_;
-    int numCols = width_;
-    MatrixOffset offset(0, 0, columnOffset, 0);
-    applyBinary(binary::Add<T>(), b, numRows, numCols, offset);
-  } else {
-    LOG(FATAL) << "Wrong argument "
-               << " a.width=" << width_ << " b.width=" << b.width_
-               << " columnOffset=" << columnOffset;
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::addP2P(BaseMatrixT& b) {
-  T* A = data_;
-  T* B = b.data_;
-  int dimM = height_;
-  int dimN = width_;
-
-  hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0>(
-      binary::Add<T>(), A, B, dimM, dimN, dimN, dimN);
-}
-
-template <class T>
-void BaseMatrixT<T>::addColVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::Add<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              false_type(),
-              true_type() /* bAsColVector */);
-}
-
-template <class T>
-void BaseMatrixT<T>::addRowVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::Add<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Add1, ONE_PARAMETER, a += b * p);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
-  applyBinary(binary::Add1<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
-template <>
-void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
-  if (useGpu_) {
-    applyBinary(binary::Pow<real>(p), b);
-  } else {
-    vPow(height_ * width_, b.data_, p, data_);
-  }
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = p1 * a + p2 * b);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::Add2<T>(p1, p2), b);
-}
-
-template <class T>
-void BaseMatrixT<T>::addBias(BaseMatrixT& b, T scale) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::Add1<T>(scale),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
-}
-
-DEFINE_MATRIX_BINARY_OP(Sub, a -= b);
-template <class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b) {
-  applyBinary(binary::Sub<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Sub1, ONE_PARAMETER, a -= b * p);
-template <class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b, T p) {
-  applyBinary(binary::Sub1<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Relu, b = a > 0.0f ? a : 0.0f);
-template <class T>
-void BaseMatrixT<T>::relu(BaseMatrixT& b) {
-  applyBinary(binary::Relu<T>(), b);
-}
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-template <>
-void BaseMatrixT<float>::relu(BaseMatrixT& b) {
-  neon::relu(data_, b.data_, height_ * width_);
-}
-#endif
-
-DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f));
-template <class T>
-void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) {
-  applyBinary(binary::ReluDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0;
-                        b = log(1.0 + exp((a > THRESHOLD)
-                                              ? THRESHOLD
-                                              : ((a < -THRESHOLD) ? (-THRESHOLD)
-                                                                  : a))));
-template <>
-void BaseMatrixT<real>::softrelu(BaseMatrixT& b) {
-  applyBinary(binary::Softrelu<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(
-    SoftreluDerivative, const T THRESHOLD = 40.0;
-    a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
-                                ? THRESHOLD
-                                : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
-template <>
-void BaseMatrixT<real>::softreluDerivative(BaseMatrixT& b) {
-  applyBinary(binary::SoftreluDerivative<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Brelu, TWO_PARAMETER, b = a > p1 ? a : p1;
-                                  b = b < p2 ? b : p2);
-template <class T>
-void BaseMatrixT<T>::brelu(BaseMatrixT& b) {
-  int p1 = 0, p2 = 24;  //! TODO(yuyang18): Make p1,p2 configuable.
-  applyBinary(binary::Brelu<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative,
-                                  TWO_PARAMETER,
-                                  a *= (b > p1 && b < p2) ? 1.0 : 0.0);
-template <class T>
-void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
-  int p1 = 0, p2 = 24;
-  applyBinary(binary::BreluDerivative<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
-template <class T>
-void BaseMatrixT<T>::square2(BaseMatrixT& b) {
-  applyBinary(binary::Square<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(SquareDerivative, a *= 2.0 * b);
-template <class T>
-void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
-  applyBinary(binary::SquareDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Tanh, T tmp = -2.0 * a;
-                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
-template <>
-void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
-  applyBinary(binary::Tanh<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(TanhDerivative, a *= 1 - b * b);
-template <class T>
-void BaseMatrixT<T>::tanhDerivative(BaseMatrixT& b) {
-  applyBinary(binary::TanhDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(
-    ScaledTanh, TWO_PARAMETER, b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0));
-template <>
-void BaseMatrixT<real>::scaledTanh(BaseMatrixT& b, real p1, real p2) {
-  applyBinary(binary::ScaledTanh<real>(p1, p2), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative,
-                                  TWO_PARAMETER,
-                                  a *= p2 * (p1 - b * b));
-template <class T>
-void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::ScaledTanhDerivative<T>(p1 * p1, p2 / p1), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
-template <class T>
-void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) {
-  applyBinary(binary::Reciprocal<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(ReciprocalDerivative, a *= -b * b);
-template <class T>
-void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
-  applyBinary(binary::ReciprocalDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
-template <class T>
-void BaseMatrixT<T>::abs2(BaseMatrixT& b) {
-  applyBinary(binary::Abs<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
-template <class T>
-void BaseMatrixT<T>::absDerivative(BaseMatrixT& b) {
-  applyBinary(binary::AbsDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Sigmoid, const T THRESHOLD_MIN = -40.0;
-                        const T THRESHOLD_MAX = 13.0;
-                        T tmp = (a < THRESHOLD_MIN)
-                                    ? THRESHOLD_MIN
-                                    : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
-                        b = 1.0f / (1.0f + exp(-tmp)));
-template <>
-void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::Sigmoid<real>(), b);
-  } else {  // cpu versioni
-    size_t numSamples = this->height_;
-    size_t dim = this->width_;
-    CHECK_EQ(b.height_, numSamples);
-    CHECK_EQ(b.width_, dim);
-    const real* in = this->data_;
-    real* out = b.data_;
-
-    // out = - in
-    const float THRESHOLD_MIN = -40.0;  // make sure sigmoid(x) > 0
-    const float THRESHOLD_MAX = 13.0;   // make sure sigmoid(x) < 1
-    for (size_t i = 0; i < numSamples * dim; ++i) {
-      real tmp = in[i];
-      tmp = (tmp < THRESHOLD_MIN)
-                ? THRESHOLD_MIN
-                : ((tmp > THRESHOLD_MAX) ? THRESHOLD_MAX : tmp);
-      out[i] = -tmp;
-    }
-
-    // out = exp(out)
-    vExp(numSamples * dim, out, out);
-
-    // out = 1 / (1 + out)
-    for (size_t i = 0; i < numSamples * dim; ++i) {
-      out[i] = 1 / (1 + out[i]);
-    }
-  }
-}
-
-DEFINE_MATRIX_BINARY_OP(SigmoidDerivative, a *= b * (1 - b));
-template <class T>
-void BaseMatrixT<T>::sigmoidDerivative(BaseMatrixT& b) {
-  applyBinary(binary::SigmoidDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(ExpDerivative, a *= b);
-template <class T>
-void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
-  applyBinary(binary::ExpDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
-template <class T>
-void BaseMatrixT<T>::sign2(BaseMatrixT& b) {
-  applyBinary(binary::Sign<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
-template <>
-void BaseMatrixT<real>::exp2(BaseMatrixT& b) {
-  applyBinary(binary::Exp<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
-template <>
-void BaseMatrixT<real>::log2(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::Log<real>(), b);
-  } else {
-    vLog(height_ * width_, b.data_, data_);
-  }
-}
-
-DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
-template <>
-void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) {
-  applyBinary(binary::Sqrt<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(InvSqrt, a = 1.0f / sqrt(b));
-template <>
-void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::InvSqrt<real>(), b);
-  } else {  // cpu branch
-    CHECK_EQ(height_, b.height_);
-    CHECK_EQ(width_, b.width_);
-    vInvSqrt(height_ * width_, b.data_, data_);
-  }
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(IsEqual, ONE_PARAMETER, a = (b == p));
-template <class T>
-void BaseMatrixT<T>::isEqualTo(BaseMatrixT& b, T value) {
-  applyBinary(binary::IsEqual<T>(value), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(AddScalar, ONE_PARAMETER, a = b + p);
-template <class T>
-void BaseMatrixT<T>::addScalar(BaseMatrixT& b, T p) {
-  applyBinary(binary::AddScalar<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a = b - p);
-template <class T>
-void BaseMatrixT<T>::subScalar(BaseMatrixT& b, T p) {
-  applyBinary(binary::SubScalar<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a = b * p);
-template <class T>
-void BaseMatrixT<T>::mulScalar(BaseMatrixT& b, T p) {
-  applyBinary(binary::MulScalar<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a = b / p);
-template <class T>
-void BaseMatrixT<T>::divScalar(BaseMatrixT& b, T p) {
-  applyBinary(binary::DivScalar<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ScalarDiv, ONE_PARAMETER, a = p / b);
-template <class T>
-void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
-  applyBinary(binary::ScalarDiv<T>(p), b);
-}
-
-/**
- * @brief   ternary operator.
- *
- */
-
-DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropy,
-                         a = -c * log(b) - (1 - c) * log(1 - b));
-template <>
-void BaseMatrixT<real>::softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::SoftCrossEntropy<real>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropyBp, a += (b - c) / (b * (1 - b)));
-template <class T>
-void BaseMatrixT<T>::softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::SoftCrossEntropyBp<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropy,
-                         a = c > 0.5 ? -log(b) : -log(1.0 - b));
-template <>
-void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
-                                                BaseMatrixT& c) {
-  if (useGpu_) {
-    applyTernary(ternary::BinaryCrossEntropy<real>(), b, c);
-  } else {
-    CHECK_EQ(height_, b.height_);
-    CHECK_EQ(height_, c.height_);
-    CHECK_EQ(width_, b.width_);
-    CHECK_EQ(width_, c.width_);
-
-    size_t size = height_ * width_;
-    real* out = b.data_;
-    real* label = c.data_;
-    real* cost = data_;
-
-    for (size_t i = 0; i < size; ++i) {
-      cost[i] = label[i] > 0.5 ? out[i] : 1.0 - out[i];
-    }
-    vLog(size, cost, cost);
-    for (size_t i = 0; i < size; ++i) {
-      cost[i] *= -1.0;
-    }
-  }
-}
-
-DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropyBp,
-                         a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b));
-template <class T>
-void BaseMatrixT<T>::binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::BinaryCrossEntropyBp<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(Add, a = b + c);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::Add<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add1, TWO_PARAMETER, a = p1 * b + p2 * c);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
-  applyTernary(ternary::Add1<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(Sub, a = b - c);
-template <class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::Sub<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(Sub1, TWO_PARAMETER, a = p1 * b - p2 * c);
-template <class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
-  applyTernary(ternary::Sub1<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(Add2, a = a + b + c);
-template <class T>
-void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::Add2<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3,
-                                   THREE_PARAMETER,
-                                   a = p1 * a + p2 * b + p3 * c);
-template <class T>
-void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
-  applyTernary(ternary::Add3<T>(p1, p2, p3), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate,
-                                   THREE_PARAMETER,
-                                   c = p2 * c - p1 * (b + p3 * a);
-                                   a = a + c);
-template <class T>
-void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad
-                               BaseMatrixT& c,  // mom
-                               T p1,            // learningRate,
-                               T p2,            // momentum,
-                               T p3) {          // decayRate
-  applyTernary(ternary::SgdUpdate<T>(p1, p2, p3), b, c);
-}
-
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate,
-                                      THREE_PARAMETER,
-                                      c = p2 * c - p1 * d * (b + p3 * a);
-                                      a += c);
-template <class T>
-void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad,
-                               BaseMatrixT& c,  // mom,
-                               BaseMatrixT& d,  // lr,
-                               T p1,            // learningRate,
-                               T p2,            // momentum,
-                               T p3) {          // decayRate
-  applyQuaternary(quaternary::SgdUpdate<T>(p1, p2, p3), b, c, d);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b;
-                                  a = (a > lambda)
-                                          ? (a - lambda)
-                                          : (a < -lambda) ? (a + lambda) : 0);
-template <class T>
-void BaseMatrixT<T>::applyL1(BaseMatrixT& lr, T learningRate, T decayRate) {
-  applyBinary(binary::ApplyL1<T>(learningRate * decayRate), lr);
-}
-
-template <>
-void BaseMatrixT<real>::applyL1(BaseMatrixT& lr,
-                                real learningRate,
-                                real decayRate) {
-  if (useGpu_) {
-    applyBinary(binary::ApplyL1<real>(learningRate * decayRate), lr);
-  } else {
-    simd::decayL1(this->data_,
-                  this->data_,
-                  lr.data_,
-                  learningRate * decayRate,
-                  height_ * width_);
-  }
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p;
-                                 a = (a > lambda)
-                                         ? (a - lambda)
-                                         : (a < -lambda) ? (a + lambda) : 0);
-template <class T>
-void BaseMatrixT<T>::applyL1(T learningRate, T decayRate) {
-  applyUnary(unary::ApplyL1<T>(learningRate * decayRate));
-}
-
-template <>
-void BaseMatrixT<real>::applyL1(real learningRate, real decayRate) {
-  if (useGpu_) {
-    applyUnary(unary::ApplyL1<real>(learningRate * decayRate));
-  } else {
-    simd::decayL1(
-        this->data_, this->data_, learningRate * decayRate, height_ * width_);
-  }
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2,
-                                  ONE_PARAMETER,
-                                  a *= (1.0f / (1.0f + p * b)));
-template <class T>
-void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
-  if (useGpu_) {
-    applyBinary(binary::ApplyL2<T>(learningRate * decayRate), lr);
-  } else {
-    size_t size = this->height_ * this->width_;
-    T decay = learningRate * decayRate;
-    for (size_t j = 0; j < size; ++j) {
-      this->data_[j] *= 1.0f / (1.0f + decay * lr.data_[j]);
-    }
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::applyL2(T learningRate, T decayRate) {
-  BaseMatrixT<T>::mulScalar(1.0f / (1.0f + learningRate * decayRate));
-}
-
-DEFINE_MATRIX_BINARY_OP(DotMul, a *= b);
-template <class T>
-void BaseMatrixT<T>::dotMul(BaseMatrixT& b) {
-  applyBinary(binary::DotMul<T>(), b);
-}
-
-DEFINE_MATRIX_TERNARY_OP(DotMul, a = b * c);
-template <class T>
-void BaseMatrixT<T>::dotMul(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::DotMul<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(DotDiv, a = (b == 0.0) ? 0.0 : b / c);
-template <class T>
-void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::DotDiv<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P,
-                                   TWO_PARAMETER,
-                                   a = (b + p1) / (c + p2));
-template <class T>
-void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
-  applyTernary(ternary::DotDiv2P<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c;
-                            a = (a > THRESHOLD)
-                                    ? THRESHOLD
-                                    : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-                            a = log(1 + exp(a)) - a * d);
-template <>
-void BaseMatrixT<real>::rankLoss(BaseMatrixT& b,
-                                 BaseMatrixT& c,
-                                 BaseMatrixT& d) {
-  applyQuaternary(quaternary::RankLoss<real>(), b, c, d);
-}
-
-DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c;
-                            a = (a > THRESHOLD)
-                                    ? THRESHOLD
-                                    : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-                            a = exp(a);
-                            a = (a / (1 + a) - d));
-template <>
-void BaseMatrixT<real>::rankLossBp(BaseMatrixT& b,
-                                   BaseMatrixT& c,
-                                   BaseMatrixT& d) {
-  applyQuaternary(quaternary::RankLossBp<real>(), b, c, d);
-}
-
-/* this = log(1 + exp(b)) - c * b */
-DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0;
-                         T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
-                                                                 ? -THRESHOLD
-                                                                 : b;
-                         a = log(1 + exp(x)) - c * x);
-template <>
-void BaseMatrixT<real>::logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::LogisticRegressionLoss<real>(), b, c);
-}
-
-/* this = exp(b)/(1+exp(b)) - c */
-DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0;
-                         T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
-                                                                 ? -THRESHOLD
-                                                                 : b;
-                         x = exp(x);
-                         a = x / (1 + x) - c);
-template <>
-void BaseMatrixT<real>::logisticRegressionLossBp(BaseMatrixT& b,
-                                                 BaseMatrixT& c) {
-  applyTernary(ternary::LogisticRegressionLossBp<real>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(BiggerThan, a = (b > c) ? 1.0f : 0.0f);
-template <class T>
-void BaseMatrixT<T>::biggerThan(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::BiggerThan<T>(), b, c);
-}
-
-DEFINE_MATRIX_QUATERNARY_OP(
-    BiggerThan, a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
-template <class T>
-void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
-                                BaseMatrixT& c,
-                                BaseMatrixT& d) {
-  applyQuaternary(quaternary::BiggerThan<T>(), b, c, d);
-}
-
-DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
-template <class T>
-void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::Max<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError,
-                                   ONE_PARAMETER,
-                                   c += ((a > p) == (b > p)) ? 0.0f : 1.0f);
-template <class T>
-void BaseMatrixT<T>::binaryClassificationError2(size_t destCol,
-                                                BaseMatrixT& b,
-                                                BaseMatrixT& c,
-                                                T p) {
-  CHECK(!useGpu_) << "do not support gpu";
-  MatrixOffset offset(0, 0, 0, 0, destCol, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
-  b.applyTernary(ternary::BinaryClassificationError<T>(p),
-                 c,
-                 *this,
-                 numRows,
-                 numCols,
-                 offset,
-                 false_type(),
-                 true_type() /*cAsColVector*/);
-}
-
-template <>
-void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
-                                                  BaseMatrixT& b,
-                                                  BaseMatrixT& c,
-                                                  real p) {
-  MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
-  aggregate(aggregate::sum(),
-            base::binary::classificationError(p),
-            base::binary::add(),
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-}
-
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3,
-                                      THREE_PARAMETER,
-                                      a = p1 * b + p2 * c + p3 * d);
-template <class T>
-void BaseMatrixT<T>::add3(
-    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3) {
-  applyQuaternary(quaternary::Add3<T>(p1, p2, p3), b, c, d);
-}
-
-DEFINE_MATRIX_TERNARY_OP(DotMulSquare, a = b * c * c);
-template <class T>
-void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::DotMulSquare<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(DotSquareSquare, a = b * b * c * c);
-template <class T>
-void BaseMatrixT<T>::dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::DotSquareSquare<T>(), b, c);
-}
-
-DEFINE_MATRIX_BINARY_OP(DotMulSquare, a *= b * b);
-template <class T>
-void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b) {
-  applyBinary(binary::DotMulSquare<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(DotSquareMul, a = a * a * b);
-template <class T>
-void BaseMatrixT<T>::dotSquareMul(BaseMatrixT& b) {
-  applyBinary(binary::DotSquareMul<T>(), b);
-}
-
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum,
-                                      THREE_PARAMETER,
-                                      T tmp = p1 * b + p2 * c + p3 * d;
-                                      a += tmp * tmp);
-template <class T>
-void BaseMatrixT<T>::addSquareSum(
-    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3) {
-  applyQuaternary(quaternary::AddSquareSum<T>(p1, p2, p3), b, c, d);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(AddSquare, ONE_PARAMETER, a += p * b * b);
-template <class T>
-void BaseMatrixT<T>::addSquare(BaseMatrixT& b, T p) {
-  applyBinary(binary::AddSquare<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare,
-                                  TWO_PARAMETER,
-                                  a = p1 * a + p2 * b * b);
-template <class T>
-void BaseMatrixT<T>::decayAddSquare(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::DecayAddSquare<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul,
-                                   TWO_PARAMETER,
-                                   a = p1 * a + p2 * b * b * c * c);
-template <class T>
-void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b,
-                                       BaseMatrixT& c,
-                                       T p1,
-                                       T p2) {
-  applyTernary(ternary::DecayAddSquareMul<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum,
-                                   THREE_PARAMETER,
-                                   a = 1 / (p1 * b + p2 * c + p3));
-template <class T>
-void BaseMatrixT<T>::reciprocalSum(
-    BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
-  applyTernary(ternary::ReciprocalSum<T>(p1, p2, p3), b, c);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2,
-                                  TWO_PARAMETER,
-                                  a = 1 / (p1 * b + p2));
-template <class T>
-void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::Reciprocal2<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum,
-                                   TWO_PARAMETER,
-                                   T tmp = p1 * b + p2 * c;
-                                   a *= tmp * tmp);
-template <class T>
-void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b,
-                                     BaseMatrixT& c,
-                                     T p1,
-                                     T p2) {
-  applyTernary(ternary::DotMulSquareSum<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum,
-                                   TWO_PARAMETER,
-                                   T tmp = p1 * b + p2 * c;
-                                   a = tmp * tmp);
-template <class T>
-void BaseMatrixT<T>::dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
-  applyTernary(ternary::DotSquareSum<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum,
-                                   TWO_PARAMETER,
-                                   a *= p1 * b + p2 * c);
-template <class T>
-void BaseMatrixT<T>::dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
-  applyTernary(ternary::DotMulSum<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_BINARY_OP(CopyAndClear, b = a; a = 0);
-template <class T>
-void BaseMatrixT<T>::copyAndClear(BaseMatrixT& b) {
-  applyBinary(binary::CopyAndClear<T>(), b);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul,
-                                   TWO_PARAMETER,
-                                   a = p1 * a + p2 * b * c);
-template <class T>
-void BaseMatrixT<T>::addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
-  applyTernary(ternary::AddDotMul<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_BINARY_OP(Assign, a = b;);
-template <class T>
-void BaseMatrixT<T>::assign(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::Assign<T>(), b);
-  } else {  // cpu version
-    CHECK_EQ(this->height_, b.height_);
-    CHECK_EQ(this->width_, b.width_);
-    memcpy(data_, b.data_, sizeof(T) * height_ * width_);
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
-  if (columnOffset + b.width_ <= width_) {
-    int numRows = height_;
-    int numCols = b.width_;
-    MatrixOffset offset(columnOffset, 0, 0, 0);
-    applyBinary(binary::Assign<T>(), b, numRows, numCols, offset);
-  } else if (columnOffset + width_ <= b.width_) {
-    int numRows = height_;
-    int numCols = width_;
-    MatrixOffset offset(0, 0, columnOffset, 0);
-    applyBinary(binary::Assign<T>(), b, numRows, numCols, offset);
-  } else {
-    LOG(FATAL) << "Wrong argument "
-               << " a.width=" << width_ << " b.width=" << b.width_
-               << " columnOffset=" << columnOffset;
-  }
-}
-
-DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp);
-template <class T>
-void BaseMatrixT<T>::deepSwap(BaseMatrixT& b) {
-  applyBinary(binary::DeepSwap<T>(), b);
-}
-
-template <>
-void BaseMatrixT<real>::rowDotMul(size_t destCol,
-                                  BaseMatrixT& b,
-                                  BaseMatrixT& c) {
-  int numRows = b.height_;
-  int numCols = b.width_;
-  MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
-  aggregate(aggregate::sum(),
-            base::binary::mul(),
-            base::binary::add(),
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-}
-
-template <class T>
-void BaseMatrixT<T>::rowDotMul2(size_t destCol,
-                                BaseMatrixT& b,
-                                BaseMatrixT& c) {
-  CHECK(!useGpu_) << "do not support gpu";
-
-  size_t height = this->height_;
-  CHECK_LT(destCol, this->width_);
-  CHECK_EQ(height, b.height_);
-  CHECK_EQ(height, c.height_);
-  CHECK_EQ(b.width_, c.width_);
-  size_t width = b.width_;
-  T* A = this->data_;
-  const T* B = b.data_;
-  const T* C = c.data_;
-  for (size_t i = 0; i < height;
-       ++i, A += this->width_, B += width, C += width) {
-    for (size_t j = 0; j < width; ++j) {
-      A[destCol] += B[j] * C[j];
-    }
-  }
-}
-
-template <>
-void BaseMatrixT<real>::addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
-  aggregate(aggregate::sum(),
-            base::binary::mul(),
-            base::binary::add(),
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            true_type() /*aAsRowVector*/,
-            false_type());
-}
-
-template <class T>
-void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
-  CHECK(!useGpu_) << "do not support gpu";
-
-  CHECK_EQ(height_, 1LU);
-  CHECK_EQ(b.height_, c.height_);
-  CHECK_EQ(width_, b.width_);
-  CHECK_EQ(width_, c.width_);
-  size_t height = b.height_;
-  size_t width = b.width_;
-  T* A = this->data_;
-  const T* B = b.data_;
-  const T* C = c.data_;
-  for (size_t i = 0; i < height; ++i, B += width, C += width) {
-    for (size_t j = 0; j < width; ++j) {
-      A[j] += B[j] * C[j];
-    }
-  }
-}
-
-DEFINE_MATRIX_TERNARY_OP(addDotMulMMV, a += b * c);
-template <class T>
-void BaseMatrixT<T>::addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               true_type() /*cAsRowVector*/,
-               false_type());
-}
-
-template <class T>
-void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
-  CHECK(!useGpu_) << "do not support gpu";
-
-  CHECK_EQ(c.height_, 1LU);
-  CHECK_EQ(height_, b.height_);
-  CHECK_EQ(width_, b.width_);
-  CHECK_EQ(width_, c.width_);
-  size_t height = height_;
-  size_t width = width_;
-  T* A = this->data_;
-  const T* B = b.data_;
-  const T* C = c.data_;
-  for (size_t i = 0; i < height; ++i, A += width, B += width) {
-    for (size_t j = 0; j < width; ++j) {
-      A[j] += B[j] * C[j];
-    }
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::DotMul<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               false_type(),
-               true_type() /*cAsColVector*/);
-}
-
-template <class T>
-void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
-  CHECK(!useGpu_) << "do not support gpu";
-
-  size_t height = this->height_;
-  size_t width = this->width_;
-  CHECK_EQ(height, b.height_);
-  CHECK_EQ(width, b.width_);
-  CHECK_LT(cCol, c.width_);
-  CHECK_EQ(height, c.height_);
-  T* A = this->data_;
-  const T* B = b.data_;
-  const T* C = c.data_;
-  for (size_t i = 0; i < height; ++i, A += width, B += width, C += c.width_) {
-    for (size_t j = 0; j < width; ++j) {
-      A[j] = B[j] * C[cCol];
-    }
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, cRow);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::DotMul<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               true_type() /* cAsRowVector */,
-               false_type() /* cAsColVector */);
-}
-
-template <class T>
-void BaseMatrixT<T>::addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, cRow);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               true_type() /* cAsRowVector */,
-               false_type() /* cAsColVector */);
-}
-
-template <class T>
-void BaseMatrixT<T>::addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               false_type(),
-               true_type() /*cAsColVector*/);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(RowAdd, ONE_PARAMETER, a = b + p * c);
-template <class T>
-void BaseMatrixT<T>::rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p) {
-  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::RowAdd<T>(p),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               false_type(),
-               true_type() /*cAsColVector*/);
-}
-
-DEFINE_MATRIX_TERNARY_OP(RowPow, a = pow(b, c));
-template <>
-void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
-  if (useGpu_) {
-    MatrixOffset offset(0, 0, 0, 0, cCol, 0);
-    int numRows = height_;
-    int numCols = width_;
-    applyTernary(ternary::RowPow<real>(),
-                 b,
-                 c,
-                 numRows,
-                 numCols,
-                 offset,
-                 false_type(),
-                 true_type() /*cAsColVector*/);
-  } else {
-    size_t height = this->height_;
-    size_t width = this->width_;
-    CHECK_EQ(height, b.height_);
-    CHECK_EQ(width, b.width_);
-    CHECK_LT(cCol, c.width_);
-    CHECK_EQ(height, c.height_);
-    real* A = this->data_;
-    const real* B = b.data_;
-    const real* C = c.data_;
-    for (size_t i = 0; i < height; ++i, A += width, B += width, C += c.width_) {
-      vPow(width, B, C[cCol], A);
-    }
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::mulRowVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::DotMul<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
-}
-
-DEFINE_MATRIX_BINARY_OP(DotDiv, a /= b);
-template <class T>
-void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::DotDiv<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
-}
-
-template <class T>
-void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::DotMul<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              false_type(),
-              true_type() /* bAsColVector */);
-}
-
-template <class T>
-void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::DotDiv<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              false_type(),
-              true_type() /* bAsColVector */);
-}
-
-template <>
-template <class Agg>
-int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(height_, numRows);
-  CHECK_EQ(width_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            base::binary::second(),
-            b,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-
-  return 0;
-}
-
-template <>
-template <class Agg, class Saver>
-int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(height_, numRows);
-  CHECK_EQ(width_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            sv,
-            b,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-
-  return 0;
-}
-
-template <>
-template <class Agg>
-int BaseMatrixT<real>::applyRow(Agg agg,
-                                real scaleDest,
-                                real scaleAgg,
-                                BaseMatrixT& b) {
-  if (scaleDest != 0) {
-    applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b);
-  } else {
-    applyRow(agg, base::binary::second(), b);
-    if (scaleAgg != 1) {
-      mulScalar(scaleAgg);
-    }
-  }
-  return 0;
-}
-
-template <>
-template <class Agg, class Op, class Saver>
-int BaseMatrixT<real>::applyRow(
-    Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(height_, numRows);
-  CHECK_EQ(width_, 1UL);
-  CHECK_EQ(c.height_, numRows);
-  CHECK_EQ(c.width_, numCols);
-  aggregate(agg,
-            op,
-            sv,
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-  return 0;
-}
-
-template <>
-template <class Agg, class Op>
-int BaseMatrixT<real>::applyRow(Agg agg,
-                                Op op,
-                                real scaleDest,
-                                real scaleAgg,
-                                BaseMatrixT& b,
-                                BaseMatrixT& c) {
-  if (scaleDest != 0) {
-    applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c);
-  } else {
-    applyRow(agg, op, base::binary::second(), b, c);
-    if (scaleAgg != 1) {
-      mulScalar(scaleAgg);
-    }
-  }
-  return 0;
-}
-
-template <>
-template <class Agg>
-int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(width_, numCols);
-  CHECK_EQ(height_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            base::binary::second(),
-            b,
-            numRows,
-            numCols,
-            offset,
-            true_type() /*aAsRowVector*/,
-            false_type());
-
-  return 0;
-}
-
-template <>
-template <class Agg, class Saver>
-int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(width_, numCols);
-  CHECK_EQ(height_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            sv,
-            b,
-            numRows,
-            numCols,
-            offset,
-            true_type() /*aAsRowVector*/,
-            false_type());
-
-  return 0;
-}
-
-template <>
-template <class Agg>
-int BaseMatrixT<real>::applyCol(Agg agg,
-                                real scaleDest,
-                                real scaleAgg,
-                                BaseMatrixT& b) {
-  if (scaleDest != 0) {
-    applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b);
-  } else {
-    applyCol(agg, base::binary::second(), b);
-    if (scaleAgg != 1) {
-      mulScalar(scaleAgg);
-    }
-  }
-  return 0;
-}
-
-template <>
-void BaseMatrixT<real>::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) {
-  applyRow(aggregate::sum(), scaleDest, scaleSum, b);
-}
-
-template <>
-void BaseMatrixT<real>::maxRows(BaseMatrixT& b) {
-  applyRow(aggregate::max(), b);
-}
-
-template <>
-void BaseMatrixT<real>::minRows(BaseMatrixT& b) {
-  applyRow(aggregate::min(), b);
-}
-
-template <>
-void BaseMatrixT<real>::maxCols(BaseMatrixT& b) {
-  applyCol(aggregate::max(), b);
-}
-
-template <>
-void BaseMatrixT<real>::minCols(BaseMatrixT& b) {
-  applyCol(aggregate::min(), b);
-}
-
-template <>
-void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) {
-  applyCol(aggregate::sum(), scaleDest, scaleSum, b);
-}
-
-template <>
-void BaseMatrixT<real>::sumOfSquaredDiffs(BaseMatrixT& b,
-                                          BaseMatrixT& c,
-                                          real scaleSum,
-                                          real scaleDest) {
-  applyRow(
-      aggregate::sum(), base::binary::squaredDiff(), scaleDest, scaleSum, b, c);
-}
-
-template <>
-void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
-                                      BaseMatrixT& c,
-                                      real scaleSum,
-                                      real scaleDest) {
-  applyRow(aggregate::sum(), base::binary::mul(), scaleDest, scaleSum, b, c);
-}
-
-template class BaseMatrixT<real>;
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-template class BaseMatrixT<int>;
-
-#else
-
-template <>
-void BaseMatrixT<int>::zero() {
-  applyUnary(unary::Zero<int>());
-}
-
-template <>
-void BaseMatrixT<int>::assign(int p) {
-  applyUnary(unary::Assign<int>(p));
-}
-
-template <>
-void BaseMatrixT<int>::isEqualTo(BaseMatrixT& b, int value) {
-  applyBinary(binary::IsEqual<int>(value), b);
-}
-
-template <>
-void BaseMatrixT<int>::neg() {
-  applyUnary(unary::Neg<int>());
-}
-
-template <>
-void BaseMatrixT<int>::abs2() {
-  applyUnary(unary::Abs<int>());
-}
-
-template <>
-void BaseMatrixT<int>::add(int p) {
-  applyUnary(unary::Add<int>(p));
-}
-
-template <>
-void BaseMatrixT<int>::add(int p1, int p2) {
-  applyUnary(unary::Add2<int>(p1, p2));
-}
-
-template <>
-void BaseMatrixT<int>::applyL1(int learningRate, int decayRate) {
-  applyUnary(unary::ApplyL1<int>(learningRate * decayRate));
-}
-
-#endif
-}  // namespace paddle
diff --git a/paddle/legacy/math/BaseMatrix.h b/paddle/legacy/math/BaseMatrix.h
deleted file mode 100644
index 4627f847d35..00000000000
--- a/paddle/legacy/math/BaseMatrix.h
+++ /dev/null
@@ -1,1095 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stdint.h>
-#include <cstddef>
-#include "TensorExpression.h"
-#include "paddle/legacy/utils/Common.h"
-
-namespace paddle {
-
-/*
- * nvcc currently does not support C++11,
- * so I realized false_type and true_type.
- */
-template <class T, T v>
-struct bool_constant {
-  static const T value = v;
-};
-typedef bool_constant<bool, false> false_type;
-typedef bool_constant<bool, true> true_type;
-
-/**
- * @brief   Calculate matrix element address.
- *
- * For instance, address of A[i][j] = i * ld + j.
- *
- */
-#define CAL_MATRIX_START_ADDRESS(address, height, width, ld, col, row) \
-  CHECK_LE(col, width);                                                \
-  CHECK_LE(row, height);                                               \
-  address += row * ld + col;
-
-class MatrixOffset {
- public:
-  size_t aCol_;
-  size_t aRow_;
-  size_t bCol_;
-  size_t bRow_;
-  size_t cCol_;
-  size_t cRow_;
-  size_t dCol_;
-  size_t dRow_;
-  MatrixOffset(size_t aCol = 0,
-               size_t aRow = 0,
-               size_t bCol = 0,
-               size_t bRow = 0,
-               size_t cCol = 0,
-               size_t cRow = 0,
-               size_t dCol = 0,
-               size_t dRow = 0)
-      : aCol_(aCol),
-        aRow_(aRow),
-        bCol_(bCol),
-        bRow_(bRow),
-        cCol_(cCol),
-        cRow_(cRow),
-        dCol_(dCol),
-        dRow_(dRow) {}
-};
-
-template <class T>
-class BaseMatrixT : public TensorExpression<BaseMatrixT<T>, T> {
- public:
-  size_t height_, width_;
-  size_t stride_;
-  T* data_;
-  bool trans_;
-  bool useGpu_;
-
- public:
-  virtual ~BaseMatrixT() {}
-  BaseMatrixT(size_t height, size_t width, T* data, bool trans, bool useGpu)
-      : height_(height),
-        width_(width),
-        stride_(width),
-        data_(data),
-        trans_(trans),
-        useGpu_(useGpu) {}
-
-  /**
-   * @note This constructor is for temporarily making a matrix with different
-   *       useGpu flag as the original matrix so that mixed gpu/cpu operations
-   *       can be performed successfully.
-   */
-  BaseMatrixT(BaseMatrixT& mat, bool useGpu)
-      : height_(mat.height_),
-        width_(mat.width_),
-        stride_(mat.stride_),
-        data_(mat.data_),
-        trans_(mat.trans_),
-        useGpu_(useGpu) {}
-
-  BaseMatrixT(size_t height,
-              size_t width,
-              size_t stride,
-              T* data,
-              bool trans,
-              bool use_gpu)
-      : height_(height),
-        width_(width),
-        stride_(stride),
-        data_(data),
-        trans_(trans),
-        useGpu_(use_gpu) {
-    /* CHECK_LE(width_, stride_); */
-  }
-
-  /// caller should make sure that the size of data is at least height*width
-  void setData(T* data) { data_ = data; }
-
-  /**
-   * unary operator: element wise op(a).
-   *
-   * @code
-   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
-   * @endcode
-   */
-  template <class Op>
-  int applyUnary(Op op);
-
-  /**
-   * unary operator: element wise op(a).
-   *
-   * @code
-   * for 0 <= i < numRows & for 0 <= j < numCols.
-   * While matrix start address is:
-   *  A = this->data_ + offset.aRow_*ld + offset.aCol_;
-   * @endcode
-   */
-  template <class Op>
-  int applyUnary(Op op, int numRows, int numCols, MatrixOffset& offset);
-
-  /**
-   * binary operator: element wise op(a, b).
-   *
-   * @code
-   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
-   * While this->height_ == b.height_ && this->width_ == b.width_.
-   * @endcode
-   */
-  template <class Op>
-  int applyBinary(Op op, BaseMatrixT& b);
-
-  /**
-   * binary operator: element wise op(a, b)
-   *
-   * @code
-   * for 0 <= i < numRows & for 0 <= j < numCols.
-   * While matrix start address is:
-   *   A = this->data_ + offset.aRow_*lda + offset.aCol_;
-   *   B = b->data_ + offset.bRow_*ldb + offset.bCol_;
-   *
-   * if (bAsRowVector == false_type && bAsColVector == false_type)
-   *   op(A[i * lda + j], B[i * ldb + j])
-   *
-   * if (bAsRowVector == true_type && bAsColVector == false_type)
-   *   op(A[i * lda + j], B[j])
-   *
-   * if (bAsRowVector == false_type && bAsColVector == true_type)
-   *   op(A[i * lda + j], B[i * ldb])
-   *
-   * if (bAsRowVector == true_type && bAsColVector == true_type)
-   *   op(A[i * lda + j], B[0])
-   * @endcode
-   */
-  template <class Op, class bAsRowVector, class bAsColVector>
-  int applyBinary(Op op,
-                  BaseMatrixT& b,
-                  int numRows,
-                  int numCols,
-                  MatrixOffset& offset,
-                  bAsRowVector,
-                  bAsColVector);
-
-  template <class Op>
-  int applyBinary(
-      Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset);
-
-  /**
-   * ternary operator: element wise op(a, b, c).
-   *
-   * @code
-   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
-   *
-   * While this->height_ == b.height_ && this->width_ == b.width_
-   *    && this->height_ == c.height_ && this->width_ == c.width_
-   * @endcode
-   */
-  template <class Op>
-  int applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * ternary operator: element wise op(a, b, c).
-   *
-   * @code
-   *  for 0 <= i < numRows & for 0 <= j < numCols.
-   *  While matrix start address is:
-   *
-   *    A = this->data_ + offset.aRow_*lda + offset.aCol_;
-   *    B = b->data_ + offset.bRow_*ldb + offset.bCol_;
-   *    C = c->data_ + offset.cRow_*ldc + offset.cCol_;
-   *
-   *    if (cAsRowVector == false_type && cAsColVector == false_type)
-   *      op(A[i*lda + j], B[i*ldb + j], C[i*ldc + j])
-   *
-   *    if (cAsRowVector == true_type && cAsColVector == false_type)
-   *      op(A[i*lda + j], B[i*ldb + j], C[j])
-   *
-   *    if (cAsRowVector == false_type && cAsColVector == true_type)
-   *      op(A[i*lda + j], B[i*ldb + j], C[i*ldc])
-   *
-   *    if (cAsRowVector == 1 && cAsColVector == 1)
-   *      op(A[i*lda + j], B[i*ldb + j], C[0])
-   * @endcode
-   */
-  template <class Op, class cAsRowVector, class cAsColVector>
-  int applyTernary(Op op,
-                   BaseMatrixT& b,
-                   BaseMatrixT& c,
-                   int numRows,
-                   int numCols,
-                   MatrixOffset& offset,
-                   cAsRowVector,
-                   cAsColVector);
-
-  template <class Op>
-  int applyTernary(Op op,
-                   BaseMatrixT& b,
-                   BaseMatrixT& c,
-                   int numRows,
-                   int numCols,
-                   MatrixOffset& offset);
-
-  /**
-   * quaternary operator: element wise op(a, b, c, d).
-   *
-   * @code
-   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
-   *
-   * While this->height_ == b.height_ && this->width_ == b.width_
-   *    && this->height_ == c.height_ && this->width_ == c.width_
-   *    && this->height_ == d.height_ && this->width_ == d.width_
-   * @endcode
-   */
-  template <class Op>
-  int applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
-
-  /**
-   * quaternary operator: element wise op(a, b, c, d).
-   *
-   * @code
-   * for 0 <= i < numRows & for 0 <= j < numCols.
-   * While matrix start address is:
-   *    A = this->data_ + offset.aRow_*lda + offset.aCol_;
-   *    B = b->data_ + offset.bRow_*ldb + offset.bCol_;
-   *    C = c->data_ + offset.cRow_*ldc + offset.cCol_;
-   *    D = d->data_ + offset.dRow_*ldd + offset.dCol_;
-   * @endcode
-   */
-  template <class Op>
-  int applyQuaternary(Op op,
-                      BaseMatrixT& b,
-                      BaseMatrixT& c,
-                      BaseMatrixT& d,
-                      int numRows,
-                      int numCols,
-                      MatrixOffset& offset);
-
-  /**
-   * a aggregate expression that apply each row(or column) of matrix b.
-   * op and sv is element wise operator.
-   *
-   * @code
-   * if (aAsRowVector == true_type && aAsColVector == false_type)
-   *  for each column j & 0 <= i < numRows, do:
-   *    dst = agg(op(b[i*ldb + j]))
-   *    a[j] = sv(a[j], dst)
-   *
-   * if (aAsRowVector == false_type && aAsColVector == true_type)
-   *  for each row i & 0 <= j < numCols, do:
-   *    dst = agg(op(b[i*ldb + j]))
-   *    a[i] = sv(a[i], dst)
-   * @endcode
-   */
-  template <class Agg,
-            class Op,
-            class Saver,
-            class aAsRowVector,
-            class aAsColVector>
-  int aggregate(Agg agg,
-                Op op,
-                Saver sv,
-                BaseMatrixT& b,
-                int numRows,
-                int numCols,
-                MatrixOffset& offset,
-                aAsRowVector,
-                aAsColVector);
-
-  /**
-   * a aggregate expression that apply each row(or column) of matrix b and c.
-   *
-   * op and sv is element wise operator.
-   *
-   * @code
-   * if (aAsRowVector == true_type && aAsColVector == false_type)
-   *   for each column j & 0 <= i < numRows, do:
-   *     dst = agg(op(b[i*ldb + j], c[i*ldc + j]))
-   *     a[j] = sv(a[j], dst)
-   *
-   * if (aAsRowVector == false_type && aAsColVector == true_type)
-   *   for each row i & 0 <= j < numCols, do:
-   *     dst = agg(op(b[i*ldb + j], c[i*ldc + j]))
-   *     a[i] = sv(a[i], dst)
-   * @endcode
-   */
-  template <class Agg,
-            class Op,
-            class Saver,
-            class aAsRowVector,
-            class aAsColVector>
-  int aggregate(Agg agg,
-                Op op,
-                Saver sv,
-                BaseMatrixT& b,
-                BaseMatrixT& c,
-                int numRows,
-                int numCols,
-                MatrixOffset& offset,
-                aAsRowVector,
-                aAsColVector);
-
-  /**
-   * a aggregate expression that apply each row of matrix b.
-   *
-   * @code
-   * for each row i & 0 <= j < b.width_, do:
-   *   this[i] = agg(b[i*ldb + j])
-   * @endcode
-   */
-  template <class Agg>
-  int applyRow(Agg agg, BaseMatrixT& b);
-
-  /**
-   * a aggregate expression that apply each row of matrix b.
-   *
-   * @code
-   * for each row i & 0 <= j < b.width_, do:
-   *   dst = agg(op(b[i*ldb + j], c[i*ldc + j])
-   *   this[i] = sv(this[i], dst)
-   * @endcode
-   */
-  template <class Agg, class Op, class Saver>
-  int applyRow(Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c);
-
-  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
-  template <class Agg, class Op>
-  int applyRow(Agg agg,
-               Op op,
-               real scaleDest,
-               real scaleAgg,
-               BaseMatrixT& b,
-               BaseMatrixT& c);
-
-  /**
-   * a aggregate expression that apply each row of matrix b.
-   *
-   * @code
-   * for each row i & 0 <= j < b.width_, do:
-   *   dst = agg(b[i*ldb + j])
-   *   this[i] = sv(this[i], dst)
-   * @endcode
-   */
-  template <class Agg, class Saver>
-  int applyRow(Agg agg, Saver sv, BaseMatrixT& b);
-
-  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
-  template <class Agg>
-  int applyRow(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
-
-  /**
-   * a aggregate expression that apply each column of matrix b.
-   *
-   * @code
-   * for each column j & 0 <= i < b.height_, do:
-   *   this[j] = agg(b[i*ldb + j])
-   * @endcode
-   */
-  template <class Agg>
-  int applyCol(Agg agg, BaseMatrixT& b);
-
-  /**
-   * a aggregate expression that apply each column of matrix b.
-   *
-   * @code
-   * for each column j & 0 <= i < b.height_, do:
-   *   dst = agg(b[i*ldb + j])
-   *   this[j] = sv(this[j], dst)
-   * @endcode
-   */
-  template <class Agg, class Saver>
-  int applyCol(Agg agg, Saver sv, BaseMatrixT& b);
-
-  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
-  template <class Agg>
-  int applyCol(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
-
-  bool useGpu() const { return useGpu_; }
-
-  const T* rowBuf(size_t row) const { return data_ + width_ * row; }
-
-  T* rowBuf(size_t row) { return data_ + width_ * row; }
-
-  /**
-   * @brief   unary operator.
-   *
-   */
-  void neg();
-  void exp2();
-  void pow2(T p);
-  void log2();
-  void sqrt2();
-  void square2();
-  void reciprocal2();
-  void abs2();
-  void sign2();
-  void zero();
-
-  /**
-   * @code
-   * this(row, col + columnOffset) = 0 for 0 <= col < numColumns
-   * @endcode
-   */
-  void zeroAtOffset(int64_t columnOffset, int64_t numColumns);
-  void one();
-  void subScalar(T p);
-  void mulScalar(T p);
-  void divScalar(T p);
-
-  /**
-   * @code
-   * this = p
-   * @endcode
-   */
-  void assign(T p);
-
-  /**
-   * @code
-   * swap(this, b)
-   * example: swap two Matrices
-   * MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-   * MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-   * cpuA->deepSwap(*cpuB);
-   * @endcode
-   */
-  void deepSwap(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = this + p
-   * @endcode
-   */
-  void add(T p);
-
-  /**
-   * @code
-   * this = this*p1 + p2
-   * @endcode
-   */
-  void add(T p1, T p2);
-
-  /**
-   * this = this < low ? low : this
-   *
-   * this = this > high ? high : this
-   */
-  void clip(T p1, T p2);
-
-  /**
-   * this = b < low ? 0 : 1
-   *
-   * this = b > high ? 0 : 1
-   */
-  void clipDerivative(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * a = a > p ? 1.0f : 0.0f
-   * @endcode
-   */
-  void biggerThanScalar(T p);
-
-  /**
-   * @code
-   * a = a > p ? a : p
-   * @endcode
-   */
-  void downClip(T p);
-
-  /**
-   * @code
-   * this = b
-   * @endcode
-   */
-  void assign(BaseMatrixT& b);
-
-  /**
-   * @code
-   * If b.width + columOffset <= this.width
-   *  this(row, col + columnOffset) = b(row, col) for 0 <= col < b.width
-   *
-   * If this.width + columnOffset <= b.width
-   *  this(row, col) = b(row, col + columnOffset) for 0 <= col < this.width
-   *
-   * Otherwise, FATAL
-   * @endcode
-   */
-  void assignAtOffset(BaseMatrixT& b, int64_t columnOffset);
-
-  /// this = this + b
-  void add(BaseMatrixT& b);
-
-  /**
-   * @code
-   * If b.width + columOffset <= this.width
-   *  this(row, col + columnOffset) += b(row, col) for 0 <= col < b.width
-   *
-   * If this.width + columnOffset <= b.width
-   *  this(row, col) += b(row, col + columnOffset) for 0 <= col < this.width
-   *
-   * Otherwise, FATAL
-   * @endcode
-   */
-  void addAtOffset(BaseMatrixT& b, int64_t columnOffset);
-
-  void addColVector(BaseMatrixT& b);
-  void addRowVector(BaseMatrixT& b);
-  void addBias(BaseMatrixT& b, T scale);
-
-  void mulRowVector(BaseMatrixT& b);
-  void divRowVector(BaseMatrixT& b);
-
-  void mulColVector(BaseMatrixT& b);
-  void divColVector(BaseMatrixT& b);
-
-  void addP2P(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = this + b*p
-   * @endcode
-   */
-  void add(BaseMatrixT& b, T p);
-
-  /**
-   * @code
-   * this = p1*this + p2*b
-   * @endcode
-   */
-  void add(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * this = this - b
-   * @endcode
-   */
-  void sub(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = this - b*p
-   * @endcode
-   */
-  void sub(BaseMatrixT& b, T p);
-
-  /**
-   * @code
-   * b = max(0, this)
-   * @endcode
-   */
-  void relu(BaseMatrixT& b);
-  void reluDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = log(1.0 + exp(this))
-   * @endcode
-   */
-  void softrelu(BaseMatrixT& b);
-  void softreluDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = min(max(this, p1), p2)
-   * @endcode
-   */
-  void brelu(BaseMatrixT& b);
-  void breluDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = this * this
-   * @endcode
-   */
-  void square2(BaseMatrixT& b);
-  void squareDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = tanh(this)
-   * @endcode
-   */
-  void tanh(BaseMatrixT& b);
-  void tanhDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = p1 * tanh(p2 * this)
-   * @endcode
-   */
-  void scaledTanh(BaseMatrixT& b, T p1, T p2);
-  void scaledTanhDerivative(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * b = 1.0f / this
-   * @endcode
-   */
-  void reciprocal2(BaseMatrixT& b);
-  void reciprocalDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = this > 0.0f ? this : -this
-   * @endcode
-   */
-  void abs2(BaseMatrixT& b);
-  void absDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = 1.0f / (1.0f + exp(-this))
-   * @endcode
-   */
-  void sigmoid(BaseMatrixT& b);
-  void sigmoidDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = a
-   * @endcode
-   */
-  void expDerivative(BaseMatrixT& b);
-
-  void sign2(BaseMatrixT& b);
-
-  void exp2(BaseMatrixT& b);
-  void pow2(BaseMatrixT& b, T p);
-  void log2(BaseMatrixT& b);
-  void sqrt2(BaseMatrixT& b);
-  void addScalar(BaseMatrixT& b, T p);
-  void subScalar(BaseMatrixT& b, T p);
-  void mulScalar(BaseMatrixT& b, T p);
-  void divScalar(BaseMatrixT& b, T p);
-  void scalarDiv(BaseMatrixT& b, T p);
-
-  /**
-   * @code
-   * this = 1.0f / sqrt(b)
-   * @endcode
-   */
-  void invSqrt(BaseMatrixT& b);
-
-  /// this = (b == value)
-  void isEqualTo(BaseMatrixT& b, T value);
-
-  /**
-   * @brief   ternary operator.
-   */
-  void softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c);
-  void softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c);
-  void binaryLabelCrossEntropy(BaseMatrixT& b, BaseMatrixT& c);
-  void binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = b + c
-   * @endcode
-   */
-  void add(BaseMatrixT& b, BaseMatrixT& c);
-  /**
-   * @code
-   * this = b*p1 + c*p2
-   * @endcode
-   */
-  void add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2);
-  /**
-   * @code
-   * this = b - c
-   * @endcode
-   */
-  void sub(BaseMatrixT& b, BaseMatrixT& c);
-  /**
-   * @code
-   * this = b*p1 - c*p2
-   * @endcode
-   */
-  void sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2);
-
-  /**
-   * @code
-   * this = this + b + c
-   * @endcode
-   */
-  void add2(BaseMatrixT& b, BaseMatrixT& c);
-  /**
-   * @code
-   * this = this*p1 + b*p2 + c*p3
-   * @endcode
-   */
-  void add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3);
-
-  /**
-   * @code
-   * this = a*p1 + b*p2 + c*p3
-   * @endcode
-   */
-  void add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3);
-
-  /**
-   * @code
-   *   c = p2 * c - p1 *  (b + p3 * this)
-   *   this += mom
-   * @endcode
-   */
-  void sgdUpdate(BaseMatrixT& b,  //  grad
-                 BaseMatrixT& c,  //  mom
-                 T p1,            //  learningRate,
-                 T p2,            //  momentum,
-                 T p3);           //  decayRate
-
-  /**
-   * @code
-   *   c = p2 * c - p1 * d * (b + p3 * this)
-   *   this += mom
-   * @endcode
-   */
-  void sgdUpdate(BaseMatrixT& b,  // grad,
-                 BaseMatrixT& c,  // mom,
-                 BaseMatrixT& d,  // lr,
-                 T p1,            // learningRate,
-                 T p2,            // momentum,
-                 T p3);           // decayRate
-
-  /// apply L1/L2 to *this*
-  virtual void applyL1(T learningRate, T decayRate);
-  void applyL1(BaseMatrixT& lr, T learningRate, T decayRate);
-  void applyL2(T learningRate, T decayRate);
-  void applyL2(BaseMatrixT& lr, T learningRate, T decayRate);
-
-  /**
-   * @code
-   * this *= b
-   * @endcode
-   */
-  void dotMul(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = b * c
-   * @endcode
-   */
-  void dotMul(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = b / c
-   * @endcode
-   */
-  void dotDiv(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = (b + p1) / (c + p2)
-   * @endcode
-   */
-  void dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this = log(1 + exp(b - c)) - d * (b - c)
-   * @endcode
-   */
-  void rankLoss(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
-  void rankLossBp(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
-
-  /**
-   * @code
-   * this = log(1 + exp(b)) - c * b
-   * @endcode
-   */
-  void logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this += exp(b)/(1+exp(b)) - c
-   * @endcode
-   */
-  void logisticRegressionLossBp(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = b > c ? 1.0 : 0.0
-   * @endcode
-   */
-  void biggerThan(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = ((b>c && d>0.5) || (b<c && d<0.5)) ? 1 : 0)
-   * @endcode
-   */
-  void biggerThan(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
-
-  /**
-   * @code
-   * this = b>c ? b : c
-   * @endcode
-   */
-  void max2(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this[destCol] += (b>p1 == c>p1) ? 0 : 1)
-   * @endcode
-   */
-  void binaryClassificationError(size_t destCol,
-                                 BaseMatrixT& b,
-                                 BaseMatrixT& c,
-                                 T p);
-  void binaryClassificationError2(size_t destCol,
-                                  BaseMatrixT& b,
-                                  BaseMatrixT& c,
-                                  T p);
-
-  /**
-   * @code
-   * this = this * b * b
-   * @endcode
-   */
-  void dotMulSquare(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = this * this * b
-   * @endcode
-   */
-  void dotSquareMul(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = b * c * c
-   * @endcode
-   */
-  void dotMulSquare(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = b * b * c * c
-   * @endcode
-   */
-  void dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = this * (p1*b + p2*c)^2
-   * @endcode
-   */
-  void dotMulSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this = (p1*b + p2*c)^2
-   * @endcode
-   */
-  void dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this=  this * (p1*b + p2*c)
-   * @endcode
-   */
-  void dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this += sqr(p1*b + p2*c + p3*d)
-   * @endcode
-   */
-  void addSquareSum(
-      BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3);
-
-  /**
-   * @code
-   * this += p * sqr(b)
-   * @endcode
-   */
-  void addSquare(BaseMatrixT& b, T p);
-
-  /**
-   * @code
-   * this = p1 * this + p2 * sqr(b)
-   * @endcode
-   */
-  void decayAddSquare(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * this = p1 * this + p2 * sqr(b * c)
-   * @endcode
-   */
-  void decayAddSquareMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this = 1 / (p1 * b + p2)
-   * @endcode
-   */
-  void reciprocal2(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * this = 1 / (p1 * b + p2 * c + p3)
-   * @endcode
-   */
-  void reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3);
-
-  /**
-   * @code
-   * b = this; this = 0
-   * @endcode
-   */
-  void copyAndClear(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this_row[destCol] += dotprod(b_row, c_row)
-   * @endcode
-   */
-  void rowDotMul(size_t destCol, BaseMatrixT& b, BaseMatrixT& c);
-  void rowDotMul2(size_t destCol, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * this is vector (one row matrix)
-   *
-   * @code
-   *   for each row i, do:
-   *      this_row += dotmul(b_row_i, c_row_i)
-   * @endcode
-   */
-  void addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c);
-  void addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * c is vector (one row matrix)
-   *
-   * @code
-   * for each row i, do:
-   *    this_row_i += dotmul(b_row_i, c_row)
-   * @endcode
-   */
-  void addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c);
-  void addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = p1 * this + p2 * b * c
-   * @endcode
-   */
-  void addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this_row = b_row * c_row[cCol]
-   * @endcode
-   */
-  void rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
-  void rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this_col = b_col * c_col[cRow]
-   * @endcode
-   */
-  void colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this_col += b_col * c_col[cRow]
-   * @endcode
-   */
-  void addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this_row += b_row * c_row[cCol]
-   * @endcode
-   */
-  void addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
-
-  /// calculate the sum of each row of the matrix b.
-  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij}
-  void sumRows(BaseMatrixT& b, T scaleSum, T scaleDest);
-
-  /// calculate the maximum value of each row of the matrix b.
-  void maxRows(BaseMatrixT& b);
-  /// calculate the minimum value of each row of the matrix b.
-  void minRows(BaseMatrixT& b);
-
-  /// calculate the maximum value of each column of the matrix b.
-  void maxCols(BaseMatrixT& b);
-  /// calculate the minimum value of each column of the matrix b.
-  void minCols(BaseMatrixT& b);
-
-  /// calculate the sum of each column of the matrix b.
-  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
-  void sumCols(BaseMatrixT& b, T scaleSum, T scaleDest);
-
-  /// this_i = scaleDest * this_i + scaleSum * \sum_j (b_{ij} - c_{ij})^2
-  void sumOfSquaredDiffs(BaseMatrixT& b,
-                         BaseMatrixT& c,
-                         T scaleSum,
-                         T scaleDest);
-
-  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
-  void sumOfProducts(BaseMatrixT& b, BaseMatrixT& c, T scaleSum, T scaleDest);
-
-  /**
-   * @code
-   * this_row = b_row + p * ones * c_row[cCol]
-   * @endcode
-   */
-  void rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p);
-  /**
-   * @code
-   * this_row = pow(b_row, c_row[cCol])
-   * @endcode
-   */
-  void rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
-
-  virtual bool isSparse() const { return false; }
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    if (useGpu_) {
-      TensorGpuApply<T>(*this, expr);
-    } else {
-      TensorCpuApply<T>(*this, expr);
-    }
-  }
-
-  template <typename ExpressionType>
-  void operator+=(const ExpressionType& expr) {
-    (*this) = (*this) + expr;
-  }
-  template <typename ExpressionType>
-  void operator-=(const ExpressionType& expr) {
-    (*this) = (*this) - expr;
-  }
-  template <typename ExpressionType>
-  void operator*=(const ExpressionType& expr) {
-    (*this) = (*this) * expr;
-  }
-  template <typename ExpressionType>
-  void operator/=(const ExpressionType& expr) {
-    (*this) = (*this) / expr;
-  }
-};
-
-typedef BaseMatrixT<real> BaseMatrix;
-typedef BaseMatrixT<int> IBaseMatrix;
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/CMakeLists.txt b/paddle/legacy/math/CMakeLists.txt
deleted file mode 100644
index 9992ec71f45..00000000000
--- a/paddle/legacy/math/CMakeLists.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-# common package contains:
-#   * the utilities:
-#       * Thread Libs
-#       * Memory Manage libs
-#       * CommandLine Parser
-#       * Logging
-#       * Timer/Stats
-#   * the math libraries:
-#       * Matrix/Vector
-#   * the parameter optimizers.
-#   * the parameter updater functions.
-#
-# TODO(yuyang18): separate libs.
-#
-file(GLOB MATH_HEADERS . *.h)
-file(GLOB MATH_SOURCES . *.cpp)
-
-if(NOT WITH_MKLDNN)
-    set(DNN_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.h")
-    set(DNN_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.cpp")
-    list(REMOVE_ITEM MATH_HEADERS "${DNN_HEADER}")
-    list(REMOVE_ITEM MATH_SOURCES "${DNN_SOURCE}")
-    message(STATUS "Skip compiling with MKLDNNMatrix")
-else()
-    message(STATUS "Compile with MKLDNNMatrix")
-endif()
-
-if(MOBILE_INFERENCE)
-    # Remove sparse
-    list(REMOVE_ITEM MATH_HEADERS
-         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.h
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.h)
-    list(REMOVE_ITEM MATH_SOURCES
-         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.cpp)
-endif()
-set(MATH_SOURCES
-    "${PADDLE_SOURCE_DIR}/paddle/legacy/math/BaseMatrix.cu"
-    "${PADDLE_SOURCE_DIR}/paddle/legacy/math/TrainingAlgorithmOp.cu"
-    ${MATH_SOURCES})
-if(NOT WITH_GPU)
-    # then compile BaseMatrix.cu as c++ file
-    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/legacy/math/BaseMatrix.cu")
-    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/legacy/math/TrainingAlgorithmOp.cu")
-    add_library(paddle_math STATIC
-        ${MATH_SOURCES})
-else()
-    cuda_add_library(paddle_math ${MATH_SOURCES})
-endif()
-
-
-add_dependencies(paddle_math paddle_proto ${external_project_dependencies})  # depends
-if(WITH_TESTING)
-    add_subdirectory(tests)
-endif()
diff --git a/paddle/legacy/math/CpuSparseMatrix.cpp b/paddle/legacy/math/CpuSparseMatrix.cpp
deleted file mode 100644
index 20c65a3a1d7..00000000000
--- a/paddle/legacy/math/CpuSparseMatrix.cpp
+++ /dev/null
@@ -1,787 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CpuSparseMatrix.h"
-#include "SparseMatrix.h"
-#include "float.h"
-#include "hl_gpu.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-const size_t CpuSparseMatrix::DEFAULT_AVG_WIDTH;
-
-CpuSparseMatrix::CpuSparseMatrix(size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(NULL, height, width, trans, false) {
-  resize(height, width, nnz, valueType, format);
-}
-
-CpuSparseMatrix::CpuSparseMatrix(CpuMemHandlePtr dataHandle,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(dataHandle, height, width, trans, false) {
-  resize(height, width, nnz, valueType, format);
-}
-
-CpuSparseMatrix::CpuSparseMatrix(real* data,
-                                 int* rows,
-                                 int* cols,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(NULL, height, width, trans, false) {
-  cols_ = cols;
-  rows_ = rows;
-  value_ = data;
-  height_ = height;
-  width_ = width;
-  elementCnt_ = nnz;
-  valueType_ = valueType;
-  format_ = format;
-}
-
-void CpuSparseMatrix::resize(size_t newHeight,
-                             size_t newWidth,
-                             size_t newNnz,
-                             SparseValueType valueType,
-                             SparseFormat format) {
-  CHECK_LE(newNnz, newHeight * newWidth);
-  size_t newSize = 0;
-  if (format == SPARSE_CSR) {
-    newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
-  } else {
-    newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
-  }
-
-  if (NO_VALUE != valueType) {
-    newSize += newNnz * sizeof(real);
-  }
-
-  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
-    memoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
-  }
-
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newNnz;
-  valueType_ = valueType;
-  format_ = format;
-  sparseResize();
-}
-void CpuSparseMatrix::sparseResize() {
-  if (format_ == SPARSE_CSR) {
-    rows_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()));
-    cols_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-        (height_ + 1) * sizeof(int));
-    if (NO_VALUE != valueType_) {
-      value_ = reinterpret_cast<real*>(
-          reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-          (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-    } else {
-      value_ = NULL;
-    }
-  } else {
-    cols_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()));
-    rows_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-        (width_ + 1) * sizeof(int));
-    if (NO_VALUE != valueType_) {
-      value_ = reinterpret_cast<real*>(
-          reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-          (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-    } else {
-      value_ = NULL;
-    }
-  }
-}
-
-void CpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
-  resize(newHeight,
-         newWidth,
-         newHeight * std::min(DEFAULT_AVG_WIDTH, newWidth),
-         valueType_,
-         format_);
-}
-
-MatrixPtr CpuSparseMatrix::getTranspose() {
-  if (!memoryHandle_ && !value_) {
-    MatrixPtr dest(new CpuSparseMatrix(
-        height_, width_, elementCnt_, valueType_, format_, true));
-    return dest;
-  } else if (memoryHandle_) {
-    MatrixPtr dest(new CpuSparseMatrix(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
-        height_,
-        width_,
-        elementCnt_,
-        valueType_,
-        format_,
-        true));
-    return dest;
-  } else if (value_) {
-    MatrixPtr dest(new CpuSparseMatrix(value_,
-                                       rows_,
-                                       cols_,
-                                       height_,
-                                       width_,
-                                       elementCnt_,
-                                       valueType_,
-                                       format_,
-                                       true));
-    return dest;
-  } else {
-    return NULL;
-  }
-}
-
-SparseValueType CpuSparseMatrix::getValueType() { return valueType_; }
-
-void CpuSparseMatrix::mul(const Matrix& a,
-                          const Matrix& b,
-                          real scaleAB,
-                          real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-  const auto a_ptr = dynamic_cast<const CpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const CpuMatrix*>(&b);
-
-  if (a_ptr && b_ptr) {
-    CpuMatrix::mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, this, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void CpuSparseMatrix::add3(CpuMatrix* b) {
-  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
-  CHECK(height_ == b->getHeight());
-  CHECK(width_ == b->getWidth());
-  real* A = getValue();
-  real* B = b->getData();
-  int* cols = getCols();
-  for (size_t i = 0; i < height_; i++) {
-    size_t start = getRowStartIdx(i);
-    size_t end = getRowStartIdx(i + 1);
-    for (size_t j = start; j < end; j++) {
-      A[j] = B[i * width_ + cols[j]];
-    }
-  }
-}
-
-void CpuSparseMatrix::add3(MatrixPtr b) {
-  if (dynamic_cast<CpuMatrix*>(b.get())) {
-    add3(dynamic_cast<CpuMatrix*>(b.get()));
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void CpuSparseMatrix::addBias(Matrix& b, real scale) {
-  CHECK_EQ(b.getHeight(), (size_t)1);
-  CHECK_EQ(width_, b.getWidth());
-  real* A = getValue();
-  real* B = b.getData();
-  int* cols = getCols();
-  size_t nnz = getElementCnt();
-  for (size_t i = 0; i < nnz; i++) {
-    A[i] += scale * B[cols[i]];
-  }
-}
-
-template <class T>
-void printBuf(std::ostream& os, T* a, size_t len, const char* name) {
-  os << "\n: " << name << " [";
-  for (size_t i = 0; i < len; i++) {
-    os << a[i] << " ";
-  }
-  os << "]\n";
-}
-
-void CpuSparseMatrix::print(std::ostream& os) const {
-  size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
-  size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
-  printBuf(os, rows_, rowSize, "row");
-  printBuf(os, cols_, colSize, "col");
-  if (valueType_ == FLOAT_VALUE) {
-    printBuf(os, value_, elementCnt_, "value");
-  }
-  return;
-}
-
-void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
-  CHECK_LT(idx, height_);
-  if (format_ == SPARSE_CSC) {
-    LOG(FATAL) << "SPARSE_CSC not supported";
-    return;
-  }
-
-  const int* col = getRowCols(idx);
-  size_t num = getColNum(idx);
-  if (num > 0) {
-    if (valueType_ == FLOAT_VALUE) {
-      const real* data = getRowValues(idx);
-      os << col[0] << ":" << data[0];
-      for (size_t i = 1; i < num; ++i) {
-        os << " " << col[i] << ":" << data[i];
-      }
-    } else {
-      os << col[0];
-      for (size_t i = 1; i < num; ++i) {
-        os << " " << col[i];
-      }
-    }
-  }
-  os << ";";
-}
-
-void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
-  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
-  CHECK_EQ(height_, b.getHeight());
-  CHECK_EQ(width_, b.getWidth());
-  real* A = getValue();
-  real* B = b.getValue();
-  if (b.getValueType() == FLOAT_VALUE) {
-    for (size_t i = 0; i < height_; i++) {
-      size_t start = getRowStartIdx(i);
-      size_t end = getRowStartIdx(i + 1);
-      CHECK_EQ(start, b.getRowStartIdx(i));
-      CHECK_EQ(end, b.getRowStartIdx(i + 1));
-      for (size_t j = start; j < end; j++) {
-        A[j] = B[j] * c.getElement(i, cCol);
-      }
-    }
-  } else if (b.getValueType() == NO_VALUE) {
-    for (size_t i = 0; i < height_; i++) {
-      size_t start = getRowStartIdx(i);
-      size_t end = getRowStartIdx(i + 1);
-      CHECK_EQ(start, b.getRowStartIdx(i));
-      CHECK_EQ(end, b.getRowStartIdx(i + 1));
-      for (size_t j = start; j < end; j++) {
-        A[j] = c.getElement(i, cCol);
-      }
-    }
-  }
-}
-
-void CpuSparseMatrix::randomizeUniform() {
-  CHECK_LE(elementCnt_, height_ * width_);
-  if (valueType_ == FLOAT_VALUE) {
-    real* data = getValue();
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      *data++ = rand() / static_cast<real>(RAND_MAX);  // NOLINT
-    }
-  }
-  if (format_ == SPARSE_CSR) {
-    sparseRand(rows_, cols_, elementCnt_, height_ + 1, width_, false);
-  } else {
-    sparseRand(cols_, rows_, elementCnt_, width_ + 1, height_, false);
-  }
-}
-
-void CpuSparseMatrix::copyFrom(std::vector<int>& rows,
-                               std::vector<int>& cols,
-                               std::vector<real>& values) {
-  size_t size = format_ == SPARSE_CSR ? cols.size() : rows.size();
-  resize(height_, width_, size, valueType_, format_);
-  if (valueType_ == FLOAT_VALUE) {
-    memcpy(&value_[0], &values[0], sizeof(real) * values.size());
-  }
-  memcpy(&cols_[0], &cols[0], sizeof(int) * cols.size());
-  memcpy(&rows_[0], &rows[0], sizeof(int) * rows.size());
-}
-
-// Copy from a CpuMatrix, only supported in sparse_float_value_t
-// SparseMatrix.
-void CpuSparseMatrix::copyFrom(const CpuMatrix& src) {
-  CHECK_EQ(getHeight(), src.getHeight());
-  CHECK_EQ(getWidth(), src.getWidth());
-  CHECK(!src.trans_ && !trans_);
-  if (format_ == SPARSE_CSR) {
-    std::vector<int> rows(getHeight() + 1);
-    std::vector<int> cols;
-    std::vector<real> values;
-    rows[0] = 0;
-    for (size_t r = 0; r < getHeight(); ++r) {
-      for (size_t c = 0; c < getWidth(); ++c) {
-        real v = src.getElement(r, c);
-        if (fabs(v) > FLT_EPSILON) {
-          cols.push_back(c);
-          values.push_back(v);
-        }
-      }
-      rows[r + 1] = values.size();
-    }
-    copyFrom(rows, cols, values);
-  } else {
-    std::vector<int> cols(getWidth() + 1);
-    std::vector<int> rows;
-    std::vector<real> values;
-    cols[0] = 0;
-    for (size_t r = 0; r < getWidth(); ++r) {
-      for (size_t c = 0; c < getHeight(); ++c) {
-        real v = src.getElement(c, r);
-        if (fabs(v) > FLT_EPSILON) {
-          rows.push_back(c);
-          values.push_back(v);
-        }
-      }
-      cols[r + 1] = values.size();
-    }
-    copyFrom(rows, cols, values);
-  }
-}
-
-MatrixPtr CpuSparseMatrix::clone(size_t height, size_t width, bool useGpu) {
-  if (height == 0 && width == 0) {
-    height = height_;
-    width = width_;
-  }
-  CHECK(width && height);
-  if (!useGpu) {
-    return std::make_shared<CpuSparseMatrix>(
-        height, width, 0, valueType_, format_);
-  } else {
-    return std::make_shared<GpuSparseMatrix>(
-        height, width, elementCnt_, valueType_, format_);
-  }
-}
-
-MatrixPtr CpuSparseMatrix::subMatrix(size_t startRow, size_t numRows) {
-  CHECK_LE(startRow + numRows, height_);
-  CHECK_EQ(format_, SPARSE_CSR);
-  if (valueType_ == NO_VALUE) {
-    return std::make_shared<CpuSparseMatrix>(
-        nullptr,
-        rows_ + startRow,
-        cols_,
-        numRows,
-        width_,
-        rows_[startRow + numRows] - rows_[startRow],
-        valueType_,
-        format_,
-        trans_);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        value_,
-        rows_ + startRow,
-        cols_,
-        numRows,
-        width_,
-        rows_[startRow + numRows] - rows_[startRow],
-        valueType_,
-        format_,
-        trans_);
-  }
-}
-
-/* mem MUST be alloced outside (memAlloc=false) */
-void CpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  CHECK(!memAlloc);
-  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(matTrans.get());
-  if (format_ == SPARSE_CSR) {
-    /*statistic element number in each col*/
-    int* colCounters = mat->getRows() + 1;
-    memset(colCounters, 0, sizeof(int) * width_);
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      int col = cols_[i];
-      colCounters[col]++;
-    }
-    /*fill mat rows */
-    mat->getRows()[0] = 0;
-    for (size_t i = 1; i < width_ + 1; i++) {
-      mat->getRows()[i] = mat->getRows()[i - 1] + mat->getRows()[i];
-    }
-    /*fill mat values and cols*/
-    std::vector<int> colNumVec(width_, 0);
-    if (valueType_ == FLOAT_VALUE) {
-      for (size_t i = 0; i < height_; i++) {
-        for (int j = rows_[i]; j < rows_[i + 1]; j++) {
-          int colIdx = cols_[j];
-          int index = mat->getRows()[colIdx] + colNumVec[colIdx];
-          mat->getCols()[index] = i;
-          mat->getValue()[index] = value_[j];
-          colNumVec[colIdx]++;
-        }
-      }
-    } else {
-      for (size_t i = 0; i < height_; i++) {
-        for (int j = rows_[i]; j < rows_[i + 1]; j++) {
-          int colIdx = cols_[j];
-          int index = mat->getRows()[colIdx] + colNumVec[colIdx];
-          mat->getCols()[index] = i;
-          colNumVec[colIdx]++;
-        }
-      }
-    }
-  } else {
-    /*statistic element number in each row*/
-    int* rowCounters = mat->getCols() + 1;
-    memset(rowCounters, 0, sizeof(int) * height_);
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      int row = rows_[i];
-      rowCounters[row]++;
-    }
-
-    /*fill mat cols */
-    mat->getCols()[0] = 0;
-    for (size_t i = 1; i < height_ + 1; i++) {
-      mat->getCols()[i] = mat->getCols()[i - 1] + mat->getCols()[i];
-    }
-    /*fill mat values and rows*/
-    std::vector<int> rowNumVec(height_, 0);
-    if (valueType_ == FLOAT_VALUE) {
-      for (size_t i = 0; i < width_; i++) {
-        for (int j = cols_[i]; j < cols_[i + 1]; j++) {
-          int rowIdx = rows_[j];
-          int index = mat->getCols()[rowIdx] + rowNumVec[rowIdx];
-          mat->getRows()[index] = i;
-          mat->getValue()[index] = value_[j];
-          rowNumVec[rowIdx]++;
-        }
-      }
-    } else {
-      for (size_t i = 0; i < width_; i++) {
-        for (int j = cols_[i]; j < cols_[i + 1]; j++) {
-          int rowIdx = rows_[j];
-          int index = mat->getCols()[rowIdx] + rowNumVec[rowIdx];
-          mat->getRows()[index] = i;
-          rowNumVec[rowIdx]++;
-        }
-      }
-    }
-  }
-}
-
-void CpuSparseMatrix::setRow(size_t row,
-                             size_t colNum,
-                             const unsigned int* cols,
-                             const real* values) {
-  if (format_ == SPARSE_CSR) {
-    CHECK_LT(row, height_);
-    CHECK(NULL != cols);
-    if (0 == row) {
-      rows_[row] = 0;
-    }
-    rows_[row + 1] = rows_[row] + colNum;
-    for (size_t i = 0; i < colNum; ++i) {
-      cols_[rows_[row] + i] = cols[i];
-    }
-    if (valueType_ == NO_VALUE) {
-      CHECK(!values);
-    } else {
-      for (size_t i = 0; i < colNum; ++i) {
-        value_[rows_[row] + i] = values[i];
-      }
-    }
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void CpuSparseMatrix::fillRowIndices(IVectorPtr& outVec) const {
-  if (format_ == SPARSE_CSR) {
-    auto nnz = getElementCnt();
-    IVector::resizeOrCreate(outVec, nnz, false);
-    auto out = outVec->getData();
-    int* rows = getRows();
-    for (size_t i = 0; i < height_; i++) {
-      for (int j = rows[i]; j < rows[i + 1]; j++) {
-        out[j] = i;
-      }
-    }
-  } else {
-    LOG(FATAL) << "SPARSE_CSC not supported";
-  }
-}
-
-ThreadLocal<std::vector<CpuSparseMatrixPtr>> CpuSparseMatrix::cpuLocalMats_;
-
-CpuSparseMatrixPtr CpuSparseMatrix::getTmpSparseMatrix(size_t height,
-                                                       size_t width) {
-  std::vector<CpuSparseMatrixPtr>* localMats = cpuLocalMats_.get();
-  auto it = localMats->begin();
-  while (it != localMats->end()) {
-    if (it->unique()) {
-      (*it)->resize(height, width, elementCnt_, valueType_, format_);
-      return *it;
-    }
-  }
-  localMats->emplace_back(std::make_shared<CpuSparseMatrix>(
-      height, width, elementCnt_, valueType_, format_, false));
-  return localMats->back();
-}
-
-void CpuSparseMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  if (dynamic_cast<const GpuSparseMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const GpuSparseMatrix*>(&src);
-    copyFrom(*tmpSrc, stream);
-  } else if (dynamic_cast<const CpuSparseMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuSparseMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else if (dynamic_cast<const CpuMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else {
-    LOG(FATAL) << "not implemented";
-  }
-}
-
-void CpuSparseMatrix::copyFrom(const Matrix& src) {
-  if (dynamic_cast<const CpuSparseMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuSparseMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else if (dynamic_cast<const CpuMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else {
-    LOG(FATAL) << "not implemented";
-  }
-}
-
-void CpuSparseMatrix::copyFrom(const GpuSparseMatrix& src, hl_stream_t stream) {
-  CHECK_EQ(height_, src.getHeight());
-  CHECK_EQ(width_, src.getWidth());
-  CHECK_EQ(size_t(elementCnt_), src.getElementCnt());
-  size_t valSize = valueType_ == NO_VALUE ? 0 : elementCnt_;
-  if (format_ == SPARSE_CSC)
-    hl_memcpy_from_csc_matrix(value_,
-                              valSize,
-                              rows_,
-                              elementCnt_,
-                              cols_,
-                              width_ + 1,
-                              src.sMatrix_.get(),
-                              stream);
-  else
-    hl_memcpy_from_csr_matrix(value_,
-                              valSize,
-                              rows_,
-                              height_ + 1,
-                              cols_,
-                              elementCnt_,
-                              src.sMatrix_.get(),
-                              stream);
-}
-
-void CpuSparseMatrix::copyFrom(const CpuSparseMatrix& src) {
-  CHECK_EQ(height_, src.getHeight());
-  CHECK_EQ(width_, src.getWidth());
-  CHECK_EQ(format_, src.getFormat());
-  int start = format_ == SPARSE_CSR ? src.getRows()[0] : src.getCols()[0];
-  if (format_ == SPARSE_CSR) {
-    size_t totalColNum = 0;
-    for (size_t i = 0; i < height_; ++i) {
-      totalColNum += src.getColNum(i);
-    }
-    resize(height_, width_, totalColNum, valueType_, format_);
-    rows_[0] = 0;
-    for (size_t i = 0; i < height_; ++i) {
-      rows_[i + 1] = rows_[i] + src.getColNum(i);
-    }
-    memcpy(cols_, src.getCols() + start, totalColNum * sizeof(int));
-  } else {
-    size_t totalColNum = 0;
-    for (size_t i = 0; i < width_; ++i) {
-      totalColNum += src.getRowNum(i);
-    }
-    resize(height_, width_, totalColNum, valueType_, format_);
-    cols_[0] = 0;
-    for (size_t i = 0; i < width_; ++i) {
-      cols_[i + 1] = cols_[i] + src.getRowNum(i);
-    }
-    memcpy(rows_, src.getRows() + start, totalColNum * sizeof(int));
-  }
-
-  // if have different value type, only copy rows and cols
-  if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
-    memcpy(value_, src.getValue() + start, elementCnt_ * sizeof(real));
-  }
-}
-
-void CpuSparseMatrix::copyRow(int offsets,
-                              size_t colNum,
-                              const sparse_non_value_t* row) {
-  for (size_t j = 0; j < colNum; j++) {
-    cols_[offsets + j] = row[j].col;
-  }
-}
-
-void CpuSparseMatrix::copyRow(int offsets,
-                              size_t colNum,
-                              const sparse_float_value_t* row) {
-  for (size_t j = 0; j < colNum; j++) {
-    cols_[offsets + j] = row[j].col;
-    value_[offsets + j] = row[j].value;
-  }
-}
-
-template <class T>
-void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data) {
-  size_t totalColNum = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    int64_t id = ids[i];
-    totalColNum += indices[id + 1] - indices[id];
-  }
-  valueType_ = typeid(T) == typeid(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE;
-
-  resize(height_, width_, totalColNum, valueType_, format_);
-
-  rows_[0] = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    int64_t id = ids[i];
-    T* row = data + indices[id];
-    size_t colNum = indices[id + 1] - indices[id];
-    rows_[i + 1] = rows_[i] + colNum;
-    copyRow(rows_[i], colNum, row);
-  }
-}
-
-template <class T>
-void CpuSparseMatrix::copyFrom(int64_t* indices, T* data) {
-  CHECK(format_ == SPARSE_CSR);
-  size_t totalColNum = indices[height_] - indices[0];
-  valueType_ = typeid(T) == typeid(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE;
-  resize(height_, width_, totalColNum, valueType_, format_);
-
-  rows_[0] = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    T* row = data + indices[i];
-    size_t colNum = indices[i + 1] - indices[i];
-    rows_[i + 1] = rows_[i] + colNum;
-    copyRow(rows_[i], colNum, row);
-  }
-}
-
-void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
-  CHECK_EQ(height_, src.getHeight());
-  CHECK_LE(width_, src.getWidth());
-  CHECK_EQ(format_, src.getFormat());
-  CHECK_EQ(valueType_, src.getValueType());
-  if (format_ == SPARSE_CSR) {
-    int* srcCols = src.getCols();
-    size_t numLessWidth =
-        std::count_if(srcCols, srcCols + src.getElementCnt(), [this](size_t n) {
-          return n < this->width_;
-        });
-    resize(height_, width_, numLessWidth, valueType_, format_);
-    rows_[0] = 0;
-    size_t index = 0;
-    for (size_t r = 0; r < height_; ++r) {
-      for (int i = src.getRows()[r]; i < src.getRows()[r + 1]; ++i) {
-        if (srcCols[i] < static_cast<int>(width_)) {
-          cols_[index] = srcCols[i];
-          if (valueType_ == FLOAT_VALUE) {
-            value_[index] = src.getValue()[i];
-          }
-          ++index;
-        }
-      }
-      rows_[r + 1] = index;
-    }
-    CHECK_EQ(index, numLessWidth);
-  } else {
-    size_t numLessWidth = src.getCols()[width_] - src.getCols()[0];
-    resize(height_, width_, numLessWidth, valueType_, format_);
-    cols_[0] = 0;
-    size_t index = 0;
-    // note: c < width_, not src.getWidth();
-    for (size_t c = 0; c < width_; ++c) {
-      for (int i = src.getCols()[c]; i < src.getCols()[c + 1]; ++i) {
-        rows_[index] = src.getRows()[i];
-        if (valueType_ == FLOAT_VALUE) {
-          value_[index] = src.getValue()[i];
-        }
-        ++index;
-      }
-      cols_[c + 1] = index;
-    }
-    CHECK_EQ(index, numLessWidth);
-  }
-}
-
-void CpuSparseMatrix::zeroMem() {
-  CHECK(valueType_ == FLOAT_VALUE);
-  memset(value_, 0, elementCnt_ * sizeof(real));
-}
-
-template void CpuSparseMatrix::copyFrom(int64_t* ids,
-                                        int64_t* indices,
-                                        sparse_non_value_t* data);
-
-template void CpuSparseMatrix::copyFrom(int64_t* ids,
-                                        int64_t* indices,
-                                        sparse_float_value_t* data);
-
-template void CpuSparseMatrix::copyFrom(int64_t* indices,
-                                        sparse_non_value_t* data);
-
-template void CpuSparseMatrix::copyFrom(int64_t* indices,
-                                        sparse_float_value_t* data);
-
-void CpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  maxVal.zeroMem();
-  int* outids = maxIds.getData();
-  real* outvalues = maxVal.getData();
-
-  typedef std::pair<real, size_t> valuepair;
-  std::vector<valuepair> vec;
-  for (size_t i = 0; i < numSamples; i++) {
-    vec.clear();
-
-    auto num = getColNum(i);
-    auto ids = getRowCols(i);
-    auto values = getRowValues(i);
-    for (size_t j = 0; j < num; j++) {
-      vec.push_back(std::make_pair(values[j], ids[j]));
-    }
-
-    size_t outsize = std::min(num, beam);
-    std::partial_sort(vec.begin(),
-                      vec.begin() + outsize,
-                      vec.end(),
-                      [](const valuepair& a, const valuepair& b) {
-                        return a.first > b.first;
-                      });
-    for (size_t j = 0; j < outsize; j++) {
-      outids[i * beam + j] = vec[j].second;
-      outvalues[i * beam + j] = vec[j].first;
-    }
-    if (outsize < beam) {
-      // if the number of values to sort are less than the output size,
-      // use -1 to indicate the end of valid sorted values.
-      outids[i * beam + outsize] = -1;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/CpuSparseMatrix.h b/paddle/legacy/math/CpuSparseMatrix.h
deleted file mode 100644
index 172792c2950..00000000000
--- a/paddle/legacy/math/CpuSparseMatrix.h
+++ /dev/null
@@ -1,377 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-#include <cstddef>
-#include "Matrix.h"
-
-namespace paddle {
-
-class CpuSparseMatrix : public Matrix {
- public:
-  CpuSparseMatrix(size_t height,
-                  size_t width,
-                  size_t nnz, /* used to allocate space */
-                  SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format = SPARSE_CSR,
-                  bool trans = false);
-
-  CpuSparseMatrix(CpuMemHandlePtr memHandle,
-                  size_t height,
-                  size_t width,
-                  size_t nnz,
-                  SparseValueType valueType,
-                  SparseFormat format,
-                  bool trans);
-
-  CpuSparseMatrix(real* data,
-                  int* rows,
-                  int* cols,
-                  size_t height,
-                  size_t width,
-                  size_t nnz,
-                  SparseValueType valueType,
-                  SparseFormat format,
-                  bool trans);
-
-  ~CpuSparseMatrix() {}
-
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format);
-  void resize(size_t newHeight, size_t newWidth);
-
-  MatrixPtr getTranspose();
-
-  SparseValueType getValueType();
-
-  real* getRowValues(size_t i) const {
-    if (format_ == SPARSE_CSR) {
-      return value_ + rows_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSC not supported";
-      return 0;
-    }
-  }
-
-  int* getRowCols(size_t i) const {
-    if (format_ == SPARSE_CSR) {
-      return cols_ + rows_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSC not supported";
-      return 0;
-    }
-  }
-
-  /// fill row indices of each value in CSR matrix
-  void fillRowIndices(IVectorPtr& outVec) const;
-
-  size_t getColNum(size_t i) const {
-    if (format_ == SPARSE_CSR) {
-      return rows_[i + 1] - rows_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSC not supported";
-      return 0;
-    }
-  }
-
-  real* getColumn(size_t i) const {
-    if (format_ == SPARSE_CSC) {
-      return value_ + cols_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSR not supported";
-      return 0;
-    }
-  }
-
-  size_t getColStartIdx(size_t i) const {
-    if (format_ == SPARSE_CSC) {
-      return cols_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSR not supported";
-      return 0;
-    }
-  }
-
-  size_t getRowStartIdx(size_t i) const {
-    if (format_ == SPARSE_CSR) {
-      return rows_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSC not supported";
-      return 0;
-    }
-  }
-
-  size_t getRowNum(size_t i) const {
-    if (format_ == SPARSE_CSC) {
-      return cols_[i + 1] - cols_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSR not supported";
-      return 0;
-    }
-  }
-
-  virtual real getSum() {
-    CHECK(isContiguous());
-    if (valueType_ == NO_VALUE) {
-      return elementCnt_;
-    }
-    double sum = 0;
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      sum += value_[i];
-    }
-    return sum;
-  }
-
-  virtual void square2() {
-    CHECK(isContiguous());
-    if (valueType_ == NO_VALUE) {
-      return;
-    }
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      value_[i] = value_[i] * value_[i];
-    }
-  }
-
-  /**
-   * only consider nonzero values.
-   * the actual min value should compare with 0.0.
-   */
-  virtual real getMin() {
-    CHECK(isContiguous());
-    if (valueType_ == NO_VALUE) {
-      return (elementCnt_ > 0 ? 1.0 : 0.0);
-    }
-    real min = value_[0];
-    for (size_t i = 1; i < elementCnt_; ++i) {
-      min = value_[i] < min ? value_[i] : min;
-    }
-    return min;
-  }
-
-  /**
-   * only consider nonzero values.
-   * the actual max value should compare with 0.0.
-   */
-  virtual real getMax() {
-    CHECK(isContiguous());
-    if (valueType_ == NO_VALUE) {
-      return (elementCnt_ > 0 ? 1.0 : 0.0);
-    }
-    real max = value_[0];
-    for (size_t i = 1; i < elementCnt_; ++i) {
-      max = value_[i] > max ? value_[i] : max;
-    }
-    return max;
-  }
-
-  void rowMax(IVector& maxIds, Matrix& maxVal);
-  int* getRows() const { return rows_; }
-  int* getCols() const { return cols_; }
-  real* getValue() const { return value_; }
-  SparseFormat getFormat() const { return format_; }
-  SparseValueType getValueType() const { return valueType_; }
-
-  /**
-   * @brief return value_ of sparse matrix
-   *
-   * Some times CpuSparseMatrix maybe Matrix,
-   * if getValue, must dynamic_cast to CpuSparseMatrix,
-   * getData is convenient to get value
-   */
-  real* getData() { return getValue(); }
-  const real* getData() const { return getValue(); }
-
-  /**
-   * @brief only set value_ of FLOAT_VALUE sparse matrix to zero
-   */
-  void zeroMem();
-
-  /// mem MUST be alloced outside (memAlloc=false)
-  void transpose(MatrixPtr& matTrans, bool memAlloc);
-
-  void mul(const Matrix& A, const Matrix& B, real alpha, real beta);
-
-  /**
-   * @brief sparseMatrix += denseMatrix
-   *
-   *  Named add3 just because add/add2 has been used in BaseMatrix.cu
-   *  and they are not virtual function.
-   *
-   *  Only add value of same (row, col) index in dense matrix
-   *  and do not use others values whoes postions are not in sparse matirx.
-   *
-   * @param[in]  b   dense matrix
-   */
-  void add3(CpuMatrix* b);
-  void add3(MatrixPtr b);
-
-  /**
-   * @brief sparseMatrix[i,j] += bias[j], (j is the col index of sparse matrix)
-   *
-   * @param[in]  b      bias, dense matrix and height = 1
-   * @param[in]  scale  scale of b
-   */
-  void addBias(Matrix& b, real scale);
-
-  void print(std::ostream& os) const;
-
-  void printOneRow(std::ostream& os, size_t idx) const;
-
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values);
-
-  /**
-   * @brief this_row = b_row * c_row[cCol]
-   *
-   * @param[in]  cCol   the column of matrix c used to scale each row of b
-   * @param[in]  b      CpuSparseMatrix
-   * @param[in]  c      Matrix
-   */
-  void rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c);
-
-  void randomizeUniform();
-
-  void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream = HPPL_STREAM_DEFAULT);
-
-  void copyFrom(const Matrix& src);
-
-  /**
-   * Get a temporary matrix. This is threadsafe. It should be only used
-   * temporarily, i.e. do not store it or use it as return value.
-   *
-   * @note  Do NOT use large amount of tmp matrix.
-   */
-  CpuSparseMatrixPtr getTmpSparseMatrix(size_t height, size_t width);
-
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows);
-
-  void copyFrom(std::vector<int>& rows,
-                std::vector<int>& cols,
-                std::vector<real>& values);
-
-  void copyFrom(const CpuMatrix& src);
-
-  void copyFrom(const CpuSparseMatrix& src);
-
-  // trim the large size
-  void trimFrom(const CpuSparseMatrix& src);
-
-  void copyRow(int offsets, size_t colNum, const sparse_non_value_t* row);
-
-  void copyRow(int offsets, size_t colNum, const sparse_float_value_t* row);
-
-  template <class T>
-  void copyFrom(int64_t* ids, int64_t* indices, T* data);
-
-  template <class T>
-  void copyFrom(int64_t* indices, T* data);
-
-  void copyFrom(const real* data, size_t len) {
-    LOG(FATAL) << "not supported!";
-  }
-
- private:
-  MatrixPtr clone(size_t height = 0, size_t width = 0, bool useGpu = false);
-
- protected:
-  void sparseResize();
-  /*for csr , record row start position, for csc, record row index for every no
-   * zero value*/
-  int* rows_;
-  /*for csc , record col start position, for csr, record col index for every no
-   * zero value*/
-  int* cols_;
-  real* value_;               /*nonzero value*/
-  SparseFormat format_;       /* matrix format */
-  SparseValueType valueType_; /*with value or not  */
-  static const size_t DEFAULT_AVG_WIDTH = 20;
-
-  static ThreadLocal<std::vector<CpuSparseMatrixPtr>> cpuLocalMats_;
-
-  // BaseMatrixT interface
- public:
-  bool isSparse() const { return true; }
-
- private:
-  using Matrix::mul;
-  using Matrix::copyFrom;
-  using Matrix::rowMax;
-  using Matrix::print;
-  using Matrix::subMatrix;
-};
-}  // namespace paddle
-
-#else
-
-#include "Matrix.h"
-
-namespace paddle {
-
-class CpuSparseMatrix : public Matrix {
- public:
-  CpuSparseMatrix(size_t height,
-                  size_t width,
-                  size_t nnz, /* used to allocate space */
-                  SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format = SPARSE_CSR,
-                  bool trans = false)
-      : Matrix(NULL, height, width, trans, false) {}
-
-  CpuSparseMatrix(real* data,
-                  int* rows,
-                  int* cols,
-                  size_t height,
-                  size_t width,
-                  size_t nnz,
-                  SparseValueType valueType,
-                  SparseFormat format,
-                  bool trans)
-      : Matrix(NULL, height, width, trans, false) {}
-
-  real* getValue() const { return nullptr; }
-  size_t getColStartIdx(size_t i) const { return 0; }
-  size_t getRowStartIdx(size_t i) const { return 0; }
-  size_t getColNum(size_t i) const { return 0; }
-  int* getRowCols(size_t i) const { return nullptr; }
-
-  CpuSparseMatrixPtr getTmpSparseMatrix(size_t height, size_t width) {
-    return nullptr;
-  }
-
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {}
-  void resize(size_t newHeight, size_t newWidth) {}
-  MatrixPtr getTranspose() { return nullptr; }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {}
-};
-
-}  // namespace paddle
-
-#endif
diff --git a/paddle/legacy/math/ExecViaCpu.h b/paddle/legacy/math/ExecViaCpu.h
deleted file mode 100644
index ec2337545e9..00000000000
--- a/paddle/legacy/math/ExecViaCpu.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- execViaCpu is used to do operations on GpuMatirx and/or GpuIVector through
- cpu functions. It can automatically make a temporary CPU copy for the
- gpu matrix/vector, and copy back after executing the CPU function.
-
- Examples:
- 1. For a function, functor or lambda:
-   r = execViaCpu(&f, mat, vec)
-
- 2. For member function of CpuMatirx, execViaCpu2 should be used:
-   execViaCpu2(&CpuMatrix::selectElements, *this, table, ids)
-*/
-
-#pragma once
-
-namespace paddle {
-
-template <typename Arg>
-class CopyToCpu {
- public:
-  explicit CopyToCpu(Arg& arg) : arg_(arg) {}
-  Arg& copiedArg() const { return arg_; }
-
- private:
-  Arg& arg_;
-};
-
-template <>
-class CopyToCpu<Matrix> {
- public:
-  explicit CopyToCpu(Matrix& arg) : arg_(arg) {
-    if (arg.useGpu()) {
-      CHECK(!arg.isTransposed()) << "Not supported";
-      copied_ = Matrix::create(arg.getHeight(),
-                               arg.getWidth(),
-                               /* trans= */ false,
-                               /* useGpu= */ false);
-      copied_->copyFrom(arg);
-    }
-  }
-  ~CopyToCpu() {
-    if (copied_) {
-      arg_.copyFrom(*copied_);
-    }
-  }
-  Matrix& copiedArg() const { return copied_ ? *copied_ : arg_; }
-
- private:
-  Matrix& arg_;
-  MatrixPtr copied_;
-};
-
-template <>
-class CopyToCpu<const Matrix> {
- public:
-  explicit CopyToCpu(const Matrix& arg) : arg_(arg) {
-    if (arg.useGpu()) {
-      CHECK(!arg.isTransposed()) << "Not supported";
-      copied_ = Matrix::create(arg.getHeight(),
-                               arg.getWidth(),
-                               /* trans= */ false,
-                               /* useGpu= */ false);
-      copied_->copyFrom(arg);
-    }
-  }
-  const Matrix& copiedArg() const { return copied_ ? *copied_ : arg_; }
-
- private:
-  const Matrix& arg_;
-  MatrixPtr copied_;
-};
-
-template <>
-class CopyToCpu<IVector> {
- public:
-  explicit CopyToCpu(IVector& arg) : arg_(arg) {
-    if (arg.useGpu()) {
-      copied_ = IVector::create(arg.getSize(), /* useGpu= */ false);
-      copied_->copyFrom(arg);
-    }
-  }
-  ~CopyToCpu() {
-    if (copied_) {
-      arg_.copyFrom(*copied_);
-    }
-  }
-  IVector& copiedArg() const { return copied_ ? *copied_ : arg_; }
-
- private:
-  IVector& arg_;
-  IVectorPtr copied_;
-};
-
-template <>
-class CopyToCpu<const IVector> {
- public:
-  explicit CopyToCpu(const IVector& arg) : arg_(arg) {
-    if (arg.useGpu()) {
-      copied_ = IVector::create(arg.getSize(), /* useGpu= */ false);
-      copied_->copyFrom(arg);
-    }
-  }
-  const IVector& copiedArg() const { return copied_ ? *copied_ : arg_; }
-
- private:
-  const IVector& arg_;
-  IVectorPtr copied_;
-};
-
-namespace detail {
-
-template <bool isFunction, bool isFunctionPointer, bool isClass, typename F>
-class GpuFuncWrapperImp;
-
-template <typename F, typename R, typename... Args>
-class GpuFuncWrapperBase {
- public:
-  typedef R ResultType;
-  R operator()(F&& f, Args... args) {
-    return f(CopyToCpu<typename std::remove_reference<Args>::type>(args)
-                 .copiedArg()...);
-  }
-};
-
-// function
-template <typename R, typename... Args>
-class GpuFuncWrapperImp<true, false, false, R(Args...)>
-    : public GpuFuncWrapperBase<R(Args...), R, Args...> {};
-
-// function pointer
-template <typename R, typename... Args>
-class GpuFuncWrapperImp<false, true, false, R (*)(Args...)>
-    : public GpuFuncWrapperBase<R (*)(Args...), R, Args...> {};
-
-template <typename F, typename Op>
-class GpuFuncWrapperImp2;
-
-template <typename F, typename C, typename R, typename... Args>
-class GpuFuncWrapperImp2<F, R (C::*)(Args...) const>
-    : public GpuFuncWrapperBase<F, R, Args...> {};
-
-template <typename F, typename C, typename R, typename... Args>
-class GpuFuncWrapperImp2<F, R (C::*)(Args...)>
-    : public GpuFuncWrapperBase<F, R, Args...> {};
-
-// functor or lambda
-template <typename F>
-class GpuFuncWrapperImp<false, false, true, F>
-    : public GpuFuncWrapperImp2<F, decltype(&F::operator())> {};
-
-template <typename F>
-class GpuFuncWrapper2
-    : public GpuFuncWrapperImp<
-          std::is_function<F>::value,
-          std::is_pointer<F>::value &&
-              std::is_function<typename std::remove_pointer<F>::type>::value,
-          std::is_class<F>::value,
-          F> {};
-
-template <typename F>
-class GpuFuncWrapper
-    : public GpuFuncWrapper2<typename std::remove_reference<F>::type> {};
-
-}  // namespace detail
-
-template <typename F, typename... Args>
-typename detail::GpuFuncWrapper<F>::ResultType execViaCpu(F&& f,
-                                                          Args&&... args) {
-  return detail::GpuFuncWrapper<F>()(std::move(f), args...);
-}
-
-// The second version is for F as member function of CpuMatrix
-template <typename R, typename... FArgs, typename... Args>
-R execViaCpu2(R (CpuMatrix::*f)(FArgs...), Args&&... args) {
-  auto lambda = [](R (CpuMatrix::*f)(FArgs...), Matrix& ths, FArgs... args) {
-    return (((CpuMatrix&)ths).*f)(args...);
-  };
-  return execViaCpu(lambda, f, args...);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/MKLDNNMatrix.cpp b/paddle/legacy/math/MKLDNNMatrix.cpp
deleted file mode 100644
index 52036c5f803..00000000000
--- a/paddle/legacy/math/MKLDNNMatrix.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNMatrix.h"
-
-using namespace mkldnn;  // NOLINT
-
-namespace paddle {
-
-MKLDNNMatrixPtr MKLDNNMatrix::create(memory::primitive_desc pd, MatrixPtr m) {
-  memory::desc md = pd.desc();
-  size_t ndims = md.data.ndims;
-  int* dims = md.data.dims;
-  CHECK(ndims > 0) << "Input dims should not be empty";
-  size_t cnts = 1;
-  for (size_t i = 0; i < ndims; ++i) {
-    cnts *= dims[i];
-  }
-
-  if (m == nullptr) {
-    size_t height = dims[0];
-    size_t width = cnts / dims[0];
-    m = Matrix::create(height, width, false, false);
-  }
-  CHECK(m) << " Matrix should not be empty";
-
-  CpuMatrixPtr cpuMatrix = std::dynamic_pointer_cast<CpuMatrix>(m);
-  CHECK(cpuMatrix) << "Only support create from CPU matrix yet";
-  CHECK_EQ(cpuMatrix->getElementCnt(), cnts) << "Count size does not match";
-  return std::make_shared<MKLDNNMatrix>(cpuMatrix, pd);
-}
-
-MKLDNNMatrixPtr MKLDNNMatrix::create(memory::dims dims,
-                                     memory::format fmt,
-                                     engine& eg,
-                                     MatrixPtr m,
-                                     mkldnn::memory::data_type dtype) {
-  return create(createPrimitiveDesc(dims, fmt, eg, dtype), m);
-}
-
-std::shared_ptr<reorder> MKLDNNMatrix::createReorder(const MKLDNNMatrixPtr& src,
-                                                     const MKLDNNMatrixPtr& dst,
-                                                     bool checkData) {
-  if (src == dst || src->getPrimitiveDesc() == dst->getPrimitiveDesc()) {
-    return nullptr;
-  }
-
-  if (checkData && (src->getData() == dst->getData())) {
-    LOG(FATAL) << "can not create reorder with inplace data";
-    return nullptr;
-  }
-
-  memory::dims srcDims = src->getDims();
-  memory::dims dstDims = dst->getDims();
-  CHECK_EQ(srcDims.size(), dstDims.size());
-  for (size_t i = 0; i < srcDims.size(); ++i) {
-    CHECK_EQ(srcDims[i], dstDims[i]);
-  }
-  return std::make_shared<reorder>(*src, *dst);
-}
-
-void MKLDNNMatrix::reorderDataFrom(const MKLDNNMatrixPtr& m,
-                                   memory::format srcFmt,
-                                   memory::dims targetDim) {
-  memory::format dstFmt = getFormat();
-  if (srcFmt == dstFmt) {
-    return;
-  }
-  CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
-  reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
-}
-
-void MKLDNNMatrix::reorderDataTo(const MKLDNNMatrixPtr& m,
-                                 memory::format dstFmt,
-                                 memory::dims targetDim) {
-  memory::format srcFmt = getFormat();
-  if (srcFmt == dstFmt) {
-    return;
-  }
-  CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
-  reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
-}
-
-void MKLDNNMatrix::reorderOnce(void* srcData,
-                               void* dstData,
-                               memory::format srcFmt,
-                               memory::format dstFmt,
-                               memory::dims dm) {
-  CHECK(srcData);
-  CHECK(dstData);
-  MatrixPtr tmpSrc;
-  if (dstData == srcData) {
-    // inplace data
-    size_t sz = 1;
-    for (size_t i = 0; i < dm.size(); ++i) {
-      sz *= dm[i];
-    }
-    tmpSrc = Matrix::create(sz, 1, false, false);
-    tmpSrc->copyFrom((real*)srcData, sz);
-    srcData = tmpSrc->getData();
-  }
-
-  auto dtype = this->getDtype();
-  auto srcMD = memory::desc(dm, dtype, srcFmt);
-  auto dstMD = memory::desc(dm, dtype, dstFmt);
-
-  auto eg = this->getEngine();
-  auto src = memory(memory::primitive_desc(srcMD, eg), srcData);
-  auto dst = memory(memory::primitive_desc(dstMD, eg), dstData);
-
-  auto r = reorder(src, dst);
-  stream(stream::kind::eager).submit({r}).wait();
-}
-
-void MKLDNNMatrix::downSpatial() {
-  int fmt = getFormat();
-  if (!(fmt == memory::format::nchw || fmt == memory::format::oihw)) {
-    // only support nchw and oihw yet, later can support more like nhwc, ihwo
-    return;
-  }
-
-  // TODO(TJ): change H(height) and W(width) if support nhwc or more
-  const int H = 2, W = 3;
-  memory::dims srcDims = getDims();
-  if (srcDims[H] != 1 || srcDims[W] != 1) {
-    // can not down spatial
-    return;
-  }
-
-  memory::dims dstDims = memory::dims{srcDims[0], srcDims[1]};
-  memory::format dstFmt;
-  switch (fmt) {
-    case memory::format::nchw:
-      dstFmt = memory::format::nc;
-      break;
-    case memory::format::oihw:
-      dstFmt = memory::format::oi;
-      break;
-    default:
-      LOG(FATAL) << "unsupported format";
-  }
-  memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
-  memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
-  resetMKLDNNMemory(pd, data_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/MKLDNNMatrix.h b/paddle/legacy/math/MKLDNNMatrix.h
deleted file mode 100644
index 5a0e5f85923..00000000000
--- a/paddle/legacy/math/MKLDNNMatrix.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Matrix.h"
-#include "mkldnn.hpp"
-#include "paddle/legacy/parameter/Parameter.h"
-
-namespace paddle {
-
-class MKLDNNMatrix;
-typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
-
-#define CHECK_PRIMITIVE_DESC_EQ(MAT, PD, ...)                        \
-  CHECK(MAT) << " can not be empty.";                                \
-  CHECK(MAT->getPrimitiveDesc() == PD)                               \
-      << #MAT "->getPrimitiveDesc() and " #PD " should be equal.\n " \
-      << "" __VA_ARGS__;
-
-/**
- * @brief MKLDNN Matrix.
- *
- */
-class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
- public:
-  MKLDNNMatrix(CpuMatrixPtr m, mkldnn::memory::primitive_desc pd)
-      : CpuMatrix(m->getData(), m->getHeight(), m->getWidth(), false),
-        mkldnn::memory(pd, m->getData()),
-        m_(m) {}
-
-  ~MKLDNNMatrix() {}
-
-  /**
-   * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
-   */
-  static MKLDNNMatrixPtr create(mkldnn::memory::primitive_desc pd,
-                                MatrixPtr m = nullptr);
-
-  /**
-   * Create MKLDNNMatrix from a MatrixPtr and memory details info
-   */
-  static MKLDNNMatrixPtr create(
-      mkldnn::memory::dims dims,
-      mkldnn::memory::format fmt,
-      mkldnn::engine& eg,
-      MatrixPtr m = nullptr,
-      mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
-
-  /**
-   * Create primitive descriptor.
-   * default with f32 dtype
-   */
-  static mkldnn::memory::primitive_desc createPrimitiveDesc(
-      const mkldnn::memory::dims dims,
-      const mkldnn::memory::format& fmt,
-      const mkldnn::engine& eg,
-      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
-    return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg);
-  }
-
-  /**
-   * Create Memory descriptor.
-   * default with any format and f32 dtype
-   */
-  static mkldnn::memory::desc createMemoryDesc(
-      const mkldnn::memory::dims dims,
-      const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
-      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
-    return mkldnn::memory::desc(dims, dtype, fmt);
-  }
-
-  /**
-   * Create reorder primitive.
-   * Create a mkldnn::reorder handle for converting src MKLDNNMatrix to dst.
-   * checkData: whether to check the data handle of src and dst.
-   *            if true, it will check the data and do not allow them equal;
-   *            otherwise, it will not check them, then the reorder created
-   *            may have inplace buffer.
-   *            Do not set false, if you can not guarantee the inplace logical
-   *            would work with your reorder.
-   */
-  static std::shared_ptr<mkldnn::reorder> createReorder(
-      const MKLDNNMatrixPtr& src,
-      const MKLDNNMatrixPtr& dst,
-      bool checkData = true);
-
-  void copyFrom(const Matrix& src) {
-    // TODO(TJ): reorder data if this format is not nchw or x
-    m_->copyFrom(src);
-  }
-
-  void copyTo(Matrix& dst) {
-    // TODO(TJ): reorder data if this format is not nchw or x
-    dst.copyFrom(*m_);
-  }
-
- public:
-  /**
-   * Reorder this MKLDNNMatrix from other format.
-   * Support inplace reorder.
-   * @note: this function would only reorder the data layout.
-   *        will NOT change this original dim or format info
-   */
-  void reorderDataFrom(const MKLDNNMatrixPtr& m,
-                       memory::format srcFmt,
-                       memory::dims targetDim);
-
-  /**
-   * Reorder this MKLDNNMatrix to other format.
-   * Support inplace reorder.
-   * @note: this function would only reorder the data layout.
-   *        will NOT change the dst dim or format info
-   */
-  void reorderDataTo(const MKLDNNMatrixPtr& m,
-                     memory::format dstFmt,
-                     memory::dims targetDim);
-
-  /**
-   * Dimensionality reduction.
-   * Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
-   */
-  void downSpatial();
-
-  /**
-   * set the memory data handle.
-   * Caution: This will not check the buffer size of the data,
-   *          it should be coverd by user.
-   */
-  void setData(real* data) {
-    set_data_handle(data);
-    CpuMatrix::setData(data);
-    m_.reset();
-  }
-
-  /**
-   * override the CpuMatrix::resize
-   */
-  void resize(size_t newHeight, size_t newWidth) override {
-    m_->resize(newHeight, newWidth);
-    if (data_ == m_->getData() && elementCnt_ == newHeight * newWidth) {
-      return;
-    }
-    CpuMatrix::setData(data_);
-    height_ = newHeight;
-    width_ = newWidth;
-    elementCnt_ = newHeight * newWidth;
-    stride_ = width_;
-    auto pd = mkldnn::memory::primitive_desc(
-        mkldnn::memory::desc({(int)newHeight, (int)newWidth},
-                             getDtype(),
-                             mkldnn::memory::format::nc),
-        getEngine());
-    resetMKLDNNMemory(pd, data_);
-  }
-
-  /**
-   * override Matrix::getData
-   * check data before return
-   */
-  real* getData() override {
-    CHECK_EQ((void*)data_, get_data_handle());
-    return data_;
-  }
-
-  const real* getData() const override {
-    CHECK_EQ((void*)data_, get_data_handle());
-    return data_;
-  }
-
-  /**
-   * Get primitive descriptor.
-   */
-  mkldnn::memory::primitive_desc getPrimitiveDesc() {
-    return this->get_primitive_desc();
-  }
-
-  /**
-   * Get memory descriptor.
-   */
-  mkldnn::memory::desc getMemoryDesc() { return getPrimitiveDesc().desc(); }
-
-  /**
-   * Get dimensions.
-   */
-  mkldnn::memory::dims getDims() {
-    mkldnn::memory::desc md = getMemoryDesc();
-    const int* src = md.data.dims;
-    int ndims = md.data.ndims;
-    mkldnn::memory::dims dst;
-    dst.resize(ndims);
-    for (int i = 0; i < ndims; ++i) {
-      dst[i] = src[i];
-    }
-    return dst;
-  }
-
-  /**
-   * Get format.
-   */
-  mkldnn::memory::format getFormat() {
-    return (mkldnn::memory::format)(getMemoryDesc().data.format);
-  }
-
-  /**
-   * Get memory data type.
-   */
-  mkldnn::memory::data_type getDtype() {
-    return (mkldnn::memory::data_type)(getMemoryDesc().data.data_type);
-  }
-
-  /**
-   * Get engine.
-   */
-  mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
-
- protected:
-  /**
-   * Do reorder once.
-   * Can support inplace.
-   */
-  void reorderOnce(void* srcData,
-                   void* dstData,
-                   memory::format srcFmt,
-                   memory::format dstFmt,
-                   memory::dims dm);
-  /**
-   * reset this MKLDNN Memory from primitve desc
-   */
-  void resetMKLDNNMemory(memory::primitive_desc pd, real* data) {
-    mkldnn_primitive_t result;
-    mkldnn::error::wrap_c_api(
-        mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
-        "could not create a memory primitive");
-    reset(result);
-    set_data_handle(data);
-  }
-
- private:
-  // save the CpuMatrixPtr in case the buffer released outside
-  CpuMatrixPtr m_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/MathFunctions.cpp b/paddle/legacy/math/MathFunctions.cpp
deleted file mode 100644
index bbf34a32f36..00000000000
--- a/paddle/legacy/math/MathFunctions.cpp
+++ /dev/null
@@ -1,348 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/math/MathFunctions.h"
-#include "hl_matrix_apply.cuh"
-#include "hl_matrix_ops.cuh"
-#include "paddle/legacy/utils/DynamicLoader.h"
-
-namespace dynload {
-
-std::once_flag lapack_dso_flag;
-void* lapack_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load lapack routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-
-// The argument for stringizing operator is not macro-expanded first.
-// We have to use two levels of macro to do the expansion.
-// See https://gcc.gnu.org/onlinedocs/cpp/Stringizing.html
-#define STR(x) #x
-
-// clang-format off
-#ifndef LAPACK_FOUND
-#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
-      using lapack_func = decltype(__name(args...)) (*)(Args...);              \
-      std::call_once(lapack_dso_flag, GetLapackDsoHandle, &lapack_dso_handle); \
-      void* p_##__name = dlsym(lapack_dso_handle, STR(__name));                \
-      CHECK(p_##__name) << "Cannot find symbol " << STR(__name)                \
-                        << " in liblapack.so";                                 \
-      return reinterpret_cast<lapack_func>(p_##__name)(args...);               \
-    }                                                                          \
-  } __name;  // struct DynLoad__##__name
-#else
-#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
-      return __name(args...);                                                  \
-    }                                                                          \
-  } __name;  // struct DynLoad__##__name
-#endif
-
-#define  PADDLE_SGETRF  LAPACKE_sgetrf
-#define  PADDLE_DGETRF  LAPACKE_dgetrf
-#define  PADDLE_SGETRI  LAPACKE_sgetri
-#define  PADDLE_DGETRI  LAPACKE_dgetri
-
-#define LAPACK_ROUTINE_EACH(__macro)       \
-  __macro(PADDLE_SGETRF)                   \
-  __macro(PADDLE_DGETRF)                   \
-  __macro(PADDLE_SGETRI)                   \
-  __macro(PADDLE_DGETRI)
-// clang-format on
-
-LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)
-
-}  // namespace dynload
-
-namespace paddle {
-
-#ifndef PADDLE_USE_EIGEN_FOR_BLAS
-template <>
-void gemm<float>(const CBLAS_TRANSPOSE transA,
-                 const CBLAS_TRANSPOSE transB,
-                 const int M,
-                 const int N,
-                 const int K,
-                 const float alpha,
-                 const float* A,
-                 const int lda,
-                 const float* B,
-                 const int ldb,
-                 const float beta,
-                 float* C,
-                 const int ldc) {
-  cblas_sgemm(CblasRowMajor,
-              transA,
-              transB,
-              M,
-              N,
-              K,
-              alpha,
-              A,
-              lda,
-              B,
-              ldb,
-              beta,
-              C,
-              ldc);
-}
-
-template <>
-void gemm<double>(const CBLAS_TRANSPOSE transA,
-                  const CBLAS_TRANSPOSE transB,
-                  const int M,
-                  const int N,
-                  const int K,
-                  const double alpha,
-                  const double* A,
-                  const int lda,
-                  const double* B,
-                  const int ldb,
-                  const double beta,
-                  double* C,
-                  const int ldc) {
-  cblas_dgemm(CblasRowMajor,
-              transA,
-              transB,
-              M,
-              N,
-              K,
-              alpha,
-              A,
-              lda,
-              B,
-              ldb,
-              beta,
-              C,
-              ldc);
-}
-#endif
-
-template <>
-int getrf<float>(const CBLAS_ORDER order,
-                 const int M,
-                 const int N,
-                 float* A,
-                 const int lda,
-                 int* ipiv) {
-  return dynload::PADDLE_SGETRF(order, M, N, A, lda, ipiv);
-}
-
-template <>
-int getrf<double>(const CBLAS_ORDER order,
-                  const int M,
-                  const int N,
-                  double* A,
-                  const int lda,
-                  int* ipiv) {
-  return dynload::PADDLE_DGETRF(order, M, N, A, lda, ipiv);
-}
-
-template <>
-int getri<float>(const CBLAS_ORDER order,
-                 const int N,
-                 float* A,
-                 const int lda,
-                 const int* ipiv) {
-  return dynload::PADDLE_SGETRI(order, N, A, lda, ipiv);
-}
-
-template <>
-int getri<double>(const CBLAS_ORDER order,
-                  const int N,
-                  double* A,
-                  const int lda,
-                  const int* ipiv) {
-  return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
-}
-
-#ifndef PADDLE_USE_EIGEN_FOR_BLAS
-template <>
-void axpy<float>(const int n, const float alpha, const float* x, float* y) {
-  cblas_saxpy(n, alpha, x, 1, y, 1);
-}
-
-template <>
-void axpy<double>(const int n, const double alpha, const double* x, double* y) {
-  cblas_daxpy(n, alpha, x, 1, y, 1);
-}
-
-template <>
-float dotProduct<float>(const int n, const float* x, const float* y) {
-  return cblas_sdot(n, x, 1, y, 1);
-}
-
-template <>
-double dotProduct<double>(const int n, const double* x, const double* y) {
-  return cblas_ddot(n, x, 1, y, 1);
-}
-#endif
-
-#if defined(PADDLE_WITH_MKLML)
-
-template <>
-void vExp<float>(const int n, const float* a, float* r) {
-  vsExp(n, a, r);
-}
-
-template <>
-void vExp<double>(const int n, const double* a, double* r) {
-  vdExp(n, a, r);
-}
-
-template <>
-void vPow<float>(const int n, const float* a, const float b, float* r) {
-  vsPowx(n, a, b, r);
-}
-
-template <>
-void vPow<double>(const int n, const double* a, const double b, double* r) {
-  vdPowx(n, a, b, r);
-}
-
-template <>
-void vLog<float>(const int n, const float* a, float* r) {
-  vsLn(n, a, r);
-}
-
-template <>
-void vLog<double>(const int n, const double* a, double* r) {
-  vdLn(n, a, r);
-}
-
-template <>
-void vAdd<float>(const int n, const float* a, const float* b, float* r) {
-  vsAdd(n, a, b, r);
-}
-
-template <>
-void vAdd<double>(const int n, const double* a, const double* b, double* r) {
-  vdAdd(n, a, b, r);
-}
-
-template <>
-void vTanh<float>(const int n, const float* a, float* r) {
-  vsTanh(n, a, r);
-}
-
-template <>
-void vTanh<double>(const int n, const double* a, double* r) {
-  vdTanh(n, a, r);
-}
-
-template <>
-void vInvSqrt<float>(const int n, const float* a, float* r) {
-  vsInvSqrt(n, a, r);
-}
-
-template <>
-void vInvSqrt<double>(const int n, const double* a, double* r) {
-  vdInvSqrt(n, a, r);
-}
-
-template <>
-void vLog1p<float>(const int n, const float* a, float* r) {
-  vsLog1p(n, a, r);
-}
-
-template <>
-void vLog1p<double>(const int n, const double* a, double* r) {
-  vdLog1p(n, a, r);
-}
-#else
-
-DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
-template <class T>
-void vExp(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
-      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
-template <class T>
-void vLog(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
-      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
-template <class T>
-void vPow(const int n, const T* a, const T b, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
-      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
-template <class T>
-void vAdd(const int n, const T* a, const T* b, T* r) {
-  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
-                                                     const_cast<T*>(a),
-                                                     const_cast<T*>(b),
-                                                     r,
-                                                     1,
-                                                     n,
-                                                     n,
-                                                     n,
-                                                     n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
-template <class T>
-void vInvSqrt(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vInvSqrt<T>, 0, 0>(
-      binary::vInvSqrt<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vLog1p, b = std::log(1.0f + a));
-template <class T>
-void vLog1p(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vLog1p<T>, 0, 0>(
-      binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vTanh, T tmp = -2.0 * a;
-                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
-template <class T>
-void vTanh(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(
-      binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-template void vExp(const int n, const float* a, float* r);
-template void vExp(const int n, const double* a, double* r);
-template void vLog(const int n, const float* a, float* r);
-template void vLog(const int n, const double* a, double* r);
-template void vPow(const int n, const float* a, const float b, float* r);
-template void vPow(const int n, const double* a, const double b, double* r);
-template void vAdd(const int n, const float* a, const float* b, float* r);
-template void vAdd(const int n, const double* a, const double* b, double* r);
-template void vInvSqrt(const int n, const double* a, double* r);
-template void vInvSqrt(const int n, const float* a, float* r);
-template void vLog1p(const int n, const float* a, float* r);
-template void vLog1p(const int n, const double* a, double* r);
-template void vTanh(const int n, const float* a, float* r);
-template void vTanh(const int n, const double* a, double* r);
-#endif
-}  // namespace paddle
diff --git a/paddle/legacy/math/MathFunctions.h b/paddle/legacy/math/MathFunctions.h
deleted file mode 100644
index 854e4baa398..00000000000
--- a/paddle/legacy/math/MathFunctions.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_MKLML
-#include <mkl_cblas.h>
-#include <mkl_lapacke.h>
-#include <mkl_vml_functions.h>
-#endif
-
-#ifdef PADDLE_USE_VECLIB
-extern "C" {
-#include <cblas.h>
-#include <clapack.h>
-}
-#endif
-
-#ifdef PADDLE_USE_OPENBLAS
-#include <cblas.h>
-#ifdef LAPACK_FOUND
-#include <lapacke.h>
-#endif
-#endif
-
-#ifndef LAPACK_FOUND
-extern "C" {
-#ifndef PADDLE_USE_EIGEN_FOR_BLAS
-#include <cblas.h>
-#else
-typedef enum CBLAS_ORDER {
-  CblasRowMajor = 101,
-  CblasColMajor = 102
-} CBLAS_ORDER;
-#endif
-int LAPACKE_sgetrf(
-    int matrix_layout, int m, int n, float* a, int lda, int* ipiv);
-int LAPACKE_dgetrf(
-    int matrix_layout, int m, int n, double* a, int lda, int* ipiv);
-int LAPACKE_sgetri(
-    int matrix_layout, int n, float* a, int lda, const int* ipiv);
-int LAPACKE_dgetri(
-    int matrix_layout, int n, double* a, int lda, const int* ipiv);
-}
-#endif
-
-#include <cmath>
-
-namespace paddle {
-
-#ifndef PADDLE_USE_EIGEN_FOR_BLAS
-template <class T>
-void gemm(const CBLAS_TRANSPOSE transA,
-          const CBLAS_TRANSPOSE transB,
-          const int M,
-          const int N,
-          const int K,
-          const T alpha,
-          const T* A,
-          const int lda,
-          const T* B,
-          const int ldb,
-          const T beta,
-          T* C,
-          const int ldc);
-#endif
-
-template <class T>
-int getrf(const CBLAS_ORDER Order,
-          const int M,
-          const int N,
-          T* A,
-          const int lda,
-          int* ipiv);
-
-template <class T>
-int getri(
-    const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv);
-
-template <class T>
-void axpy(const int n, const T alpha, const T* x, T* y) {
-  /// y = y + alpha * x
-  for (int i = 0; i < n; i++) {
-    y[i] = y[i] + alpha * x[i];
-  }
-}
-
-template <class T>
-T dotProduct(const int n, const T* x, const T* y) {
-  T result = static_cast<T>(0);
-  for (int i = 0; i < n; i++) {
-    result += x[i] * y[i];
-  }
-  return result;
-}
-
-template <class T>
-void vExp(const int n, const T* a, T* r);
-
-template <class T>
-void vPow(const int n, const T* a, const T b, T* r);
-
-template <class T>
-void vLog(const int n, const T* a, T* r);
-
-template <class T>
-void vAdd(const int n, const T* a, const T* b, T* r);
-
-template <class T>
-void vInvSqrt(const int n, const T* a, T* r);
-
-template <class T>
-void vLog1p(const int n, const T* a, T* r);
-
-template <class T>
-void vTanh(const int n, const T* a, T* r);
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/MathUtils.cpp b/paddle/legacy/math/MathUtils.cpp
deleted file mode 100644
index 47ac9c187ca..00000000000
--- a/paddle/legacy/math/MathUtils.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MathUtils.h"
-#include <algorithm>
-#include "Vector.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-/*if csc, major is cols and minor is rows, else
- * major is rows and minor is cols, according to
- * major value to initialize minor value"
- */
-void sparseRand(
-    int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu) {
-  CHECK(size_t(nnz) >= size_t(1));
-  int* cpuMajor;
-  int* cpuMinor;
-  CpuIVector cpuMinorVec(nnz);
-  CpuIVector cpuMajorVec(majorLen);
-  if (useGpu) {
-    cpuMajor = cpuMajorVec.getData();
-    cpuMinor = cpuMinorVec.getData();
-  } else {
-    cpuMajor = major;
-    cpuMinor = minor;
-  }
-
-  /*major value init*/
-  for (int i = 0; i < majorLen - 1; i++) {
-    cpuMajor[i] = 1.0 * i * nnz / (majorLen - 1);
-  }
-  cpuMajor[majorLen - 1] = nnz;
-
-  /*minor value init according to major value*/
-  std::vector<char> used(minorMax, 0);
-  for (int i = 0; i < majorLen - 1; i++) {
-    CHECK_LE(cpuMajor[i + 1] - cpuMajor[i], minorMax);
-    used.assign(minorMax, 0);
-    for (int j = cpuMajor[i]; j < cpuMajor[i + 1]; j++) {
-      int idx = ::rand() % minorMax;
-      while (used[idx]) {
-        idx = ::rand() % minorMax;
-      }
-      cpuMinor[j] = idx;
-      used[idx] = 1;
-    }
-    std::sort(cpuMinor + cpuMajor[i],
-              cpuMinor + cpuMajor[i + 1],
-              [](int a, int b) { return a < b; });
-  }
-  /*memcpy result to gpu*/
-  if (useGpu) {
-    hl_memcpy_host2device(major, cpuMajor, sizeof(int) * majorLen);
-    hl_memcpy_host2device(minor, cpuMinor, sizeof(int) * nnz);
-  }
-}
-
-int outputSize(
-    int imageSize, int filterSize, int padding, int stride, bool caffeMode) {
-  int outputSize;
-  if (!caffeMode) {
-    outputSize =
-        (imageSize - filterSize + 2 * padding + stride - 1) / stride + 1;
-  } else {
-    outputSize = (imageSize - filterSize + 2 * padding) / stride + 1;
-  }
-  CHECK_GE(outputSize, 1);
-  return outputSize;
-}
-
-int imageSize(
-    int outputSize, int filterSize, int padding, int stride, bool caffeMode) {
-  int imageSize;
-  if (!caffeMode) {
-    imageSize =
-        (outputSize - 1) * stride + filterSize - 2 * padding - stride + 1;
-  } else {
-    imageSize = (outputSize - 1) * stride + filterSize - 2 * padding;
-  }
-  CHECK_GE(imageSize, 1);
-  return imageSize;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/MathUtils.h b/paddle/legacy/math/MathUtils.h
deleted file mode 100644
index 597485d9c54..00000000000
--- a/paddle/legacy/math/MathUtils.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle {
-
-/**
- * this function is for SparseMatrix initialization except data.
- * It generates a random non-zero pattern for a sparse matrix.
- *
- * if format is SPARSE_CSC,
- *    major is column start index and minor is row index
- *    for each non zero value.
- * else
- *    major is row start index and minor is col
- *    index for each non zero value.
- *
- * Initialize minor value according to major value.
- *
- * For example, A is 5*3  CSC matrix, nnz is 10, then
- *
- * @code
- *   cols[i] = i * nnz / 3
- *   cols=[0, 3, 6, 10]
- * @endcode
- *
- * for column i, we randomly select cols[i+1] - cols[i] rows
- * as non zero number row index.
- *
- * rows is [1, 3, 4, 0, 2, 4, 1, 2, 3, 4]
- */
-void sparseRand(
-    int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu);
-
-/**
- * Calculate output size based on caffeMode_.
- * - input(+padding): 0123456789
- * - imageSize(+padding) = 10;
- * - filterSize = 3;
- * - stride = 2;
- * - caffeMode is true:
-     - output: (012), (234), (456), (678)
-     - outputSize = 4;
- * - caffeMode is false:
- *   - output: (012), (234), (456), (678), (9)
- *   - outputSize = 5;
- */
-int outputSize(
-    int imageSize, int filterSize, int padding, int stride, bool caffeMode);
-
-/**
- * Calculate image size based on output size and caffeMode_.
- * It is the reverse function of outputSize()
- */
-int imageSize(
-    int outputSize, int filterSize, int padding, int stride, bool caffeMode);
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/Matrix.cpp b/paddle/legacy/math/Matrix.cpp
deleted file mode 100644
index e53f95006c3..00000000000
--- a/paddle/legacy/math/Matrix.cpp
+++ /dev/null
@@ -1,4787 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Matrix.h"
-#include "MathFunctions.h"
-#include "SparseMatrix.h"
-#include "SparseRowMatrix.h"
-
-#include <float.h>
-#include <algorithm>
-#include <cmath>
-
-#include <string.h>
-#include "hl_cnn.h"
-#include "hl_gpu.h"
-#include "hl_table_apply.h"
-#include "hl_top_k.h"
-#include "paddle/legacy/utils/Logging.h"
-
-#include "NEONFunctions.h"
-#include "paddle/legacy/function/GemmFunctor.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-#include "SIMDFunctions.h"
-
-namespace paddle {
-
-inline real _pow(real a, real beta) { return std::pow(a, beta); }
-
-inline real _square(real a) { return a * a; }
-
-inline real _safelog(real a) { return a > 0.0f ? std::log(a) : -40.0f; }
-
-Matrix::Matrix(MemoryHandlePtr memHandle,
-               size_t height,
-               size_t width,
-               bool trans,
-               bool use_gpu)
-    : BaseMatrix(
-          height,
-          width,
-          memHandle ? (reinterpret_cast<real*>(memHandle->getBuf())) : nullptr,
-          trans,
-          use_gpu) {
-  elementCnt_ = width * height;
-  memoryHandle_ = memHandle;
-}
-
-Matrix::Matrix(
-    real* data, size_t height, size_t width, bool trans, bool use_gpu)
-    : BaseMatrix(height, width, data, trans, use_gpu) {
-  elementCnt_ = width * height;
-}
-
-Matrix::Matrix(real* data,
-               size_t height,
-               size_t width,
-               size_t stride,
-               bool trans,
-               bool use_gpu)
-    : BaseMatrix(height, width, stride, data, trans, use_gpu) {
-  elementCnt_ = width * height;
-}
-
-MatrixPtr Matrix::createSparseMatrix(real* data,
-                                     int* row,
-                                     int* col,
-                                     size_t height,
-                                     size_t width,
-                                     size_t nnz, /* used to allocate space */
-                                     SparseValueType valueType, /*value type*/
-                                     SparseFormat format,
-                                     bool trans,
-                                     bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(
-        data, row, col, height, width, nnz, valueType, format, trans);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        data, row, col, height, width, nnz, valueType, format, trans);
-  }
-}
-
-MatrixPtr Matrix::createSparseMatrix(size_t height,
-                                     size_t width,
-                                     size_t nnz, /* used to allocate space */
-                                     SparseValueType valueType, /*value type*/
-                                     SparseFormat format,
-                                     bool trans,
-                                     bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(
-        height, width, nnz, valueType, format, trans);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        height, width, nnz, valueType, format, trans);
-  }
-}
-
-MatrixPtr Matrix::create(MemoryHandlePtr memHandle,
-                         size_t height,
-                         size_t width,
-                         bool trans) {
-  if (auto gpuHandle = std::dynamic_pointer_cast<GpuMemoryHandle>(memHandle)) {
-    return std::make_shared<GpuMatrix>(gpuHandle, height, width, trans);
-  } else if (auto cpuHandle =
-                 std::dynamic_pointer_cast<CpuMemoryHandle>(memHandle)) {
-    return std::make_shared<CpuMatrix>(cpuHandle, height, width, trans);
-  } else {
-    LOG(FATAL) << "Wrong";
-    return nullptr;
-  }
-}
-
-MatrixPtr Matrix::create(size_t height, size_t width, bool trans, bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(height, width, trans);
-  } else {
-    return std::make_shared<CpuMatrix>(height, width, trans);
-  }
-}
-
-MatrixPtr Matrix::create(
-    real* data, size_t height, size_t width, bool trans, bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(data, height, width, trans);
-  } else {
-    return std::make_shared<CpuMatrix>(data, height, width, trans);
-  }
-}
-
-MatrixPtr Matrix::create(real* data,
-                         size_t height,
-                         size_t width,
-                         size_t stride,
-                         bool trans,
-                         bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(data, height, width, stride, trans);
-  } else {
-    return std::make_shared<CpuMatrix>(data, height, width, stride, trans);
-  }
-}
-
-MatrixPtr Matrix::createSparseMatrix(size_t height,
-                                     size_t width,
-                                     size_t nnz,
-                                     SparseValueType valueType,
-                                     bool trans,
-                                     bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(
-        height, width, nnz, valueType, SPARSE_CSR, trans);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        height, width, nnz, valueType, SPARSE_CSR, trans);
-  }
-}
-
-void Matrix::resizeOrCreate(
-    MatrixPtr& matrix, size_t height, size_t width, bool trans, bool useGpu) {
-  if (!matrix) {
-    matrix = Matrix::create(height, width, trans, useGpu);
-  } else {
-    CHECK_EQ(matrix->useGpu(), useGpu);
-    matrix->resize(height, width);
-  }
-}
-
-void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix,
-                                        size_t height,
-                                        size_t width,
-                                        size_t nnz,
-                                        SparseValueType valueType,
-                                        SparseFormat format,
-                                        bool trans,
-                                        bool useGpu) {
-  if (!matrix) {
-    matrix = Matrix::createSparseMatrix(
-        height, width, nnz, valueType, format, trans, useGpu);
-  } else {
-    CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
-          dynamic_cast<GpuSparseMatrix*>(matrix.get()));
-    CHECK_EQ(matrix->useGpu(), useGpu);
-    matrix->resize(height, width, nnz, valueType, format);
-  }
-}
-
-void Matrix::reshape(size_t height, size_t width) {
-  CHECK(isContiguous());
-  CHECK(height_ * width_ == height * width);
-  height_ = height;
-  width_ = width;
-  stride_ = width_;
-}
-
-MatrixPtr Matrix::subMatrix(size_t startRow,
-                            size_t endRow,
-                            size_t startCol,
-                            size_t endCol) {
-  CHECK_LE(startRow, endRow);
-  CHECK_LE(endRow, getHeight());
-  CHECK_LE(startCol, endCol);
-  CHECK_LE(endCol, getWidth());
-
-  return Matrix::create(getData() + startRow * getStride() + startCol,
-                        endRow - startRow,
-                        endCol - startCol,
-                        getStride(),
-                        trans_,
-                        useGpu_);
-}
-
-void Matrix::setDiag(real value) {
-  CHECK(data_ != NULL);
-  CHECK_EQ(height_, width_);
-
-  zeroMem();
-  BaseMatrix diag(height_, 1, stride_ + 1, data_, false, useGpu_);
-  diag.assign(value);
-}
-
-GpuMatrix::GpuMatrix(size_t height, size_t width, bool trans)
-    : Matrix(std::make_shared<GpuMemoryHandle>(height * width * sizeof(real)),
-             height,
-             width,
-             trans,
-             true) {}
-
-GpuMatrix::~GpuMatrix() {}
-
-void GpuMatrix::zeroMem() {
-  CHECK(data_ != NULL);
-  zero();
-}
-
-void GpuMatrix::resetOne() {
-  CHECK(data_ != NULL);
-  one();
-}
-
-void GpuMatrix::resize(size_t newHeight, size_t newWidth) {
-  size_t newSize = newHeight * newWidth;
-  if (NULL == memoryHandle_.get() ||
-      newSize * sizeof(real) > memoryHandle_->getAllocSize()) {
-    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize * sizeof(real));
-    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
-  }
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newSize;
-  stride_ = width_;
-}
-
-real GpuMatrix::getElement(size_t x, size_t y) const {
-  real elem = 0;
-  hl_memcpy_device2host(&elem, &data_[x * stride_ + y], sizeof(real));
-  return elem;
-}
-
-real GpuMatrix::getSum() {
-  CHECK(isContiguous());
-  real sum = 0.0f;
-  hl_vector_sum(data_, &sum, height_ * width_);
-  return sum;
-}
-
-real GpuMatrix::getMin() {
-  CHECK(isContiguous());
-  auto vec = GpuVector(height_ * width_, data_);
-  return vec.getMin();
-}
-
-real GpuMatrix::getMax() {
-  CHECK(isContiguous());
-  auto vec = GpuVector(height_ * width_, data_);
-  return vec.getMax();
-}
-
-void GpuMatrix::accumulateColSum(Matrix& src) {
-  CHECK_EQ(getWidth(), src.getWidth());
-  CHECK_EQ(getHeight(), (size_t)1);
-  sumCols(src, 1.0, 1.0);
-}
-
-real GpuMatrix::getAbsSum() {
-  CHECK(isContiguous());
-  real sum = 0.0f;
-  hl_vector_abs_sum(data_, &sum, height_ * width_);
-  return sum;
-}
-
-void GpuMatrix::copyFrom(const Matrix& src) {
-  CHECK(isContiguous());
-  CHECK(src.isContiguous());
-  CHECK(elementCnt_ == src.getElementCnt());
-
-  if (typeid(src) == typeid(CpuMatrix)) {
-    hl_memcpy_host2device(
-        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
-  } else if (typeid(src) == typeid(GpuMatrix)) {
-    hl_memcpy_device2device(
-        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void GpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  CHECK(isContiguous());
-  CHECK(src.isContiguous());
-  CHECK(elementCnt_ == src.getElementCnt());
-  hl_memcpy_async(this->getData(),
-                  const_cast<real*>(src.getData()),
-                  sizeof(real) * elementCnt_,
-                  stream);
-}
-
-void GpuMatrix::copyFrom(const real* hostSrc, size_t size) {
-  CHECK(isContiguous());
-  CHECK(size <= elementCnt_);
-  hl_memcpy_host2device(data_, const_cast<real*>(hostSrc), sizeof(real) * size);
-}
-
-void GpuMatrix::copyFrom(const real* hostSrc, const int64_t* seq) {
-  LOG(FATAL) << "not implemented";
-}
-
-void GpuMatrix::copyFrom(const IVector& src) {
-  CHECK(isContiguous());
-  CpuMatrix matrix(src.getSize(), 1, false);
-  matrix.copyFrom(src);
-  copyFrom(matrix);
-}
-
-void GpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(b.getWidth(), width);
-  real* dst = getData();
-  real* src = b.getData();
-  const int* index = rowIndex.getData();
-  hl_sequence2batch_copy(dst, src, index, width, height, true);
-}
-
-MatrixPtr GpuMatrix::clone(size_t height, size_t width, bool useGpu) {
-  CHECK(isContiguous());
-
-  if (height == 0 && width == 0) {
-    height = height_;
-    width = width_;
-  }
-
-  CHECK(width && height);
-
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(height, width);
-  } else {
-    return std::make_shared<CpuMatrix>(height, width);
-  }
-}
-
-MatrixPtr GpuMatrix::getTranspose() {
-  if (memoryHandle_.get() != NULL) {
-    MatrixPtr copy_T(
-        new GpuMatrix(std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
-                      height_,
-                      width_,
-                      true));
-    return copy_T;
-  } else {
-    MatrixPtr copy_T(new GpuMatrix(data_, height_, width_, true));
-    return copy_T;
-  }
-}
-
-void GpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  if (memAlloc) {
-    matTrans = std::make_shared<GpuMatrix>(width_, height_);
-  } else {
-    CHECK(matTrans != NULL);
-    CHECK_EQ(matTrans->getHeight(), width_);
-    CHECK_EQ(matTrans->getWidth(), height_);
-  }
-  real* dataTrans = matTrans->getData();
-  real* data = getData();
-  int lda = getStride();
-  int ldc = matTrans->getStride();
-
-  hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc);
-}
-
-void GpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
-  if (memAlloc) {
-    matRot = std::make_shared<GpuMatrix>(width_, height_);
-  } else {
-    CHECK(matRot != NULL);
-    CHECK_EQ(matRot->getHeight(), width_);
-    CHECK_EQ(matRot->getWidth(), height_);
-  }
-
-  real* dataRot = matRot->getData();
-  real* data = getData();
-  hl_matrix_rotate(data, dataRot, height_, width_, clockWise);
-}
-
-MatrixPtr GpuMatrix::getInverse() {
-  MatrixPtr matInv;
-  inverse(matInv, true);
-  return matInv;
-}
-
-void GpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
-  CHECK_EQ(height_, width_);
-
-  if (memAlloc) {
-    matInv = std::make_shared<GpuMatrix>(height_, width_);
-  } else {
-    CHECK(matInv != NULL);
-  }
-
-  real* data = getData();
-  real* dataInv = matInv->getData();
-  int lda = getStride();
-  int ldc = matInv->getStride();
-
-  hl_matrix_inverse(data, dataInv, height_, lda, ldc);
-}
-
-void GpuMatrix::addBias(Matrix& b, real scale) {
-  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
-  BaseMatrix::addBias(b, scale);
-}
-
-void GpuMatrix::addSharedBias(Matrix& b, real scale) {
-  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
-  CHECK_LE(b.getWidth(), getWidth());
-  CHECK_EQ(getWidth() % b.getWidth(), 0UL);
-  hl_matrix_add_shared_bias(
-      getData(), b.getData(), b.getWidth(), getHeight(), getWidth(), scale);
-}
-
-void GpuMatrix::collectBias(Matrix& a, real scale) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK_EQ(getHeight(), (size_t)1);
-  CHECK_EQ(width_, a.getWidth());
-  GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
-  if (!sMatPtr) {
-    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
-  } else {
-    real* data = getData();
-    hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
-    hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(), width_, scale);
-  }
-#endif
-}
-
-void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
-  CHECK_EQ(getHeight(), (size_t)1);
-  CHECK_EQ(a.getWidth() % getWidth(), 0UL);
-  hl_matrix_collect_shared_bias(
-      getData(), a.getData(), getWidth(), a.getHeight(), a.getWidth(), scale);
-}
-
-void GpuMatrix::sequenceAvgForward(Matrix& a,
-                                   const IVector& startsPos,
-                                   int mode) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-
-  hl_sequence_avg_forward(dst, src, starts, height, width, mode);
-}
-
-void GpuMatrix::sequenceAvgBackward(Matrix& a,
-                                    const IVector& startsPos,
-                                    int mode) {
-  size_t height = a.getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-
-  hl_sequence_avg_backward(dst, src, starts, height, width, mode);
-}
-
-/* this = scaleAB*(a*b) +  scaleT*this */
-void GpuMatrix::mul(const GpuMatrix& a,
-                    const GpuMatrix& b,
-                    real scaleAB,
-                    real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-
-  if (!a.isTransposed() && !b.isTransposed()) {
-    CHECK_EQ(width_, b.width_);
-    CHECK_EQ(height_, a.height_);
-    CHECK_EQ(a.width_, b.height_);
-  } else if (a.isTransposed() && !b.isTransposed()) {
-    CHECK_EQ(width_, b.width_);
-    CHECK_EQ(height_, a.width_);
-    CHECK_EQ(a.height_, b.height_);
-  } else if (!a.isTransposed() && b.isTransposed()) {
-    CHECK_EQ(width_, b.height_);
-    CHECK_EQ(height_, a.height_);
-    CHECK_EQ(a.width_, b.width_);
-  } else {
-    LOG(FATAL) << "Is not supported";
-  }
-
-  real* A_d = a.data_;
-  real* B_d = b.data_;
-  real* C_d = data_;
-  int dimM = getHeight();
-  int dimN = getWidth();
-  int dimK = !a.isTransposed() ? a.width_ : a.height_;
-  int lda = a.getStride();
-  int ldb = b.getStride();
-  int ldc = getStride();
-  hl_trans_op_t transa = !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
-  hl_trans_op_t transb = !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
-
-  hl_matrix_mul(A_d,
-                transa,
-                B_d,
-                transb,
-                C_d,
-                dimM,
-                dimN,
-                dimK,
-                scaleAB,
-                scaleT,
-                lda,
-                ldb,
-                ldc);
-}
-
-void GpuMatrix::mul(const GpuSparseMatrix& a,
-                    const GpuMatrix& b,
-                    real scaleAB,
-                    real scaleT) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(isContiguous());
-  CHECK(b.isContiguous());
-  CHECK(b.useGpu_ == true) << "Matrix type are not equal";
-  CHECK(!trans_ && !b.trans_) << "not supported";
-
-  if (!a.trans_) {
-    CHECK(width_ == b.width_ && height_ == a.height_ && a.width_ == b.height_)
-        << "Matrix dimensions are not equal";
-  } else {
-    CHECK(width_ == b.width_ && height_ == a.width_ && a.height_ == b.height_)
-        << "Matrix dimensions are not equal";
-  }
-  hl_trans_op_t transA = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
-  hl_sparse_matrix_s A_d = a.sMatrix_.get();
-  real* B_d = b.data_;
-  real* C_d = data_;
-  hl_matrix_csr_mul_dense(A_d,
-                          transA,
-                          B_d,
-                          HPPL_OP_N,
-                          C_d,
-                          height_,
-                          width_,
-                          b.height_,
-                          scaleAB,
-                          scaleT);
-#endif
-}
-
-void GpuMatrix::mul(const GpuMatrix& a,
-                    const GpuSparseMatrix& b,
-                    real scaleAB,
-                    real scaleT) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(isContiguous());
-  CHECK(a.isContiguous());
-  CHECK(a.useGpu_ == true) << "Matrix type are not equal";
-
-  hl_sparse_matrix_s B_d = b.sMatrix_.get();
-  real* A_d = a.data_;
-  real* C_d = data_;
-  hl_trans_op_t transB = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
-  if (!b.trans_) {
-    CHECK(width_ == b.width_ && height_ == a.height_ && a.width_ == b.height_)
-        << "Matrix dimensions are not equal";
-  } else {
-    CHECK(width_ == b.height_ && height_ == a.height_ && a.width_ == b.width_)
-        << "Matrix dimensions are not equal";
-  }
-  if (b.format_ == SPARSE_CSC) {
-    hl_matrix_dense_mul_csc(A_d,
-                            HPPL_OP_N,
-                            B_d,
-                            transB,
-                            C_d,
-                            height_,
-                            width_,
-                            a.width_,
-                            scaleAB,
-                            scaleT);
-  } else {
-    hl_matrix_dense_mul_csr(A_d,
-                            HPPL_OP_N,
-                            B_d,
-                            transB,
-                            C_d,
-                            height_,
-                            width_,
-                            a.width_,
-                            scaleAB,
-                            scaleT);
-  }
-#endif
-}
-
-/* this = a*b */
-void GpuMatrix::mul(const Matrix& a, const Matrix& b) { mul(a, b, 1.0, 0.0); }
-
-void GpuMatrix::mul(const Matrix& a,
-                    const Matrix& b,
-                    real scaleAB,
-                    real scaleT) {
-  const auto a_ptr = dynamic_cast<const GpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const GpuMatrix*>(&b);
-  const auto a_ptr_s = dynamic_cast<const GpuSparseMatrix*>(&a);
-  const auto b_ptr_s = dynamic_cast<const GpuSparseMatrix*>(&b);
-
-  if (a_ptr && b_ptr) {
-    mul(*a_ptr, *b_ptr, scaleAB, scaleT);
-  } else if (a_ptr_s && b_ptr) {
-    mul(*a_ptr_s, *b_ptr, scaleAB, scaleT);
-  } else if (a_ptr && b_ptr_s) {
-    mul(*a_ptr, *b_ptr_s, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
-}
-
-/* this = this* b */
-void GpuMatrix::rightMul(Matrix& b) { rightMul(b, 1.0, 0.0); }
-
-/* this = scaleAB*(this*b) +  scaleT*this */
-void GpuMatrix::rightMul(Matrix& b, real scaleAB, real scaleT) {
-  CHECK(dynamic_cast<GpuMatrix*>(&b));
-  CHECK(!isTransposed()) << "Not supported";
-  CHECK(!b.isTransposed()) << "Not supported";
-  mul(*this, *dynamic_cast<GpuMatrix*>(&b), scaleAB, scaleT);
-}
-
-/* this = a*this */
-void GpuMatrix::leftMul(Matrix& a) { leftMul(a, 1.0, 0.0); }
-
-/* this = scaleAB*(a*this) +  scaleT*this */
-void GpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
-  CHECK(dynamic_cast<GpuMatrix*>(&a));
-  CHECK(!isTransposed()) << "Not supported";
-  CHECK(!a.isTransposed()) << "Not supported";
-  mul(*dynamic_cast<GpuMatrix*>(&a), *this, scaleAB, scaleT);
-}
-
-void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(dynamic_cast<GpuMatrix*>(&table));
-  CHECK(table.useGpu());
-  CHECK(ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  hl_matrix_select_rows(a,
-                        stride_,
-                        table.getData(),
-                        table.stride_,
-                        index,
-                        numSamples,
-                        tableSize,
-                        dim);
-#endif
-}
-
-void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(dynamic_cast<GpuMatrix*>(&table));
-  CHECK(table.useGpu());
-  CHECK(ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  hl_matrix_add_to_rows(table.getData(),
-                        table.stride_,
-                        a,
-                        stride_,
-                        index,
-                        numSamples,
-                        tableSize,
-                        dim);
-#endif
-}
-
-void GpuMatrix::colMerge(Matrix& src) {
-  CHECK(src.height_ == height_);
-  if (!trans_ && !src.trans_) {
-    sumRows(src, /* scaleSum= */ 1, /* scaleDest= */ 0);
-  } else {
-    LOG(FATAL) << "Is not supported";
-  }
-}
-
-void GpuMatrix::rowSum(Matrix& sum) {
-  CHECK_EQ(sum.getHeight(), getHeight());
-  CHECK_EQ(sum.getWidth(), (size_t)1);
-
-  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
-}
-
-void GpuMatrix::rowMax(Matrix& max) {
-  CHECK_EQ(max.getHeight(), getHeight());
-  CHECK_EQ(max.getWidth(), (size_t)1);
-
-  max.maxRows(*this);
-}
-
-void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  CHECK_EQ(maxVal.getWidth(), beam);
-
-  hl_matrix_top_k(maxVal.getData(),
-                  maxVal.getStride(),
-                  maxIds.getData(),
-                  this->getData(),
-                  this->getStride(),
-                  this->getWidth(),
-                  beam,
-                  numSamples);
-#endif
-}
-
-void GpuMatrix::colMax(Matrix& max) {
-  CHECK_EQ(max.getWidth(), getWidth());
-  CHECK_EQ(max.getHeight(), (size_t)1);
-
-  max.maxCols(*this);
-}
-
-void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
-  LOG(FATAL) << "Is not supported";
-}
-
-void GpuMatrix::maxoutForward(Matrix& a,
-                              IVector& id,
-                              size_t channels,
-                              size_t groups) {
-  CHECK(dynamic_cast<GpuMatrix*>(&a));
-  CHECK(dynamic_cast<GpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = getWidth();
-  size_t batchSize = getHeight();
-  const real* input = a.getData();
-  real* output = getData();
-  int* idForGpu = id.getData();
-
-  hl_maxout_forward(
-      input, output, idForGpu, batchSize, size, size / channels, groups);
-}
-
-void GpuMatrix::maxoutBackward(Matrix& a,
-                               IVector& id,
-                               size_t channels,
-                               size_t groups) {
-  CHECK(dynamic_cast<GpuMatrix*>(&a));
-  CHECK(dynamic_cast<GpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = a.getWidth();
-  size_t batchSize = getHeight();
-  real* input = getData();
-  const real* output = a.getData();
-  const int* idForGpu = id.getData();
-
-  hl_maxout_backward(
-      input, output, idForGpu, batchSize, size, size / channels, groups);
-}
-
-/*calulate the error of classification */
-void GpuMatrix::classificationError(Matrix& output,
-                                    IVector& label,
-                                    size_t topkSize) {
-  auto gpuOutput = dynamic_cast<GpuMatrix*>(&output);
-  auto gpuLabel = dynamic_cast<GpuIVector*>(&label);
-  size_t numSamples = this->getHeight();
-  GpuMatrixPtr gpuTopVal = std::make_shared<GpuMatrix>(numSamples, topkSize);
-  GpuIVectorPtr gpuTopIds = std::make_shared<GpuIVector>(numSamples * topkSize);
-
-  CHECK(gpuOutput && gpuLabel) << "Invalid argument pointer";
-  CHECK(gpuTopVal && gpuTopIds) << "Allocate GPU memory failed";
-  CHECK(gpuLabel->getSize() == numSamples) << "Vector size is not equal";
-  CHECK(numSamples == gpuOutput->getHeight() && this->getWidth() == 1)
-      << "Matrix dimensions are not equal";
-
-  size_t dim = gpuOutput->getWidth();
-  hl_matrix_classification_error(gpuTopVal->getData(),
-                                 gpuTopVal->getStride(),
-                                 gpuTopIds->getData(),
-                                 gpuOutput->getData(),
-                                 gpuOutput->getStride(),
-                                 dim,
-                                 topkSize,
-                                 numSamples,
-                                 gpuLabel->getData(),
-                                 this->getData());
-}
-
-/* copy -log(output[i * width + label]) to this->data[i] */
-void GpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
-  GpuMatrix* output_ptr = dynamic_cast<GpuMatrix*>(&output);
-  GpuIVector* label_ptr = dynamic_cast<GpuIVector*>(&label);
-
-  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
-
-  CHECK(height_ == label.getSize() && width_ == 1 && height_ == output.height_)
-      << "Matrix dimensions are not equal";
-
-  real* A_d = output_ptr->data_;
-  real* C_d = data_;
-  int* label_d = label_ptr->getData();
-
-  hl_matrix_cross_entropy(A_d, C_d, label_d, height_, output.width_);
-}
-
-/* calculate the error of outputV according to label */
-void GpuMatrix::oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
-  GpuMatrix* output_ptr = dynamic_cast<GpuMatrix*>(&outputV);
-  GpuIVector* label_ptr = dynamic_cast<GpuIVector*>(&label);
-
-  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
-
-  CHECK(height_ == output_ptr->height_ && width_ == output_ptr->width_)
-      << "Matrix dimensions are not equal";
-
-  real* output_d = output_ptr->data_;
-  real* grad_d = data_;
-  int* label_d = label_ptr->getData();
-
-  hl_matrix_cross_entropy_bp(grad_d, output_d, label_d, height_, width_);
-}
-
-void GpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                               IVector& label,
-                                               real alpha) {
-  LOG(FATAL) << "Not implemented";
-}
-
-void GpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                                 IVector& label,
-                                                 real alpha) {
-  LOG(FATAL) << "Not implemented";
-}
-
-void GpuMatrix::softmax(Matrix& output) {
-  CHECK(output.useGpu()) << "Matrix type are not equal";
-
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK(height == output.getHeight() && width == output.getWidth())
-      << "Matrix dimensions are not equal";
-
-  real* inputData = getData();
-  real* outputData = output.getData();
-  hl_matrix_softmax(inputData, outputData, height, width);
-}
-
-void GpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-  CHECK(isContiguous());
-
-  real* inputData = getData();
-  real* outputData = output.getData();
-  auto starts = index.getData();
-  int numSequences = index.getSize() - 1;
-  hl_sequence_softmax_forward(inputData, outputData, starts, numSequences);
-}
-
-void GpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
-  CHECK(output.useGpu_ == true && sftmaxSum.useGpu_ == true)
-      << "Matrix type are not equal";
-
-  CHECK(height_ == output.height_ && width_ == output.width_ &&
-        height_ == sftmaxSum.height_)
-      << "Matrix dimensions are not equal";
-
-  real* output_d = output.data_;
-  real* sftmaxSum_d = sftmaxSum.data_;
-  real* grad_d = data_;
-  hl_matrix_softmax_derivative(grad_d, output_d, sftmaxSum_d, height_, width_);
-}
-
-void GpuMatrix::softmaxBackward(Matrix& outputV) {
-  CHECK(outputV.useGpu()) << "Matrix type are not equal";
-
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK(height == outputV.getHeight() && width == outputV.getWidth())
-      << "Matrix dimensions are not equal";
-
-  real* output_grad = getData();
-  real* output_value = outputV.getData();
-  hl_softmax_backward(output_value, output_grad, height, width);
-}
-
-void GpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
-  CHECK_EQ(label.getHeight(), height_);
-  CHECK_EQ(output.getHeight(), height_);
-  CHECK_EQ(label.getWidth(), output.getWidth());
-  CHECK_EQ((size_t)1, width_);
-
-  auto labelptr = dynamic_cast<GpuSparseMatrix*>(&label);
-  if (labelptr) {
-    LOG(FATAL) << "not supported: GpuSparseMatrix as label";
-  }
-
-  BaseMatrix::sumOfSquaredDiffs(output,
-                                label,
-                                /* scaleSum= */ 1,
-                                /* scaleDest= */ 1);
-}
-
-void GpuMatrix::sumOfSquaresBp(Matrix& outputV, Matrix& label) {
-  add2(outputV, label, 1, 2, -2);
-}
-
-void GpuMatrix::tanh(Matrix& output) { BaseMatrix::tanh(output); }
-
-void GpuMatrix::tanhDerivative(Matrix& output) {
-  BaseMatrix::tanhDerivative(output);
-}
-
-void GpuMatrix::softrelu(Matrix& output) { BaseMatrix::softrelu(output); }
-
-void GpuMatrix::softreluDerivative(Matrix& output) {
-  BaseMatrix::softreluDerivative(output);
-}
-
-void GpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
-  BaseMatrix::scaledTanh(output, p1, p2);
-}
-
-void GpuMatrix::randomizeUniform() {
-  CHECK(isContiguous());
-  real* data = data_;
-  size_t size = height_ * width_;
-
-  hl_rand(data, size);
-}
-
-void GpuMatrix::print(std::ostream& os) const {
-  CHECK(isContiguous());
-  CpuMatrix cpuMat(getHeight(), getWidth());
-  cpuMat.copyFrom(*this);
-  cpuMat.print(os);
-}
-
-void GpuMatrix::print(std::ostream& os, size_t height, size_t width) const {
-  CHECK(isContiguous());
-  CpuMatrix cpuMat(getHeight(), getWidth());
-  cpuMat.copyFrom(*this);
-  cpuMat.print(os, height, width);
-}
-
-void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
-  CHECK(isContiguous());
-  CHECK(height_ == refMat.getHeight());
-  CHECK(width_ == refMat.getWidth());
-  CpuMatrix cpuRef(height_, width_);
-  GpuMatrix gpuRef(height_, width_);
-  cpuRef.copyFrom(refMat);
-  gpuRef.copyFrom(*this);
-  size_t diffCnt = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      real a = gpuRef.getElement(i, j);
-      real b = cpuRef.getElement(i, j);
-      if (fabs(a - b) > 0.00001) {
-        ++diffCnt;
-        if (printDiff) {
-          os << "ref= " << a << "  check= " << b << std::endl;
-        }
-      }
-    }
-  }
-  LOG(INFO) << "the  diffCnt is " << diffCnt;
-}
-
-void GpuMatrix::upsampleForward(Matrix& input,
-                                Matrix& mask,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t channels,
-                                size_t outputH,
-                                size_t outputW) {
-  CHECK(input.useGpu_ == true) << "Matrix type are not equal";
-  CHECK(mask.useGpu_ == true) << "Matrix type are not equal";
-
-  real* inputData = input.getData();
-  real* maskData = mask.getData();
-  real* outData = data_;
-
-  size_t batch = input.getHeight();
-
-  CHECK(imgSizeH * imgSizeW * channels == input.getWidth());
-  CHECK(imgSizeH * imgSizeW * channels == mask.getWidth());
-  CHECK_EQ(batch, this->getHeight());
-  CHECK(width_ == outputH * outputW * channels);
-  hl_upsample_forward(inputData,
-                      maskData,
-                      batch,
-                      imgSizeH,
-                      imgSizeW,
-                      channels,
-                      outputH,
-                      outputW,
-                      outData);
-}
-
-void GpuMatrix::upsampleBackward(Matrix& outputGrad,
-                                 Matrix& mask,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t channels,
-                                 size_t outputH,
-                                 size_t outputW) {
-  CHECK(outputGrad.useGpu_ == true) << "Matrix type are not equal";
-  CHECK(mask.useGpu_ == true) << "Matrix type are not equal";
-
-  real* outputGradData = outputGrad.getData();
-  real* maskData = mask.getData();
-  real* inputGradData = data_;
-  size_t batch = outputGrad.getHeight();
-
-  CHECK(imgSizeH * imgSizeW == this->getWidth() / channels);
-  CHECK_EQ(batch, this->getHeight());
-  CHECK_EQ(channels * outputH * outputW, outputGrad.getWidth());
-  hl_upsample_backward(outputGradData,
-                       maskData,
-                       batch,
-                       imgSizeH,
-                       imgSizeW,
-                       channels,
-                       outputH,
-                       outputW,
-                       inputGradData);
-}
-
-void GpuMatrix::maxPoolForward(Matrix& inputMat,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               MatrixPtr maskMatP) {
-  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  real* maskData = NULL;
-  size_t frameNum = inputMat.getHeight();
-  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputH * outputW * channels);
-
-  if (maskMatP != NULL) {
-    CHECK(maskMatP->useGpu_ == true) << "Matrix type are not equal";
-    CHECK(outputH * outputW * channels == maskMatP->getWidth());
-    maskData = maskMatP->getData();
-  }
-
-  hl_maxpool_forward(frameNum,
-                     inputData,
-                     channels,
-                     imgSizeH,
-                     imgSizeW,
-                     outputH,
-                     outputW,
-                     sizeX,
-                     sizeY,
-                     strideH,
-                     strideW,
-                     paddingH,
-                     paddingW,
-                     data_,
-                     getStride(),
-                     maskData);
-}
-
-void GpuMatrix::maxPoolBackward(Matrix& inputMat,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                Matrix& outGrad,
-                                Matrix& outV,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW) {
-  CHECK(inputMat.useGpu_ == true && outGrad.useGpu_ == true &&
-        outV.useGpu_ == true)
-      << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  real* outData = outV.getData();
-  real* outDiff = outGrad.getData();
-  size_t frameNum = inputMat.getHeight();
-  size_t channels = outV.getWidth() / outputH / outputW;
-  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(outGrad.getHeight() == outV.getHeight() &&
-        outGrad.getWidth() == outV.getWidth());
-
-  hl_maxpool_backward(frameNum,
-                      inputData,
-                      outData,
-                      outDiff,
-                      channels,
-                      imgSizeH,
-                      imgSizeW,
-                      outputH,
-                      outputW,
-                      sizeX,
-                      sizeY,
-                      strideH,
-                      strideW,
-                      paddingH,
-                      paddingW,
-                      scaleTargets,
-                      scaleOutput,
-                      data_,
-                      outGrad.getStride());
-}
-
-void GpuMatrix::avgPoolForward(Matrix& inputMat,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               bool excludeMode) {
-  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  size_t frameNum = inputMat.getHeight();
-  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputH * outputW * channels);
-
-  hl_avgpool_forward(frameNum,
-                     inputData,
-                     channels,
-                     imgSizeH,
-                     imgSizeW,
-                     outputH,
-                     outputW,
-                     sizeX,
-                     sizeY,
-                     strideH,
-                     strideW,
-                     paddingH,
-                     paddingW,
-                     data_,
-                     getStride(),
-                     excludeMode);
-}
-
-void GpuMatrix::avgPoolBackward(Matrix& outGrad,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW,
-                                bool excludeMode) {
-  CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
-
-  real* outDiff = outGrad.getData();
-  size_t frameNum = outGrad.getHeight();
-  size_t channels = outGrad.getWidth() / outputH / outputW;
-  CHECK(imgSizeH * imgSizeW * channels == width_);
-  CHECK(height_ == outGrad.getHeight());
-  CHECK(outGrad.getWidth() == outputH * outputW * channels);
-
-  hl_avgpool_backward(frameNum,
-                      outDiff,
-                      channels,
-                      imgSizeH,
-                      imgSizeW,
-                      outputH,
-                      outputW,
-                      sizeX,
-                      sizeY,
-                      strideH,
-                      strideW,
-                      paddingH,
-                      paddingW,
-                      scaleTargets,
-                      scaleOutput,
-                      data_,
-                      outGrad.getStride(),
-                      excludeMode);
-}
-
-void GpuMatrix::maxPool3DForward(Matrix& inputMat,
-                                 Matrix& maxPoolIdx,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  CHECK(inputMat.useGpu_) << "Matrix type are not correct";
-
-  real* inputData = inputMat.getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t num = inputMat.getHeight();
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputD * outputH * outputW * channels);
-
-  hl_maxpool3D_forward(num,
-                       inputData,
-                       channels,
-                       imgSizeD,
-                       imgSizeH,
-                       imgSizeW,
-                       outputD,
-                       outputH,
-                       outputW,
-                       sizeZ,
-                       sizeY,
-                       sizeX,
-                       strideD,
-                       strideH,
-                       strideW,
-                       paddingD,
-                       paddingH,
-                       paddingW,
-                       getData(),
-                       maxPoolIdxData,
-                       getStride());
-}
-
-void GpuMatrix::maxPool3DBackward(Matrix& outGrad,
-                                  Matrix& maxPoolIdx,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  CHECK(outGrad.useGpu_ && maxPoolIdx.useGpu_) << "Matrix type are not equal";
-
-  real* outDiff = outGrad.getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t frameNum = getHeight();
-  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == getWidth());
-  CHECK(outGrad.getHeight() == maxPoolIdx.getHeight() &&
-        outGrad.getWidth() == maxPoolIdx.getWidth());
-
-  hl_maxpool3D_backward(frameNum,
-                        outDiff,
-                        channels,
-                        imgSizeD,
-                        imgSizeH,
-                        imgSizeW,
-                        outputD,
-                        outputH,
-                        outputW,
-                        sizeZ,
-                        sizeY,
-                        sizeX,
-                        strideD,
-                        strideH,
-                        strideW,
-                        paddingD,
-                        paddingH,
-                        paddingW,
-                        scaleTargets,
-                        scaleOutput,
-                        getData(),
-                        maxPoolIdxData,
-                        outGrad.getStride());
-}
-
-void GpuMatrix::avgPool3DForward(Matrix& inputMat,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  CHECK(inputMat.useGpu_) << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  size_t frameNum = inputMat.getHeight();
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputD * outputH * outputW * channels);
-
-  hl_avgpool3D_forward(frameNum,
-                       inputData,
-                       channels,
-                       imgSizeD,
-                       imgSizeH,
-                       imgSizeW,
-                       outputD,
-                       outputH,
-                       outputW,
-                       sizeZ,
-                       sizeY,
-                       sizeX,
-                       strideD,
-                       strideH,
-                       strideW,
-                       paddingD,
-                       paddingH,
-                       paddingW,
-                       getData(),
-                       getStride());
-}
-
-void GpuMatrix::avgPool3DBackward(Matrix& outGrad,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  CHECK(outGrad.useGpu_) << "Matrix type are not equal";
-
-  real* outDiff = outGrad.getData();
-  size_t frameNum = outGrad.getHeight();
-  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == width_);
-  CHECK(height_ == outGrad.getHeight());
-  CHECK(outGrad.getWidth() == outputD * outputH * outputW * channels);
-
-  hl_avgpool3D_backward(frameNum,
-                        outDiff,
-                        channels,
-                        imgSizeD,
-                        imgSizeH,
-                        imgSizeW,
-                        outputD,
-                        outputH,
-                        outputW,
-                        sizeZ,
-                        sizeY,
-                        sizeX,
-                        strideD,
-                        strideH,
-                        strideW,
-                        paddingD,
-                        paddingH,
-                        paddingW,
-                        scaleTargets,
-                        scaleOutput,
-                        getData(),
-                        outGrad.getStride());
-}
-
-void GpuMatrix::maxSequenceForward(Matrix& input,
-                                   const IVector& sequence,
-                                   IVector& index) {
-  CHECK(dynamic_cast<GpuMatrix*>(&input));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  CHECK(dynamic_cast<GpuIVector*>(&index));
-
-  real* outData = getData();
-  real* inputData = input.getData();
-  const int* starts = sequence.getData();
-  int* maxIndex = index.getData();
-  size_t numSequences = getHeight();
-  size_t dim = getWidth();
-
-  CHECK_EQ(dim, input.getWidth());
-  CHECK_EQ(numSequences, sequence.getSize() - 1);
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  hl_max_sequence_forward(
-      inputData, starts, outData, maxIndex, numSequences, dim);
-}
-
-void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
-                                    const IVector& sequence,
-                                    IVector& index) {
-  CHECK(dynamic_cast<GpuMatrix*>(&outputGrad));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  CHECK(dynamic_cast<GpuIVector*>(&index));
-
-  real* inputGrad = getData();
-  real* outGrad = outputGrad.getData();
-  int* maxIndex = index.getData();
-  size_t dim = getWidth();
-  size_t numSequences = sequence.getSize() - 1;
-
-  CHECK_EQ(dim, outputGrad.getWidth());
-  CHECK_EQ(numSequences, outputGrad.getHeight());
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
-}
-
-void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
-  CHECK(data.useGpu_ == true && W.useGpu_ == true)
-      << "Matrix type are not equal";
-  real* input = data.getData();
-  real* w = W.getData();
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  real* output = getData();
-  hl_param_relu_forward(output, input, w, numElements, numSamples, partial_sum);
-}
-
-void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
-  CHECK(oGrad.useGpu_ == true && data.useGpu_ == true)
-      << "Matrix type are not equal";
-  real* ograd = oGrad.getData();
-  real* input = data.getData();
-  real* wgrad = data_;
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = this->getHeight() * this->getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  hl_param_relu_backward_w(
-      wgrad, ograd, input, numElements, numSamples, partial_sum);
-}
-
-void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
-  real* diff = data_;
-  real* input = data.getData();
-  real* ograd = oGrad.getData();
-  real* w = W.getData();
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  hl_param_relu_backward_diff(
-      ograd, input, w, diff, numElements, numSamples, partial_sum);
-}
-
-void GpuMatrix::addColumnVector(const Matrix& b) {
-  BaseMatrix::addColVector(const_cast<Matrix&>(b));
-}
-
-void GpuMatrix::bilinearForward(const Matrix& in,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {
-  CHECK(dynamic_cast<const GpuMatrix*>(&in));
-
-  const size_t outputW = getWidth();
-  const size_t outputH = getHeight();
-  const size_t inputW = in.getWidth();
-  const size_t inputH = in.getHeight();
-
-  real* outData = getData();
-  const real* inData = in.getData();
-
-  if (inImgH == outImgW && inImgW == outImgW) {
-    this->copyFrom(in);
-  } else {
-    hl_bilinear_forward(inData,
-                        inImgH,
-                        inImgW,
-                        inputH,
-                        inputW,
-                        outData,
-                        outImgH,
-                        outImgW,
-                        outputH,
-                        outputW,
-                        numChannels,
-                        ratioH,
-                        ratioW);
-  }
-}
-
-void GpuMatrix::bilinearBackward(const Matrix& out,
-                                 const size_t outImgH,
-                                 const size_t outImgW,
-                                 const size_t inImgH,
-                                 const size_t inImgW,
-                                 const size_t numChannels,
-                                 const real ratioH,
-                                 const real ratioW) {
-  CHECK(dynamic_cast<const GpuMatrix*>(&out));
-
-  const size_t inputW = getWidth();
-  const size_t inputH = getHeight();
-  const size_t outputW = out.getWidth();
-  const size_t outputH = out.getHeight();
-
-  real* inGrad = getData();
-  const real* outGrad = out.getData();
-
-  if (outImgH == inImgH && outImgW == inImgW) {
-    this->add(const_cast<Matrix&>(out));
-  } else {
-    hl_bilinear_backward(inGrad,
-                         inImgH,
-                         inImgW,
-                         inputH,
-                         inputW,
-                         outGrad,
-                         outImgH,
-                         outImgW,
-                         outputH,
-                         outputW,
-                         numChannels,
-                         ratioH,
-                         ratioW);
-  }
-}
-
-void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-#ifdef PADDLE_WITH_CUDA
-  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
-  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
-
-  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
-  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
-  CHECK(height_ == outputPtr->height_ && width_ == 1 &&
-        outputPtr->width_ == labelPtr->getWidth() &&
-        outputPtr->height_ == labelPtr->getHeight())
-      << "Matrix dimensions are not equal";
-
-  real* output_d = outputPtr->data_;
-  real* entropy_d = data_;
-  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
-  hl_matrix_multi_binary_cross_entropy(
-      output_d, entropy_d, mat_d, height_, outputPtr->width_);
-#endif
-}
-
-void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
-#ifdef PADDLE_WITH_CUDA
-  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
-  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
-
-  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
-  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
-  CHECK(height_ == outputPtr->height_ && width_ == outputPtr->width_ &&
-        outputPtr->width_ == labelPtr->getWidth() &&
-        outputPtr->height_ == labelPtr->getHeight())
-      << "Matrix dimensions are not equal";
-
-  real* output_d = outputPtr->data_;
-  real* grad_d = data_;
-  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
-  hl_matrix_multi_binary_cross_entropy_bp(
-      output_d, grad_d, mat_d, height_, width_);
-#endif
-}
-
-void GpuMatrix::vol2Col(real* dataSrc,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW) {
-  hl_matrix_vol2Col(dataSrc,
-                    channels,
-                    depth,
-                    height,
-                    width,
-                    filterD,
-                    filterH,
-                    filterW,
-                    strideD,
-                    strideH,
-                    strideW,
-                    paddingD,
-                    paddingH,
-                    paddingW,
-                    getData());
-}
-
-void GpuMatrix::col2Vol(real* dataDst,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW,
-                        real alpha,
-                        real beta) {
-  hl_matrix_col2Vol(dataDst,
-                    channels,
-                    depth,
-                    height,
-                    width,
-                    filterD,
-                    filterH,
-                    filterW,
-                    strideD,
-                    strideH,
-                    strideW,
-                    paddingD,
-                    paddingH,
-                    paddingW,
-                    getData(),
-                    alpha,
-                    beta);
-}
-
-/**
- * CpuMatrix
- */
-
-CpuMatrix::CpuMatrix(size_t height, size_t width, bool trans)
-    : Matrix(std::make_shared<CpuMemoryHandle>(height * width * sizeof(real)),
-             height,
-             width,
-             trans,
-             false) {}
-
-CpuMatrix::~CpuMatrix() {}
-
-void CpuMatrix::zeroMem() {
-  CHECK(data_ != NULL);
-  if (isContiguous()) {
-    memset(data_, 0, height_ * width_ * sizeof(real));
-  } else {
-    BaseMatrix::zero();
-  }
-}
-void CpuMatrix::resetOne() {
-  CHECK(data_ != NULL);
-  BaseMatrix::one();
-}
-
-void CpuMatrix::copyFrom(const Matrix& src) {
-  CHECK(isContiguous());
-  if (typeid(src) == typeid(GpuMatrix)) {
-    CHECK(src.isContiguous());
-    CHECK(elementCnt_ == src.getElementCnt());
-    hl_memcpy_device2host(
-        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
-  } else if (typeid(src) == typeid(CpuMatrix) ||
-             typeid(src) == typeid(SharedCpuMatrix)) {
-    CHECK(src.isContiguous());
-    CHECK(elementCnt_ == src.getElementCnt());
-    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
-  } else if (typeid(src) == typeid(CpuSparseMatrix)) {
-    CHECK_GE(elementCnt_, src.getElementCnt());
-    copyFrom(dynamic_cast<CpuSparseMatrix&>(const_cast<Matrix&>(src)));
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void CpuMatrix::copyFrom(CpuSparseMatrix& src) {
-  CHECK(isContiguous());
-  CHECK(height_ == src.getHeight());
-  CHECK(width_ == src.getWidth());
-  memset(data_, 0, sizeof(real) * height_ * width_);
-  if (src.getValueType() == FLOAT_VALUE) {
-    if (src.getFormat() == SPARSE_CSC) {
-      int* rows = src.getRows();
-      real* vals = src.getValue();
-      for (size_t i = 0; i < width_; i++) {
-        for (size_t j = src.getColStartIdx(i); j < src.getColStartIdx(i + 1);
-             j++) {
-          data_[rows[j] * width_ + i] = vals[j];
-        }
-      }
-    } else {
-      int* cols = src.getCols();
-      real* vals = src.getValue();
-      for (size_t i = 0; i < height_; i++) {
-        for (size_t j = src.getRowStartIdx(i); j < src.getRowStartIdx(i + 1);
-             j++) {
-          data_[i * width_ + cols[j]] = vals[j];
-        }
-      }
-    }
-  } else {
-    if (src.getFormat() == SPARSE_CSC) {
-      int* rows = src.getRows();
-      for (size_t i = 0; i < width_; i++) {
-        for (size_t j = src.getColStartIdx(i); j < src.getColStartIdx(i + 1);
-             j++) {
-          data_[rows[j] * width_ + i] = 1.0;
-        }
-      }
-    } else {
-      int* cols = src.getCols();
-      for (size_t i = 0; i < height_; i++) {
-        for (size_t j = src.getRowStartIdx(i); j < src.getRowStartIdx(i + 1);
-             j++) {
-          data_[i * width_ + cols[j]] = 1.0;
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  CHECK(isContiguous());
-  CHECK(src.isContiguous());
-  CHECK(elementCnt_ == src.getElementCnt());
-  if (typeid(src) == typeid(GpuMatrix)) {
-    hl_memcpy_async(this->getData(),
-                    const_cast<real*>(src.getData()),
-                    sizeof(real) * elementCnt_,
-                    stream);
-    // There is a need to add synchronization to ensure that the data is copied.
-    hl_stream_synchronize(stream);
-  } else if (typeid(src) == typeid(CpuMatrix)) {
-    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void CpuMatrix::copyFrom(const real* cpuSrc, size_t size) {
-  CHECK(isContiguous());
-  CHECK(size <= elementCnt_);
-  memcpy(data_, cpuSrc, sizeof(real) * size);
-}
-
-void CpuMatrix::copyFrom(const real* cpuSrc, const int64_t* seq) {
-  CHECK(isContiguous());
-  for (size_t i = 0; i < height_; i++) {
-    memcpy(data_ + i * width_, cpuSrc + seq[i] * width_, sizeof(real) * width_);
-  }
-}
-
-void CpuMatrix::copyFrom(const IVector& src) {
-  CHECK(isContiguous());
-  CHECK(elementCnt_ == src.getSize())
-      << "the src and dst should have same size.";
-  const int* cpuSrc = NULL;
-  IVectorPtr tmp;
-  if (src.useGpu()) {
-    CpuIVector tmp(src.getSize());
-    tmp.copyFrom(src);
-    cpuSrc = tmp.getData();
-  } else {
-    cpuSrc = src.getData();
-  }
-  for (size_t i = 0; i < elementCnt_; ++i) {
-    data_[i] = cpuSrc[i];
-  }
-}
-
-void CpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(b.getWidth(), width);
-  const int* index = rowIndex.getData();
-  for (size_t i = 0; i < height; i++) {
-    CHECK_LT(static_cast<size_t>(index[i]), b.getHeight());
-    real* src = b.getData() + index[i] * width;
-    real* dst = getData() + i * width;
-    memcpy(dst, src, sizeof(real) * width);
-  }
-}
-
-MatrixPtr CpuMatrix::clone(size_t height, size_t width, bool useGpu) {
-  CHECK(isContiguous());
-
-  if (height == 0 && width == 0) {
-    height = height_;
-    width = width_;
-  }
-
-  CHECK(width && height);
-
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(height, width);
-  } else {
-    return std::make_shared<CpuMatrix>(height, width);
-  }
-}
-
-void CpuMatrix::resize(size_t newHeight, size_t newWidth) {
-  size_t newSize = newHeight * newWidth;
-  if (NULL == memoryHandle_.get() ||
-      newSize * sizeof(real) > memoryHandle_->getAllocSize()) {
-    memoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize * sizeof(real));
-    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
-  }
-
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newSize;
-  stride_ = width_;
-}
-
-real CpuMatrix::getElement(size_t x, size_t y) const {
-  return data_[x * stride_ + y];
-}
-
-real CpuMatrix::getSum() {
-  CHECK(isContiguous());
-  double sum = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      sum += data_[i * width_ + j];
-    }
-  }
-  return sum;
-}
-
-void CpuMatrix::accumulateColSum(Matrix& src) {
-  CHECK_EQ(getWidth(), src.getWidth());
-  CHECK_EQ(getHeight(), (size_t)1);
-
-  sumCols(src, /* scaleSum= */ 1, /* scaleDest= */ 1);
-}
-
-real CpuMatrix::getAbsSum() {
-  CHECK(isContiguous());
-  double sum = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      sum += fabs(data_[i * width_ + j]);
-    }
-  }
-  return sum;
-}
-
-MatrixPtr CpuMatrix::getTranspose() {
-  if (memoryHandle_.get() != NULL) {
-    return std::make_shared<CpuMatrix>(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
-        height_,
-        width_,
-        true);
-  } else {
-    MatrixPtr copy_T(new CpuMatrix(data_, height_, width_, true));
-    return copy_T;
-  }
-}
-
-void CpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  if (memAlloc) {
-    matTrans = std::make_shared<CpuMatrix>(width_, height_);
-  } else {
-    CHECK(matTrans != NULL);
-    CHECK_EQ(matTrans->getHeight(), width_);
-    CHECK_EQ(matTrans->getWidth(), height_);
-  }
-  real* dataTrans = matTrans->getData();
-  real* data = getData();
-  int lda = getStride();
-  int ldc = matTrans->getStride();
-
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      dataTrans[j * ldc + i] = data[i * lda + j];
-    }
-  }
-}
-
-void CpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
-  if (memAlloc) {
-    matRot = std::make_shared<CpuMatrix>(width_, height_);
-  } else {
-    CHECK(matRot != NULL);
-    CHECK_EQ(matRot->getHeight(), width_);
-    CHECK_EQ(matRot->getWidth(), height_);
-  }
-  real* dataRot = matRot->getData();
-  real* data = getData();
-
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      if (clockWise) {
-        dataRot[j * height_ + i] = data[(height_ - i - 1) * width_ + j];
-      } else {
-        dataRot[j * height_ + i] = data[i * width_ + (width_ - j - 1)];
-      }
-    }
-  }
-}
-
-MatrixPtr CpuMatrix::getInverse() {
-  MatrixPtr matInv;
-  inverse(matInv, true);
-  return matInv;
-}
-
-void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
-  CHECK_EQ(height_, width_);
-
-  if (memAlloc) {
-    matInv = std::make_shared<CpuMatrix>(height_, width_);
-  } else {
-    CHECK(matInv != NULL);
-  }
-
-  CHECK_EQ(height_, matInv->getHeight());
-  CHECK_EQ(width_, matInv->getWidth());
-  matInv->copyFrom(*this);
-
-  real* data = getData();
-  real* dataInv = matInv->getData();
-  int ldc = matInv->getStride();
-
-  if (height_ == 1) {
-    CHECK_NE(*data, 0);
-    *dataInv = 1.0 / (*data);
-    return;
-  }
-
-  /* Compute the LU decomposition of the matrix */
-  std::vector<int> ipiv(height_);
-  CBLAS_ORDER order = (matInv->isTransposed() ? CblasColMajor : CblasRowMajor);
-  int info = getrf<real>(order, height_, height_, dataInv, ldc, ipiv.data());
-  CHECK_EQ(info, 0);
-
-  /* Compute the inverse of the matrix given its LU decompsotion */
-  info = getri<real>(order, height_, dataInv, ldc, ipiv.data());
-  CHECK_EQ(info, 0);
-}
-
-void CpuMatrix::upsampleForward(Matrix& input,
-                                Matrix& mask,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t channels,
-                                size_t outputH,
-                                size_t outputW) {
-  real* inputData = input.getData();
-  real* maskData = mask.getData();
-  real* outData = data_;
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  size_t batch = input.getHeight();
-  CHECK(inLength == input.getWidth() / channels);
-  CHECK_EQ(batch, this->getHeight());
-  CHECK_EQ(channels * outLength, this->getWidth());
-
-  for (size_t k = 0; k < batch; k++) {
-    for (size_t c = 0; c < channels; c++) {
-      for (size_t i = 0; i < inLength; i++) {
-        size_t out_index = static_cast<int>(maskData[i]);
-        if (out_index >= outLength) {
-          LOG(FATAL) << "upsample index " << out_index << " out of range.";
-        }
-        outData[out_index] = inputData[i];
-      }
-      inputData += inLength;
-      maskData += inLength;
-      outData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::upsampleBackward(Matrix& outputGrad,
-                                 Matrix& mask,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t channels,
-                                 size_t outputH,
-                                 size_t outputW) {
-  real* outputGradData = outputGrad.getData();
-  real* maskData = mask.getData();
-  real* inputGradData = data_;
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  size_t batch = outputGrad.getHeight();
-  CHECK(inLength == this->getWidth() / channels);
-  CHECK_EQ(batch, this->getHeight());
-  CHECK_EQ(channels * outLength, outputGrad.getWidth());
-
-  for (size_t k = 0; k < batch; k++) {
-    for (size_t c = 0; c < channels; c++) {
-      for (size_t i = 0; i < inLength; i++) {
-        size_t out_index = static_cast<int>(maskData[i]);
-        if (out_index >= outLength) {
-          LOG(FATAL) << "upsample index " << out_index << " out of range.";
-        }
-        inputGradData[i] = outputGradData[out_index];
-      }
-      inputGradData += inLength;
-      maskData += inLength;
-      outputGradData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPoolForward(Matrix& inputMat,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               MatrixPtr maskMatP) {
-  real* inputData = inputMat.getData();
-  real* outData = data_;
-  real* maskData = NULL;
-  size_t num = inputMat.getHeight();
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  CHECK(inLength == inputMat.getWidth() / channels);
-  CHECK_EQ(num, this->getHeight());
-  CHECK_EQ(channels * outLength, this->getWidth());
-  size_t outStride = getStride();
-
-  if (maskMatP != NULL) {
-    maskData = maskMatP->getData();
-    CHECK_EQ(channels * outLength, maskMatP->getWidth());
-  }
-
-  /* pool max one by one */
-  for (size_t n = 0; n < num; ++n) {  // frame by frame
-    if (!isContiguous()) {
-      outData = data_ + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {  // channel by channel
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = hstart + sizeY;
-        hstart = hstart < 0 ? 0 : hstart;
-        hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = wstart + sizeX;
-          wstart = wstart < 0 ? 0 : wstart;
-          wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
-
-          real maxval = -(real)FLT_MAX;
-          int max_index = -1;
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              if (maxval < inputData[h * imgSizeW + w]) {
-                maxval = inputData[h * imgSizeW + w];
-                max_index = h * imgSizeW + w;
-              }
-            }
-          }
-
-          outData[ph * outputW + pw] = maxval;
-          if (maskData != NULL) maskData[ph * outputW + pw] = max_index;
-        }
-      }
-      // compute offset
-      inputData += inLength;
-      outData += outLength;
-
-      if (maskData != NULL) maskData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPoolBackward(Matrix& image,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                Matrix& outGrad,
-                                Matrix& outV,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW) {
-  size_t num = image.getHeight();
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  size_t channels = size_t(width_ / inLength);
-  CHECK(image.getWidth() == inLength * channels);
-  CHECK(image.getHeight() == height_ && image.getWidth() == width_);
-  CHECK(outV.getHeight() == outGrad.getHeight() &&
-        outV.getWidth() == outGrad.getWidth());
-
-  real* tgtGrad = data_;
-  real* inData = image.getData();
-  real* otData = outV.getData();
-  real* otGrad = outGrad.getData();
-
-  size_t outStride = outV.getStride();
-  real* origOutData = otData;
-  real* origOutGrad = otGrad;
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!outV.isContiguous()) {
-      otData = origOutData + n * outStride;
-      otGrad = origOutGrad + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              tgtGrad[h * imgSizeW + w] =
-                  scaleTargets * tgtGrad[h * imgSizeW + w] +
-                  scaleOutput * otGrad[ph * outputW + pw] *
-                      (inData[h * imgSizeW + w] == otData[ph * outputW + pw]);
-            }
-          }
-        }
-      }
-      // offset
-      inData += inLength;
-      tgtGrad += inLength;
-      otData += outLength;
-      otGrad += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPoolForward(Matrix& input,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               bool excludeMode) {
-  // The main loop
-  size_t num = input.getHeight();
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  CHECK(inLength * channels == input.getWidth());
-  CHECK(outLength * channels * num == height_ * width_);
-  real* tgtData = data_;
-  real* inData = input.getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!isContiguous()) {
-      tgtData = data_ + n * getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
-          tgtData[ph * outputW + pw] = 0;  // clear
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              tgtData[ph * outputW + pw] += inData[h * imgSizeW + w];
-            }
-          }
-          int poolSize =
-              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
-          CHECK(poolSize);
-          tgtData[ph * outputW + pw] /= poolSize;
-        }
-      }
-      // compute offset
-      inData += inLength;
-      tgtData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPoolBackward(Matrix& input,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW,
-                                bool excludeMode) {
-  size_t num = input.getHeight();
-  size_t channels = input.getWidth() / outputH / outputW;
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  CHECK(inLength * channels == getWidth());
-  real* inData = input.getData();
-  real* outData = getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!input.isContiguous()) {
-      inData = input.getData() + n * input.getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
-          int poolSize =
-              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
-          CHECK(poolSize);
-
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              outData[h * imgSizeW + w] += inData[ph * outputW + pw] / poolSize;
-            }
-          }
-        }
-      }
-      // offset
-      outData += inLength;
-      inData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPool3DForward(Matrix& inputMat,
-                                 Matrix& maxPoolIdx,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  real* inputData = inputMat.getData();
-  real* outData = getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t num = inputMat.getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  CHECK(inLength == inputMat.getWidth() / channels);
-  CHECK_EQ(num, this->getHeight());
-  CHECK_EQ(channels * outLength, this->getWidth());
-  size_t outStride = getStride();
-
-  /* initialize the data_ */
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      outData[(i)*outStride + j] = -(real)FLT_MAX;
-      maxPoolIdxData[(i)*outStride + j] = -1;
-    }
-  }
-
-  /* pool max one by one */
-  for (size_t n = 0; n < num; ++n) {  // frame by frame
-    if (!isContiguous()) {
-      outData = getData() + n * outStride;
-      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {  // channel by channel
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        int dstart = pd * strideD - paddingD;
-        int dend = std::min(dstart + sizeZ, imgSizeD);
-        dstart = std::max(dstart, 0);
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          int hstart = ph * strideH - paddingH;
-          int hend = std::min(hstart + sizeY, imgSizeH);
-          hstart = std::max(hstart, 0);
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            int wstart = pw * strideW - paddingW;
-            int wend = std::min(wstart + sizeX, imgSizeW);
-            wstart = std::max(wstart, 0);
-            int maxIdx = -1;
-            real maxOutData = outData[(pd * outputH + ph) * outputW + pw];
-            for (int d = dstart; d < dend; ++d) {
-              for (int h = hstart; h < hend; ++h) {
-                for (int w = wstart; w < wend; ++w) {
-                  if (maxOutData <
-                      inputData[(d * imgSizeH + h) * imgSizeW + w]) {
-                    maxOutData = inputData[(d * imgSizeH + h) * imgSizeW + w];
-                    maxIdx = (d * imgSizeH + h) * imgSizeW + w;
-                  }
-                }
-              }
-            }
-            outData[(pd * outputH + ph) * outputW + pw] = maxOutData;
-            maxPoolIdxData[(pd * outputH + ph) * outputW + pw] = maxIdx;
-          }
-        }
-      }
-      // compute offset
-      inputData += inLength;
-      outData += outLength;
-      maxPoolIdxData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPool3DBackward(Matrix& outGrad,
-                                  Matrix& maxPoolIdx,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  size_t num = getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  size_t channels = size_t(width_ / inLength);
-  CHECK(maxPoolIdx.getHeight() == outGrad.getHeight() &&
-        maxPoolIdx.getWidth() == outGrad.getWidth());
-
-  real* tgtGrad = getData();
-  real* otGrad = outGrad.getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t outStride = outGrad.getStride();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!outGrad.isContiguous()) {
-      otGrad = outGrad.getData() + n * outStride;
-      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            const size_t index = (pd * outputH + ph) * outputW + pw;
-            const size_t tgtIdx = static_cast<size_t>(maxPoolIdxData[index]);
-            tgtGrad[tgtIdx] =
-                scaleTargets * tgtGrad[tgtIdx] + scaleOutput * otGrad[index];
-          }
-        }
-      }
-      // offset
-      tgtGrad += inLength;
-      otGrad += outLength;
-      maxPoolIdxData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPool3DForward(Matrix& input,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  // The main loop
-  size_t num = input.getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  CHECK(inLength * channels == input.getWidth());
-  CHECK(outLength * channels * num == height_ * width_);
-  real* tgtData = getData();
-  real* inData = input.getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!isContiguous()) {
-      tgtData = data_ + n * getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        int dstart = pd * strideD - paddingD;
-        int dend = std::min(dstart + sizeZ, imgSizeD);
-        dstart = std::max(dstart, 0);
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          int hstart = ph * strideH - paddingH;
-          int hend = std::min(hstart + sizeY, imgSizeH);
-          hstart = std::max(hstart, 0);
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            int wstart = pw * strideW - paddingW;
-            int wend = std::min(wstart + sizeX, imgSizeW);
-            wstart = std::max(wstart, 0);
-
-            tgtData[(pd * outputH + ph) * outputW + pw] = 0;  // clear
-            for (int d = dstart; d < dend; ++d) {
-              for (int h = hstart; h < hend; ++h) {
-                for (int w = wstart; w < wend; ++w) {
-                  tgtData[(pd * outputH + ph) * outputW + pw] +=
-                      inData[(d * imgSizeH + h) * imgSizeW + w];
-                }
-              }
-            }
-            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
-            CHECK(poolSize);
-            tgtData[(pd * outputH + ph) * outputW + pw] /= poolSize;
-          }
-        }
-      }
-      // compute offset
-      inData += inLength;
-      tgtData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPool3DBackward(Matrix& input,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  size_t num = input.getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  size_t channels = input.getWidth() / outLength;
-  CHECK(inLength * channels == getWidth());
-  real* inData = input.getData();
-  real* outData = getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!input.isContiguous()) {
-      inData = input.getData() + n * input.getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        int dstart = pd * strideD - paddingD;
-        int dend = std::min(dstart + sizeZ, imgSizeD);
-        dstart = std::max(dstart, 0);
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          int hstart = ph * strideH - paddingH;
-          int hend = std::min(hstart + sizeY, imgSizeH);
-          hstart = std::max(hstart, 0);
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            int wstart = pw * strideW - paddingW;
-            int wend = std::min(wstart + sizeX, imgSizeW);
-            wstart = std::max(wstart, 0);
-            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
-            CHECK(poolSize);
-            for (int d = dstart; d < dend; ++d) {
-              for (int h = hstart; h < hend; ++h) {
-                for (int w = wstart; w < wend; ++w) {
-                  outData[(d * imgSizeH + h) * imgSizeW + w] +=
-                      inData[(pd * outputH + ph) * outputW + pw] / poolSize;
-                }
-              }
-            }
-          }
-        }
-      }
-      // offset
-      outData += inLength;
-      inData += outLength;
-    }
-  }
-}
-
-/**
- * Input: one or more sequences. Each sequence contains some instances.
- * Output: output size is the number of input sequences (NOT input instances).
- * output[i] is set to max_{for each instance in this sequence}{input[i]}
- */
-void CpuMatrix::maxSequenceForward(Matrix& input,
-                                   const IVector& sequence,
-                                   IVector& index) {
-  CHECK(dynamic_cast<CpuMatrix*>(&input));
-  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
-  CHECK(dynamic_cast<CpuIVector*>(&index));
-
-  real* outData = getData();
-  real* inputData = input.getData();
-  const int* starts = sequence.getData();
-  int* maxIndex = index.getData();
-  size_t numSequences = getHeight();
-  size_t dim = getWidth();
-
-  CHECK_EQ(dim, input.getWidth());
-  CHECK_EQ(numSequences, sequence.getSize() - 1);
-  CHECK_EQ(starts[numSequences], (int)input.getHeight());
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
-    // current sequence, loop for each input instance
-    // (1) first instance: do not need compare, copy value to outV directly
-    for (size_t k = 0; k < dim; ++k) {
-      outData[sequenceId * dim + k] = inputData[starts[sequenceId] * dim + k];
-      maxIndex[sequenceId * dim + k] = starts[sequenceId];
-    }
-    // (2) other instance in same sequence
-    for (int insId = starts[sequenceId] + 1; insId < starts[sequenceId + 1];
-         ++insId) {
-      // insId is the index on all instances
-      for (size_t k = 0; k < dim; ++k) {
-        // for each dim
-        if (inputData[insId * dim + k] > outData[sequenceId * dim + k]) {
-          // update max value and record index
-          outData[sequenceId * dim + k] = inputData[insId * dim + k];
-          maxIndex[sequenceId * dim + k] = insId;
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
-                                    const IVector& sequence,
-                                    IVector& index) {
-  CHECK(dynamic_cast<CpuMatrix*>(&outputGrad));
-  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
-  CHECK(dynamic_cast<CpuIVector*>(&index));
-
-  real* inputGrad = getData();
-  real* outGrad = outputGrad.getData();
-  int* maxIndex = index.getData();
-  size_t dim = getWidth();
-  size_t numSequences = sequence.getSize() - 1;
-
-  CHECK_EQ(dim, outputGrad.getWidth());
-  CHECK_EQ(numSequences, outputGrad.getHeight());
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
-    // current sequence
-    for (size_t j = 0; j < dim; ++j) {
-      // each dim
-      int insId = maxIndex[sequenceId * dim + j];
-      inputGrad[insId * dim + j] += outGrad[sequenceId * dim + j];
-    }
-  }
-}
-
-inline void vecAddTo(real* a, const real* b, size_t len) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i] += b[i];
-  }
-}
-
-inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i] += scaleB * b[i];
-  }
-}
-
-inline void colVecAddTo(
-    real* a, const real* b, size_t len, size_t aWidth, size_t bWidth) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i * aWidth] += b[i * bWidth];
-  }
-}
-
-inline void colVecAddTo(
-    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i * aWidth] += b[i * bWidth] * c;
-  }
-}
-
-void CpuMatrix::addBias(Matrix& b, real scale) {
-  CHECK(b.useGpu_ == false) << "Matrix type are not equal";
-
-  CHECK_EQ(b.getHeight(), (size_t)1);
-  CHECK_EQ(width_, b.getWidth());
-  real* aData = getData();
-  real* bData = b.getData();
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-
-  if (scale == 1 && getStride() % 32 == 0) {  // use libaddto
-    // @TODO(yuyang18) Make input addr can be unaligned.
-    // So merge this if and else
-    CHECK_EQ((size_t)aData % 32, 0UL);
-    CHECK_EQ((size_t)bData % 32, 0UL);
-    for (size_t i = 0; i < numSamples; i++) {
-      simd::addTo(aData + i * getStride(), bData, dim);
-    }
-  } else {
-    for (size_t i = 0; i < numSamples; i++) {
-      for (size_t j = 0; j < dim; j++) {
-        aData[i * getStride() + j] += scale * bData[j];
-      }
-    }
-  }
-}
-
-void CpuMatrix::addSharedBias(Matrix& b, real scale) {
-  CHECK_EQ(b.getHeight(), (size_t)1);
-  real* aData = getData();
-  real* bData = b.getData();
-  size_t numSamples = getHeight();
-  size_t channel = b.getWidth();
-  CHECK_EQ(getWidth() % channel, 0UL);
-  size_t dim = getWidth() / channel;
-
-  for (size_t i = 0; i < numSamples; i++) {
-    for (size_t c = 0; c < channel; c++) {
-      for (size_t j = 0; j < dim; j++) {
-        aData[i * getStride() + c * dim + j] += scale * bData[c];
-      }
-    }
-  }
-}
-
-void CpuMatrix::collectBias(Matrix& a, real scale) {
-  CHECK_EQ(getHeight(), (size_t)1);
-  CHECK_EQ(width_, a.getWidth());
-  CpuSparseMatrix* aptr = dynamic_cast<CpuSparseMatrix*>(&a);
-  if (!aptr) {
-    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
-  } else {
-    size_t nnz = aptr->getElementCnt();
-    int* cols = aptr->getCols();
-    real* A = aptr->getValue();
-    real* B = getData();
-    for (size_t i = 0; i < nnz; i++) {
-      B[cols[i]] += scale * A[i];
-    }
-  }
-}
-
-void CpuMatrix::collectSharedBias(Matrix& a, real scale) {
-  CHECK_EQ(getHeight(), (size_t)1);
-  real* B = getData();
-  real* A = a.getData();
-  size_t numSamples = a.getHeight();
-  size_t channel = getWidth();
-  CHECK_EQ(a.getWidth() % channel, 0UL);
-  size_t dim = a.getWidth() / channel;
-  for (size_t i = 0; i < numSamples; i++) {
-    for (size_t c = 0; c < channel; c++) {
-      for (size_t j = 0; j < dim; j++) {
-        B[c] += scale * A[i * channel * dim + c * dim + j];
-      }
-    }
-  }
-}
-
-void CpuMatrix::sequenceAvgForward(Matrix& a,
-                                   const IVector& startsPos,
-                                   int mode) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
-  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
-  for (size_t i = 0; i < height; i++) {
-    int sequenceLength = starts[i + 1] - starts[i];
-    if (0 == sequenceLength) {
-      // empty sequence
-      continue;
-    }
-    outMtx->setData(dst + i * width);
-    dataMtx->setData(src + starts[i] * width, sequenceLength, width);
-    if (mode == 0) {
-      // plain average
-      outMtx->sumCols(*dataMtx,
-                      (real)1 / (real)sequenceLength,
-                      /* scaleDest= */ 1);
-    } else if (mode == 1) {
-      // sum instead of average
-      outMtx->sumCols(*dataMtx, /* scaleSum= */ 1, /* scaleDest= */ 1);
-    } else if (mode == 2) {
-      // divide by square root of sequenceLength
-      outMtx->sumCols(*dataMtx,
-                      (real)1 / std::sqrt(sequenceLength),
-                      /* scaleDest= */ 1);
-    } else {
-      LOG(FATAL) << "should not reach here";
-    }
-  }
-}
-
-void CpuMatrix::sequenceAvgBackward(Matrix& a,
-                                    const IVector& startsPos,
-                                    int mode) {
-  size_t height = a.getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
-  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
-  for (size_t i = 0; i < height; ++i) {
-    int sequenceLength = starts[i + 1] - starts[i];
-    if (0 == sequenceLength) {
-      // empty sequence
-      continue;
-    }
-    outMtx->setData(dst + starts[i] * width, sequenceLength, width);
-    dataMtx->setData(src + i * width);
-    if (mode == 0) {
-      // plain average
-      outMtx->addBias(*dataMtx, 1.0f / sequenceLength);
-    } else if (mode == 1) {
-      // sum instead of average
-      outMtx->addBias(*dataMtx, 1.0f);
-    } else if (mode == 2) {
-      // divide by square root of sequenceLength
-      outMtx->addBias(*dataMtx, 1.0f / std::sqrt(sequenceLength));
-    } else {
-      LOG(FATAL) << "should not reach here";
-    }
-  }
-}
-
-/* this = scaleAB*(a*b) + scaleT*this*/
-void CpuMatrix::mul(const Matrix& a,
-                    const Matrix& b,
-                    real scaleAB,
-                    real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-  const auto a_ptr = dynamic_cast<const CpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const CpuMatrix*>(&b);
-  const auto a_ptr_s = dynamic_cast<const CpuSparseMatrix*>(&a);
-  const auto b_ptr_s = dynamic_cast<const CpuSparseMatrix*>(&b);
-
-  if (a_ptr && b_ptr) {
-    mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, scaleAB, scaleT);
-  } else if (a_ptr_s && b_ptr) {
-    mul((CpuSparseMatrix*)a_ptr_s, (CpuMatrix*)b_ptr, scaleAB, scaleT);
-  } else if (a_ptr && b_ptr_s) {
-    mul((CpuMatrix*)a_ptr, (CpuSparseMatrix*)b_ptr_s, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
-}
-
-void CpuMatrix::mul(CpuSparseMatrix* a,
-                    CpuMatrix* b,
-                    real scaleAB,
-                    real scaleT) {
-  if (dynamic_cast<CacheRowCpuMatrix*>(b)) {
-    return mul(a, dynamic_cast<CacheRowCpuMatrix*>(b), this, scaleAB, scaleT);
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(b)) {
-    return mul(a, dynamic_cast<SparseRowCpuMatrix*>(b), this, scaleAB, scaleT);
-  } else {
-    return mul(a, b, this, scaleAB, scaleT);
-  }
-}
-
-void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-
-  size_t a_col, b_col, a_row, b_row;
-  bool a_trans, b_trans;
-  if (!a->isTransposed()) {
-    a_col = a->getWidth();
-    a_row = a->getHeight();
-    a_trans = false;
-  } else {
-    a_col = a->getHeight();
-    a_row = a->getWidth();
-    a_trans = true;
-  }
-  if (!b->isTransposed()) {
-    b_col = b->getWidth();
-    b_row = b->getHeight();
-    b_trans = false;
-  } else {
-    b_col = b->getHeight();
-    b_row = b->getWidth();
-    b_trans = true;
-  }
-
-  CHECK_EQ(a_col, b_row);
-  CHECK_EQ(a_row, getHeight());
-  CHECK_EQ(b_col, getWidth());
-
-  real* A = a->getData();
-  real* B = b->getData();
-  real* C = getData();
-
-  int M = getHeight();
-  int N = getWidth();
-  int K = a_col;
-  int lda = a->getStride();
-  int ldb = b->getStride();
-  int ldc = getStride();
-  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
-      a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
-}
-
-void CpuMatrix::mul(
-    CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, real scaleAB, real scaleT) {
-  CHECK(!c->isTransposed()) << "Not supported";
-  CHECK_EQ(c->getValueType(), FLOAT_VALUE);
-
-  real* A = a->getData();
-  real* B = b->getData();
-  real* C = c->getValue();
-  int* rows = c->getRows();
-  int* cols = c->getCols();
-  size_t height = c->getHeight();
-  size_t width = c->getWidth();
-  if (scaleT == 0) {
-    c->zeroMem();
-  }
-
-  if (!a->isTransposed() && !b->isTransposed()) {
-    size_t m = a->getWidth();
-    CHECK_EQ(b->getHeight(), m);
-    CHECK_EQ(a->getHeight(), height);
-    CHECK_EQ(b->getWidth(), width);
-    if (c->getFormat() == SPARSE_CSC) {
-      for (size_t i = 0; i < width; i++) {
-        size_t start = c->getColStartIdx(i);
-        size_t end = c->getColStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t rowIdx = rows[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[rowIdx * m + k] * B[k * width + i];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    } else {
-      for (size_t i = 0; i < height; i++) {
-        size_t start = c->getRowStartIdx(i);
-        size_t end = c->getRowStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t colIdx = cols[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[i * m + k] * B[k * width + colIdx];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    }
-  } else if (a->isTransposed() && !b->isTransposed()) {
-    size_t m = a->getHeight();
-    CHECK_EQ(m, b->getHeight());
-    CHECK_EQ(b->getWidth(), width);
-    CHECK_EQ(a->getWidth(), height);
-
-    if (c->getFormat() == SPARSE_CSC) {
-      for (size_t i = 0; i < width; i++) {
-        size_t start = c->getColStartIdx(i);
-        size_t end = c->getColStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t rowIdx = rows[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[k * height + rowIdx] * B[k * width + i];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    } else {
-      for (size_t i = 0; i < height; i++) {
-        int start = c->getRowStartIdx(i);
-        int end = c->getRowStartIdx(i + 1);
-        for (int j = start; j < end; j++) {
-          real sum = 0;
-          size_t colIdx = cols[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[k * height + i] * B[k * width + colIdx];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    }
-  } else if (!a->isTransposed() && b->isTransposed()) {
-    size_t m = a->getWidth();
-    CHECK_EQ(b->getWidth(), m);
-    CHECK_EQ(a->getHeight(), height);
-    CHECK_EQ(b->getHeight(), width);
-    if (c->getFormat() == SPARSE_CSR) {
-      for (size_t i = 0; i < height; i++) {
-        size_t start = c->getRowStartIdx(i);
-        size_t end = c->getRowStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t colIdx = cols[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[i * m + k] * B[colIdx * m + k];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    } else {
-      LOG(FATAL) << "Not supported csc format "
-                    "when a is not trans and b is trans";
-    }
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
-}
-
-void CpuMatrix::mul(CpuMatrix* a,
-                    CpuSparseMatrix* b,
-                    real scaleAB,
-                    real scaleT) {
-  CHECK(!trans_) << "Not supported";
-  CHECK(!a->isTransposed()) << "Not supported";
-  CHECK(scaleT == 0 || scaleT == 1);
-
-  // TODO(yuyang18): Maybe bug implementation here
-  CHECK_EQ(scaleAB, static_cast<real>(1.0));
-
-  real* A = a->getData();
-  real* B = b->getValue();
-  real* C = getData();
-  int* rows = b->getRows();
-  int* cols = b->getCols();
-
-  if (scaleT == 0) {
-    zeroMem();
-  }
-  if (b->getFormat() == SPARSE_CSC) {
-    if (!b->isTransposed()) {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), m);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), width_);
-
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t j = 0; j < b->getWidth(); ++j) {
-          int start = b->getColStartIdx(j);
-          int end = b->getColStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(C + j, A + rows[i], height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t j = 0; j < b->getWidth(); ++j) {
-          int start = b->getColStartIdx(j);
-          int end = b->getColStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(
-                C + j, A + rows[i], B[i], height_, width_, a->getWidth());
-          }
-        }
-      }
-    } else /*if (b->isTransposed())*/ {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), width_);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), m);
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t i = 0; i < b->getWidth(); ++i) {
-          int start = b->getColStartIdx(i);
-          int end = b->getColStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(C + rows[j], A + i, height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t i = 0; i < b->getWidth(); ++i) {
-          int start = b->getColStartIdx(i);
-          int end = b->getColStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(
-                C + rows[j], A + i, B[j], height_, width_, a->getWidth());
-          }
-        }
-      }
-    }
-  } else {
-    if (!b->isTransposed()) {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), m);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), width_);
-
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t j = 0; j < b->getHeight(); ++j) {
-          int start = b->getRowStartIdx(j);
-          int end = b->getRowStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(C + cols[i], A + j, height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t j = 0; j < b->getHeight(); ++j) {
-          int start = b->getRowStartIdx(j);
-          int end = b->getRowStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(
-                C + cols[i], A + j, B[i], height_, width_, a->getWidth());
-          }
-        }
-      }
-    } else /*if (b->isTransposed())*/ {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), width_);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), m);
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t i = 0; i < b->getHeight(); ++i) {
-          int start = b->getRowStartIdx(i);
-          int end = b->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(C + i, A + cols[j], height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t i = 0; i < b->getHeight(); ++i) {
-          int start = b->getRowStartIdx(i);
-          int end = b->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(
-                C + i, A + cols[j], B[j], height_, width_, a->getWidth());
-          }
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::selectRows(Matrix& table, IVector& ids) {
-  if (dynamic_cast<CacheRowCpuMatrix*>(&table)) {
-    selectRowsImp(*dynamic_cast<CacheRowCpuMatrix*>(&table), ids);
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(&table)) {
-    selectRowsImp(*dynamic_cast<SparseRowCpuMatrix*>(&table), ids);
-  } else {
-    CHECK(table.isContiguous());
-    selectRowsImp(*dynamic_cast<CpuMatrix*>(&table), ids);
-  }
-}
-
-void CpuMatrix::selectElements(Matrix& table, IVector& ids) {
-  CHECK_EQ(table.getHeight(), ids.getSize());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), 1U);
-  real* tableData = table.getData();
-  int* idsData = ids.getData();
-  for (size_t i = 0; i < table.getHeight(); i++) {
-    data_[i] += tableData[i * table.getWidth() + idsData[i]];
-  }
-}
-
-void CpuMatrix::addElements(Matrix& table, IVector& ids) {
-  CHECK_EQ(table.getHeight(), ids.getSize());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), 1U);
-  real* tableData = table.getData();
-  int* idsData = ids.getData();
-  for (size_t i = 0; i < table.getHeight(); i++) {
-    tableData[i * table.getWidth() + idsData[i]] += data_[i];
-  }
-}
-
-// this.row[i] += table.row[ids[i]]
-template <typename TableMatType>
-void CpuMatrix::selectRowsImp(TableMatType& table, IVector& ids) {
-  CHECK(!table.useGpu());
-  CHECK(!ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  for (size_t i = 0; i < numSamples; ++i) {
-    if (index[i] == -1) continue;
-    CHECK_LT(index[i], (int)tableSize);
-    CHECK_GE(index[i], 0);
-    vecAddTo(a + i * stride_, table.getRow(index[i]), dim);
-  }
-}
-
-void CpuMatrix::addToRows(Matrix& table, IVector& ids) {
-  if (dynamic_cast<CacheRowCpuMatrix*>(&table)) {
-    addToRowsImp(*dynamic_cast<CacheRowCpuMatrix*>(&table), ids);
-  } else if (dynamic_cast<SparseAutoGrowRowCpuMatrix*>(&table)) {
-    addToRowsImp(*dynamic_cast<SparseAutoGrowRowCpuMatrix*>(&table), ids);
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(&table)) {
-    addToRowsImp(*dynamic_cast<SparseRowCpuMatrix*>(&table), ids);
-  } else {
-    CHECK(table.isContiguous());
-    addToRowsImp(*dynamic_cast<CpuMatrix*>(&table), ids);
-  }
-}
-
-// table.row[ids[i]] += this.row[i]
-template <typename TableMatType>
-void CpuMatrix::addToRowsImp(TableMatType& table, IVector& ids) {
-  CHECK(!table.useGpu());
-  CHECK(!ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  for (size_t i = 0; i < numSamples; ++i) {
-    if (index[i] == -1) continue;
-    CHECK_LT(index[i], (int)tableSize);
-    CHECK_GE(index[i], 0);
-    vecAddTo(table.getRow(index[i]), a + i * stride_, dim);
-  }
-}
-
-static ThreadLocal<std::vector<const real*>> threadLocalColArray;
-
-template <typename MatBType, typename MatCType>
-void CpuMatrix::mul(
-    CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT) {
-  CHECK(!c->isTransposed()) << "Not supported";
-  CHECK(!b->isTransposed()) << "Not supported";
-  // TODO(yuyang18): Maybe bug implementation here.
-  CHECK(scaleAB == 1) << "Not supported";
-  CHECK(scaleT == 0 || scaleT == 1) << "Not supported";
-  CHECK_EQ(a->getFormat(), SPARSE_CSR) << "Not supported";
-
-  real* B = b->getData();
-  real* C = c->getData();
-  size_t height = c->getHeight();
-  size_t width = c->getWidth();
-  int* cols = a->getCols();
-  real* values = a->getValue();
-
-  if (scaleT == 0) {
-    c->zeroMem();
-  }
-
-  if (!a->isTransposed()) {
-    size_t m = a->getWidth();
-    CHECK_EQ(b->getHeight(), m);
-    CHECK_EQ(a->getHeight(), height);
-    CHECK_EQ(b->getWidth(), width);
-
-    if (a->getValueType() == NO_VALUE) {
-      if (width % 32 == 0) {  // use libaddto
-        // @TODO(yuyang18) Make input addr can be unaligned.
-        // So merge this if and else
-        CHECK_EQ((size_t)B % 32, 0UL);
-        CHECK_EQ((size_t)C % 32, 0UL);
-        auto& colArray = *threadLocalColArray;
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          size_t colNum = end - start;
-          colArray.resize(colNum);
-          for (int j = 0; j < end - start; ++j) {
-            colArray[j] = b->getRow(cols[j + start]);
-          }
-          simd::batchAddTo(c->getRow(i), &colArray[0], colNum, width);
-        }
-
-      } else {
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            vecAddTo(c->getRow(i), b->getRow(cols[j]), width);
-          }
-        }
-      }
-    } else if (a->getValueType() == FLOAT_VALUE) {
-      for (size_t i = 0; i < a->getHeight(); ++i) {
-        const int start = a->getRowStartIdx(i);
-        const int end = a->getRowStartIdx(i + 1);
-        for (int j = start; j < end; ++j) {
-          vecAddTo(c->getRow(i), b->getRow(cols[j]), values[j], width);
-        }
-      }
-    }
-  } else /*if (a->isTransposed())*/ {
-    size_t m = a->getHeight();
-    CHECK_EQ(b->getHeight(), m);
-    CHECK_EQ(a->getWidth(), height);
-    CHECK_EQ(b->getWidth(), width);
-    if (a->getValueType() == NO_VALUE) {
-      if (width % 32 == 0) {  // use libaddto
-        // @TODO(yuyang18) Make input addr can be unaligned.
-        // So merge this if and else
-        CHECK_EQ((size_t)B % 32, 0UL);
-        CHECK_EQ((size_t)C % 32, 0UL);
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            simd::addTo(c->getRow(cols[j]), b->getRow(i), width);
-          }
-        }
-
-      } else {
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            vecAddTo(c->getRow(cols[j]), b->getRow(i), width);
-          }
-        }
-      }
-    } else if (a->getValueType() == FLOAT_VALUE) {
-      for (size_t i = 0; i < a->getHeight(); ++i) {
-        const int start = a->getRowStartIdx(i);
-        const int end = a->getRowStartIdx(i + 1);
-        for (int j = start; j < end; ++j) {
-          vecAddTo(c->getRow(cols[j]), b->getRow(i), values[j], width);
-        }
-      }
-    }
-  }
-}
-
-// instantiation mul() called in SparseRowMatrix.cpp
-template void CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(
-    CpuSparseMatrix* a,
-    CpuMatrix* b,
-    SparseRowCpuMatrix* c,
-    real scaleAB,
-    real scaleT);
-template void CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
-    CpuSparseMatrix* a,
-    CpuMatrix* b,
-    SparseAutoGrowRowCpuMatrix* c,
-    real scaleAB,
-    real scaleT);
-template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
-                                                           CpuMatrix* b,
-                                                           CacheRowCpuMatrix* c,
-                                                           real scaleAB,
-                                                           real scaleT);
-
-#ifndef PADDLE_MOBILE_INFERENCE
-void SharedCpuMatrix::mul(CpuSparseMatrix* a,
-                          CpuMatrix* b,
-                          real scaleAB,
-                          real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-  CHECK(!b->isTransposed()) << "Not supported";
-  CHECK_EQ(scaleAB, 1) << "Not supported";
-  CHECK_EQ(scaleT, 1) << "Not supported";
-  CHECK_EQ(a->getFormat(), SPARSE_CSR) << "not supported";
-
-  real* B = b->getData();
-  real* C = getData();
-  size_t height = getHeight();
-  size_t width = getWidth();
-
-  // get real trans
-  MatrixPtr aTrans;
-  if (a->isTransposed()) {
-    aTrans = a->getTmpSparseMatrix(a->getWidth(), a->getHeight());
-    a->transpose(aTrans, false);
-  }
-  a = dynamic_cast<CpuSparseMatrix*>(aTrans.get());
-
-  size_t m = a->getWidth();
-  CHECK_EQ(b->getHeight(), m);
-  CHECK_EQ(a->getHeight(), height);
-  CHECK_EQ(b->getWidth(), width);
-
-  size_t blockSize = (height / blockNum_) + 1;
-  CpuMatrixPtr localBuf = *localBuf_;
-  if (!localBuf) {
-    localBuf = std::make_shared<CpuMatrix>(blockSize, width);
-  } else {
-    localBuf->resize(blockSize, width);
-  }
-  localBuf->zeroMem();
-  real* localC = localBuf->getData();
-  std::vector<int>& blockSeq = *blockSeq_;
-  if (blockSeq.size() == 0) {
-    for (int k = 0; k < blockNum_; ++k) {
-      blockSeq.push_back(k);
-    }
-    std::shuffle(
-        blockSeq.begin(), blockSeq.end(), ThreadLocalRandomEngine::get());
-  }
-  std::vector<int>& localBufRows = *localBufRows_;
-  int* cols = a->getCols();
-  real* value = a->getValue();
-
-  for (int k = 0; k < blockNum_; ++k) {
-    int blockId = blockSeq[k];
-    size_t blockBegin = blockId * blockSize;
-    size_t blockEnd = (blockId + 1) * blockSize;
-    if (blockId == blockNum_ - 1) {
-      blockEnd = height;
-    }
-    if (a->getValueType() == NO_VALUE) {
-      for (size_t i = blockBegin; i < blockEnd; ++i) {
-        int start = a->getRowStartIdx(i);
-        int end = a->getRowStartIdx(i);
-        size_t colNum = a->getColNum(i);
-        if (colNum == 0) {
-          continue;
-        }  // skip empty row
-        localBufRows.push_back(i);
-        size_t bufPos = localBufRows.size() - 1;
-        for (int j = start; j < end; ++j) {
-          vecAddTo(localC + bufPos * width, B + cols[j] * width, width);
-        }
-      }
-    } else if (a->getValueType() == FLOAT_VALUE) {
-      for (size_t i = blockBegin; i < blockEnd; ++i) {
-        int start = a->getRowStartIdx(i);
-        int end = a->getRowStartIdx(i);
-        size_t colNum = a->getColNum(i);
-        if (colNum == 0) {
-          continue;
-        }  // skip empty row
-        localBufRows.push_back(i);
-        size_t bufPos = localBufRows.size() - 1;
-        for (int j = start; j < end; ++j) {
-          vecAddTo(
-              localC + bufPos * width, B + cols[j] * width, value[j], width);
-        }
-      }
-    }
-
-    {
-      std::lock_guard<std::mutex> guard(*blockLocks_[blockId]);
-      for (size_t i = 0; i < localBufRows.size(); ++i) {
-        vecAddTo(C + localBufRows[i] * width, localC + i * width, width);
-      }
-    }
-    memset(localC, 0, localBufRows.size() * width * sizeof(real));
-    localBufRows.clear();
-  }
-
-  VLOG(2) << " B[0]=" << B[0] << " B[1]=" << B[1] << " C[0]=" << C[0]
-          << " C[1]=" << C[1];
-}
-
-void SharedCpuMatrix::add(Matrix& b, real p1, real p2) {
-  CHECK_EQ(blockNum_, 1);
-  std::lock_guard<std::mutex> guard(*blockLocks_[0]);
-  CpuMatrix::add(b, p1, p2);
-}
-
-void SharedCpuMatrix::add(real p1, real p2) {
-  CHECK_EQ(blockNum_, 1);
-  std::lock_guard<std::mutex> guard(*blockLocks_[0]);
-  CpuMatrix::add(p1, p2);
-}
-
-void SharedCpuMatrix::initShared(int blockNum) {
-  CHECK_GT(height_ * width_, 1UL * 1024 * 1024)
-      << "should not share small matrix";
-  initBlock(blockNum);
-}
-
-void SharedCpuMatrix::initBlock(int blockNum) {
-  CHECK_LE(blockNum, 200) << "should not use large block number";
-  blockNum_ = blockNum;
-  blockLocks_.resize(blockNum);
-  for (auto& locker : blockLocks_) {
-    locker.reset(new std::mutex);
-  }
-}
-
-#endif
-/* Add a (column) vector b to matrix a, column by column */
-void CpuMatrix::addColumnVector(const Matrix& b) {
-  BaseMatrix::addColVector(const_cast<Matrix&>(b));
-}
-
-/* this = a*b */
-void CpuMatrix::mul(const Matrix& a, const Matrix& b) {
-  return mul(a, b, 1.0, 0.0);
-}
-
-/* this = scaleAB*(this*b) +  scaleT*this */
-void CpuMatrix::rightMul(Matrix& b, real scaleAB, real scaleT) {
-  (void)b;
-  (void)scaleAB;
-  (void)scaleT;
-  LOG(FATAL) << "Not implemented";
-}
-
-/* this = this* b */
-void CpuMatrix::rightMul(Matrix& b) { return rightMul(b, 1.0, 0.0); }
-
-/* this = scaleAB*(a*this) +  scaleT*this */
-void CpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
-  (void)a;
-  (void)scaleAB;
-  (void)scaleT;
-  LOG(FATAL) << "Not implemented";
-}
-
-/* this = a*this) */
-void CpuMatrix::leftMul(Matrix& a) { return leftMul(a, 1.0, 0.0); }
-
-void CpuMatrix::colMerge(Matrix& src) { src.rowSum(*this); }
-
-void CpuMatrix::rowSum(Matrix& sum) {
-  CHECK_EQ(sum.getHeight(), getHeight());
-  CHECK_EQ(sum.getWidth(), (size_t)1);
-
-  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
-}
-
-void CpuMatrix::rowMaxId(IVector& maxIds) {
-  CHECK(!maxIds.useGpu()) << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  CHECK_EQ(maxIds.getSize(), numSamples);
-
-  real* a = getData();
-  int* s = maxIds.getData();
-  size_t dim = getWidth();
-
-  for (size_t i = 0; i < numSamples; i++) {
-    real sm = a[i * dim];
-    int maxId = 0;
-    for (size_t j = 1; j < dim; j++) {
-      if (a[i * dim + j] > sm) {
-        maxId = j;
-        sm = a[i * dim + j];
-      }
-    }
-    s[i] = maxId;
-  }
-}
-
-void CpuMatrix::rowMax(Matrix& max) {
-  CHECK_EQ(max.getHeight(), getHeight());
-  CHECK_EQ(max.getWidth(), (size_t)1);
-  max.maxRows(*this);
-}
-
-/* Get the top k elements of each row of this matrix */
-void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-  CHECK(isContiguous());
-  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  CHECK_EQ(maxVal.getWidth(), beam);
-
-  real* a = getData();
-  int* s = maxIds.getData();
-  real* t = maxVal.getData();
-  size_t dim = getWidth();
-  for (size_t i = 0; i < numSamples; i++) {
-    std::vector<std::pair<real, size_t>> vec;
-    for (size_t j = 0; j < dim; j++) {
-      vec.push_back(std::pair<real, size_t>(a[i * dim + j], j));
-    }
-
-    std::partial_sort(
-        vec.begin(),
-        vec.begin() + beam,
-        vec.end(),
-        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
-          return l.first > r.first;
-        });
-    for (size_t j = 0; j < beam; j++) {
-      t[i * beam + j] = vec[j].first;
-      s[i * beam + j] = vec[j].second;
-    }
-  }
-}
-
-void CpuMatrix::colMax(Matrix& max) {
-  CHECK_EQ(max.getWidth(), getWidth());
-  CHECK_EQ(max.getHeight(), (size_t)1);
-  max.maxCols(*this);
-}
-
-void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
-  CHECK(isContiguous());
-  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getWidth();
-  size_t beam = maxVal.getHeight();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getWidth(), numSamples);
-
-  real* a = getData();
-  int* s = maxIds.getData();
-  real* t = maxVal.getData();
-  size_t dim = getHeight();
-  for (size_t i = 0; i < numSamples; i++) {
-    std::vector<std::pair<real, size_t>> vec;
-    for (size_t j = 0; j < dim; j++) {
-      vec.push_back(std::pair<real, size_t>(a[i + j * numSamples], j));
-    }
-
-    std::partial_sort(
-        vec.begin(),
-        vec.begin() + beam,
-        vec.end(),
-        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
-          return l.first > r.first;
-        });
-    for (size_t j = 0; j < beam; j++) {
-      t[i + j * numSamples] = vec[j].first;
-      s[i + j * numSamples] = vec[j].second;
-    }
-  }
-}
-
-void CpuMatrix::maxoutForward(Matrix& a,
-                              IVector& id,
-                              size_t channels,
-                              size_t groups) {
-  CHECK(dynamic_cast<CpuMatrix*>(&a));
-  CHECK(dynamic_cast<CpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = getWidth();
-  size_t batchSize = getHeight();
-  size_t featLen = size / channels;
-  const real* input = a.getData();
-  int* idForCpu = id.getData();
-
-  MatrixPtr maxInMat, maxOutMat;
-  Matrix::resizeOrCreate(maxInMat, groups, size, false, false);
-  Matrix::resizeOrCreate(maxOutMat, 1, size, false, false);
-
-  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
-    size_t newIndex = batch_idx * size;
-    IVectorPtr tmpId = IVector::create(idForCpu + newIndex, size, false);
-
-    for (size_t i = 0; i < channels; ++i) {
-      size_t newFeatLen = i * featLen;
-      for (size_t j = 0; j < groups; ++j) {
-        maxInMat->subMatrix(j, j + 1, newFeatLen, newFeatLen + featLen)
-            ->copyFrom(input + (newIndex + newFeatLen) * groups + j * featLen,
-                       featLen);
-      }
-    }
-    maxInMat->colMax(*tmpId, *maxOutMat);
-    this->subRowMatrix(batch_idx, batch_idx + 1)->copyFrom(*maxOutMat);
-  }
-}
-
-void CpuMatrix::maxoutBackward(Matrix& a,
-                               IVector& id,
-                               size_t channels,
-                               size_t groups) {
-  CHECK(dynamic_cast<CpuMatrix*>(&a));
-  CHECK(dynamic_cast<CpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = a.getWidth();
-  size_t batchSize = getHeight();
-  size_t featLen = size / channels;
-  size_t newFeatLen = groups * featLen;
-  real* inputG = getData();
-  const real* outG = a.getData();
-  int* idForCpu = id.getData();
-
-  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
-    size_t newIndex = batch_idx * size;
-    int* idData = idForCpu + newIndex;
-
-    for (size_t i = 0; i < size; ++i) {
-      int gradIdx =
-          idData[i] * featLen + (i / featLen) * newFeatLen + i % featLen;
-      (inputG + newIndex * groups)[gradIdx] += (outG + newIndex)[i];
-    }
-  }
-}
-
-void CpuMatrix::rowNormalizeL1(Matrix& out) {
-  CHECK(!out.useGpu());
-
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(out.getHeight(), numSamples);
-  CHECK_EQ(out.getWidth(), dim);
-  real* a = getData();
-  real* b = out.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    real s = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      s += a[i * dim + j];
-    }
-    // Right now, we just bet that sum won't be zero. If this really happens,
-    // we will figure out what should be done then.
-    CHECK_GT(s, 0);
-    s = 1 / s;
-    for (size_t j = 0; j < dim; ++j) {
-      b[i * dim + j] = s * a[i * dim + j];
-    }
-  }
-}
-
-/* calulate classification error */
-void CpuMatrix::classificationError(Matrix& output,
-                                    IVector& label,
-                                    size_t topkSize) {
-  size_t numSamples = this->getHeight();
-  auto cpuOutput = dynamic_cast<CpuMatrix*>(&output);
-  auto cpuLabel = dynamic_cast<CpuIVector*>(&label);
-  IVectorPtr cpuTopIds = std::make_shared<CpuIVector>(numSamples * topkSize);
-  MatrixPtr cpuTopVal = std::make_shared<CpuMatrix>(numSamples, topkSize);
-
-  CHECK(cpuOutput && cpuLabel) << "Invalid argument pointer";
-  CHECK(cpuTopIds && cpuTopVal) << "Allocate cpu memory failed";
-  CHECK(cpuLabel->getSize() == numSamples) << "Vector size is not equal";
-  CHECK(cpuOutput->getHeight() == numSamples && this->getWidth() == 1)
-      << "Matrix dimensions are not equal";
-
-  // top k matrix classification
-  cpuOutput->rowMax(*cpuTopIds, *cpuTopVal);
-
-  size_t dim = cpuOutput->getWidth();
-  real* result = this->getData();
-  int* ids = cpuTopIds->getData();
-  int* lbl = cpuLabel->getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    CHECK_GE(lbl[i], 0);
-    CHECK_LT((size_t)lbl[i], dim);
-
-    for (size_t j = 0; j < topkSize; ++j) {
-      if (ids[j + i * topkSize] == lbl[i]) {
-        result[i] = 0;
-        break;
-      }
-      result[i] = 1.0f;
-    }
-  }
-}
-
-/* copy -log(output[label]) to this->data[i] */
-void CpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(getWidth(), (size_t)1);
-
-  real* out = output.getData();
-  real* cost = getData();
-  int* lbl = label.getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    CHECK_GE(lbl[i], 0);
-    CHECK_LT((size_t)lbl[i], dim);
-    cost[i] = -std::log(out[lbl[i]]);
-  }
-}
-
-/* calculate the error of outputV according to label */
-void CpuMatrix::oneHotCrossEntropyBp(Matrix& output, IVector& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getWidth(), dim);
-  real* out = output.getData();
-  real* grad = getData();
-  int* lbl = label.getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
-    grad[lbl[i]] -= 1 / out[lbl[i]];
-  }
-}
-
-/*
-    We implement the matrix functionality in CostLayer.cpp,
-    but we define the scalar function here for sanity check
-    deletion of the function does not affect anything neverthelss
-*/
-void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                               IVector& label,
-                                               real alpha) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(getWidth(), (size_t)1);
-
-  real* out = output.getData();
-  real* cost = getData();
-  int* lbl = label.getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    CHECK_GE(lbl[i], 0);
-    CHECK_LT((size_t)lbl[i], dim);
-    real sum = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      sum += out[j];
-    }
-    sum = _safelog(sum);
-    cost[i] = -_safelog(out[lbl[i]]) + sum + alpha * _square(sum);
-  }
-}
-
-/*
-    We implement the matrix functionality in CostLayer.cpp,
-    but we define the scalar function here for sanity check
-    deletion of the function does not affect anything neverthelss
-*/
-void CpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& output,
-                                                 IVector& label,
-                                                 real alpha) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getWidth(), dim);
-  real* out = output.getData();
-  real* grad = getData();
-  int* lbl = label.getData();
-
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
-    grad[lbl[i]] -= 1 / out[lbl[i]];
-    real sum = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      sum += out[j];
-    }
-    for (size_t j = 0; j < dim; ++j) {
-      if (j == (size_t)lbl[i]) {
-        grad[j] += -1 / out[j];
-      }
-      grad[j] += 1 / sum + 2 * alpha * _safelog(sum) / sum;
-    }
-  }
-}
-
-#define FORWARD_LOOP()                      \
-  size_t numSamples = getHeight();          \
-  size_t dim = getWidth();                  \
-  CHECK_EQ(output.getHeight(), numSamples); \
-  CHECK_EQ(output.getWidth(), dim);         \
-  const real* in = getData();               \
-  real* out = output.getData();             \
-  for (size_t i = 0; i < numSamples; ++i, in += dim, out += dim)
-
-#define BACKWARD_LOOP()                     \
-  size_t numSamples = getHeight();          \
-  size_t dim = getWidth();                  \
-  CHECK_EQ(output.getHeight(), numSamples); \
-  CHECK_EQ(output.getWidth(), dim);         \
-  real* grad = getData();                   \
-  real* out = output.getData();             \
-  for (size_t i = 0; i < numSamples; ++i, grad += dim, out += dim)
-
-void CpuMatrix::softmax(Matrix& output) {
-  CHECK(!output.useGpu());
-
-  const float THRESHOLD = -64.0;
-
-  FORWARD_LOOP() {
-    real max = -1.0e20;
-    for (size_t j = 0; j < dim; ++j) {
-      if (in[j] > max) {
-        max = in[j];
-      }
-    }
-    for (size_t j = 0; j < dim; ++j) {
-      real a = in[j] - max;
-      if (a < THRESHOLD) {
-        a = THRESHOLD;
-      }
-      out[j] = a;
-    }
-    vExp(dim, out, out);
-
-    real sum = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      sum += out[j];
-    }
-    sum = 1 / sum;
-    for (size_t j = 0; j < dim; ++j) {
-      out[j] *= sum;
-    }
-  }
-}
-
-void CpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-  CHECK(isContiguous());
-
-  MatrixPtr inTmp = Matrix::create(nullptr,
-                                   /* height= */ 1,
-                                   1,
-                                   /* trans= */ false,
-                                   false);
-  MatrixPtr outTmp = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    1,
-                                    /* trans= */ false,
-                                    false);
-  size_t numSequences = index.getSize() - 1;
-  auto starts = index.getData();
-  for (size_t i = 0; i < numSequences; ++i) {
-    size_t offset = starts[i];
-    size_t size = starts[i + 1] - starts[i];
-    inTmp->setData(getData() + offset, 1UL, size);
-    outTmp->setData(output.getData() + offset, 1UL, size);
-    inTmp->softmax(*outTmp);
-  }
-}
-
-void CpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
-  CHECK(output.useGpu_ == false) << "Matrix type are not equal";
-  CHECK_EQ(getHeight(), sftmaxSum.getHeight());
-
-  real* sums = sftmaxSum.getData();
-
-  BACKWARD_LOOP() {
-    real sum = sums[i];
-    for (size_t j = 0; j < dim; ++j) {
-      grad[j] = out[j] * (grad[j] - sum);
-    }
-  }
-}
-
-void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(label.getWidth(), dim);
-  CHECK_EQ(getWidth(), (size_t)1);
-  real* out = output.getData();
-  real* cost = getData();
-
-  auto labelptr = dynamic_cast<CpuSparseMatrix*>(&label);
-  if (labelptr) {
-    // it is a CpuSparseMatrix
-    if (labelptr->getFormat() == SPARSE_CSR) {
-      // treat label as a SparseMatrix
-      for (size_t i = 0; i < numSamples; ++i) {
-        for (size_t j = 0; j < dim; ++j) {
-          cost[i] += _square(out[i * dim + j]);
-        }
-      }
-      if (labelptr->getValueType() == NO_VALUE) {
-        int* cols = labelptr->getCols();
-        for (size_t i = 0; i < numSamples; ++i) {
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            cost[i] += 1.0 - 2.0 * out[i * dim + cols[j]];
-            /*
-             * explanation of above line: original codes are follows:
-             * cost[i] -= _square(out[i * dim + feature.col]);
-             * cost[i] += _square(1.0 - out[i * dim + feature.col]);
-             */
-          }
-        }
-      } else if (labelptr->getValueType() == FLOAT_VALUE) {
-        int* cols = labelptr->getCols();
-        real* values = labelptr->getValue();
-        for (size_t i = 0; i < numSamples; ++i) {
-          real sum1 = 0;
-          real sum2 = 0;
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            sum1 += values[j] * values[j];
-            sum2 += values[j] * out[i * dim + cols[j]];
-            /*
-             * explanation of above line: original codes are follows:
-             * cost[i] -= _square(out[i * dim + feature.col]);
-             * cost[i] += _square(value.col - out[i * dim + feature.col]);
-             */
-          }
-          cost[i] += sum1 - 2.0 * sum2;
-        }
-      } else {
-        LOG(FATAL) << "unsupported sparse matrix value type in sumOfSquares";
-        return;
-      }
-      return;
-    } else {
-      LOG(FATAL) << "unsupported sparse matrix format in sumOfSquares";
-      return;
-    }
-  }
-
-  BaseMatrix::sumOfSquaredDiffs(output,
-                                label,
-                                /* scaleSum= */ 1,
-                                /* scaleDest= */ 1);
-}
-
-/* calculate the error of outputV according to label */
-void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getWidth(), dim);
-  CHECK_EQ(label.getWidth(), dim);
-
-  real* out = output.getData();
-  real* grad = getData();
-
-  auto labelptr = dynamic_cast<CpuSparseMatrix*>(&label);
-  if (labelptr) {
-    // it is a CpuSparseMatrix
-    if (labelptr->getFormat() == SPARSE_CSR) {
-      // treat label as a SparseMatrix
-      for (size_t i = 0; i < numSamples; ++i) {
-        for (size_t j = 0; j < dim; ++j) {
-          grad[i * dim + j] += 2.0 * out[i * dim + j];
-        }
-      }
-      if (labelptr->getValueType() == NO_VALUE) {
-        int* cols = labelptr->getCols();
-        for (size_t i = 0; i < numSamples; ++i) {
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            grad[i * dim + cols[j]] -= 2.0;
-            /*
-             * explanation of above line: original codes are follows:
-             * grad[i * dim + feature.col] -= 2.0 * out[i * dim + feature.col];
-             * grad[i * dim + feature.col] += 2.0 * (out[i * dim + feature.col]
-             * - 1);
-             */
-          }
-        }
-      } else if (labelptr->getValueType() == FLOAT_VALUE) {
-        int* cols = labelptr->getCols();
-        real* values = labelptr->getValue();
-        for (size_t i = 0; i < numSamples; ++i) {
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            grad[i * dim + cols[j]] -= 2.0 * values[j];
-            /*
-             * explanation of above line: original codes are follows:
-             * grad[i * dim + feature.col] -= 2.0 * out[i * dim + feature.col];
-             * grad[i * dim + feature.col] += 2.0 * (out[i * dim + feature.col]
-             * - value.col);
-             */
-          }
-        }
-      } else {
-        LOG(FATAL) << "unsupported sparse matrix value type in sumOfSquares";
-        return;
-      }
-      return;
-    } else {
-      LOG(FATAL) << "unsupported sparse matrix format in sumOfSquares";
-      return;
-    }
-  }
-
-  real* lbl = label.getData();
-  size_t ld = getStride();
-  size_t outLd = output.getStride();
-  size_t lblLd = label.getStride();
-  CHECK(lbl);
-  for (size_t i = 0; i < numSamples;
-       ++i, out += outLd, lbl += lblLd, grad += ld) {
-    for (size_t j = 0; j < dim; ++j) {
-      grad[j] += 2.0 * (out[j] - lbl[j]);  // positive gradient;
-    }
-  }
-}
-
-void CpuMatrix::smoothL1(Matrix& output, Matrix& label, real destScale) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(label.getWidth(), dim);
-  CHECK_EQ(getWidth(), (size_t)1);
-
-  real* cost = getData();
-  real* out = output.getData();
-  real* lbl = label.getData();
-
-  for (size_t i = 0; i < numSamples; ++i, out += dim, lbl += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      real absVal = std::fabs(out[j] - lbl[j]);
-      cost[i] *= destScale;
-      if (absVal < 1.0)
-        cost[i] += 0.5 * absVal * absVal;
-      else
-        cost[i] += absVal - 0.5;
-    }
-  }
-}
-
-void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label, real destScale) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(label.getWidth(), dim);
-  CHECK_EQ(getWidth(), dim);
-
-  real* out = output.getData();
-  real* lbl = label.getData();
-  real* grad = getData();
-
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim, lbl += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      real val = out[j] - lbl[j];
-      grad[j] *= destScale;
-      if (std::fabs(val) < 1) {
-        grad[j] += val;
-      } else {
-        grad[j] += (real(0) < val) - (val < real(0));
-      }
-    }
-  }
-}
-
-void CpuMatrix::tanh(Matrix& output) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), dim);
-  vTanh(numSamples * dim, getData(), output.getData());
-}
-
-void CpuMatrix::tanhDerivative(Matrix& output) {
-  BaseMatrix::tanhDerivative(output);
-}
-
-void CpuMatrix::softrelu(Matrix& output) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  const real THRESHOLD = 40.0;
-  FORWARD_LOOP() {  // TODO(yuyang18): SIMD it?
-    for (size_t j = 0; j < dim; ++j) {
-      real x = in[j];
-      if (x > THRESHOLD) {
-        x = THRESHOLD;
-      } else if (x < -THRESHOLD) {
-        x = -THRESHOLD;
-      }
-      out[j] = x;
-    }
-  }
-  vExp(numSamples * dim, output.getData(), output.getData());
-  vLog1p(numSamples * dim, output.getData(), output.getData());
-}
-
-void CpuMatrix::softreluDerivative(Matrix& output) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  size_t size = numSamples * dim;
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), dim);
-  real* grad = getData();
-  MatrixPtr tmpMat = Matrix::create(numSamples, dim);
-  real* tmp = tmpMat->getData();
-
-  vExp(size, output.getData(), tmpMat->getData());
-
-  for (size_t i = 0; i < size; ++i) {
-    grad[i] *= (1.0 - 1.0 / tmp[i]);
-  }
-}
-
-void CpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), dim);
-
-  const real* in = getData();
-  real* out = output.getData();
-
-  // out = p2*in
-  for (size_t i = 0; i < numSamples * dim; ++i) {
-    out[i] = p2 * in[i];
-  }
-
-  vTanh(numSamples * dim, out, out);
-
-  // out = p1 * out
-  for (size_t i = 0; i < numSamples * dim; ++i) {
-    out[i] = p1 * out[i];
-  }
-}
-
-/* uniform randomization, minimize precision = 1e-5 */
-void CpuMatrix::randomizeUniform() {
-  CHECK(isContiguous());
-  real* data = getData();
-  unsigned int* randSeed = ThreadLocalRand::getSeed();
-  real recipRandMax = 1.0f / (real)RAND_MAX;
-  for (size_t i = 0; i < elementCnt_; ++i) {
-    *data++ = rand_r(randSeed) * recipRandMax;
-  }
-}
-
-void CpuMatrix::print(std::ostream& os) const {
-  CHECK(isContiguous());
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      os << data_[i * width_ + j] << " ";
-    }
-    os << std::endl;
-  }
-}
-
-void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
-  real* input = data.getData();
-  real* w = W.getData();
-  real* output = data_;
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-
-  size_t partial_sum = numElements / paraSize;
-  if (paraSize == numElements) {
-    for (size_t n = 0; n < numSamples * numElements; ++n) {
-      output[n] = input[n] > 0 ? input[n] : input[n] * w[n % numElements];
-    }
-    return;
-  }
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  for (size_t n = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < paraSize; i++) {
-      neon::prelu(
-          input + i * partial_sum, w[i], output + i * partial_sum, partial_sum);
-    }
-    input = input + numElements;
-    output = output + numElements;
-  }
-#else
-  for (size_t n = 0, k = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < numElements; ++i, ++k) {
-      output[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
-    }
-  }
-#endif
-}
-
-void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
-  real* ograd = oGrad.getData();
-  real* input = data.getData();
-  real* wgrad = data_;
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = this->getHeight() * this->getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  for (size_t n = 0, k = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < numElements; ++i, ++k) {
-      wgrad[i / partial_sum] += ograd[k] * (input[k] > 0 ? 0 : input[k]);
-    }
-  }
-}
-
-void CpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
-  real* diff = data_;
-  real* input = data.getData();
-  real* ograd = oGrad.getData();
-  real* w = W.getData();
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  for (size_t n = 0, k = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < numElements; ++i, ++k) {
-      diff[k] += ograd[k] * (input[k] > 0 ? 1 : w[i / partial_sum]);
-    }
-  }
-}
-
-void CpuMatrix::print(std::ostream& os, size_t height, size_t width) const {
-  CHECK(isContiguous());
-  size_t h = height_ < height ? height_ : height;
-  size_t w = width_ < width ? width_ : width;
-  os.setf(std::ostream::scientific);
-  os << "[";
-  for (size_t i = 0; i < h; ++i) {
-    for (size_t j = 0; j < w; ++j) {
-      os << data_[i * width_ + j] << " ";
-    }
-    if (i == h - 1) {
-      os << "]";
-    }
-    os << std::endl;
-  }
-}
-
-void CpuMatrix::printOneRow(std::ostream& os, size_t idx) const {
-  CHECK_LT(idx, height_);
-  size_t offset = idx * stride_;
-  os << data_[offset];
-  for (size_t i = 1; i < width_; ++i) {
-    os << " " << data_[offset + i];
-  }
-  os << ";";
-}
-
-void CpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
-  CHECK(isContiguous());
-  CHECK(height_ == refMat.getHeight());
-  CHECK(width_ == refMat.getWidth());
-  CpuMatrix cpuRef(height_, width_);
-  cpuRef.copyFrom(refMat);
-  size_t diffCnt = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      real a = getElement(i, j);
-      real b = cpuRef.getElement(i, j);
-      if (fabs(a - b) > 0.00001) {
-        ++diffCnt;
-        if (printDiff) {
-          os << "ref= " << a << "  check= " << b << std::endl;
-        }
-      }
-    }
-  }
-  LOG(INFO) << "the  diffCnt is " << diffCnt;
-}
-
-real CpuMatrix::getMin() {
-  size_t size = getHeight() * getWidth();
-  real* data = getData();
-  real res = data[0];
-  for (size_t i = 1; i < size; ++i) {
-    if (res > data[i]) {
-      res = data[i];
-    }
-  }
-  return res;
-}
-
-real CpuMatrix::getMax() {
-  size_t size = getHeight() * getWidth();
-  real* data = getData();
-  real res = data[0];
-  for (size_t i = 1; i < size; ++i) {
-    if (res < data[i]) {
-      res = data[i];
-    }
-  }
-  return res;
-}
-
-void CpuMatrix::circularConv(Matrix& in0, Matrix& in1) {
-  size_t height = this->getHeight();
-  size_t width0 = this->getWidth();
-  size_t width1 = in1.getWidth();
-
-  CHECK_EQ(height, in0.getHeight());
-  CHECK_EQ(width0, in0.getWidth());
-  CHECK_EQ(height, in1.getHeight());
-
-  CHECK_EQ(width1 % 2, 1U);
-
-  real* outV = this->getData();
-  real* inV0 = in0.getData();
-  real* inV1 = in1.getData();
-
-  int leftCtxLen = (width1 - 1) / 2;
-  for (size_t x = 0; x < height;
-       ++x, outV += width0, inV0 += width0, inV1 += width1) {
-    for (size_t i = 0; i < width0; ++i) {  // each dimension of output
-      for (size_t j = 0; j < width1; ++j) {
-        // iterate over all dimentions of inV1
-        int index = i + j - leftCtxLen;
-        index = (index + width0) % width0;
-        outV[i] += inV0[index] * inV1[j];
-      }
-    }
-  }
-}
-
-void CpuMatrix::circularConvDerivative(
-    Matrix& outG, Matrix& in0, Matrix& in1, Matrix& inG0, Matrix& inG1) {
-  size_t height = in0.getHeight();
-  size_t width0 = in0.getWidth();
-  size_t width1 = in1.getWidth();
-
-  CHECK_EQ(height, in1.getHeight());
-  CHECK_EQ(height, inG0.getHeight());
-  CHECK_EQ(width0, inG0.getWidth());
-  CHECK_EQ(height, inG1.getHeight());
-  CHECK_EQ(width1, inG1.getWidth());
-  CHECK_EQ(height, outG.getHeight());
-  CHECK_EQ(width0, outG.getWidth());
-
-  real* outGV = outG.getData();
-  real* inV0 = in0.getData();
-  real* inV1 = in1.getData();
-  real* inGV0 = inG0.getData();
-  real* inGV1 = inG1.getData();
-
-  int leftCtxLen = (width1 - 1) / 2;
-  for (size_t x = 0; x < height; ++x,
-              outGV += width0,
-              inV0 += width0,
-              inV1 += width1,
-              inGV0 += width0,
-              inGV1 += width1) {
-    for (size_t j = 0; j < width1; ++j) {  // iterate over width1
-      for (size_t i = 0; i < width0; ++i) {
-        // such over all dimensions of outG
-        int index = i + j - leftCtxLen;
-        index = (index + width0) % width0;
-        inGV0[index] += outGV[i] * inV1[j];
-        inGV1[j] += outGV[i] * inV0[index];
-      }
-    }
-  }
-}
-
-void CpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
-  CHECK(labelPtr);
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(numSamples, output.getHeight());
-  CHECK_EQ(numSamples, labelPtr->getHeight());
-  CHECK_EQ(dim, labelPtr->getWidth());
-
-  real* out = output.getData();
-  real* cost = getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      CHECK(out[j] > 0 && out[j] < 1.0);
-      cost[i] -= std::log(1 - out[j]);
-    }
-
-    const int* cols = labelPtr->getRowCols(i);
-    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      cost[i] -= std::log(out[cols[j]] / (1 - out[cols[j]]));
-    }
-  }
-}
-
-void CpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
-  CHECK(labelPtr);
-
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(numSamples, output.getHeight());
-  CHECK_EQ(numSamples, labelPtr->getHeight());
-  CHECK_EQ(dim, output.getWidth());
-  CHECK_EQ(dim, labelPtr->getWidth());
-
-  real* out = output.getData();
-  real* grad = getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      CHECK(out[j] > 0 && out[j] < 1.0);
-      grad[j] += 1.0 / (1 - out[j]);
-    }
-
-    const int* cols = labelPtr->getRowCols(i);
-    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      grad[cols[j]] -= 1.0 / (out[cols[j]] * (1 - out[cols[j]]));
-    }
-  }
-}
-
-/* calculate the classification error for multi binary label */
-void CpuMatrix::classificationErrorMulti(Matrix& output,
-                                         Matrix& label,
-                                         real threshold) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
-  CHECK(labelPtr);
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(numSamples, output.getHeight());
-  CHECK_EQ(numSamples, labelPtr->getHeight());
-  CHECK_EQ(dim, labelPtr->getWidth());
-
-  real* out = output.getData();
-  real* result = getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    real sum = 0.0;
-    for (size_t j = 0; j < dim; ++j) {
-      if (out[j] >= threshold) {
-        sum += 1.0;
-      }
-    }
-
-    const int* cols = labelPtr->getRowCols(i);
-    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      if (out[cols[j]] < threshold) {
-        sum += 1.0;
-      } else {
-        sum -= 1.0;
-      }
-    }
-    result[i] = sum / dim;
-  }
-}
-
-void CpuMatrix::bilinearForward(const Matrix& in,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {
-  CHECK(dynamic_cast<const CpuMatrix*>(&in));
-
-  size_t outputW = getWidth();
-  size_t batchSize = getHeight();
-  size_t inputW = in.getWidth();
-  size_t inputH = in.getHeight();
-  size_t inPosOffset = inImgH * inImgW;
-  size_t outPosOffset = outImgH * outImgW;
-  (void)(inputH);
-
-  real* outData = getData();
-  const real* inData = in.getData();
-
-  if (inImgH == outImgH && inImgW == outImgW) {
-    this->copyFrom(in);
-  } else {
-    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
-      for (size_t i = 0; i < outImgH; ++i) {  // loop for images
-        size_t h = ratioH * i;
-        size_t hid = (h < inImgH - 1) ? 1 : 0;
-        real h1lambda = ratioH * i - h;
-        real h2lambda = 1 - h1lambda;
-
-        for (size_t j = 0; j < outImgW; ++j) {
-          size_t w = ratioW * j;
-          size_t wid = (w < inImgW - 1) ? 1 : 0;
-          real w1lambda = ratioW * j - w;
-          real w2lambda = 1 - w1lambda;
-          // calculate four position for bilinear interpolation
-          const real* inPos = &inData[k * inputW + h * inImgW + w];
-          real* outPos = &outData[k * outputW + i * outImgW + j];
-          for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
-            // bilinear interpolation
-            outPos[0] =
-                h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wid]) +
-                h1lambda * (w2lambda * inPos[hid * inImgW] +
-                            w1lambda * inPos[hid * inImgW + wid]);
-            inPos += inPosOffset;
-            outPos += outPosOffset;
-          }
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::bilinearBackward(const Matrix& out,
-                                 const size_t outImgH,
-                                 const size_t outImgW,
-                                 const size_t inImgH,
-                                 const size_t inImgW,
-                                 const size_t numChannels,
-                                 const real ratioH,
-                                 const real ratioW) {
-  CHECK(dynamic_cast<const CpuMatrix*>(&out));
-
-  size_t inputW = getWidth();
-  size_t inputH = getHeight();
-  size_t outputW = out.getWidth();
-  size_t batchSize = out.getHeight();
-  size_t inPosOffset = inImgH * inImgW;
-  size_t outPosOffset = outImgH * outImgW;
-  (void)(inputH);
-
-  real* inGrad = getData();
-  const real* outGrad = out.getData();
-
-  if (inImgH == outImgH && inImgW == outImgW) {
-    this->add(const_cast<Matrix&>(out));
-  } else {
-    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
-      for (size_t i = 0; i < outImgH; ++i) {  // loop for images
-        size_t h = ratioH * i;
-        size_t hid = (h < inImgH - 1) ? 1 : 0;
-        real h1lambda = ratioH * i - h;
-        real h2lambda = 1 - h1lambda;
-        for (size_t j = 0; j < outImgW; ++j) {
-          size_t w = ratioW * j;
-          size_t wid = (w < inImgW - 1) ? 1 : 0;
-          real w1lambda = ratioW * j - w;
-          real w2lambda = 1 - w1lambda;
-
-          real* inPos = &inGrad[k * inputW + h * inImgW + w];
-          const real* outPos = &outGrad[k * outputW + i * outImgW + j];
-          for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
-            inPos[0] += h2lambda * w2lambda * outPos[0];
-            inPos[wid] += h2lambda * w1lambda * outPos[0];
-            inPos[hid * inImgW] += h1lambda * w2lambda * outPos[0];
-            inPos[hid * inImgW + wid] += h1lambda * w1lambda * outPos[0];
-            inPos += inPosOffset;
-            outPos += outPosOffset;
-          }
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::vol2Col(real* data,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW) {
-  real* outData = getData();
-  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
-  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
-  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
-
-  int channelsCol = channels * filterD * filterH * filterW;
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % filterW;
-    int hOffset = (c / filterW) % filterH;
-    int dOffset = (c / filterW / filterH) % filterD;
-    int cIn = c / filterW / filterH / filterD;
-    for (int d = 0; d < outDepth; ++d) {
-      for (int h = 0; h < outHeight; ++h) {
-        for (int w = 0; w < outWidth; ++w) {
-          int dPad = d * strideD - paddingD + dOffset;
-          int hPad = h * strideH - paddingH + hOffset;
-          int wPad = w * strideW - paddingW + wOffset;
-
-          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
-              dPad >= 0 && dPad < depth)
-            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] =
-                data[((cIn * depth + dPad) * height + hPad) * width + wPad];
-          else
-            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] = 0;
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::col2Vol(real* trg,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW,
-                        real alpha,
-                        real beta) {
-  real* src = getData();
-  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
-  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
-  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
-  int channelsCol = channels * filterD * filterH * filterW;
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % filterW;
-    int hOffset = (c / filterW) % filterH;
-    int dOffset = (c / filterW / filterH) % filterD;
-    int cIm = c / filterW / filterH / filterD;
-    for (int d = 0; d < outDepth; ++d) {
-      for (int h = 0; h < outHeight; ++h) {
-        for (int w = 0; w < outWidth; ++w) {
-          int dPad = d * strideD - paddingD + dOffset;
-          int hPad = h * strideH - paddingH + hOffset;
-          int wPad = w * strideW - paddingW + wOffset;
-          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
-              dPad >= 0 && dPad < depth)
-            trg[((cIm * depth + dPad) * height + hPad) * width + wPad] =
-                alpha *
-                    src[((c * outDepth + d) * outHeight + h) * outWidth + w] +
-                beta *
-                    trg[((cIm * depth + dPad) * height + hPad) * width + wPad];
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////
-//               functions executed via cpu                   //
-////////////////////////////////////////////////////////////////
-
-void GpuMatrix::selectElements(Matrix& table, IVector& ids) {
-  execViaCpu2(&CpuMatrix::selectElements, *this, table, ids);
-}
-}  // namespace paddle
diff --git a/paddle/legacy/math/Matrix.h b/paddle/legacy/math/Matrix.h
deleted file mode 100644
index ff4f4cfc2a4..00000000000
--- a/paddle/legacy/math/Matrix.h
+++ /dev/null
@@ -1,2189 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <memory>
-#include <thread>
-
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-#include <hl_gpu.h>
-
-#include "BaseMatrix.h"
-#include "MemoryHandle.h"
-#include "Vector.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/// TODO(tianbing), move to paddle/legacy/function/TensorType.h
-enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
-
-/**
- * @brief  matrix sparse_format .
- *
- * nnz represents nonzero number in sparse matrix.
- *
- * SPARSE_CSR: row major matrix. length of row is height_ + 1, each element
- * represents row start index in Matrix. length of col and value are nnz.
- *
- * SPARSE_CSC: col major matrix. length of col is width_ + 1, each element
- * represents col start index in Matrix. length of col and value are nnz.
- *
- * @code
- * for example: [0, 1, 0, 2, 0;
- *               1, 0, 0, 0, 0;
- *               0, 0, 0, 2, 5];
- * SPARSE_CSR row   [0, 2, 3, 5];
- *            col   [1, 3, 0, 3, 4];
- *            value [1, 2, 1, 2, 5]
- * SPARSE_CSC col   [0, 1, 2, 2, 4, 5];
- *            row   [1, 0, 0, 2, 2];
- *            value [1, 1, 2, 2, 5]
- * @endcode
- */
-/// TODO(tianbing), move to paddle/legacy/function/TensorType.h
-enum SparseFormat { SPARSE_CSR = 0, SPARSE_CSC = 1 };
-
-class Matrix;
-class GpuMatrix;
-class CpuMatrix;
-class CpuSparseMatrix;
-class GpuSparseMatrix;
-typedef std::shared_ptr<Matrix> MatrixPtr;
-typedef std::shared_ptr<GpuMatrix> GpuMatrixPtr;
-typedef std::shared_ptr<CpuMatrix> CpuMatrixPtr;
-typedef std::shared_ptr<GpuSparseMatrix> GpuSparseMatrixPtr;
-typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
-
-/**
- * Copy or assignemnt constructor will share the data as opposed to making a
- * copy of the original data. To make a copy of the orinal data, use copyFrom()
- * instead.
- */
-class Matrix : public BaseMatrix {
- protected:
-  Matrix(MemoryHandlePtr memHandle,
-         size_t height,
-         size_t width,
-         bool trans,
-         bool use_gpu);
-
-  Matrix(real* data, size_t height, size_t width, bool trans, bool use_gpu);
-
-  Matrix(real* data,
-         size_t height,
-         size_t width,
-         size_t stride,
-         bool trans,
-         bool use_gpu);
-
-  static ThreadLocal<MatrixPtr> tmpMat_;
-
- public:
-  size_t elementCnt_;  // maximal number of elements which can be held in data_
-  MemoryHandlePtr memoryHandle_;
-
- public:
-  virtual ~Matrix() {}
-
-  static MatrixPtr create(MemoryHandlePtr memHandle,
-                          size_t height,
-                          size_t width,
-                          bool trans = false);
-  static MatrixPtr create(size_t height,
-                          size_t width,
-                          bool trans = false,
-                          bool useGpu = false);
-  static MatrixPtr create(real* data,
-                          size_t height,
-                          size_t width,
-                          bool trans = false,
-                          bool useGpu = false);
-  static MatrixPtr create(real* data,
-                          size_t height,
-                          size_t width,
-                          size_t stride,
-                          bool trans = false,
-                          bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(size_t height,
-                                      size_t width,
-                                      size_t nnz,
-                                      SparseValueType valueType = FLOAT_VALUE,
-                                      bool trans = false,
-                                      bool useGpu = false);
-  static MatrixPtr createSparseMatrix(size_t height,
-                                      size_t width,
-                                      size_t nnz,
-                                      SparseValueType valueType = FLOAT_VALUE,
-                                      SparseFormat foramt = SPARSE_CSR,
-                                      bool trans = false,
-                                      bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(real* data,
-                                      int* row,
-                                      int* col,
-                                      size_t height,
-                                      size_t width,
-                                      size_t nnz, /* used to allocate space */
-                                      SparseValueType valueType, /*value type*/
-                                      SparseFormat format,
-                                      bool trans,
-                                      bool useGpu);
-
-  static void resizeOrCreateSparseMatrix(
-      MatrixPtr& matrix,
-      size_t height,
-      size_t width,
-      size_t nnz,
-      SparseValueType valueType = FLOAT_VALUE,
-      SparseFormat foramt = SPARSE_CSR,
-      bool trans = false,
-      bool useGpu = false);
-
-  static void resizeOrCreate(MatrixPtr& a,
-                             size_t height,
-                             size_t width,
-                             bool trans = false,
-                             bool useGpu = false);
-
-  /**
-   * @brief  set the data buffer used to hold the matrix data.
-   *
-   * caller should make sure that the size of data is at least
-   * sizeof(real)*height*width.
-   */
-  void setData(real* data) {
-    BaseMatrix::setData(data);
-    memoryHandle_.reset();
-  }
-
-  /// the data should be contiguous
-  void setData(real* data, size_t newHeight, size_t newWidth) {
-    setData(data);
-    height_ = newHeight;
-    width_ = newWidth;
-    elementCnt_ = newHeight * newWidth;
-    stride_ = width_;
-  }
-
-  size_t getWidth() const { return width_; }
-  size_t getHeight() const { return height_; }
-  size_t getStride() const { return stride_; }
-  size_t getElementCnt() const { return elementCnt_; }
-  virtual real* getData() { return data_; }
-  virtual const real* getData() const { return data_; }
-  bool isTransposed() const { return trans_; }
-  bool isContiguous() const { return stride_ == width_ || height_ == 1; }
-
-  // If sparse matrix, need to dynamic_cast to CpuSparseMatrix/GpuSparseMatrix
-  // befor call the following functions.
-  // Declare these functions in the base class just easy to call them.
-  // And these declarations should be moved to base class of sparse matrix
-  // if refactor sparse matrix
-  virtual int* getRows() const {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;  //! suppress warning for no return value.
-  }
-
-  virtual int* getCols() const {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;  //! suppress warning for no return value.
-  }
-
-  virtual SparseFormat getFormat() const {
-    LOG(FATAL) << "Not implemented";
-    return SPARSE_CSR;  //! suppress warning for no return value.
-  }
-
-  virtual SparseValueType getValueType() const {
-    LOG(FATAL) << "Not implemented";
-    return NO_VALUE;  //! suppress warning for no return value.
-  }
-
-  /**
-   * @brief matrix elment-wise add
-   *
-   * Named add3 just because add/add2 has been used in BaseMatrix.cu
-   * and they are not virtual function.
-   */
-  virtual void add3(MatrixPtr b) { LOG(FATAL) << "Not implemented"; }
-
-  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
-
-  virtual void zeroMem() { LOG(FATAL) << "Not implemented"; }
-
-  virtual void resetOne() { LOG(FATAL) << "Not implemented"; }
-
-  void setDiag(real value);
-
-  virtual void copyFrom(const Matrix& src) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void trimFrom(const CpuSparseMatrix& src) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  // For GpuMatrix this is an asynchronous copy interface
-  // For CpuMatrix this is an synchronous copy interface
-  virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  MatrixPtr subMatrix(size_t startRow,
-                      size_t endRow,
-                      size_t startCol,
-                      size_t endCol);
-
-  MatrixPtr subRowMatrix(size_t startRow, size_t endRow) {
-    return subMatrix(startRow, endRow, 0, getWidth());
-  }
-
-  MatrixPtr subColMatrix(size_t startCol, size_t endCol) {
-    return subMatrix(0, getHeight(), startCol, endCol);
-  }
-
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows) {
-    CHECK_LE(startRow + numRows, getHeight());
-    return Matrix::create(getData() + startRow * getWidth(),
-                          numRows,
-                          getWidth(),
-                          trans_,
-                          useGpu_);
-  }
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest) {
-    CHECK_LE(startRow + numRows, getHeight());
-    CHECK_EQ(useGpu_, dest->useGpu_);
-    dest->setData(this->rowBuf(startRow), numRows, getWidth());
-    return dest;
-  }
-
-  /**
-   * If this is GpuMatrix, src is assumed to be CPU memory
-   *
-   * If this is CpuMatrix, src is assumed to be CPU memory
-   */
-  virtual void copyFrom(const real* src, size_t size) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void copyFrom(const real* src, const int64_t* seq) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief convert a int vector to a real matrix.
-   *
-   * (1) source and dest are both in CPU.
-   *
-   * (2) sizes are exactly match.
-   */
-  virtual void copyFrom(const IVector& src) {
-    LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
-  }
-
-  virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief Create a matrix with the same type (GpuMatrix, CpuMatrix,
-   *        NonValueSparseMatrix, etc.) as this.
-   *
-   * If height and width is zero, the new matrix will have the same size
-   * as this, otherwise the new matrix will have the specified size.
-   *
-   */
-  virtual MatrixPtr clone(size_t height = 0,
-                          size_t width = 0,
-                          bool useGpu = false) {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  virtual real* getRowBuf(size_t row) {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  virtual real getElement(size_t x, size_t y) const {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual real getSum() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual void accumulateColSum(Matrix& src) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual real getAbsSum() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  /**
-   * @note Original data may not be preserved after resize().
-   */
-  virtual void resize(size_t newHeight, size_t newWidth) = 0;
-
-  /**
-   * @note This should only be used for sparse matrix.
-   */
-  virtual void resize(size_t newHeight,
-                      size_t newWidth,
-                      size_t newNnz, /* total item used to allocate space */
-                      SparseValueType valueType,
-                      SparseFormat format) = 0;
-
-  /**
-   * @brief This should only be used for sparse matrix.
-   *
-   * Currently must be called for each row in order.
-   * The matrix is not valid until setRow is called for the last row.
-   */
-  virtual void setRow(size_t row,
-                      size_t colNum,
-                      const unsigned int* cols,
-                      const real* values) = 0;
-
-  virtual MatrixPtr getTranspose() = 0;
-
-  /**
-   * @brief  hard transpose.
-   *
-   * allocate matTrans' memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void transpose(MatrixPtr& matTrans, bool memAlloc) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  rotate 90 degrees in clock-wise if clockWise=true;
-   *         otherwise rotate in anti clock-wise
-   * clock-wise:
-   * \f[
-   *   y(j,i) = x(M-i-1,j)
-   * \f]
-   * anti clock-wise:
-   * \f[
-   *   y(j,i) = x(i, N-1-j)
-   * \f]
-   * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
-   *
-   * allocate matRot' memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual MatrixPtr getInverse() {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  /**
-   * @brief  inverse.
-   *
-   * if allocate matInv's memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void inverse(MatrixPtr& matInv, bool memAlloc) {
-    LOG(FATAL) << "Not implemented";
-  }
-
- public:
-  /// Only set all variables to 0 or NULL but not free them.
-  virtual void clear() {
-    height_ = 0;
-    width_ = 0;
-    data_ = NULL;
-  }
-
-  void reshape(size_t height, size_t width);
-
-  /// add b to each sample of this.
-  virtual void addBias(Matrix& b, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void addSharedBias(Matrix& b, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  void addBias(Matrix& b, real scale, bool sharedBias) {
-    if (!sharedBias) {
-      addBias(b, scale);
-    } else {
-      addSharedBias(b, scale);
-    }
-  }
-
-  /// add each sample from a to this.
-  virtual void collectBias(Matrix& a, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void collectSharedBias(Matrix& a, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  void collectBias(Matrix& a, real scale, bool sharedBias) {
-    if (!sharedBias) {
-      collectBias(a, scale);
-    } else {
-      collectSharedBias(a, scale);
-    }
-  }
-
-  virtual void sequenceAvgForward(Matrix& a,
-                                  const IVector& startsPos,
-                                  int mode) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void sequenceAvgBackward(Matrix& a,
-                                   const IVector& startsPos,
-                                   int mode) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = scaleAB*(a*b) + scaleT*this
-   * @endcode
-   */
-  virtual void mul(const Matrix& a,
-                   const Matrix& b,
-                   real scaleAB,
-                   real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// Add a vector (column) b to matrix a, column by column.
-  virtual void addColumnVector(const Matrix& b) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   this(i, j) += vec(index(i, j), 0)
-   * where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1
-   * @endcode
-   */
-  virtual void addByBitCode(size_t numClasses,
-                            const IVector& codes,
-                            const Matrix& vec) {
-    (void)numClasses;
-    (void)codes;
-    (void)vec;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   vec(index(i, j), 0) += this(i, j)
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void addByBitCodeBackward(size_t numClasses,
-                                    const IVector& codes,
-                                    Matrix& vec) {
-    (void)numClasses;
-    (void)codes;
-    (void)vec;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   this(i, j) += <mat.row(index(i, j)), input.row(i)>
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCode(size_t numClasses,
-                            const IVector& codes,
-                            const Matrix& mat,
-                            const Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   mat.row(index(i, j)) += this(i, j) * input.row(i)
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCodeBackwardWeight(size_t numClasses,
-                                          const IVector& codes,
-                                          Matrix& mat,
-                                          const Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   input.row(i) += this(i, j) * mat.row(index(i, j))
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCodeBackwardError(size_t numClasses,
-                                         const IVector& codes,
-                                         const Matrix& mat,
-                                         Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength
-   *   sum(i, 0) = scaleSum * \sum_j  bit(i, j) * this(i, j)
-   * where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0
-   * @endcode
-   */
-  virtual void sumByBitCode(size_t numClasses,
-                            IVector& codes,
-                            Matrix& sum,
-                            real scaleSum) {
-    (void)numClasses;
-    (void)codes;
-    (void)sum;
-    (void)scaleSum;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength
-   *  this(i, j) -= bit(i, j)
-   * where bit(i, j) is same as that for sumByBitCode
-   * @endcode
-   */
-  virtual void subByBitCode(size_t numClasses_, IVector& codes) {
-    (void)numClasses_;
-    (void)codes;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * add the sum of each row of this to mat
-   */
-  virtual void rowSum(Matrix& sum) {
-    (void)sum;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * set the max of each row of this to mat
-   */
-  virtual void rowMax(Matrix& max) {
-    (void)max;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * set the max of each column of this to mat
-   */
-  virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
-
-  /**
-   * @brief Get the top k elements of each column of this matrix.
-   *
-   * The row ids and values of these elements are stored in
-   * maxIds and max respectively. where k is the size of maxIds.
-   * And note that the top k elements are not sorted.
-   */
-  virtual void colMax(IVector& maxIds, Matrix& maxVal) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void maxoutForward(Matrix& a,
-                             IVector& id,
-                             size_t channels,
-                             size_t groups) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void maxoutBackward(Matrix& a,
-                              IVector& id,
-                              size_t channels,
-                              size_t groups) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief Get the top k elements of each row of this matrix.
-   *
-   * The column ids and values of these elements are stored in
-   * maxIds and max respectively. where k is the size of maxIds.
-   * And note that the top k elements are not sorted.
-   */
-  virtual void rowMax(IVector& maxIds, Matrix& max) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// normalize each row so that the sum of each row is 1.
-  virtual void rowNormalizeL1(Matrix& out) {
-    (void)out;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   *  this = a*b
-   * @endcode
-   */
-  virtual void mul(const Matrix& a, const Matrix& b) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = scaleAB*(this*b) +  scaleT*this
-   * @endcode
-   */
-  virtual void rightMul(Matrix& b, real scaleAB, real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = this* b
-   * @endcode
-   */
-  virtual void rightMul(Matrix& b) { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @code
-   * this = scaleAB*(a*this) +  scaleT*this
-   * @endcode
-   */
-  virtual void leftMul(Matrix& a, real scaleAB, real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = a*this)
-   * @endcode
-   */
-  virtual void leftMul(Matrix& a) { LOG(FATAL) << "Not implemented"; }
-
-  /// merge the element for each col.
-  virtual void colMerge(Matrix& src) { LOG(FATAL) << "Not implemented"; }
-
-  /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropy(Matrix& output, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the error of outputV according to label.
-  virtual void oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                              IVector& label,
-                                              real alpha) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the error of outputV according to label.
-  virtual void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                                IVector& label,
-                                                real alpha) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * \f[
-   *  a[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} b_{i+j} * c_{j}
-   * \f]
-   *
-   * b contains M elements,
-   * c contains N elements (N is odd),
-   * b's index arithmetic is computed modulo M,
-   * c's index arithmetic is computed modulo N.
-   */
-  virtual void circularConv(Matrix& b, Matrix& c) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void circularConvDerivative(Matrix& output,
-                                      Matrix& prevOut1,
-                                      Matrix& prevOut2,
-                                      Matrix& prevGrad1,
-                                      Matrix& prevGrad2) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /* output_ij = exp(this_{ij}) / (sum_j exp(this_ij)) */
-  virtual void softmax(Matrix& output) {
-    (void)output;
-    LOG(FATAL) << "Not implemeted";
-  }
-  virtual void sequenceSoftmax(Matrix& output, const IVector& index) {
-    (void)output;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void softmaxBackward(Matrix& outputV) {
-    (void)outputV;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /*
-    sum_i = sum_j this_ij * output_ij
-    this_ij = output_ij* (this_ij - sum_i)
-  */
-  virtual void softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the sum of squares diff cost.
-  virtual void sumOfSquares(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// gradient of sumOfSquares.
-  virtual void sumOfSquaresBp(Matrix& outputV, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void smoothL1(Matrix& output, Matrix& label, real destScale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void smoothL1Bp(Matrix& outputV, Matrix& label, real destScale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void tanh(Matrix& output) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void tanhDerivative(Matrix& output) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void softrelu(Matrix& output) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void softreluDerivative(Matrix& output) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void scaledTanh(Matrix& output, real p1, real p2) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// print out the values of elements to os
-  virtual void print(std::ostream& os) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * print a part of the matrix
-   * from the (top,left) value to the (height, width) value (not included)
-   */
-  virtual void print(std::ostream& os, size_t height, size_t width) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// print one row to os
-  virtual void printOneRow(std::ostream& os, size_t idx) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void check(std::ostream& os, Matrix& refMat, bool printDiff = true) {}
-
-  virtual real getMin() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-  virtual real getMax() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual void randomizeUniform() { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief  calulate the error of classification
-   *
-   * output[i] = 1 if row i is an error.
-   *
-   * output[i] = 0 if row i is correct.
-   *
-   */
-  virtual void classificationError(Matrix& output,
-                                   IVector& label,
-                                   size_t topkSize = 1) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void upsampleForward(Matrix& input,
-                               Matrix& mask,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t outputH,
-                               size_t outputW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void upsampleBackward(Matrix& outputGrad,
-                                Matrix& mask,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t channels,
-                                size_t outputH,
-                                size_t outputW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * Pooling forward operation, pick out the largest element
-   * in the sizeX of value, if the maskMatP is not NULL, it will
-   * also caculate the location indices.
-   */
-  virtual void maxPoolForward(Matrix& inputMat,
-                              size_t imgSizeH,
-                              size_t imgSizeW,
-                              size_t channels,
-                              size_t sizeX,
-                              size_t sizeY,
-                              size_t strideH,
-                              size_t strideW,
-                              size_t outputH,
-                              size_t outputW,
-                              size_t paddingH,
-                              size_t paddingW,
-                              MatrixPtr maskMatP = NULL) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /// Pooling backward operation.
-  virtual void maxPoolBackward(Matrix& image,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               Matrix& outGrad,
-                               Matrix& outV,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               real scaleTargets,
-                               real scaleOutput,
-                               size_t paddingH,
-                               size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /// Pooling forward operation, caculate the average of sizeX elements.
-  virtual void avgPoolForward(Matrix& input,
-                              size_t imgSizeH,
-                              size_t imgSizeW,
-                              size_t channels,
-                              size_t sizeX,
-                              size_t sizeY,
-                              size_t strideH,
-                              size_t strideW,
-                              size_t outputH,
-                              size_t outputW,
-                              size_t paddingH,
-                              size_t paddingW,
-                              bool excludeMode = true) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void avgPoolBackward(Matrix& input,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               real scaleTargets,
-                               real scaleOutput,
-                               size_t paddingH,
-                               size_t paddingW,
-                               bool excludeMode = true) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * Pooling 3D forward operation, pick out the largest element
-   * in the sizeX of value
-   */
-  virtual void maxPool3DForward(Matrix& inputMat,
-                                Matrix& maxPoolIdx,
-                                size_t channels,
-                                size_t imgSizeD,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t outputD,
-                                size_t outputH,
-                                size_t outputW,
-                                size_t sizeZ,
-                                size_t sizeY,
-                                size_t sizeX,
-                                size_t strideD,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t paddingD,
-                                size_t paddingH,
-                                size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void maxPool3DBackward(Matrix& outGrad,
-                                 Matrix& maxPoolIdx,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW,
-                                 real scaleTargets,
-                                 real scaleOutput) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void avgPool3DForward(Matrix& input,
-                                size_t channels,
-                                size_t imgSizeD,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t outputD,
-                                size_t outputH,
-                                size_t outputW,
-                                size_t sizeZ,
-                                size_t sizeY,
-                                size_t sizeX,
-                                size_t strideD,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t paddingD,
-                                size_t paddingH,
-                                size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void avgPool3DBackward(Matrix& input,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW,
-                                 real scaleTargets,
-                                 real scaleOutput) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
- * Input: one or more sequences. Each sequence contains some instances.
- *
- * Output: output size is the number of input sequences (NOT input
- * instances).
- *
- * output[i] is set to max_input[i].
- */
-  virtual void maxSequenceForward(Matrix& input,
-                                  const IVector& sequence,
-                                  IVector& index) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void maxSequenceBackward(Matrix& outputGrad,
-                                   const IVector& sequence,
-                                   IVector& index) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * if ids[i] == -1, it will be ignored
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids) {
-    (void)table;
-    (void)ids;
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * if ids[i] == -1, it will be ignored
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids) {
-    (void)table;
-    (void)ids;
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * table[i, id[i]] += this[i]
-   * @endcode
-   */
-  virtual void addElements(Matrix& table, IVector& ids) {
-    LOG(FATAL) << "Not implemented";
-  }
-  /**
-   * @brief  cross entropy for multi binary labels
-   *
-   * @code
-   * this[i] = -sum(label[i][j]*log(output[i][j])
-   *           + (1-label[i][j])*log(1-output[i][j]))
-   * @endcode
-   */
-  virtual void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  The gradient of cross entropy for multi binary labels on output
-   *
-   * @code
-   * this[i][j] = -label[i][j]/output[i][j]
-   *              + (1-label[i][j])/(1-output[i][j])
-   * @endcode
-   */
-  virtual void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  Calculate the classification error for multi binary labels
-   *
-   * @code
-   * this[i] = sum((output[i][j] >= threshold && label[i][j] == 0)
-   *            || (output[i][j] < threshold && label[i][j] == 1))
-   *            / output->getWidth()
-   * @endcode
-   */
-  virtual void classificationErrorMulti(Matrix& output,
-                                        Matrix& label,
-                                        real threshold) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void paramReluForward(Matrix& data, Matrix& W) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void paramReluBackwardW(Matrix& oGrad, Matrix& data) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void vol2Col(real* data,
-                       int channels,
-                       int depth,
-                       int height,
-                       int width,
-                       int filterD,
-                       int filterH,
-                       int filterW,
-                       int strideD,
-                       int strideH,
-                       int strideW,
-                       int paddingD,
-                       int paddingH,
-                       int paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void col2Vol(real* trg,
-                       int channels,
-                       int depth,
-                       int height,
-                       int width,
-                       int filterD,
-                       int filterH,
-                       int filterW,
-                       int strideD,
-                       int strideH,
-                       int strideW,
-                       int paddingD,
-                       int paddingH,
-                       int paddingW,
-                       real alpha,
-                       real beta) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void bilinearForward(const Matrix& in,
-                               const size_t inImgH,
-                               const size_t inImgW,
-                               const size_t outImgH,
-                               const size_t outImgW,
-                               const size_t numChannels,
-                               const real ratioH,
-                               const real ratioW) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void bilinearBackward(const Matrix& out,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    if (useGpu_) {
-      TensorGpuApply<real>(*this, expr);
-    } else {
-      TensorCpuApply<real>(*this, expr);
-    }
-  }
-
-  bool isEmpty() const { return data_ == nullptr; }
-
-  explicit operator bool() const { return !isEmpty(); }
-};
-
-inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
-  mat.print(os);
-  return os;
-}
-
-class GpuMatrix : public Matrix {
- public:
-  GpuMatrix();
-
-  GpuMatrix(size_t height, size_t width, bool trans = false);
-  GpuMatrix(real* data, size_t height, size_t width, bool trans = false)
-      : Matrix(data, height, width, trans, true) {}
-  GpuMatrix(real* data,
-            size_t height,
-            size_t width,
-            size_t stride,
-            bool trans = false)
-      : Matrix(data, height, width, stride, trans, true) {}
-  GpuMatrix(GpuMemHandlePtr dataHandle,
-            size_t height,
-            size_t width,
-            bool trans = false)
-      : Matrix(dataHandle, height, width, trans, true) {}
-  ~GpuMatrix();
-
-  void zeroMem();
-  void resetOne();
-  void setDiag(real value);
-
-  void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-
-  /**
-   * Copy the data from cpu_memory buffer
-   */
-  void copyFrom(const real* hostSrc, size_t size);
-
-  void copyFrom(const real* hostSrc, const int64_t* seq);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-
-  void copyFrom(const Matrix& src);
-
-  void copyFrom(const IVector& src);
-
-  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
-
-  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
-
-  real getElement(size_t x, size_t y) const;
-
-  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  real getSum();
-  void accumulateColSum(Matrix& src);
-  real getAbsSum();
-
-  real getMin();
-  real getMax();
-
-  MatrixPtr getTranspose();
-  void transpose(MatrixPtr& matTrans, bool memAlloc);
-  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
-
-  MatrixPtr getInverse();
-  void inverse(MatrixPtr& matInv, bool memAlloc);
-
-  /// add b to each sample of this.
-  void addBias(Matrix& b, real scale);
-  void addSharedBias(Matrix& b, real scale);
-
-  /**
-   * @code
-   * add each sample from a to this.
-   * @endcode
-   */
-  void collectBias(Matrix& a, real scale);
-  void collectSharedBias(Matrix& a, real scale);
-
-  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
-  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids);
-
-  void addColumnVector(const Matrix& b);
-
-  /**
-   * @code
-   * this = scaleAB*(a*b) + scaleT*this
-   * @endcode
-   */
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = a*b
-   * @endcode
-   */
-  void mul(const Matrix& a, const Matrix& b);
-
-  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
-
-  void mul(const GpuSparseMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT);
-
-  void mul(const GpuMatrix& a,
-           const GpuSparseMatrix& b,
-           real scaleAB,
-           real scaleT);
-
-  /**
-   * @code
-   * this = scaleAB*(this*b) +  scaleT*this
-   * @endcode
-   */
-  void rightMul(Matrix& b, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = this* b
-   * @endcode
-   */
-  void rightMul(Matrix& b);
-
-  /**
-   * @code
-   * this = scaleAB*(a*this) +  scaleT*this
-   * @endcode
-   */
-  void leftMul(Matrix& a, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = a*this
-   * @endcode
-   */
-  void leftMul(Matrix& a);
-
-  void colMerge(Matrix& src);
-  void rowSum(Matrix& sum);
-  void rowMax(Matrix& max);
-  void rowMax(IVector& maxIds, Matrix& max);
-  void colMax(Matrix& max);
-  void colMax(IVector& maxIds, Matrix& max);
-  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
-
-  void oneHotCrossEntropy(Matrix& output, IVector& label);
-  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                      IVector& label,
-                                      real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                        IVector& label,
-                                        real alpha);
-
-  void softmax(Matrix& output);
-  void sequenceSoftmax(Matrix& output, const IVector& index);
-  void softmaxBackward(Matrix& outputV);
-  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
-
-  /// calculate the sum of squares diff cost.
-  void sumOfSquares(Matrix& output, Matrix& label);
-
-  /// gradient of sumOfSquares.
-  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
-  void tanh(Matrix& output);
-  void tanhDerivative(Matrix& output);
-  void softrelu(Matrix& output);
-  void softreluDerivative(Matrix& output);
-  void scaledTanh(Matrix& output, real p1, real p2);
-
-  virtual void print(std::ostream& os) const;
-  virtual void print(std::ostream& os, size_t height, size_t width) const;
-
-  void paramReluForward(Matrix& data, Matrix& W);
-  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
-  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
-
-  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
-  void randomizeUniform();
-
-  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
-
-  void upsampleForward(Matrix& input,
-                       Matrix& mask,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t channels,
-                       size_t outputH,
-                       size_t outputW);
-
-  void upsampleBackward(Matrix& outputGrad,
-                        Matrix& mask,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t channels,
-                        size_t outputH,
-                        size_t outputW);
-
-  void maxPoolForward(Matrix& inputMat,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      MatrixPtr maskMatP);
-
-  void maxPoolBackward(Matrix& image,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       Matrix& outGrad,
-                       Matrix& outV,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void avgPoolForward(Matrix& input,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      bool excludeMode = true);
-
-  void avgPoolBackward(Matrix& input,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW,
-                       bool excludeMode = true);
-
-  void maxPool3DForward(Matrix& inputMat,
-                        Matrix& maxPoolIdx,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void maxPool3DBackward(Matrix& outGrad,
-                         Matrix& maxPoolIdx,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void avgPool3DForward(Matrix& input,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void avgPool3DBackward(Matrix& input,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void maxSequenceForward(Matrix& input,
-                          const IVector& sequence,
-                          IVector& index);
-
-  void maxSequenceBackward(Matrix& outputGrad,
-                           const IVector& sequence,
-                           IVector& index);
-
-  void bilinearForward(const Matrix& in,
-                       const size_t inImgH,
-                       const size_t inImgW,
-                       const size_t outImgH,
-                       const size_t outImgW,
-                       const size_t numChannels,
-                       const real ratioH,
-                       const real ratioW);
-
-  void bilinearBackward(const Matrix& out,
-                        const size_t outImgH,
-                        const size_t outImgW,
-                        const size_t inImgH,
-                        const size_t inImgW,
-                        const size_t numChannels,
-                        const real ratioH,
-                        const real ratioW);
-
-  void vol2Col(real* data,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW);
-
-  void col2Vol(real* trg,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW,
-               real alpha,
-               real beta);
-
-  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
-
-  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorGpuApply<real>(*this, expr);
-  }
-};
-
-class CpuMatrix : public Matrix {
- private:
-  MatrixPtr sftmaxSum_;
-  MatrixPtr sftmaxDot_;
-
- public:
-  CpuMatrix(size_t height, size_t width, bool trans = false);
-  CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
-      : Matrix(data, height, width, trans, false) {}
-  CpuMatrix(real* data,
-            size_t height,
-            size_t width,
-            size_t stride,
-            bool trans = false)
-      : Matrix(data, height, width, stride, trans, false) {}
-
-  CpuMatrix(CpuMemHandlePtr dataHandle,
-            size_t height,
-            size_t width,
-            bool trans = false)
-      : Matrix(dataHandle, height, width, trans, false) {}
-
-  ~CpuMatrix();
-
-  void zeroMem();
-  void resetOne();
-  void setDiag(real value);
-
-  void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-
-  real getElement(size_t x, size_t y) const;
-  real getSum();
-  void accumulateColSum(Matrix& src);
-  real getAbsSum();
-
-  MatrixPtr getTranspose();
-  void transpose(MatrixPtr& matTrans, bool memAlloc);
-  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
-
-  MatrixPtr getInverse();
-  void inverse(MatrixPtr& matInv, bool memAlloc);
-
-  void copyFrom(const Matrix& src);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-
-  void copyFrom(const real* cpuSrc, size_t size);
-
-  void copyFrom(const real* cpuSrc, const int64_t* seq);
-
-  void copyFrom(const IVector& src);
-
-  void copyFrom(CpuSparseMatrix& src);
-
-  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
-
-  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
-
-  void upsampleForward(Matrix& input,
-                       Matrix& mask,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t channels,
-                       size_t outputH,
-                       size_t outputW);
-
-  void upsampleBackward(Matrix& outputGrad,
-                        Matrix& mask,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t channels,
-                        size_t outputH,
-                        size_t outputW);
-
-  void maxPoolForward(Matrix& inputMat,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      MatrixPtr maskMatP);
-
-  void maxPoolBackward(Matrix& image,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       Matrix& outGrad,
-                       Matrix& outV,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void avgPoolForward(Matrix& input,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      bool excludeMode = true);
-
-  void avgPoolBackward(Matrix& input,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW,
-                       bool excludeMode = true);
-
-  void maxPool3DForward(Matrix& inputMat,
-                        Matrix& maxPoolIdx,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void maxPool3DBackward(Matrix& outGrad,
-                         Matrix& maxPoolIdx,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void avgPool3DForward(Matrix& input,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void avgPool3DBackward(Matrix& input,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void maxSequenceForward(Matrix& input,
-                          const IVector& sequence,
-                          IVector& index);
-
-  void maxSequenceBackward(Matrix& outputGrad,
-                           const IVector& sequence,
-                           IVector& index);
-
-  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
- public:
-  /// add b to each sample of this.
-  void addBias(Matrix& b, real scale);
-  void addSharedBias(Matrix& b, real scale);
-
-  /// add each sample of a to this.
-  void collectBias(Matrix& a, real scale);
-  void collectSharedBias(Matrix& a, real scale);
-
-  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
-  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table[i, id[i]] += this[i]
-   * @endcode
-   */
-  virtual void addElements(Matrix& table, IVector& ids);
-
-  /**
-   * use abstract getRow() to get row from table.
-   *
-   * Define table as template instead of virtual class for performance sake.
-   * internal used by above two virtual funcs.
-   */
-  template <typename TableMatType>
-  void selectRowsImp(TableMatType& table, IVector& ids);
-  template <typename TableMatType>
-  void addToRowsImp(TableMatType& table, IVector& ids);
-
-  void addColumnVector(const Matrix& b);
-
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-  void mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT);
-
-  static void mul(CpuMatrix* a,
-                  CpuMatrix* b,
-                  CpuSparseMatrix* c,
-                  real scaleAB,
-                  real scaleT);
-
-  /**
-   * c = a * b
-   *
-   * use abstract getRow() to get row from B,C.
-   * Define B,C as template instead of virtual class for performance sake.
-   */
-  template <typename MatBType, typename MatCType>
-  static void mul(
-      CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT);
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  void mul(const Matrix& a, const Matrix& b);
-
-  void rightMul(Matrix& b, real scaleAB, real scaleT);
-  void rightMul(Matrix& b);
-
-  void leftMul(Matrix& a, real scaleAB, real scaleT);
-  void leftMul(Matrix& a);
-  void colMerge(Matrix& src);
-  void rowSum(Matrix& sum);
-  void rowMaxId(IVector& maxIds);
-  void rowMax(Matrix& max);
-  void rowMax(IVector& maxIds, Matrix& maxVal);
-  void colMax(Matrix& max);
-  void colMax(IVector& maxIds, Matrix& maxVal);
-  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void rowNormalizeL1(Matrix& out);
-
-  void oneHotCrossEntropy(Matrix& output, IVector& label);
-  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                      IVector& label,
-                                      real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                        IVector& label,
-                                        real alpha);
-
-  void circularConv(Matrix& b, Matrix& c);
-  void circularConvDerivative(Matrix& output,
-                              Matrix& prevOut1,
-                              Matrix& prevOut2,
-                              Matrix& prevGrad1,
-                              Matrix& prevGrad2);
-
-  void softmax(Matrix& output);
-  void sequenceSoftmax(Matrix& output, const IVector& index);
-  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
-
-  /// calculate the sum of squares diff cost.
-  void sumOfSquares(Matrix& output, Matrix& label);
-
-  /// gradient of sumOfSquares.
-  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
-
-  void smoothL1(Matrix& output, Matrix& label, real destScale);
-  void smoothL1Bp(Matrix& output, Matrix& label, real destScale);
-
-  void tanh(Matrix& output);
-  void tanhDerivative(Matrix& output);
-
-  void softrelu(Matrix& output);
-  void softreluDerivative(Matrix& output);
-  void scaledTanh(Matrix& output, real p1, real p2);
-
-  void print(std::ostream& os) const;
-  void print(std::ostream& os, size_t height, size_t width) const;
-  void printOneRow(std::ostream& os, size_t idx) const;
-
-  void paramReluForward(Matrix& data, Matrix& W);
-  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
-  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
-
-  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
-
-  real getMin();
-  real getMax();
-
-  void randomizeUniform();
-
-  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
-
-  void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
-
-  void addByBitCodeBackward(size_t numClasses,
-                            const IVector& codes,
-                            Matrix& vec);
-
-  void mulByBitCode(size_t numClasses,
-                    const IVector& codes,
-                    const Matrix& mat,
-                    const Matrix& input);
-
-  void mulByBitCodeBackwardWeight(size_t numClasses,
-                                  const IVector& codes,
-                                  Matrix& mat,
-                                  const Matrix& input);
-
-  void mulByBitCodeBackwardError(size_t numClasses,
-                                 const IVector& codes,
-                                 const Matrix& mat,
-                                 Matrix& input);
-
-  void sumByBitCode(size_t numClasses,
-                    IVector& codes,
-                    Matrix& sum,
-                    real scaleSum);
-
-  void subByBitCode(size_t numClasses_, IVector& codes);
-
-  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
-  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
-  void classificationErrorMulti(Matrix& output, Matrix& label, real threshold);
-
-  void bilinearForward(const Matrix& in,
-                       const size_t inImgH,
-                       const size_t inImgW,
-                       const size_t outImgH,
-                       const size_t outImgW,
-                       const size_t numChannels,
-                       const real ratioH,
-                       const real ratioW);
-
-  void bilinearBackward(const Matrix& out,
-                        const size_t outImgH,
-                        const size_t outImgW,
-                        const size_t inImgH,
-                        const size_t inImgW,
-                        const size_t numChannels,
-                        const real ratioH,
-                        const real ratioW);
-
-  void vol2Col(real* data,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW);
-
-  void col2Vol(real* trg,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW,
-               real alpha,
-               real beta);
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorCpuApply<real>(*this, expr);
-  }
-};
-
-class SharedCpuMatrix : public CpuMatrix {
- public:
-#ifndef PADDLE_MOBILE_INFERENCE
-  /* blockNum is number of partitions of the matrix  */
-  SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
-      : CpuMatrix(height, width, trans) {
-    initShared(blockNum);
-  }
-  SharedCpuMatrix(
-      int blockNum, real* data, size_t height, size_t width, bool trans = false)
-      : CpuMatrix(data, height, width, trans) {
-    initShared(blockNum);
-  }
-
-  SharedCpuMatrix(int blockNum,
-                  CpuMemHandlePtr dataHandle,
-                  size_t height,
-                  size_t width,
-                  bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {
-    initShared(blockNum);
-  }
-
-  SharedCpuMatrix(CpuMemHandlePtr dataHandle,
-                  size_t height,
-                  size_t width,
-                  bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {
-    initBlock(1);
-  }
-
-  ~SharedCpuMatrix() {}
-
- public:
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-  virtual void add(Matrix& b, real p1, real p2);
-  virtual void add(real p1, real p2);
-
- private:
-  using Matrix::mul;
-  void initShared(int blockNum);
-  void initBlock(int blockNum);
-
-  int blockNum_;
-  std::vector<std::unique_ptr<std::mutex>> blockLocks_;
-  ThreadLocal<CpuMatrixPtr> localBuf_;
-  ThreadLocal<std::vector<int>> localBufRows_;
-  ThreadLocal<std::vector<int>> blockSeq_;
-#endif
-};
-
-typedef struct { unsigned int col; } sparse_non_value_t;
-
-typedef struct {
-  unsigned int col;
-  float value;
-} sparse_float_value_t;
-
-}  // namespace paddle
-#include "ExecViaCpu.h"
diff --git a/paddle/legacy/math/MatrixBitCode.cpp b/paddle/legacy/math/MatrixBitCode.cpp
deleted file mode 100644
index f35f266a305..00000000000
--- a/paddle/legacy/math/MatrixBitCode.cpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Matrix.h"
-#include "hl_gpu.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-namespace {
-
-struct SimpleCode {
-  SimpleCode(size_t code, size_t numClasses) : c_(code + numClasses) {}
-  inline size_t calcIndex(int bit) const { return (c_ >> (bit + 1)) - 1; }
-  inline bool calcBit(int bit) const { return c_ & (1 << bit); }
-  inline int getLength() const { return findLastSet(c_) - 1; }
-
- private:
-  size_t c_;
-};
-
-struct SimpleCodeTable {
-  explicit SimpleCodeTable(size_t numClasses) : numClasses_(numClasses) {}
-  SimpleCode operator()(size_t code) const {
-    return SimpleCode(code, numClasses_);
-  }
-  size_t size() const { return numClasses_; }
-  int getMaxCodeLength() const { return findLastSet(numClasses_ - 1); }
-
- private:
-  size_t numClasses_;
-  int maxCodeLength_;
-};
-
-}  // namespace
-
-/**
- * CodeTable class should support 3 functions:
- *
- * size_t size()
- *   return the number of codes
- *
- * int getMaxCodeLength()
- *   return the maximal code length
- *
- * Code operator()(size_t i)
- *   return the i-th code. Code class is descriebed below.
- *
- * Code class should support 3 functions:
- *
- * int getLength()
- *   return the length of the code
- *
- * bool calcIndex(int bit)
- *   bit ranges from 0 to getLength() - 1
- *   return the index for the (1+bit) level parent
- *
- * bool calcBit(int bit)
- *   return true if the bit level parent is the right child of (1+bit) level
- *   parent
- *
- */
-
-/*
-   for i:
-     for j < codeLength:
-       op(tmat(i, j), vec(0, index(i, j)))
-*/
-template <class CodeTable, class Op, class TMat, class Mat>
-static void addByBitCodeT(
-    Op op, CodeTable codeTable, const IVector& codes, TMat& tmat, Mat& vec) {
-  CHECK(!vec.useGpu());
-
-  size_t numClasses = codeTable.size();
-  size_t maxCodeLength = codeTable.getMaxCodeLength();
-  size_t numSamples = tmat.getHeight();
-  size_t oWidth = tmat.getWidth();
-  CHECK_EQ(tmat.getWidth(), maxCodeLength);
-  CHECK_EQ(codes.getSize(), numSamples);
-  CHECK_EQ(vec.getHeight(), (size_t)1);
-  CHECK_EQ(vec.getWidth(), numClasses - 1);
-
-  auto data = tmat.getData();
-  auto v = vec.getData();
-  const int* c = codes.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    auto code = codeTable(c[i]);
-    int codeLength = code.getLength();
-    for (int j = 0; j < codeLength; ++j) {
-      size_t index = code.calcIndex(j);
-      op(data[i * oWidth + j], v[index]);
-    }
-  }
-}
-
-/* For j < codeLength:
-   this(i, j) += vec(0, index(i, j))
-*/
-void CpuMatrix::addByBitCode(size_t numClasses,
-                             const IVector& codes,
-                             const Matrix& vec) {
-  auto op = [](real& t, real v) { t += v; };
-  addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
-}
-
-/* For j < codeLength:
-   vec(0, index(i, j)) += this(i, j)
-*/
-void CpuMatrix::addByBitCodeBackward(size_t numClasses,
-                                     const IVector& codes,
-                                     Matrix& vec) {
-  auto op = [](real t, real& v) { v += t; };
-  addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
-}
-
-/*
-  for i:
-    for j < codeLength:
-      op(tmat(i, j), mat.row(index(i, j)), input.row(i))
-*/
-template <class Op,
-          class CodeTable,
-          class IVec,
-          class TMat,
-          class WMat,
-          class InMat>
-void mulByBitCodeT(Op op,
-                   CodeTable codeTable,
-                   IVec& codes,
-                   TMat& tmat,
-                   WMat& weight,
-                   InMat& input) {
-  CHECK(!tmat.useGpu() && !weight.useGpu() && !input.useGpu());
-
-  size_t numClasses = codeTable.size();
-  size_t maxCodeLength = codeTable.getMaxCodeLength();
-  size_t numSamples = tmat.getHeight();
-  size_t inputDim = input.getWidth();
-  size_t oWidth = tmat.getWidth();
-  CHECK_EQ(tmat.getWidth(), maxCodeLength);
-  CHECK_EQ(codes.getSize(), numSamples);
-  CHECK_EQ(input.getHeight(), numSamples);
-  CHECK_EQ(weight.getHeight(), numClasses - 1);
-  CHECK_EQ(weight.getWidth(), inputDim);
-
-  real* data = tmat.getData();
-  const int* c = codes.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    auto code = codeTable(c[i]);
-    int codeLength = code.getLength();
-    for (int j = 0; j < codeLength; ++j) {
-      size_t index = code.calcIndex(j);
-      op(data[i * oWidth + j], weight.rowBuf(index), input.rowBuf(i), inputDim);
-    }
-  }
-}
-
-/* For j < codeLength:
-   this(i, j) += <weight.row(index(i, j)), input.row(i)>
-*/
-void CpuMatrix::mulByBitCode(size_t numClasses,
-                             const IVector& codes,
-                             const Matrix& weight,
-                             const Matrix& input) {
-  auto op = [](
-      real& t, const real* weightRow, const real* inputRow, size_t inputDim) {
-    real sum = 0;
-    for (size_t k = 0; k < inputDim; ++k) {
-      sum += weightRow[k] * inputRow[k];
-    }
-    t += sum;
-  };
-
-  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
-}
-
-/* For index(i, j) >= 0:
-   weight.row(index(i, j)) += this(i, j) * input.row(i)
-*/
-void CpuMatrix::mulByBitCodeBackwardWeight(size_t numClasses,
-                                           const IVector& codes,
-                                           Matrix& weight,
-                                           const Matrix& input) {
-  auto op = [](
-      const real t, real* weightRow, const real* inputRow, size_t inputDim) {
-    for (size_t k = 0; k < inputDim; ++k) {
-      weightRow[k] += t * inputRow[k];
-    }
-  };
-
-  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
-}
-
-/* For j < codeLength:
-   input.row(i) += this(i, j) * weight.row(index(i, j))
-*/
-void CpuMatrix::mulByBitCodeBackwardError(size_t numClasses,
-                                          const IVector& codes,
-                                          const Matrix& weight,
-                                          Matrix& input) {
-  auto op = [](
-      const real t, const real* weightRow, real* inputRow, size_t inputDim) {
-    for (size_t k = 0; k < inputDim; ++k) {
-      inputRow[k] += t * weightRow[k];
-    }
-  };
-
-  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
-}
-
-template <class CodeTable>
-void sumByBitCodeT(CodeTable codeTable,
-                   IVector& codes,
-                   const CpuMatrix& tmat,
-                   Matrix& sum,
-                   real scaleSum) {
-  size_t maxCodeLength = codeTable.getMaxCodeLength();
-  size_t numSamples = tmat.getHeight();
-  size_t oWidth = tmat.getWidth();
-  CHECK_EQ(tmat.getWidth(), maxCodeLength);
-  CHECK_EQ(codes.getSize(), numSamples);
-  CHECK_EQ(sum.getHeight(), numSamples);
-  CHECK_EQ(sum.getWidth(), (size_t)1);
-
-  const real* data = tmat.getData();
-  real* s = sum.getData();
-  int* c = codes.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    real sm = 0;
-    auto code = codeTable(c[i]);
-    int codeLength = code.getLength();
-    for (int j = 0; j < codeLength; ++j) {
-      if (code.calcBit(j)) {
-        sm += data[i * oWidth + j];
-      }
-    }
-    s[i] = scaleSum * sm;
-  }
-}
-
-/* For j < codeLength:
-   sum(i, 0) = \sum_j  bit(i, j) * this(i, j)
-*/
-void CpuMatrix::sumByBitCode(size_t numClasses,
-                             IVector& codes,
-                             Matrix& sum,
-                             real scaleSum) {
-  sumByBitCodeT(SimpleCodeTable(numClasses), codes, *this, sum, scaleSum);
-}
-
-template <class CodeTable>
-void subByBitCodeT(CodeTable codeTable, IVector& codes, CpuMatrix& tmat) {
-  size_t maxCodeLength = codeTable.getMaxCodeLength();
-  size_t numSamples = tmat.getHeight();
-  size_t oWidth = tmat.getWidth();
-  CHECK_EQ(tmat.getWidth(), maxCodeLength);
-  CHECK_EQ(codes.getSize(), numSamples);
-
-  real* data = tmat.getData();
-  int* c = codes.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    auto code = codeTable(c[i]);
-    int codeLength = code.getLength();
-    for (int j = 0; j < codeLength; ++j) {
-      if (code.calcBit(j)) {
-        data[i * oWidth + j] -= 1;
-      }
-    }
-  }
-}
-
-/* For j < codeLength
-   this(i, j) -= bit(i, j)
-*/
-void CpuMatrix::subByBitCode(size_t numClasses, IVector& codes) {
-  subByBitCodeT(SimpleCodeTable(numClasses), codes, *this);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/MemoryHandle.cpp b/paddle/legacy/math/MemoryHandle.cpp
deleted file mode 100644
index 1563314e921..00000000000
--- a/paddle/legacy/math/MemoryHandle.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MemoryHandle.h"
-#include <cmath>
-#include "Storage.h"
-
-namespace paddle {
-
-/**
- * Calculate the actual allocation size according to the required size.
- */
-MemoryHandle::MemoryHandle(size_t size) : size_(size), buf_(nullptr) {
-  if (size_ <= 256) {
-    // Memory allocation in cuda is always aligned to at least 256 bytes.
-    // In many cases it is 512 bytes.
-    allocSize_ = 256;
-  } else if (size_ <= 512) {
-    allocSize_ = 512;
-  } else if (size_ <= (1 << 16)) {
-    // Allocate multiple of 1024 bytes.
-    allocSize_ = (size + 1023) & ~(1023);
-  } else {
-    allocSize_ = size_;
-  }
-}
-
-GpuMemoryHandle::GpuMemoryHandle(size_t size) : MemoryHandle(size) {
-  CHECK(size != 0) << " allocate 0 bytes";
-  deviceId_ = hl_get_device();
-  allocator_ = StorageEngine::singleton()->getGpuAllocator(deviceId_);
-  buf_ = allocator_->alloc(allocSize_);
-}
-
-GpuMemoryHandle::~GpuMemoryHandle() { allocator_->free(buf_, allocSize_); }
-
-CpuMemoryHandle::CpuMemoryHandle(size_t size) : MemoryHandle(size) {
-  CHECK(size != 0) << " allocate 0 bytes";
-  allocator_ = StorageEngine::singleton()->getCpuAllocator();
-  buf_ = allocator_->alloc(allocSize_);
-}
-
-CpuMemoryHandle::~CpuMemoryHandle() { allocator_->free(buf_, allocSize_); }
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/MemoryHandle.h b/paddle/legacy/math/MemoryHandle.h
deleted file mode 100644
index 516e09dbed4..00000000000
--- a/paddle/legacy/math/MemoryHandle.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include "PoolAllocator.h"
-
-namespace paddle {
-
-class MemoryHandle {
- protected:
-  explicit MemoryHandle(size_t size);
-  virtual ~MemoryHandle() {}
-
- public:
-  void* getBuf() const { return buf_; }
-  size_t getSize() const { return size_; }
-  size_t getAllocSize() const { return allocSize_; }
-
- protected:
-  PoolAllocator* allocator_;
-  size_t size_;       // the requested size
-  size_t allocSize_;  // the allocated size
-  int deviceId_;      // the device id of memory if gpu memory
-  void* buf_;
-};
-
-/**
- * Wrapper class for raw gpu memory handle.
- *
- * The raw handle will be released at destructor
- */
-class GpuMemoryHandle : public MemoryHandle {
- public:
-  explicit GpuMemoryHandle(size_t size);
-  virtual ~GpuMemoryHandle();
-};
-
-/**
- * Wrapper class for raw cpu memory handle.
- *
- * The raw handle will be released at destructor
- */
-class CpuMemoryHandle : public MemoryHandle {
- public:
-  explicit CpuMemoryHandle(size_t size);
-  virtual ~CpuMemoryHandle();
-};
-
-typedef std::shared_ptr<MemoryHandle> MemoryHandlePtr;
-typedef std::shared_ptr<CpuMemoryHandle> CpuMemHandlePtr;
-typedef std::shared_ptr<GpuMemoryHandle> GpuMemHandlePtr;
-}  // namespace paddle
diff --git a/paddle/legacy/math/NEONFunctions.cpp b/paddle/legacy/math/NEONFunctions.cpp
deleted file mode 100644
index 953d5bb8c81..00000000000
--- a/paddle/legacy/math/NEONFunctions.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#include "NEONFunctions.h"
-#include <arm_neon.h>
-
-namespace paddle {
-namespace neon {
-
-// b[i] = a[i] > 0.0f ? a[i] : 0.0f
-void relu(const float* a, float* b, int len) {
-  int offset = len % 16;
-  float32x4_t ma0, ma1, ma2, ma3;
-  float32x4_t mb0, mb1, mb2, mb3;
-
-  float32x4_t zero = vdupq_n_f32(0.f);
-  for (int k = 0; k < len / 16; k++, a += 16, b += 16) {
-    ma0 = vld1q_f32(a);
-    ma1 = vld1q_f32(a + 4);
-    ma2 = vld1q_f32(a + 8);
-    ma3 = vld1q_f32(a + 12);
-
-    mb0 = vmaxq_f32(ma0, zero);
-    mb1 = vmaxq_f32(ma1, zero);
-    mb2 = vmaxq_f32(ma2, zero);
-    mb3 = vmaxq_f32(ma3, zero);
-
-    vst1q_f32(b, mb0);
-    vst1q_f32(b + 4, mb1);
-    vst1q_f32(b + 8, mb2);
-    vst1q_f32(b + 12, mb3);
-  }
-
-  for (int i = 0; i < offset; i++) {
-    b[i] = a[i] > 0.0f ? a[i] : 0.0f;
-  }
-}
-
-// b[i] = a[i] > 0.0f ? a[i] : a[i] * w
-void prelu(const float* a, float w, float* b, int len) {
-  int offset = len % 16;
-  float32x4_t ma0, ma1, ma2, ma3;
-
-  float32x4_t zero = vdupq_n_f32(0.f);
-  float32x4_t vw = vdupq_n_f32(w);
-
-  for (int k = 0; k < len / 16; k++, a += 16, b += 16) {
-    ma0 = vld1q_f32(a);
-    ma1 = vld1q_f32(a + 4);
-    ma2 = vld1q_f32(a + 8);
-    ma3 = vld1q_f32(a + 12);
-
-    uint32x4_t flag0 = vcgtq_f32(ma0, zero);
-    uint32x4_t flag1 = vcgtq_f32(ma1, zero);
-    uint32x4_t flag2 = vcgtq_f32(ma2, zero);
-    uint32x4_t flag3 = vcgtq_f32(ma3, zero);
-
-    float32x4_t mul0 = vmulq_f32(ma0, vw);
-    float32x4_t mul1 = vmulq_f32(ma1, vw);
-    float32x4_t mul2 = vmulq_f32(ma2, vw);
-    float32x4_t mul3 = vmulq_f32(ma3, vw);
-
-    ma0 = vbslq_f32(flag0, ma0, mul0);
-    ma1 = vbslq_f32(flag1, ma1, mul1);
-    ma2 = vbslq_f32(flag2, ma2, mul2);
-    ma3 = vbslq_f32(flag3, ma3, mul3);
-
-    vst1q_f32(b, ma0);
-    vst1q_f32(b + 4, ma1);
-    vst1q_f32(b + 8, ma2);
-    vst1q_f32(b + 12, ma3);
-  }
-
-  for (int i = 0; i < offset; i++) {
-    b[i] = a[i] > 0.0f ? a[i] : a[i] * w;
-  }
-}
-
-}  // namespace neon
-}  // namespace paddle
-
-#endif
diff --git a/paddle/legacy/math/NEONFunctions.h b/paddle/legacy/math/NEONFunctions.h
deleted file mode 100644
index 33edd9d518d..00000000000
--- a/paddle/legacy/math/NEONFunctions.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle {
-namespace neon {
-
-void relu(const float* a, float* b, int len);
-void prelu(const float* a, float w, float* b, int len);
-
-}  // namespace neon
-}  // namespace paddle
diff --git a/paddle/legacy/math/PoolAllocator.cpp b/paddle/legacy/math/PoolAllocator.cpp
deleted file mode 100644
index b6ad168856a..00000000000
--- a/paddle/legacy/math/PoolAllocator.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PoolAllocator.h"
-
-namespace paddle {
-
-PoolAllocator::PoolAllocator(Allocator* allocator,
-                             size_t sizeLimit,
-                             const std::string& name)
-    : allocator_(allocator),
-      sizeLimit_(sizeLimit),
-      poolMemorySize_(0),
-      name_(name) {}
-
-PoolAllocator::~PoolAllocator() { freeAll(); }
-
-void* PoolAllocator::alloc(size_t size) {
-  if (sizeLimit_ > 0) {
-    std::lock_guard<std::mutex> guard(mutex_);
-    auto it = pool_.find(size);
-    if (it == pool_.end() || it->second.size() == 0) {
-      if (poolMemorySize_ >= sizeLimit_) {
-        freeAll();
-      }
-      return allocator_->alloc(size);
-    } else {
-      auto buf = it->second.back();
-      it->second.pop_back();
-      poolMemorySize_ -= size;
-      return buf;
-    }
-  } else {
-    return allocator_->alloc(size);
-  }
-}
-
-void PoolAllocator::free(void* ptr, size_t size) {
-  if (sizeLimit_ > 0) {
-    std::lock_guard<std::mutex> guard(mutex_);
-    auto& it = pool_[size];
-    it.push_back(ptr);
-    poolMemorySize_ += size;
-  } else {
-    allocator_->free(ptr);
-  }
-}
-
-void PoolAllocator::freeAll() {
-  for (auto it : pool_) {
-    for (auto ptr : it.second) {
-      allocator_->free(ptr);
-    }
-  }
-  poolMemorySize_ = 0;
-  pool_.clear();
-}
-
-void PoolAllocator::printAll() {
-  size_t memory = 0;
-  LOG(INFO) << name_ << ":";
-  for (auto it : pool_) {
-    LOG(INFO) << "  size:" << it.first;
-    for (auto ptr : it.second) {
-      LOG(INFO) << "    ptr:" << ptr;
-      memory += it.first;
-    }
-  }
-  LOG(INFO) << "memory size: " << memory;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/PoolAllocator.h b/paddle/legacy/math/PoolAllocator.h
deleted file mode 100644
index 7239cf1c449..00000000000
--- a/paddle/legacy/math/PoolAllocator.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <mutex>
-#include <unordered_map>
-#include <vector>
-#include "Allocator.h"
-
-namespace paddle {
-
-/**
- * @brief Memory pool allocator implementation.
- */
-class PoolAllocator {
- public:
-  /**
-   * @brief constructor.
-   * @param allocator a Allocator object.
-   * @param sizeLimit The maximum size memory can be managed,
-   * if sizeLimit == 0, the pool allocator is a simple wrapper of allocator.
-   */
-  PoolAllocator(Allocator* allocator,
-                size_t sizeLimit = 0,
-                const std::string& name = "pool");
-
-  /**
-   * @brief destructor.
-   */
-  ~PoolAllocator();
-
-  void* alloc(size_t size);
-  void free(void* ptr, size_t size);
-  std::string getName() { return name_; }
-
- private:
-  void freeAll();
-  void printAll();
-  std::unique_ptr<Allocator> allocator_;
-  std::mutex mutex_;
-  std::unordered_map<size_t, std::vector<void*>> pool_;
-  size_t sizeLimit_;
-  size_t poolMemorySize_;
-  std::string name_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/RowBuffer.h b/paddle/legacy/math/RowBuffer.h
deleted file mode 100644
index 9dfd5eff06a..00000000000
--- a/paddle/legacy/math/RowBuffer.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "MemoryHandle.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-/**
- * @brief The RowBuffer class
- * Represent the SparseRow Matrix Data.
- *
- * If not set memory handler, then the data could be auto growth.
- */
-class RowBuffer {
- public:
-  /**
-   * @brief RowBuffer create a auto-growth row buffer. The row length is width.
-   * @param width the length of each row, a.k.a matrix width.
-   */
-  explicit RowBuffer(size_t width) : width_(width) {}
-
-  /**
-   * @brief RowBuffer create a row buffer, which cannot be auto-growth.
-   * @param mem the pre-allocated memory.
-   * @param width the length of each row, a.k.a matrix width.
-   */
-  RowBuffer(const CpuMemHandlePtr& mem, size_t width)
-      : preallocatedBuf_(mem), width_(width) {}
-
-  /**
-   * @brief resize resize the buffer with rowCount
-   * @param rowCnt number of row. matrix height.
-   */
-  inline void resize(int rowCnt) {
-    if (preallocatedBuf_) {
-      CHECK(preallocatedBuf_->getSize() >= rowCnt * width_ * sizeof(real));
-    } else {
-      rowStore_.resize(rowCnt * width_);
-    }
-  }
-
-  /**
-   * @brief get a row buffer with row index.
-   * @param row the index of row.
-   * @return row buffer.
-   */
-  inline real* get(int row) const {
-    if (preallocatedBuf_) {
-      CHECK_LE((row)*width_ * sizeof(real), preallocatedBuf_->getSize());
-      return reinterpret_cast<real*>(preallocatedBuf_->getBuf()) + row * width_;
-    } else {
-      CHECK_LE((row + 1) * width_, rowStore_.size());
-      return const_cast<real*>(rowStore_.data() + row * width_);
-    }
-  }
-
-  /**
-   * @brief get a row buffer with row index. If row index is larger than local
-   *        buffer, the size of local buffer will grow.
-   * @param row the index of row.
-   * @return row buffer.
-   */
-  inline real* getWithAutoGrowth(int row) {
-    if (preallocatedBuf_) {
-      return get(row);
-    } else {
-      if ((rowStore_.size() <= row * width_)) {
-        rowStore_.resize((row + 1) * width_);
-      }
-      return rowStore_.data() + row * width_;
-    }
-  }
-
-  /**
-   * @return raw data buffer.
-   */
-  inline real* data() {
-    if (preallocatedBuf_) {
-      return reinterpret_cast<real*>(preallocatedBuf_->getBuf());
-    } else {
-      return rowStore_.data();
-    }
-  }
-
-  /**
-   * @brief clear local buffer. It only affect auto-growth buffer.
-   */
-  inline void clear() {
-    // swap an empty vector to it to free the memory.
-    std::vector<real, AlignedAllocator<real, 32>> empty;
-    rowStore_.swap(empty);
-  }
-
-  /**
-   * @brief get current number of rows.
-   * @return number of rows.
-   */
-  inline size_t getRowCount() const {
-    if (preallocatedBuf_) {
-      return preallocatedBuf_->getSize() / sizeof(real) / width_;
-    } else {
-      return rowStore_.size() / width_;
-    }
-  }
-
-  /**
-   * @brief get is this buffer can automatically grow or not.
-   * @return ture if can automacitally grow.
-   */
-  inline bool isAutoGrowth() const { return !preallocatedBuf_; }
-
-  /**
-   * @brief return the width of matrix. a.k.a length of row.
-   * @return width of matrix
-   */
-  inline size_t getWidth() const { return width_; }
-
- private:
-  //! TODO(yuyang18): Add resize method to CpuMemHandlePtr, then we can get rid
-  //! of std::vector here.
-  CpuMemHandlePtr preallocatedBuf_;
-  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
-  size_t width_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/math/SIMDFunctions.cpp b/paddle/legacy/math/SIMDFunctions.cpp
deleted file mode 100644
index 3cfc5d6f1e0..00000000000
--- a/paddle/legacy/math/SIMDFunctions.cpp
+++ /dev/null
@@ -1,397 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SIMDFunctions.h"
-#ifdef __SSE3__
-#include <immintrin.h>
-#endif
-#include <algorithm>
-
-#ifdef __AVX__
-static void addto_avx(float* a, const float* b, size_t len) {
-  int offset = len % 32;
-
-  __m256 ma0, ma1, ma2, ma3;
-  __m256 mb0, mb1, mb2, mb3;
-
-  for (unsigned int k = 0; k < len / 32; k++, a += 32, b += 32) {
-    ma0 = _mm256_load_ps(a);
-    ma1 = _mm256_load_ps(a + 8);
-    ma2 = _mm256_load_ps(a + 16);
-    ma3 = _mm256_load_ps(a + 24);
-
-    mb0 = _mm256_load_ps(b);
-    mb1 = _mm256_load_ps(b + 8);
-    mb2 = _mm256_load_ps(b + 16);
-    mb3 = _mm256_load_ps(b + 24);
-
-    ma0 = _mm256_add_ps(ma0, mb0);
-    ma1 = _mm256_add_ps(ma1, mb1);
-    ma2 = _mm256_add_ps(ma2, mb2);
-    ma3 = _mm256_add_ps(ma3, mb3);
-
-    _mm256_store_ps(a, ma0);
-    _mm256_store_ps(a + 8, ma1);
-    _mm256_store_ps(a + 16, ma2);
-    _mm256_store_ps(a + 24, ma3);
-  }
-
-  for (int i = 0; i < offset; i++) a[i] += b[i];
-
-  return;
-}
-
-static void batch_addto_avx(float* a, const float* b[], int batch, size_t len) {
-  int offset = len % 32;
-
-  __m256 ma0, ma1, ma2, ma3;
-  __m256 mb0, mb1, mb2, mb3;
-
-  for (unsigned int k = 0; k < len / 32; k++, a += 32) {
-    ma0 = _mm256_load_ps(a);
-    ma1 = _mm256_load_ps(a + 8);
-    ma2 = _mm256_load_ps(a + 16);
-    ma3 = _mm256_load_ps(a + 24);
-
-    for (int i = 0; i < batch; i++) {
-      mb0 = _mm256_load_ps(b[i]);
-      mb1 = _mm256_load_ps(b[i] + 8);
-      mb2 = _mm256_load_ps(b[i] + 16);
-      mb3 = _mm256_load_ps(b[i] + 24);
-      ma0 = _mm256_add_ps(ma0, mb0);
-      ma1 = _mm256_add_ps(ma1, mb1);
-      ma2 = _mm256_add_ps(ma2, mb2);
-      ma3 = _mm256_add_ps(ma3, mb3);
-      b[i] += 32;
-    }
-
-    _mm256_store_ps(a, ma0);
-    _mm256_store_ps(a + 8, ma1);
-    _mm256_store_ps(a + 16, ma2);
-    _mm256_store_ps(a + 24, ma3);
-  }
-
-  for (int i = 0; i < offset; i++) {
-    for (int k = 0; k < batch; k++) a[i] += b[k][i];
-  }
-  return;
-}
-
-static void col_max_avx(float* result,
-                        const float* data,
-                        int dim,
-                        int numSamples) {
-  // first sample, direct copy
-  for (int d = 0; d < dim; ++d) {
-    result[d] = data[d];
-  }
-  int offset = dim % 32;
-  __m256 ma0, ma1, ma2, ma3;
-  __m256 mb0, mb1, mb2, mb3;
-  // first 16n dims
-  for (int k = 0; k < dim / 32; k++, result += 32, data += 32) {
-    ma0 = _mm256_load_ps(result);
-    ma1 = _mm256_load_ps(result + 8);
-    ma2 = _mm256_load_ps(result + 16);
-    ma3 = _mm256_load_ps(result + 24);
-    for (int i = 1; i < numSamples; i++) {
-      mb0 = _mm256_load_ps(data + i * dim);
-      mb1 = _mm256_load_ps(data + i * dim + 8);
-      mb2 = _mm256_load_ps(data + i * dim + 16);
-      mb3 = _mm256_load_ps(data + i * dim + 24);
-      ma0 = _mm256_max_ps(ma0, mb0);
-      ma1 = _mm256_max_ps(ma1, mb1);
-      ma2 = _mm256_max_ps(ma2, mb2);
-      ma3 = _mm256_max_ps(ma3, mb3);
-    }
-    _mm256_store_ps(result, ma0);
-    _mm256_store_ps(result + 8, ma1);
-    _mm256_store_ps(result + 16, ma2);
-    _mm256_store_ps(result + 24, ma3);
-  }
-  // last dims
-  for (int d = 0; d < offset; ++d) {
-    float sm = data[d];
-    for (int i = 1; i < numSamples; ++i) {
-      sm = std::max(sm, data[i * dim + d]);
-    }
-    result[d] = sm;
-  }
-}
-
-static void decayL1_avx(float* dst, float* src, float lambda, size_t sz) {
-  int64_t i;
-  int64_t size = sz;
-  float src_val;
-
-  __m256 ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8;
-  //  __m256 ymm9, ymm10;
-
-  ymm1 = _mm256_set1_ps(lambda);
-  ymm2 = _mm256_setzero_ps();
-
-  for (i = 0; i <= size - 16; i += 16) {
-    ymm3 = _mm256_load_ps(src + i);
-    ymm6 = _mm256_load_ps(src + i + 8);
-
-    ymm4 = _mm256_sub_ps(ymm3, ymm1);
-    ymm7 = _mm256_sub_ps(ymm6, ymm1);
-
-    ymm5 = _mm256_add_ps(ymm3, ymm1);
-    ymm8 = _mm256_add_ps(ymm6, ymm1);
-
-    ymm4 = _mm256_max_ps(ymm4, ymm2);
-    ymm7 = _mm256_max_ps(ymm7, ymm2);
-
-    ymm5 = _mm256_min_ps(ymm5, ymm2);
-    ymm8 = _mm256_min_ps(ymm8, ymm2);
-
-    ymm5 = _mm256_or_ps(ymm4, ymm5);
-    ymm8 = _mm256_or_ps(ymm7, ymm8);
-
-    _mm256_store_ps(dst + i, ymm5);
-    _mm256_store_ps(dst + i + 8, ymm8);
-  }
-  if (i <= size - 8) {
-    ymm3 = _mm256_load_ps(src + i);
-    ymm4 = _mm256_sub_ps(ymm3, ymm1);
-    ymm5 = _mm256_add_ps(ymm3, ymm1);
-    ymm4 = _mm256_max_ps(ymm4, ymm2);
-    ymm5 = _mm256_min_ps(ymm5, ymm2);
-    ymm5 = _mm256_or_ps(ymm4, ymm5);
-    _mm256_store_ps(dst + i, ymm5);
-
-    i += 8;
-  }
-  for (; i < size; i++) {
-    src_val = src[i];
-    if (src_val > 0) {
-      dst[i] = ((src_val > lambda) ? (src_val - lambda) : 0);
-    } else {
-      dst[i] = ((-src_val > lambda) ? (src_val + lambda) : 0);
-    }
-  }
-}
-
-static void decayL1_avx(
-    float* dst, float* src, float* lr, float lambda, size_t sz) {
-  int64_t i;
-  int64_t size = sz;
-  float src_val;
-
-  __m256 ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8;
-  __m256 ymm9, ymm10;
-
-  ymm1 = _mm256_set1_ps(lambda);
-  ymm2 = _mm256_setzero_ps();
-
-  for (i = 0; i <= size - 16; i += 16) {
-    ymm9 = _mm256_load_ps(lr + i);
-    ymm10 = _mm256_load_ps(lr + i + 8);
-
-    ymm3 = _mm256_load_ps(src + i);
-    ymm6 = _mm256_load_ps(src + i + 8);
-
-    ymm9 = _mm256_mul_ps(ymm9, ymm1);
-    ymm10 = _mm256_mul_ps(ymm10, ymm1);
-
-    ymm4 = _mm256_sub_ps(ymm3, ymm9);
-    ymm7 = _mm256_sub_ps(ymm6, ymm10);
-
-    ymm5 = _mm256_add_ps(ymm3, ymm9);
-    ymm8 = _mm256_add_ps(ymm6, ymm10);
-
-    ymm4 = _mm256_max_ps(ymm4, ymm2);
-    ymm7 = _mm256_max_ps(ymm7, ymm2);
-
-    ymm5 = _mm256_min_ps(ymm5, ymm2);
-    ymm8 = _mm256_min_ps(ymm8, ymm2);
-
-    ymm5 = _mm256_or_ps(ymm4, ymm5);
-    ymm8 = _mm256_or_ps(ymm7, ymm8);
-
-    _mm256_store_ps(dst + i, ymm5);
-    _mm256_store_ps(dst + i + 8, ymm8);
-  }
-  if (i <= size - 8) {
-    ymm3 = _mm256_load_ps(src + i);
-    ymm9 = _mm256_load_ps(lr + i);
-    ymm9 = _mm256_mul_ps(ymm9, ymm1);
-    ymm4 = _mm256_sub_ps(ymm3, ymm9);
-    ymm5 = _mm256_add_ps(ymm3, ymm9);
-    ymm4 = _mm256_max_ps(ymm4, ymm2);
-    ymm5 = _mm256_min_ps(ymm5, ymm2);
-    ymm5 = _mm256_or_ps(ymm4, ymm5);
-    _mm256_store_ps(dst + i, ymm5);
-
-    i += 8;
-  }
-  for (; i < size; i++) {
-    src_val = src[i];
-    float nlambda = lr[i] * lambda;
-    if (src_val > 0) {
-      dst[i] = ((src_val > nlambda) ? (src_val - nlambda) : 0);
-    } else {
-      dst[i] = ((-src_val > nlambda) ? (src_val + nlambda) : 0);
-    }
-  }
-}
-
-#elif defined(__SSE3__)
-
-static void addto_sse(float* a, const float* b, size_t len) {
-  int offset = len % 16;
-  __m128 ma0, ma1, ma2, ma3;
-  __m128 mb0, mb1, mb2, mb3;
-
-  for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) {
-    ma0 = _mm_load_ps(a);
-    ma1 = _mm_load_ps(a + 4);
-    ma2 = _mm_load_ps(a + 8);
-    ma3 = _mm_load_ps(a + 12);
-
-    mb0 = _mm_load_ps(b);
-    mb1 = _mm_load_ps(b + 4);
-    mb2 = _mm_load_ps(b + 8);
-    mb3 = _mm_load_ps(b + 12);
-
-    ma0 = _mm_add_ps(ma0, mb0);
-    ma1 = _mm_add_ps(ma1, mb1);
-    ma2 = _mm_add_ps(ma2, mb2);
-    ma3 = _mm_add_ps(ma3, mb3);
-
-    _mm_store_ps(a, ma0);
-    _mm_store_ps(a + 4, ma1);
-    _mm_store_ps(a + 8, ma2);
-    _mm_store_ps(a + 12, ma3);
-  }
-
-  for (int i = 0; i < offset; i++) a[i] += b[i];
-}
-
-static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
-  int offset = len % 16;
-
-  __m128 ma0, ma1, ma2, ma3;
-  __m128 mb0, mb1, mb2, mb3;
-
-  for (unsigned int k = 0; k < len / 16; k++, a += 16) {
-    ma0 = _mm_load_ps(a);
-    ma1 = _mm_load_ps(a + 4);
-    ma2 = _mm_load_ps(a + 8);
-    ma3 = _mm_load_ps(a + 12);
-
-    for (int i = 0; i < batch; i++) {
-      mb0 = _mm_load_ps(b[i]);
-      mb1 = _mm_load_ps(b[i] + 4);
-      mb2 = _mm_load_ps(b[i] + 8);
-      mb3 = _mm_load_ps(b[i] + 12);
-      ma0 = _mm_add_ps(ma0, mb0);
-      ma1 = _mm_add_ps(ma1, mb1);
-      ma2 = _mm_add_ps(ma2, mb2);
-      ma3 = _mm_add_ps(ma3, mb3);
-      b[i] += 16;
-    }
-
-    _mm_store_ps(a, ma0);
-    _mm_store_ps(a + 4, ma1);
-    _mm_store_ps(a + 8, ma2);
-    _mm_store_ps(a + 12, ma3);
-  }
-
-  for (int i = 0; i < offset; i++) {
-    for (int k = 0; k < batch; k++) a[i] += b[k][i];
-  }
-  return;
-}
-
-static void col_max_sse(float* result,
-                        const float* data,
-                        int dim,
-                        int numSamples) {
-  // first sample, direct copy
-  for (int d = 0; d < dim; ++d) {
-    result[d] = data[d];
-  }
-  int offset = dim % 16;
-  __m128 ma0, ma1, ma2, ma3;
-  __m128 mb0, mb1, mb2, mb3;
-  // first 16n dims
-  for (int k = 0; k < dim / 16; k++, result += 16, data += 16) {
-    ma0 = _mm_load_ps(result);
-    ma1 = _mm_load_ps(result + 4);
-    ma2 = _mm_load_ps(result + 8);
-    ma3 = _mm_load_ps(result + 12);
-    for (int i = 1; i < numSamples; i++) {
-      mb0 = _mm_load_ps(data + i * dim);
-      mb1 = _mm_load_ps(data + i * dim + 4);
-      mb2 = _mm_load_ps(data + i * dim + 8);
-      mb3 = _mm_load_ps(data + i * dim + 12);
-      ma0 = _mm_max_ps(ma0, mb0);
-      ma1 = _mm_max_ps(ma1, mb1);
-      ma2 = _mm_max_ps(ma2, mb2);
-      ma3 = _mm_max_ps(ma3, mb3);
-    }
-    _mm_store_ps(result, ma0);
-    _mm_store_ps(result + 4, ma1);
-    _mm_store_ps(result + 8, ma2);
-    _mm_store_ps(result + 12, ma3);
-  }
-  // last dims
-  for (int d = 0; d < offset; ++d) {
-    float sm = data[d];
-    for (int i = 1; i < numSamples; ++i) {
-      sm = std::max(sm, data[i * dim + d]);
-    }
-    result[d] = sm;
-  }
-}
-
-#endif
-
-#if defined(__AVX__)
-#define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__)
-#elif defined(__SSE3__)
-#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
-#endif
-
-namespace paddle {
-namespace simd {
-namespace internal {
-#ifdef __SSE3__
-void addToImpl(float* a, const float* b, size_t len) {
-  SIMD_INVOKE(addto, a, b, len);
-}
-void batchAddToImpl(float* a, const float* b[], int batch, size_t len) {
-  SIMD_INVOKE(batch_addto, a, b, batch, len);
-}
-
-void colMaxImpl(float* result, const float* data, int dim, int numSamples) {
-  SIMD_INVOKE(col_max, result, data, dim, numSamples);
-}
-#endif
-
-#ifdef __AVX__
-void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len) {
-  decayL1_avx(dst, src, lambda, len);
-}
-void decayL1AvxImpl(
-    float* dst, float* src, float* lr, float lambda, size_t len) {
-  decayL1_avx(dst, src, lr, lambda, len);
-}
-#endif
-
-}  // namespace internal
-}  // namespace simd
-}  // namespace paddle
diff --git a/paddle/legacy/math/SIMDFunctions.h b/paddle/legacy/math/SIMDFunctions.h
deleted file mode 100644
index 5b1dfea9d3c..00000000000
--- a/paddle/legacy/math/SIMDFunctions.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stddef.h>
-#include <stdint.h>
-
-namespace paddle {
-
-namespace simd {
-
-namespace naive {
-template <typename Type>
-inline void addTo(Type* a, const Type* b, size_t len) {
-  for (size_t i = 0; i < len; ++i) {
-    a[i] += b[i];
-  }
-}
-
-template <typename Type>
-inline void batchAddTo(Type* a, const Type* b[], int batch, size_t len) {
-  for (int i = 0; i < batch; ++i) {
-    for (size_t j = 0; j < len; ++j) {
-      a[j] += b[i][j];
-    }
-  }
-}
-
-/**
- * @note this method is unused in paddle.
- */
-template <typename Type>
-inline void colMax(Type* result, const Type* data, int dim, int numSamples) {
-  for (int d = 0; d < dim; ++d) {
-    Type sm = data[d];
-    for (int i = 1; i < numSamples; ++i) {
-      sm = sm > data[i * dim + d] ? sm : data[i * dim + d];
-    }
-    result[d] = sm;
-  }
-}
-
-template <typename Type>
-inline void decayL1(Type* dst, Type* src, Type* lr, Type lambda, size_t len) {
-  for (size_t i = 0; i < len; ++i) {
-    Type& src_val = src[i];
-    float nlambda = lr[i] * lambda;
-    if (src_val > 0) {
-      dst[i] = ((src_val > nlambda) ? (src_val - nlambda) : 0);
-    } else {
-      dst[i] = ((-src_val > nlambda) ? (src_val + nlambda) : 0);
-    }
-  }
-}
-
-template <class Type>
-inline void decayL1(Type* dst, Type* src, Type lambda, size_t len) {
-  for (size_t i = 0; i < len; ++i) {
-    Type& src_val = src[i];
-    if (src_val > 0) {
-      dst[i] = ((src_val > lambda) ? (src_val - lambda) : 0);
-    } else {
-      dst[i] = ((-src_val > lambda) ? (src_val + lambda) : 0);
-    }
-  }
-}
-}  // namespace naive
-
-template <typename Type>
-inline void addTo(Type* a, const Type* b, size_t len) {
-  naive::addTo(a, b, len);
-}
-
-template <typename Type>
-inline void batchAddTo(Type* a, const Type* b[], int batch, size_t len) {
-  naive::batchAddTo(a, b, batch, len);
-}
-
-template <typename Type>
-inline void colMax(Type* result, const Type* data, int dim, int numSamples) {
-  naive::colMax(result, data, dim, numSamples);
-}
-
-template <typename Type>
-inline void decayL1(Type* dst, Type* src, Type* lr, Type lambda, size_t len) {
-  naive::decayL1(dst, src, lr, lambda, len);
-}
-
-template <typename Type>
-inline void decayL1(Type* dst, Type* src, Type lambda, size_t len) {
-  naive::decayL1(dst, src, lambda, len);
-}
-
-template <size_t AlignSize>
-inline bool isPointerAlign(void* ptr) {
-  return reinterpret_cast<uintptr_t>(ptr) % AlignSize == 0;
-}
-
-inline bool vec_check(size_t len) {
-#ifdef __AVX__
-  return len % 8 == 0;
-#else
-  return len % 4 == 0;
-#endif
-}
-
-namespace internal {
-#ifdef __SSE3__
-void addToImpl(float* a, const float* b, size_t len);
-void batchAddToImpl(float* a, const float* b[], int batch, size_t len);
-void colMaxImpl(float* result, const float* data, int dim, int numSamples);
-#endif
-#ifdef __AVX__
-void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len);
-void decayL1AvxImpl(
-    float* dst, float* src, float* lr, float lambda, size_t len);
-#endif
-}  // namespace internal
-
-template <>
-inline void addTo(float* a, const float* b, size_t len) {
-#ifdef __SSE3__
-  internal::addToImpl(a, b, len);
-#else
-  naive::addTo(a, b, len);
-#endif
-}
-
-template <>
-inline void batchAddTo(float* a, const float* b[], int batch, size_t len) {
-#ifdef __SSE3__
-  internal::batchAddToImpl(a, b, batch, len);
-#else
-  naive::batchAddTo(a, b, batch, len);
-#endif
-}
-
-template <>
-inline void colMax(float* result, const float* data, int dim, int numSamples) {
-#ifdef __SSE3__
-  internal::colMaxImpl(result, data, dim, numSamples);
-#else
-  naive::colMax(result, data, dim, numSamples);
-#endif
-}
-
-template <>
-inline void decayL1(float* dst, float* src, float lambda, size_t len) {
-#ifdef __AVX__
-  internal::decayL1AvxImpl(dst, src, lambda, len);
-#else
-  naive::decayL1(dst, src, lambda, len);
-#endif
-}
-
-template <>
-inline void decayL1(
-    float* dst, float* src, float* lr, float lambda, size_t len) {
-#ifdef __AVX__
-  internal::decayL1AvxImpl(dst, src, lr, lambda, len);
-#else
-  naive::decayL1(dst, src, lr, lambda, len);
-#endif
-}
-
-}  // namespace simd
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/SparseMatrix.cpp b/paddle/legacy/math/SparseMatrix.cpp
deleted file mode 100644
index 6f68252b0a7..00000000000
--- a/paddle/legacy/math/SparseMatrix.cpp
+++ /dev/null
@@ -1,864 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SparseMatrix.h"
-#include <algorithm>
-#include <iostream>
-#include <vector>
-#include "hl_gpu.h"
-#include "hl_top_k.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-GpuSparseMatrix::GpuSparseMatrix(size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(NULL, height, width, trans, true) {
-  resize(height, width, nnz, valueType, format);
-}
-
-GpuSparseMatrix::GpuSparseMatrix(GpuMemHandlePtr dataHandle,
-                                 hl_sparse_matrix_s_ptr sMatrix,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans,
-                                 MemoryHandlePtr sMemoryHandle)
-    : Matrix(dataHandle, height, width, trans, true) {
-  CHECK(dataHandle && sMatrix) << "Invalid argument pointer";
-
-  size_t size = 0;
-  if (format == SPARSE_CSR) {
-    size = (height + 1) * sizeof(int) + nnz * sizeof(int);
-  } else {
-    size = (width + 1) * sizeof(int) + nnz * sizeof(int);
-  }
-
-  if (NO_VALUE != valueType) {
-    size += nnz * sizeof(real);
-  }
-  CHECK_LE(size, dataHandle->getSize());
-
-  sMatrix_ = sMatrix;
-
-  if (sMemoryHandle == NULL) {
-    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(dataHandle->getSize());
-  } else {
-    CHECK_EQ(sMemoryHandle->getSize(), dataHandle->getSize());
-    sMemoryHandle_ = sMemoryHandle;
-  }
-
-  elementCnt_ = nnz;
-  valueType_ = valueType;
-  format_ = format;
-  if (format_ == SPARSE_CSR)
-    sparseResizeCSR();
-  else
-    sparseResizeCSC();
-}
-
-GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans,
-                                 MemoryHandlePtr sMemoryHandle)
-    : Matrix(NULL, height, width, trans, true) {
-  CHECK(sMatrix) << "Invalid argument pointer";
-  sMatrix_ = sMatrix;
-  sMemoryHandle_ = sMemoryHandle;
-  elementCnt_ = nnz;
-  format_ = format;
-  valueType_ = valueType;
-}
-
-GpuSparseMatrix::GpuSparseMatrix(real* value,
-                                 int* rows,
-                                 int* cols,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(NULL, height, width, trans, true) {
-  size_t size = 0;
-  if (format == SPARSE_CSR) {
-    size = (height + 1) * sizeof(int) + nnz * sizeof(int);
-  } else {
-    size = (width + 1) * sizeof(int) + nnz * sizeof(int);
-  }
-
-  if (NO_VALUE != valueType) {
-    size += nnz * sizeof(real);
-  }
-  elementCnt_ = nnz;
-  valueType_ = valueType;
-  format_ = format;
-
-  sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(size);
-  if (format_ == SPARSE_CSR) {
-    rows_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
-    cols_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-        (height_ + 1) * sizeof(int));
-    if (NO_VALUE != valueType_) {
-      value_ = reinterpret_cast<real*>(
-          reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-          (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-    } else {
-      value_ = NULL;
-    }
-
-    if (sMatrix_ == NULL) {
-      /* construct hl_sparse_matrix_s */
-      hl_sparse_matrix_s tmp;
-      hl_construct_sparse_matrix(
-          &tmp,
-          value,
-          rows,
-          cols,
-          HL_SPARSE_CSR,
-          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
-          height_,
-          width_,
-          elementCnt_);
-      hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
-      sMatrix_ = tmp2;
-    }
-
-  } else {
-    cols_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
-    rows_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-        (width_ + 1) * sizeof(int));
-    if (NO_VALUE != valueType_) {
-      value_ = reinterpret_cast<real*>(
-          reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-          (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-    } else {
-      value_ = NULL;
-    }
-
-    if (sMatrix_ == NULL) {
-      /* construct hl_sparse_matrix_s */
-      hl_sparse_matrix_s tmp;
-      hl_construct_sparse_matrix(
-          &tmp,
-          value,
-          rows,
-          cols,
-          HL_SPARSE_CSC,
-          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
-          height_,
-          width_,
-          elementCnt_);
-      hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
-      sMatrix_ = tmp2;
-    }
-  }
-}
-
-void GpuSparseMatrix::sparseResizeCSR() {
-  rows_ =
-      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
-  cols_ =
-      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-                             (height_ + 1) * sizeof(int));
-  if (NO_VALUE != valueType_) {
-    value_ = reinterpret_cast<real*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-        (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-  } else {
-    value_ = NULL;
-  }
-
-  if (sMatrix_ == NULL) {
-    /* construct hl_sparse_matrix_s */
-    hl_sparse_matrix_s tmp;
-    hl_construct_sparse_matrix(
-        &tmp,
-        data_,
-        memoryHandle_->getSize(),
-        HL_SPARSE_CSR,
-        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
-        height_,
-        width_,
-        elementCnt_);
-    hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
-    sMatrix_ = tmp2;
-  }
-}
-
-void GpuSparseMatrix::sparseResizeCSC() {
-  cols_ =
-      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
-  rows_ =
-      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-                             (width_ + 1) * sizeof(int));
-  if (NO_VALUE != valueType_) {
-    value_ = reinterpret_cast<real*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-        (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-  } else {
-    value_ = NULL;
-  }
-
-  if (sMatrix_ == NULL) {
-    /* construct hl_sparse_matrix_s */
-    hl_sparse_matrix_s tmp;
-    hl_construct_sparse_matrix(
-        &tmp,
-        memoryHandle_->getBuf(),
-        memoryHandle_->getSize(),
-        HL_SPARSE_CSC,
-        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
-        height_,
-        width_,
-        elementCnt_);
-    hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
-    sMatrix_ = tmp2;
-  }
-}
-
-void GpuSparseMatrix::resize(size_t newHeight,
-                             size_t newWidth,
-                             size_t newNnz,
-                             SparseValueType valueType,
-                             SparseFormat format) {
-  if (format == SPARSE_CSR) {
-    resizeCSR(newHeight, newWidth, newNnz, valueType);
-  } else {
-    resizeCSC(newHeight, newWidth, newNnz, valueType);
-  }
-}
-
-void GpuSparseMatrix::resizeCSR(size_t newHeight,
-                                size_t newWidth,
-                                size_t newNnz,
-                                SparseValueType valueType) {
-  size_t newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
-  if (NO_VALUE != valueType) {
-    newSize += newNnz * sizeof(real);
-  }
-
-  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
-    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize);
-    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
-    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
-    end_ = reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-           sMemoryHandle_->getSize();
-    sMatrix_ = NULL;
-  } else if (valueType != valueType_) {
-    sMatrix_ = NULL;
-  } else {
-    /*
-     * newNnz > elementCnt_ is necessary for the following condition:
-     * Firstly, height_ is 9 elementCnt_ is 56
-     * Secondly, height_ is 11 elementCnt_ is 44
-     *   ==> height_ is bigger, sMatrix_ will resize, and total item is 44 now
-     * Then, height_ is 10 elementCnt_ is 52
-     *   ==> Without newNnz > elementCnt_ condition, sMatrix_ will fail
-     */
-    if ((ssize_t)((newHeight + 1) * sizeof(int)) >
-            ((char*)cols_ - (char*)rows_) ||
-        newNnz > static_cast<size_t>(sMatrix_->nnz)) {
-      sMatrix_ = NULL;
-    } else if (NO_VALUE == valueType) {
-      if ((ssize_t)(newNnz * sizeof(int)) > (end_ - (char*)cols_)) {
-        sMatrix_ = NULL;
-      }
-    } else {
-      if ((ssize_t)(newNnz * sizeof(int)) > ((char*)value_ - (char*)cols_) ||
-          (ssize_t)(newNnz * sizeof(real)) > (end_ - (char*)value_)) {
-        sMatrix_ = NULL;
-      }
-    }
-  }
-
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newNnz;
-  valueType_ = valueType;
-  format_ = SPARSE_CSR;
-
-  if (sMatrix_ == NULL) {
-    sparseResizeCSR();
-  }
-}
-
-void GpuSparseMatrix::resizeCSC(size_t newHeight,
-                                size_t newWidth,
-                                size_t newNnz,
-                                SparseValueType valueType) {
-  size_t newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
-  if (NO_VALUE != valueType) {
-    newSize += newNnz * sizeof(real);
-  }
-
-  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
-    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize);
-    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
-    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
-    end_ = reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-           sMemoryHandle_->getSize();
-    sMatrix_ = NULL;
-  } else if (valueType != valueType_) {
-    sMatrix_ = NULL;
-  } else {
-    /*
-     * newNnz > elementCnt_ is necessary for the following condition:
-     * Firstly, height_ is 9 elementCnt_ is 56
-     * Secondly, height_ is 11 elementCnt_ is 44
-     *   ==> height_ is bigger, sMatrix_ will resize,
-     *       and total item is 44 now
-     * Then, height_ is 10 elementCnt_ is 52
-     *   ==> Without newNnz > elementCnt_ condition, sMatrix_ will fail
-     */
-    if ((ssize_t)((newWidth + 1) * sizeof(int)) >
-            ((char*)rows_ - (char*)cols_) ||
-        newNnz > static_cast<size_t>(sMatrix_->nnz)) {
-      sMatrix_ = NULL;
-    } else if (NO_VALUE == valueType) {
-      if ((ssize_t)(newNnz * sizeof(int)) > (end_ - (char*)rows_)) {
-        sMatrix_ = NULL;
-      }
-    } else {
-      if ((ssize_t)(newNnz * sizeof(int)) > ((char*)value_ - (char*)rows_) ||
-          (ssize_t)(newNnz * sizeof(real)) > (end_ - (char*)value_)) {
-        sMatrix_ = NULL;
-      }
-    }
-  }
-
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newNnz;
-  valueType_ = valueType;
-  format_ = SPARSE_CSC;
-
-  if (sMatrix_ == NULL) {
-    sparseResizeCSC();
-  }
-}
-
-void GpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
-  resize(newHeight, newWidth, elementCnt_, valueType_, format_);
-}
-
-MatrixPtr GpuSparseMatrix::getTranspose() {
-  CHECK(memoryHandle_.get() || sMatrix_) << "not supported";
-  if (memoryHandle_.get()) {
-    MatrixPtr copy_T(new GpuSparseMatrix(
-        std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
-        sMatrix_,
-        height_,
-        width_,
-        elementCnt_,
-        valueType_,
-        format_,
-        true,
-        sMemoryHandle_));
-    return copy_T;
-  } else {
-    MatrixPtr copy_T(new GpuSparseMatrix(sMatrix_,
-                                         height_,
-                                         width_,
-                                         elementCnt_,
-                                         valueType_,
-                                         format_,
-                                         true,
-                                         sMemoryHandle_));
-    return copy_T;
-  }
-}
-
-void GpuSparseMatrix::copyRow(int offsets,
-                              size_t colNum,
-                              const sparse_non_value_t* row) {
-  memcpy(cols_ + offsets, row, sizeof(int) * colNum);
-}
-
-void GpuSparseMatrix::copyRow(int offsets,
-                              size_t colNum,
-                              const sparse_float_value_t* row) {
-  for (size_t j = 0; j < colNum; j++) {
-    cols_[offsets + j] = row[j].col;
-    value_[offsets + j] = row[j].value;
-  }
-}
-
-void GpuSparseMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  if (auto mat = dynamic_cast<const CpuSparseMatrix*>(&src)) {
-    copyFrom(*(const_cast<CpuSparseMatrix*>(mat)), stream);
-  } else if (auto mat = dynamic_cast<const GpuSparseMatrix*>(&src)) {
-    copyFrom(*(const_cast<GpuSparseMatrix*>(mat)), stream);
-  } else {
-    LOG(FATAL) << "Not implemented";
-  }
-}
-
-void GpuSparseMatrix::copyFrom(const Matrix& src) {
-  copyFrom(src, HPPL_STREAM_1);
-  hl_stream_synchronize(HPPL_STREAM_1);
-}
-
-template <class T>
-void GpuSparseMatrix::copyFrom(int64_t* ids,
-                               int64_t* indices,
-                               T* data,
-                               hl_stream_t stream) {
-  CHECK_EQ(format_, SPARSE_CSR);
-  size_t nnz = 0;
-  for (size_t i = 0; i < height_; i++) {
-    int64_t id = ids[i];
-    nnz += indices[id + 1] - indices[id];
-  }
-
-  resize(height_,
-         width_,
-         nnz,
-         sizeof(T) == sizeof(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE,
-         format_);
-
-  rows_[0] = 0;
-  for (size_t i = 0; i < height_; i++) {
-    int64_t id = ids[i];
-    size_t colNum = indices[id + 1] - indices[id];
-    rows_[i + 1] = rows_[i] + colNum;
-
-    T* row = data + indices[id];
-    copyRow(rows_[i], colNum, row);
-  }
-
-  sMatrix_->format = HL_SPARSE_CSR;
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-  hl_memcpy_csr_matrix(sMatrix_.get(), value_, rows_, cols_, stream);
-}
-
-void GpuSparseMatrix::setRow(size_t row,
-                             size_t colNum,
-                             const unsigned int* cols,
-                             const real* values) {
-  CHECK_EQ(format_, SPARSE_CSR);
-  if (NO_VALUE == valueType_) {
-    CHECK_LT(row, height_);
-    CHECK(NULL != cols);
-    CHECK(NULL == values);
-  } else {
-    CHECK_LT(row, height_);
-    CHECK(NULL != cols);
-    CHECK(NULL != values);
-  }
-  if (0 == row) {
-    rows_[row] = 0;
-  }
-  rows_[row + 1] = rows_[row] + colNum;
-
-  memcpy(cols_ + rows_[row], cols, sizeof(*cols) * colNum);
-  if (FLOAT_VALUE == valueType_) {
-    memcpy(value_ + rows_[row], values, sizeof(*values) * colNum);
-  }
-
-  if (height_ - 1 == row) {
-    sMatrix_->format = HL_SPARSE_CSR;
-    sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-    sMatrix_->rows = height_;
-    sMatrix_->cols = width_;
-    sMatrix_->nnz = elementCnt_;
-    hl_memcpy_csr_matrix(
-        sMatrix_.get(), value_, rows_, cols_, HPPL_STREAM_DEFAULT);
-  }
-}
-
-SparseValueType GpuSparseMatrix::getValueType() const { return valueType_; }
-
-void GpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  CHECK_EQ(format_, SPARSE_CSC);
-  int nnz = sMatrix_->nnz;
-  if (memAlloc) {
-    matTrans = std::make_shared<GpuSparseMatrix>(
-        width_, height_, nnz, valueType_, format_, false);
-  } else {
-    CHECK(matTrans != nullptr);
-  }
-
-  CpuIVector rows(nnz);
-  CpuIVector cols(width_ + 1);
-  CpuIVector cols_full(nnz);
-  CpuVector value(nnz);
-  hl_stream_t stream = HPPL_STREAM_1;
-  hl_memcpy_from_csc_matrix(value.getData(),
-                            nnz,
-                            rows.getData(),
-                            nnz,
-                            cols.getData(),
-                            width_ + 1,
-                            sMatrix_.get(),
-                            stream);
-
-  hl_stream_synchronize(stream);
-
-  /*for every non zero number, get its column index*/
-  std::vector<Element> dataVec;
-  for (size_t i = 0; i < width_; i++) {
-    for (int j = cols.getData()[i]; j < cols.getData()[i + 1]; j++) {
-      cols_full.getData()[j] = i;
-    }
-  }
-
-  /*sort row index and column index by the ascending order*/
-  for (int i = 0; i < nnz; i++) {
-    dataVec.emplace_back(
-        rows.getData()[i], cols_full.getData()[i], value.getData()[i]);
-  }
-  std::sort(dataVec.begin(), dataVec.end(), [](Element a, Element b) {
-    return a.row < b.row || (a.row == b.row && a.col < b.col);
-  });
-
-  /*get sorted data, row index, and col index, put them in the right place*/
-  cols.resize(height_ + 1);
-  rows.resize(nnz);
-  value.resize(nnz);
-
-  cols.getData()[0] = 0;
-  rows.getData()[0] = dataVec[0].col;
-  value.getData()[0] = dataVec[0].val;
-  for (int i = 1; i < nnz; i++) {
-    if (dataVec[i].row != dataVec[i - 1].row) {
-      for (int j = dataVec[i - 1].row + 1; j <= dataVec[i].row; j++) {
-        cols.getData()[j] = i;
-      }
-    }
-    rows.getData()[i] = dataVec[i].col;
-    value.getData()[i] = dataVec[i].val;
-  }
-  cols.getData()[height_] = nnz;
-
-  /*copy back from cpu*/
-  GpuSparseMatrixPtr dest =
-      std::dynamic_pointer_cast<GpuSparseMatrix>(matTrans);
-  hl_memcpy_csc_matrix((dest->sMatrix_).get(),
-                       value.getData(),
-                       rows.getData(),
-                       cols.getData(),
-                       stream);
-  hl_stream_synchronize(stream);
-}
-
-void GpuSparseMatrix::mul(const GpuMatrix& a,
-                          const GpuMatrix& b,
-                          real scaleAB,
-                          real scaleT) {
-  CHECK(a.useGpu_ && b.useGpu_) << "type not match";
-  CHECK(!trans_) << "trans not supported";
-  real* A_d = (real*)a.getData();
-  real* B_d = (real*)b.getData();
-  hl_sparse_matrix_s C_d = sMatrix_.get();
-  hl_trans_op_t a_trans = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
-  hl_trans_op_t b_trans = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
-
-  if (!a.trans_ && !b.trans_) {
-    CHECK(height_ == a.getHeight());
-    CHECK(width_ == b.getWidth());
-    CHECK(a.getWidth() == b.getHeight());
-  } else if (a.trans_ && !b.trans_) {
-    CHECK(height_ == a.getWidth());
-    CHECK(width_ == b.getWidth());
-    CHECK(a.getHeight() == b.getHeight());
-  } else if (!a.trans_ && b.trans_) {
-    CHECK(height_ == a.getHeight());
-    CHECK(width_ == b.getHeight());
-    CHECK(a.getWidth() == b.getWidth());
-  } else {
-    LOG(INFO) << "Not support";
-  }
-  int dimM = height_;
-  int dimN = width_;
-  int dimK = !b.trans_ ? b.getHeight() : b.getWidth();
-  hl_sparse_matrix_mul(
-      A_d, a_trans, B_d, b_trans, C_d, dimM, dimN, dimK, scaleAB, scaleT);
-}
-
-void GpuSparseMatrix::mul(const Matrix& a,
-                          const Matrix& b,
-                          real scaleAB,
-                          real scaleT) {
-  const auto a_ptr = dynamic_cast<const GpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const GpuMatrix*>(&b);
-  if (a_ptr && b_ptr) {
-    mul(*a_ptr, *b_ptr, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-template <class T>
-void printBuf(std::ostream& os, T* a, size_t len, const char* name) {
-  os << "\n: " << name << " [";
-  for (size_t i = 0; i < len; i++) {
-    os << a[i] << " ";
-  }
-  os << "]\n";
-}
-
-void GpuSparseMatrix::print(std::ostream& os) const {
-  if (format_ == SPARSE_CSC) {
-    int nnz = sMatrix_->nnz;
-    IVectorPtr rows = IVector::create(nnz, false);
-    IVectorPtr cols = IVector::create(width_ + 1, false);
-    VectorPtr value = Vector::create(nnz, false);
-    hl_stream_t stream = HPPL_STREAM_DEFAULT;
-    hl_memcpy_from_csc_matrix(value->getData(),
-                              value->getSize(),
-                              rows->getData(),
-                              rows->getSize(),
-                              cols->getData(),
-                              cols->getSize(),
-                              sMatrix_.get(),
-                              stream);
-    hl_stream_synchronize(stream);
-
-    printBuf(os, cols->getData(), width_ + 1, "col idx");
-    printBuf(os, rows->getData(), elementCnt_, "row idx");
-    printBuf(os, value->getData(), elementCnt_, "value");
-  }
-}
-
-void GpuSparseMatrix::copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream) {
-  trans_ = src.trans_;
-  size_t nnz = src.getElementCnt();
-
-  resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat());
-  // if have different value type, only copy rows and cols
-  SparseValueType vType =
-      valueType_ != src.getValueType() ? NO_VALUE : valueType_;
-
-  sMatrix_->format = HL_SPARSE_CSR;
-  sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-
-  hl_memcpy_csr_matrix(sMatrix_.get(),
-                       vType == NO_VALUE ? NULL : src.getValue(),
-                       src.getRows(),
-                       src.getCols(),
-                       stream);
-
-  // restore type of sMatrix_
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-}
-
-void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) {
-  trans_ = src.trans_;
-  size_t nnz = src.getElementCnt();
-
-  resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat());
-
-  // if have different value type, only copy rows and cols
-  SparseValueType vType =
-      valueType_ != src.getValueType() ? NO_VALUE : valueType_;
-
-  sMatrix_->format = HL_SPARSE_CSC;
-  sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-
-  hl_memcpy_csc_matrix(sMatrix_.get(),
-                       vType == NO_VALUE ? NULL : src.getValue(),
-                       src.getRows(),
-                       src.getCols(),
-                       stream);
-
-  // restore type of sMatrix_
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-}
-
-void GpuSparseMatrix::copyFrom(GpuSparseMatrix& src, hl_stream_t stream) {
-  CHECK(trans_ == src.trans_);
-  CHECK(format_ == src.getFormat());
-  resize(src.getHeight(),
-         src.getWidth(),
-         elementCnt_,
-         valueType_,
-         src.getFormat());
-
-  size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
-  size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
-
-  if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
-    hl_memcpy_async(
-        getValue(), src.getValue(), sizeof(real) * elementCnt_, stream);
-  }
-  CHECK(getRows());
-  CHECK(src.getRows());
-
-  hl_memcpy_async(getRows(), src.getRows(), sizeof(int) * rowSize, stream);
-  hl_memcpy_async(getCols(), src.getCols(), sizeof(int) * colSize, stream);
-}
-
-void GpuSparseMatrix::copyFrom(CpuSparseMatrix& src, hl_stream_t stream) {
-  if (format_ == SPARSE_CSR) {
-    copyFromCSR(src, stream);
-  } else {
-    copyFromCSC(src, stream);
-  }
-}
-
-void GpuSparseMatrix::trimFromCSR(const CpuSparseMatrix& src) {
-  trans_ = src.trans_;
-  int* srcCols = src.getCols();
-  size_t nnz = std::count_if(srcCols,
-                             srcCols + src.getElementCnt(),
-                             [this](size_t n) { return n < this->width_; });
-  resize(height_, width_, nnz, valueType_, format_);
-
-  rows_[0] = 0;
-  size_t index = 0;
-  for (size_t r = 0; r < height_; ++r) {
-    for (int i = src.getRows()[r]; i < src.getRows()[r + 1]; ++i) {
-      if (srcCols[i] < (int)width_) {
-        cols_[index] = srcCols[i];
-        if (valueType_ == FLOAT_VALUE) {
-          value_[index] = src.getValue()[i];
-        }
-        ++index;
-      }
-    }
-    rows_[r + 1] = index;
-  }
-  CHECK_EQ(index, nnz);
-
-  sMatrix_->format = HL_SPARSE_CSR;
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-
-  hl_memcpy_csr_matrix(sMatrix_.get(),
-                       valueType_ == NO_VALUE ? NULL : value_,
-                       rows_,
-                       cols_,
-                       /*default stream = */ HPPL_STREAM_DEFAULT);
-}
-
-void GpuSparseMatrix::trimFromCSC(const CpuSparseMatrix& src) {
-  trans_ = src.trans_;
-  size_t nnz = src.getCols()[width_] - src.getCols()[0];
-  resize(height_, width_, nnz, valueType_, format_);
-
-  cols_[0] = 0;
-  for (size_t i = 0; i < width_; i++) {
-    cols_[i + 1] = cols_[i] + (int)(src.getRowNum(i));
-  }
-  memcpy(rows_, src.getRows() + src.getCols()[0], sizeof(int) * nnz);
-  if (valueType_ == FLOAT_VALUE) {
-    memcpy(value_, src.getValue() + src.getCols()[0], sizeof(real) * nnz);
-  }
-
-  sMatrix_->format = HL_SPARSE_CSC;
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-
-  hl_memcpy_csc_matrix(sMatrix_.get(),
-                       valueType_ == NO_VALUE ? NULL : value_,
-                       rows_,
-                       cols_,
-                       /*default stream = */ HPPL_STREAM_DEFAULT);
-}
-
-void GpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
-  if (format_ == SPARSE_CSR) {
-    trimFromCSR(src);
-  } else {
-    trimFromCSC(src);
-  }
-}
-
-void GpuSparseMatrix::addBias(Matrix& b, real scale) {
-  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
-  hl_sparse_matrix_s A_d = sMatrix_.get();
-  hl_sparse_matrix_add_bias(A_d, b.getData(), scale);
-}
-
-void GpuSparseMatrix::add3(GpuMatrix* b) {
-  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
-  CHECK(height_ == b->getHeight());
-  CHECK(width_ == b->getWidth());
-  real* B_d = b->getData();
-  hl_sparse_matrix_s A_d = sMatrix_.get();
-  hl_sparse_matrix_add_dense(A_d, B_d, height_, width_, 1, 0);
-}
-
-void GpuSparseMatrix::add3(MatrixPtr b) {
-  if (dynamic_cast<GpuMatrix*>(b.get())) {
-    add3(dynamic_cast<GpuMatrix*>(b.get()));
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void GpuSparseMatrix::zeroMem() {
-  CHECK(valueType_ == FLOAT_VALUE);
-  real* value = getValue();
-  if (value == NULL) {
-    LOG(FATAL) << "value is nullptr";
-  }
-  hl_matrix_zero_mem(value, elementCnt_);
-}
-
-void GpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  CHECK_EQ(format_, SPARSE_CSR) << "Only support SPARSE_CSR";
-
-  hl_sparse_matrix_top_k(maxVal.getData(),
-                         maxVal.getStride(),
-                         maxIds.getData(),
-                         sMatrix_.get(),
-                         beam,
-                         numSamples);
-#endif
-}
-
-template void GpuSparseMatrix::copyFrom(int64_t* ids,
-                                        int64_t* indices,
-                                        sparse_non_value_t* data,
-                                        hl_stream_t stream);
-template void GpuSparseMatrix::copyFrom(int64_t* ids,
-                                        int64_t* indices,
-                                        sparse_float_value_t* data,
-                                        hl_stream_t stream);
-}  // namespace paddle
diff --git a/paddle/legacy/math/SparseMatrix.h b/paddle/legacy/math/SparseMatrix.h
deleted file mode 100644
index 9181fa29233..00000000000
--- a/paddle/legacy/math/SparseMatrix.h
+++ /dev/null
@@ -1,286 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-#include <cstddef>
-#include "CpuSparseMatrix.h"
-#include "Matrix.h"
-
-namespace paddle {
-
-typedef std::shared_ptr<_hl_sparse_matrix_s> hl_sparse_matrix_s_ptr;
-
-class GpuSparseMatrix : public Matrix {
- public:
-  MemoryHandlePtr sMemoryHandle_;
-  int* rows_;
-  int* cols_;
-  real* value_;
-  const char* end_; /* point to the end of sMemoryHandle_ */
-
-  hl_sparse_matrix_s_ptr sMatrix_;
-  SparseValueType valueType_;
-  SparseFormat format_;
-
- public:
-  GpuSparseMatrix(size_t height,
-                  size_t width,
-                  size_t nnz, /* used to allocate space */
-                  SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format_ = SPARSE_CSR,
-                  bool trans = false);
-
-  GpuSparseMatrix(GpuMemHandlePtr dataHandle,
-                  hl_sparse_matrix_s_ptr sMatrix,
-                  size_t height,
-                  size_t width,
-                  size_t nnz, /* used to allocate space */
-                  SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format_ = SPARSE_CSR,
-                  bool trans = false,
-                  MemoryHandlePtr sMemoryHandle = NULL);
-
-  GpuSparseMatrix(real* value,
-                  int* rows,
-                  int* cols,
-                  size_t height,
-                  size_t width,
-                  size_t nnz,
-                  SparseValueType valueType,
-                  SparseFormat format,
-                  bool trans);
-
-  GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix,
-                  size_t height,
-                  size_t width,
-                  size_t nnz,
-                  SparseValueType valueType,
-                  SparseFormat format,
-                  bool trans,
-                  MemoryHandlePtr sMemoryHandle);
-
- protected:
-  struct Element {
-    int row;
-    int col;
-    real val;
-    Element(int rowIn, int colIn, real valIn)
-        : row(rowIn), col(colIn), val(valIn) {}
-  };
-
- public:
-  ~GpuSparseMatrix() {}
-
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format);
-
-  void resize(size_t newHeight, size_t newWidth);
-
-  void sparseResizeCSR();
-
-  void sparseResizeCSC();
-
-  void resizeCSR(size_t newHeight,
-                 size_t newWidth,
-                 size_t newNnz,
-                 SparseValueType valueType);
-
-  void resizeCSC(size_t newHeight,
-                 size_t newWidth,
-                 size_t newNnz,
-                 SparseValueType valueType);
-
-  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
-  /// B = A , B.trans = !A.trans
-  MatrixPtr getTranspose();
-
-  /// B = A'
-  void transpose(MatrixPtr& matTrans, bool memAlloc);
-
-  void copyFrom(const Matrix& src);
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-  void copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream);
-  void copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream);
-
-  void copyFrom(const IVector& src) { LOG(FATAL) << "not implemented"; }
-  void copyFrom(const IVector& src, hl_stream_t stream) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  template <class T>
-  void copyFrom(int64_t* ids, int64_t* indices, T* data, hl_stream_t stream);
-
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values);
-  SparseValueType getValueType() const;
-  SparseFormat getFormat() const { return format_; }
-
-  const int* getRowCols(size_t x) const { return cols_ + rows_[x]; }
-  const real* getRowValues(size_t x) const { return value_ + rows_[x]; }
-  size_t getColNum(size_t x) const { return rows_[x + 1] - rows_[x]; }
-  void print(std::ostream& os) const;
-
-  /**
-   * @brief only set value_ of FLOAT_VALUE sparse matrix to zero
-   */
-  void zeroMem();
-
-  /**
-   * @brief sparseMatrix += denseMatrix
-   *
-   * Named add3 just because add/add2 has been used in BaseMatrix.cu
-   * and they are not virtual function.
-   *
-   * Only add value of same (row, col) index in dense matrix
-   * and do not use others values.
-   *
-   * @param[in]  b   dense matrix
-   */
-  void add3(GpuMatrix* b);
-  void add3(MatrixPtr b);
-
-  /**
-   * @brief sparseMatrix[i,j] += bias[j], (j is the col index of sparse matrix)
-   *
-   * @param[in]  b      bias, dense matrix and height = 1
-   * @param[in]  scale  scale of b
-   */
-  void addBias(Matrix& b, real scale);
-
-  /**
-   * @brief return rows, which is gpu address
-   */
-  int* getRows() const {
-    CHECK(sMatrix_.get()) << "sMatrix_ is NULL";
-    return hl_sparse_matrix_get_rows(sMatrix_.get());
-  }
-
-  /**
-   * @brief return cols, which is gpu address
-   */
-  int* getCols() const {
-    CHECK(sMatrix_.get()) << "sMatrix_ is NULL";
-    return hl_sparse_matrix_get_cols(sMatrix_.get());
-  }
-
-  /**
-   * @brief return value, which is gpu address
-   */
-  real* getValue() const {
-    CHECK(sMatrix_.get()) << "sMatrix_ is NULL";
-    return hl_sparse_matrix_get_value(sMatrix_.get());
-  }
-
-  /**
-   * @brief return value_ of sparse matrix
-   *
-   * Some times CpuSparseMatrix maybe Matrix,
-   * if getValue, must dynamic_cast to CpuSparseMatrix,
-   * getData is convenient to get value
-   */
-  real* getData() { return getValue(); }
-  const real* getData() const { return getValue(); }
-
-  /**
-   * @brief  Get top k value of each row in sparse matrix.
-   *
-   * Store the value in maxVal and theirs index in maxIds.
-   * k = maxVal.width
-   *
-   * @param[out]  maxIds    index of top k
-   * @param[out]  maxVal    value of top k
-   */
-  void rowMax(IVector& maxIds, Matrix& maxVal);
-
- protected:
-  void sparseResize();
-
-  void copyRow(int offsets, size_t colNum, const sparse_non_value_t* row);
-  void copyRow(int offsets, size_t colNum, const sparse_float_value_t* row);
-
- public:
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-
-  void copyFrom(CpuSparseMatrix& src, hl_stream_t stream);
-  void copyFrom(GpuSparseMatrix& src, hl_stream_t stream);
-
-  void trimFrom(const CpuSparseMatrix& src);
-  void trimFromCSR(const CpuSparseMatrix& src);
-  void trimFromCSC(const CpuSparseMatrix& src);
-
-  // BaseMatrixT interface
- public:
-  bool isSparse() const { return true; }
-
- private:
-  using Matrix::mul;
-  using Matrix::copyFrom;
-  using Matrix::rowMax;
-  using Matrix::print;
-  using Matrix::subMatrix;
-};
-
-}  // namespace paddle
-
-#else
-
-#include "CpuSparseMatrix.h"
-
-namespace paddle {
-
-class GpuSparseMatrix : public Matrix {
- public:
-  GpuSparseMatrix(size_t height,
-                  size_t width,
-                  size_t nnz, /* used to allocate space */
-                  SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format_ = SPARSE_CSR,
-                  bool trans = false)
-      : Matrix(NULL, height, width, trans, false) {}
-
-  GpuSparseMatrix(real* value,
-                  int* rows,
-                  int* cols,
-                  size_t height,
-                  size_t width,
-                  size_t nnz,
-                  SparseValueType valueType,
-                  SparseFormat format,
-                  bool trans)
-      : Matrix(NULL, height, width, trans, true) {}
-
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {}
-  void resize(size_t newHeight, size_t newWidth) {}
-  MatrixPtr getTranspose() { return nullptr; }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {}
-};
-
-}  // namespace paddle
-
-#endif
diff --git a/paddle/legacy/math/SparseRowMatrix.cpp b/paddle/legacy/math/SparseRowMatrix.cpp
deleted file mode 100644
index 39bcdf22984..00000000000
--- a/paddle/legacy/math/SparseRowMatrix.cpp
+++ /dev/null
@@ -1,282 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SparseRowMatrix.h"
-#include "CpuSparseMatrix.h"
-
-#include <algorithm>
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include "SIMDFunctions.h"
-
-#include "paddle/legacy/utils/Thread.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-const unsigned int SparseRowCpuMatrix::kUnusedId_ = -1U;
-
-void SparseRowCpuMatrix::init(size_t height, size_t width) {
-  height_ = height;
-  if (!indexDictHandle_) {
-    indexDictHandle_.reset(new IndexDict);
-    indexDictHandle_->globalIndices.assign(height, kUnusedId_);
-  }
-  localIndices_ = &indexDictHandle_->localIndices;
-  globalIndices_ = indexDictHandle_->globalIndices.data();
-}
-
-void SparseRowCpuMatrix::mul(CpuSparseMatrix* a,
-                             CpuMatrix* b,
-                             real scaleAB,
-                             real scaleT) {
-  CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(a, b, this, scaleAB, scaleT);
-}
-
-void SparseRowCpuMatrix::copyFrom(const real* src, size_t size) {
-  LOG(FATAL) << "This should not be called";
-}
-
-void SparseRowCpuMatrix::zeroMem() {
-  apply([](real* buf, size_t len) { memset(buf, 0, sizeof(real) * len); });
-  clearRows();
-}
-
-void SparseRowCpuMatrix::applyL1(real learningRate, real decayRate) {
-  apply([=](real* buf, size_t len) {
-    CpuVector value(0, nullptr);
-    value.subVecFrom(buf, 0, len);
-    value.applyL1(learningRate, decayRate);
-  });
-}
-
-void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value,
-                                   IVector& t0,
-                                   real learningRate,
-                                   int currentTime,
-                                   real decayRate,
-                                   bool useL1,
-                                   bool fini) {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-
-  // t0 and value are vectors
-  CHECK_EQ(t0.getSize(), this->height_);
-  CHECK_EQ(value.width_, this->height_ * this->width_);
-
-  if (decayRate == 0.0f) {
-    if (fini) {
-      return;
-    }
-
-    for (size_t i = 0; i < localIndices.size(); ++i) {
-      real* g = getLocalRow(i);
-      real* v = value.rowBuf(localIndices[i]);
-      for (size_t j = 0; j < this->width_; ++j) {
-        v[j] -= learningRate * g[j];
-      }
-    }
-    return;
-  }  // else
-
-  if (useL1) {  // L1 decay
-    if (fini) {
-      for (size_t i = 0; i < this->height_; ++i) {
-        real* v = value.rowBuf(i);
-        int* t = t0.getData() + i;
-        if (t[0] < currentTime) {
-          // W(t0) -> W(t+1)
-          int tDiff = currentTime - t[0];
-          real delta = tDiff * learningRate * decayRate;
-          simd::decayL1(v, v, delta, this->width_);
-        }
-      }
-      return;
-    }  // else
-
-    for (size_t i = 0; i < localIndices.size(); ++i) {
-      real* g = getLocalRow(i);
-      real* v = value.rowBuf(localIndices[i]);
-      int* t = t0.getData() + localIndices[i];
-      if (t[0] < currentTime) {
-        // W(t0) -> W(t)
-        int tDiff = currentTime - t[0];
-        real delta = tDiff * learningRate * decayRate;
-        simd::decayL1(v, v, delta, this->width_);
-      }
-
-      // W(t) -> W(t+1)
-      for (size_t j = 0; j < this->width_; ++j) {
-        v[j] -= learningRate * g[j];
-      }
-      simd::decayL1(v, v, learningRate * decayRate, this->width_);
-
-      // state update to t+1
-      t[0] = currentTime + 1;
-    }
-
-  } else {  // L2 decay
-    if (fini) {
-      for (size_t i = 0; i < this->height_; ++i) {
-        real* v = value.rowBuf(i);
-        int* t = t0.getData() + i;
-        if (t[0] < currentTime) {
-          // W(t0) -> W(t+1)
-          int tDiff = currentTime - t[0];
-          real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);
-          for (size_t j = 0; j < this->width_; ++j) {
-            v[j] *= recip;
-          }
-        }
-      }
-      return;
-    }  // else
-
-    real recipDecay = 1.0f / (1.0f + learningRate * decayRate);
-
-    for (size_t i = 0; i < localIndices.size(); ++i) {
-      real* g = getLocalRow(i);
-      real* v = value.rowBuf(localIndices[i]);
-      int* t = t0.getData() + localIndices[i];
-      if (t[0] < currentTime) {
-        // W(t0) -> W(t)
-        int tDiff = currentTime - t[0];
-        real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);
-        for (size_t j = 0; j < this->width_; ++j) {
-          v[j] *= recip;
-        }
-      }
-
-      // W(t) -> W(t+1)
-      for (size_t j = 0; j < this->width_; ++j) {
-        v[j] = recipDecay * (v[j] - learningRate * g[j]);
-      }
-
-      // state update to t+1
-      t[0] = currentTime + 1;
-    }
-  }
-}
-
-void SparseRowCpuMatrix::addTo(BaseMatrix& dest,
-                               std::vector<uint32_t>& ids,
-                               size_t tid,
-                               size_t numThreads) {
-  CHECK(!dest.useGpu_);
-  CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
-
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < localIndices.size(); ++i) {
-    uint32_t id = localIndices[i];
-    if (id % numThreads == tid) {
-      simd::addTo(dest.rowBuf(id), getLocalRow(i), this->width_);
-      ids.push_back(id);
-    }
-  }
-}
-
-void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest,
-                               size_t tid,
-                               size_t numThreads) {
-  CHECK(!dest.useGpu_);
-  CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
-
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < localIndices.size(); ++i) {
-    uint32_t id = localIndices[i];
-    if (id % numThreads == tid) {
-      dest.checkIndex(id);
-      simd::addTo(dest.getRow(id), getLocalRow(i), this->width_);
-    }
-  }
-}
-
-void SparseRowCpuMatrix::zeroMemThread(size_t tid, size_t numThreads) {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < localIndices.size(); ++i) {
-    uint32_t id = localIndices[i];
-    if (id % numThreads == tid) {
-      memset(this->getLocalRow(i), 0, this->width_ * sizeof(real));
-    }
-  }
-}
-
-void SparseAutoGrowRowCpuMatrix::mul(CpuSparseMatrix* a,
-                                     CpuMatrix* b,
-                                     real scaleAB,
-                                     real scaleT) {
-  CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
-      a, b, this, scaleAB, scaleT);
-}
-
-void CacheRowCpuMatrix::mul(CpuSparseMatrix* a,
-                            CpuMatrix* b,
-                            real scaleAB,
-                            real scaleT) {
-  CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(a, b, this, scaleAB, scaleT);
-}
-
-void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < len; i++) {
-    CHECK_LT(*(ids + i), this->getHeight())
-        << "id:" << *(ids + i) << "Height:" << this->getHeight()
-        << "sparse id value exceeds the max input dimension, "
-        << "it could be caused invalid input data samples";
-  }
-  localIndices.insert(localIndices.end(), ids, ids + len);
-}
-
-void SparsePrefetchRowCpuMatrix::addRows(MatrixPtr input) {
-  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(input.get());
-  CHECK(mat) << "only support sparse matrix";
-  addRows(reinterpret_cast<const unsigned int*>(mat->getCols()),
-          mat->getElementCnt());
-}
-
-void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  size_t numSamples = ids->getSize();
-  int* index = ids->getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    if (index[i] == -1) continue;
-
-    unsigned int id = (unsigned int)index[i];
-    CHECK_LT(id, this->getHeight())
-        << "id:" << id << "Height:" << this->getHeight()
-        << "sparse id value exceeds the max input dimension, "
-        << "it could be caused invalid input data samples";
-    localIndices.push_back(id);
-  }
-}
-
-void SparsePrefetchRowCpuMatrix::setupIndices() {
-  auto& localIndices = indexDictHandle_->localIndices;
-  uniqueIds(localIndices);
-  // for each sparse row
-  for (size_t id = 0; id < localIndices.size(); ++id) {
-    globalIndices_[localIndices[id]] = id;  // sparse row -> local id
-  }
-  checkStoreSize();
-}
-
-void SparseRowCpuMatrix::checkIndices() {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < localIndices.size(); ++i) {
-    CHECK_EQ(globalIndices_[localIndices[i]], i);
-  }
-  checkStoreSize();
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/SparseRowMatrix.h b/paddle/legacy/math/SparseRowMatrix.h
deleted file mode 100644
index e206747a41c..00000000000
--- a/paddle/legacy/math/SparseRowMatrix.h
+++ /dev/null
@@ -1,341 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-#include <gflags/gflags.h>
-#include <string.h>
-#include <algorithm>
-#include "Matrix.h"
-#include "RowBuffer.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-/**
- * Sparse Row
- */
-class SparseRowCpuMatrix : public CpuMatrix {
- public:
-  struct IndexDict {
-    // In the following, global id means the row id in the original matrix.
-    // Local id means the row id in the local storage which only contains
-    // the sparse rows.
-    std::vector<unsigned int> localIndices;   // local id -> global id
-    std::vector<unsigned int> globalIndices;  // global id -> local id
-  };
-  typedef std::shared_ptr<IndexDict> IndexDictPtr;
-
-  /// heightStore is max number of rows of the sparse matrix.
-  SparseRowCpuMatrix(CpuMemHandlePtr dataHandle,
-                     size_t height,
-                     size_t width,
-                     IndexDictPtr indexDictHandle = nullptr,
-                     bool trans = false)
-      : CpuMatrix(nullptr, height, width, trans),
-        indexDictHandle_(indexDictHandle) {
-    init(height, width);
-    buf_.reset(new RowBuffer(dataHandle, width));
-  }
-
-  virtual ~SparseRowCpuMatrix() {}
-
- public:
-  /**
-   *  Get the row buf
-   *
-   *  @param row row id in the original matrix
-   */
-  real* getRow(size_t row) {
-    CHECK_NE(globalIndices_[row], kUnusedId_);
-    return getLocalRow(globalIndices_[row]);
-  }
-
-  /**
-   *  Get the row buf
-   *
-   *  @param row row id in local storage
-   */
-  real* getLocalRow(size_t row) { return buf_->getWithAutoGrowth(row); }
-
-  /**
-   *  reserve the storage for rows according to current size of
-   * indexDictHandle.
-   *
-   *  This is only used when SparseRowCpuMatrix is constructed with
-   *  indexDictHandle.
-   */
-  void reserveStore() { buf_->resize(localIndices_->size()); }
-
-  // row is the row id in the original matrix
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  /**
-   * Fill data according to row indexs added, setup indices inside.
-   *
-   * *src* and *size* are data and size of normal dense CpuMatrix.
-   */
-  virtual void copyFrom(const real* src, size_t size);
-  virtual void zeroMem();
-
-  /**
-   * apply L1 to all sparse rows, should be apply after indices ready.
-   */
-  virtual void applyL1(real learningRate, real decayRate);
-
-  void clearIndices() { clearRows(); }
-  void zeroMemThread(size_t tid, size_t numThreads);
-
-  /**
-   *  value -= grad * learningRate,  this is gradient.
-   *
-   * If L1 decay set use L1, else if L2 set use L2, otherwise no decay atall.
-   *
-   * t0 is a int vector used by L1/L2 decay, size = height of parameter
-   * matrix,
-   * store the time that each weight row last updated.
-   *
-   * Time is batchId, currentTime is current batchId.
-   *
-   * While pass finished, caller should call this func one more time
-   *  with (fini=true) to let weight decay catch up current time.
-   */
-  void sgdUpdate(BaseMatrix& value,
-                 IVector& t0,
-                 real learningRate,
-                 int currentTime,
-                 real decayRate,
-                 bool useL1,
-                 bool fini = false);
-
-  /**
-   *  merge rows in *this* to *dest* for designated thread
-   *
-   *  values add to *dest* matrix
-   *
-   *  ids occured in *this* append to *ids*
-   *  filtered by  (id % numThreads == tid)
-   */
-  void addTo(BaseMatrix& dest,
-             std::vector<uint32_t>& ids,
-             size_t tid,
-             size_t numThreads);
-
-  /**
-   *  the second version addTo(), *dest* is a SparseRowCpuMatrix.
-   *
-   *  The dest's indices should be setup already, addTo() will
-   *  check src ids is exist in dest's indices.
-   */
-  void addTo(SparseRowCpuMatrix& dest, size_t tid, size_t numThreads);
-
-  const IndexDictPtr& getIndexDictHandle() const { return indexDictHandle_; }
-
-  /**
-   *  check all local and global indices consistency
-   */
-  void checkIndices();
-  /**
-   *  check whether row *i* exist in indices
-   */
-  void checkIndex(size_t i) {
-    size_t localId = globalIndices_[i];
-    CHECK_LT(localId, localIndices_->size());
-    CHECK_EQ((*localIndices_)[localId], i);
-  }
-
-  std::vector<unsigned int>& getLocalIndices() const {
-    return indexDictHandle_->localIndices;
-  }
-
- protected:
-  template <typename Func>
-  void apply(Func f) {
-    f(buf_->data(), localIndices_->size() * width_);
-  }
-
-  void init(size_t height, size_t width);
-
-  /// clear row indices.
-  void clearRows() {
-    for (auto id : *localIndices_) {
-      globalIndices_[id] = kUnusedId_;
-    }
-    localIndices_->clear();
-    buf_->clear();
-  }
-
-  inline void checkStoreSize() {
-    if (buf_->isAutoGrowth()) {
-      if (buf_->getRowCount() > 0.5 * height_) {
-        LOG(WARNING) << "There are more than 0.5*height ("
-                     << localIndices_->size() << ") rows are used for sparse "
-                     << "update, which is not efficient. Considering not use "
-                     << "sparse_update.";
-      }
-    } else {
-      CHECK_LE(localIndices_->size(), buf_->getRowCount());
-    }
-  }
-
-  std::unique_ptr<RowBuffer> buf_;
-  IndexDictPtr indexDictHandle_;
-  std::vector<unsigned int>* localIndices_;  // =&indexDictHandle_->localIndices
-  unsigned int* globalIndices_;  // =indexDictHandle_->globalIndices.data();
-  static const unsigned int kUnusedId_;
-};
-
-class SyncThreadPool;
-
-/// For prefetching parameters from remote Parameter server
-class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
- public:
-  SparsePrefetchRowCpuMatrix(CpuMemHandlePtr dataHandle,
-                             size_t height,
-                             size_t width,
-                             IndexDictPtr indexDictHandle = nullptr,
-                             SyncThreadPool* pool = nullptr,
-                             bool trans = false)
-      : SparseRowCpuMatrix(dataHandle, height, width, indexDictHandle, trans),
-        pool_(pool) {}
-
-  /**
-   * Extract feature ids from *input*, to fill row indexs.
-   *
-   * *input* must be sparse matrix.
-   *
-   * Can call many times before setup.
-   */
-  void addRows(MatrixPtr input);
-  void addRows(IVectorPtr ids);
-
-  /**
-   * setup global indices of SparseRowMatrix after finish add rows.
-   */
-  void setupIndices();
-
- protected:
-  void addRows(const unsigned int* ids, size_t len);
-  SyncThreadPool* pool_;
-};
-
-class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {
- public:
-  SparseAutoGrowRowCpuMatrix(size_t height,
-                             size_t width,
-                             IndexDictPtr indexDictHandle = nullptr,
-                             bool trans = false)
-      : SparseRowCpuMatrix(nullptr, height, width, indexDictHandle, trans) {}
-
-  real* getRow(size_t row) {
-    auto id = globalIndices_[row];
-    if (id == kUnusedId_) {
-      id = globalIndices_[row] = localIndices_->size();
-      localIndices_->push_back(row);
-      checkStoreSize();
-    }
-    return getLocalRow(id);
-  }
-
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-};
-
-class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {
- public:
-  CacheRowCpuMatrix(size_t height,
-                    size_t width,
-                    IndexDictPtr indexDictHandle = nullptr,
-                    bool trans = false)
-      : SparseAutoGrowRowCpuMatrix(height, width, indexDictHandle, trans),
-        sourceData_(nullptr) {}
-
-  void setSourceData(CpuVectorPtr sourceVec) {
-    sourceDataVec_ = sourceVec;
-    sourceData_ = sourceVec->getData();
-  }
-
-  real* getRow(size_t row) {
-    auto id = globalIndices_[row];
-    if (id == kUnusedId_) {
-      id = globalIndices_[row] = localIndices_->size();
-      localIndices_->push_back(row);
-      checkStoreSize();
-      memcpy(
-          getLocalRow(id), sourceData_ + width_ * row, sizeof(float) * width_);
-    }
-    return getLocalRow(id);
-  }
-
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
- public:
-  CpuVectorPtr sourceDataVec_;
-  real* sourceData_;
-};
-
-/**
- * Sparse Row Ids Matrix.
- *
- * mostly same as CpuMatrix, but maintain sparse row ids occured,
- * ids are hashed by worker thread id.
- */
-class SparseRowIdsCpuMatrix : public CpuMatrix {
- public:
-  SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle,
-                        size_t height,
-                        size_t width,
-                        bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {}
-
-  void setNumOfThreads(size_t numOfThreads) { idsArray_.resize(numOfThreads); }
-
-  std::vector<uint32_t>& getIds(size_t threadId) { return idsArray_[threadId]; }
-
- private:
-  std::vector<std::vector<uint32_t>> idsArray_;
-};
-
-}  // namespace paddle
-
-#else
-namespace paddle {
-
-class SparseRowCpuMatrix : public CpuMatrix {
- public:
-  void reserveStore() {}
-  void clearIndices() {}
-};
-
-class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
- public:
-  void setupIndices() {}
-  void addRows(MatrixPtr input) {}
-  void addRows(IVectorPtr ids) {}
-};
-
-class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {};
-class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {};
-class SparseRowIdsCpuMatrix : public CpuMatrix {};
-
-}  // namespace paddle
-
-#endif
diff --git a/paddle/legacy/math/Storage.cpp b/paddle/legacy/math/Storage.cpp
deleted file mode 100644
index 65d53aeaa92..00000000000
--- a/paddle/legacy/math/Storage.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Storage.h"
-#include "Allocator.h"
-#include "paddle/legacy/utils/StringUtil.h"
-#include "paddle/legacy/utils/Util.h"
-
-#ifndef PADDLE_MOBILE_INFERENCE
-DEFINE_int32(pool_limit_size,
-             536870912,
-             "maximum memory size managed by a memory pool, default is 512M");
-#else
-DEFINE_int32(pool_limit_size, 0, "default is 0");
-#endif
-
-namespace paddle {
-
-// Initialization StorageEngine singleton.
-// Other modules may rely on storage management,
-// so StorageEngine need to be initialized before other modules.
-static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); },
-                                          std::numeric_limits<int>::max());
-
-StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {}
-
-StorageEngine::~StorageEngine() {
-  delete cpuAllocator_;
-  for (auto it : gpuAllocator_) {
-    delete it;
-  }
-}
-
-StorageEngine* StorageEngine::singleton() {
-  static StorageEngine storage;
-  return &storage;
-}
-
-PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
-  {
-    // if gpuAllocator_ has been constructed
-    ReadLockGuard guard(lock_);
-    if (deviceId < static_cast<int>(gpuAllocator_.size()) &&
-        (gpuAllocator_[deviceId] != nullptr)) {
-      return gpuAllocator_[deviceId];
-    }
-  }
-
-  {
-    // Construct gpuAllocator_
-    std::lock_guard<RWLock> guard(lock_);
-    if (deviceId >= static_cast<int>(gpuAllocator_.size())) {
-      gpuAllocator_.resize(deviceId + 1);
-    }
-    if (gpuAllocator_[deviceId] == nullptr) {
-      std::string name =
-          "gpu" + str::to_string(deviceId) + std::string("_pool");
-      gpuAllocator_[deviceId] =
-          new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name);
-    }
-    return gpuAllocator_[deviceId];
-  }
-}
-
-PoolAllocator* StorageEngine::getCpuAllocator() {
-  {
-    // if cpuAllocator_ has been constructed
-    ReadLockGuard guard(lock_);
-    if (cpuAllocator_ != nullptr) {
-      return cpuAllocator_;
-    }
-  }
-
-  {
-    // Construct cpuAllocator_
-    std::lock_guard<RWLock> guard(lock_);
-    if (cpuAllocator_ == nullptr) {
-      if (FLAGS_use_gpu) {
-        cpuAllocator_ = new PoolAllocator(
-            new CudaHostAllocator(), FLAGS_pool_limit_size, "cuda_host_pool");
-      } else {
-        cpuAllocator_ = new PoolAllocator(
-            new CpuAllocator(), FLAGS_pool_limit_size, "cpu_pool");
-      }
-    }
-    return cpuAllocator_;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/Storage.h b/paddle/legacy/math/Storage.h
deleted file mode 100644
index bd22dde2c85..00000000000
--- a/paddle/legacy/math/Storage.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mutex>
-#include <vector>
-#include "PoolAllocator.h"
-#include "paddle/legacy/utils/Locks.h"
-
-namespace paddle {
-
-/**
- * @brief Storage manager for multiple devices.
- */
-class StorageEngine {
- public:
-  /**
-   * @return Storage singleton
-   */
-  static StorageEngine* singleton();
-
-  /**
-   * @return return one gpu allocator by deviceId
-   */
-  PoolAllocator* getGpuAllocator(int deviceId);
-
-  /**
-   * @return return cpu allocator
-   */
-  PoolAllocator* getCpuAllocator();
-
- protected:
-  StorageEngine();
-  ~StorageEngine();
-  RWLock lock_;
-  std::vector<PoolAllocator*> gpuAllocator_;
-  PoolAllocator* cpuAllocator_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/TensorApply.h b/paddle/legacy/math/TensorApply.h
deleted file mode 100644
index 8b642047bff..00000000000
--- a/paddle/legacy/math/TensorApply.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle {
-
-/**
- * \brief The tensor evaluator classes.
- */
-template <typename Derived, class T>
-class TensorApply {
- public:
-  explicit INLINE TensorApply(const Derived& p)
-      : data_(p.data_),
-        stride_(p.stride_),
-        height_(p.height_),
-        width_(p.width_),
-        useGpu_(p.useGpu_) {}
-
-  INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
-  INLINE T apply(int index) const { return data_[index]; }
-  INLINE T& applyRef(int i, int j) { return data_[i * stride_ + j]; }
-  INLINE T& applyRef(int index) { return data_[index]; }
-
-  INLINE size_t getWidth() const { return width_; }
-  INLINE size_t getHeight() const { return height_; }
-  INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
-  INLINE bool useGpu() const { return useGpu_; }
-
-  T* data_;
-  size_t stride_;
-  size_t height_;
-  size_t width_;
-  bool useGpu_;
-};
-
-/**
- * \brief The tensor evaluator classes.
- * evaluator for rvalues
- */
-template <typename Derived, class T>
-class TensorApply<const Derived, T> {
- public:
-  explicit INLINE TensorApply(const Derived& p)
-      : data_(p.data_),
-        stride_(p.stride_),
-        height_(p.height_),
-        width_(p.width_),
-        useGpu_(p.useGpu_) {}
-
-  INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
-  INLINE T apply(int index) const { return data_[index]; }
-
-  INLINE size_t getWidth() const { return width_; }
-  INLINE size_t getHeight() const { return height_; }
-  INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
-  INLINE bool useGpu() const { return useGpu_; }
-
-  const T* data_;
-  size_t stride_;
-  size_t height_;
-  size_t width_;
-  bool useGpu_;
-};
-
-template <typename Derived, class T>
-class TensorApply<const TensorExpression<Derived, T>, T> {
- public:
-  explicit TensorApply(const TensorExpression<Derived, T>& expr)
-      : expr_(expr.derived()) {}
-
-  INLINE T apply(int i, int j) const { return expr_.apply(i, j); }
-  INLINE T apply(int index) const { return expr_.apply(index); }
-
-  INLINE size_t getWidth() const { return expr_.getWidth(); }
-  INLINE size_t getHeight() const { return expr_.getHeight(); }
-  INLINE bool isContiguous() const { return expr_.isContiguous(); }
-  INLINE bool useGpu() const { return expr_.useGpu(); }
-
-  TensorApply<const Derived, T> expr_;
-};
-
-/**
- * \brief The unary expression evaluator classes.
- */
-template <class OP, typename ArgType, class T>
-class TensorApply<const TensorUnaryOp<OP, ArgType, T>, T> {
- public:
-  explicit INLINE TensorApply(const TensorUnaryOp<OP, ArgType, T>& expr)
-      : op_(expr.op_), expr_(expr.expr_) {}
-
-  INLINE T apply(int i, int j) const { return op_(expr_.apply(i, j)); }
-  INLINE T apply(int index) const { return op_(expr_.apply(index)); }
-
-  INLINE size_t getWidth() const { return expr_.getWidth(); }
-  INLINE size_t getHeight() const { return expr_.getHeight(); }
-  INLINE bool isContiguous() const { return expr_.isContiguous(); }
-  INLINE bool useGpu() const { return expr_.useGpu(); }
-
-  const OP op_;
-  TensorApply<ArgType, T> expr_;
-};
-
-/**
- * \brief The binary expression evaluator classes.
- */
-template <class OP, typename LhsType, typename RhsType, class T>
-class TensorApply<const TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
- public:
-  explicit INLINE TensorApply(
-      const TensorBinaryOp<OP, LhsType, RhsType, T>& expr)
-      : op_(expr.op_), lhs_(expr.lhs_), rhs_(expr.rhs_) {
-#ifndef __CUDA_ARCH__
-    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
-    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
-    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
-#endif
-  }
-
-  INLINE T apply(int i, int j) const {
-    return op_(lhs_.apply(i, j), rhs_.apply(i, j));
-  }
-  INLINE T apply(int index) const {
-    return op_(lhs_.apply(index), rhs_.apply(index));
-  }
-
-  INLINE size_t getWidth() const { return lhs_.getWidth(); }
-  INLINE size_t getHeight() const { return rhs_.getHeight(); }
-  INLINE bool isContiguous() const {
-    return lhs_.isContiguous() && rhs_.isContiguous();
-  }
-  INLINE bool useGpu() const { return lhs_.useGpu(); }
-
-  const OP op_;
-  TensorApply<LhsType, T> lhs_;
-  TensorApply<RhsType, T> rhs_;
-};
-
-/**
- * \brief The ternary expression evaluator classes.
- */
-template <typename ArgType1, typename ArgType2, typename ArgType3, class T>
-class TensorApply<const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>, T> {
- public:
-  explicit INLINE TensorApply(
-      const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>& expr)
-      : expr1_(expr.expr1_), expr2_(expr.expr2_), expr3_(expr.expr3_) {
-#ifndef __CUDA_ARCH__
-    CHECK_EQ(expr1_.getWidth(), expr2_.getWidth());
-    CHECK_EQ(expr1_.getWidth(), expr3_.getWidth());
-    CHECK_EQ(expr1_.getHeight(), expr2_.getHeight());
-    CHECK_EQ(expr1_.getHeight(), expr3_.getHeight());
-    CHECK_EQ(expr1_.useGpu(), expr2_.useGpu());
-    CHECK_EQ(expr1_.useGpu(), expr3_.useGpu());
-#endif
-  }
-
-  INLINE T apply(int i, int j) const {
-    return expr1_.apply(i, j) ? expr2_.apply(i, j) : expr3_.apply(i, j);
-  }
-  INLINE T apply(int index) const {
-    return expr1_.apply(index) ? expr2_.apply(index) : expr3_.apply(index);
-  }
-
-  INLINE size_t getWidth() const { return expr1_.getWidth(); }
-  INLINE size_t getHeight() const { return expr1_.getHeight(); }
-  INLINE bool isContiguous() const {
-    return expr1_.isContiguous() && expr2_.isContiguous() &&
-           expr3_.isContiguous();
-  }
-  INLINE bool useGpu() const { return expr1_.useGpu(); }
-
-  TensorApply<ArgType1, T> expr1_;
-  TensorApply<ArgType2, T> expr2_;
-  TensorApply<ArgType3, T> expr3_;
-};
-
-/**
- * \brief The const expression evaluator classes.
- */
-template <class OP, typename ArgType, class T>
-class TensorApply<const TensorConstant<OP, ArgType, T>, T> {
- public:
-  explicit INLINE TensorApply(const TensorConstant<OP, ArgType, T>& expr)
-      : op_(expr.op_), expr_(expr.expr_) {}
-
-  INLINE T apply(int i, int j) const { return op_(i, j); }
-  INLINE T apply(int index) const { return op_(index); }
-
-  INLINE size_t getWidth() const { return expr_.getWidth(); }
-  INLINE size_t getHeight() const { return expr_.getHeight(); }
-  INLINE bool isContiguous() const { return true; }
-  INLINE bool useGpu() const { return expr_.useGpu(); }
-
-  const OP op_;
-  TensorApply<ArgType, T> expr_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/TensorAssign.h b/paddle/legacy/math/TensorAssign.h
deleted file mode 100644
index efbfce6c4f8..00000000000
--- a/paddle/legacy/math/TensorAssign.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-/**
- * \brief Tensor Assign Expression(return by lazyAssign,
- * and evaluated by AssignEvaluate)
- */
-template <typename LhsType, typename RhsType, class T>
-class TensorAssignOp {
- public:
-  explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
-      : lhs_(lhs), rhs_(rhs) {
-#ifndef __CUDA_ARCH__
-    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
-    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
-    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
-#endif
-  }
-
-  INLINE void apply(const int i, const int j) {
-    lhs_.applyRef(i, j) = rhs_.apply(i, j);
-  }
-  INLINE void apply(const int index) {
-    lhs_.applyRef(index) = rhs_.apply(index);
-  }
-
-  INLINE size_t getWidth() const { return lhs_.getWidth(); }
-  INLINE size_t getHeight() const { return rhs_.getHeight(); }
-  INLINE bool isContiguous() const {
-    return lhs_.isContiguous() && rhs_.isContiguous();
-  }
-  INLINE bool useGpu() const { return lhs_.useGpu(); }
-
- private:
-  TensorApply<LhsType, T> lhs_;
-  TensorApply<const RhsType, T> rhs_;
-};
-
-template <typename Assign, typename... AssignOp>
-void AssignCpuEvaluate(int height,
-                       int width,
-                       bool isContiguous,
-                       Assign&& assign,
-                       AssignOp&&... args) {
-  if (isContiguous) {
-    int size = height * width;
-    for (int index = 0; index < size; index++) {
-      assign.apply(index);
-      __attribute__((unused)) int dummy[] = {(((args)).apply(index), 0)...};
-    }
-  } else {
-    for (int i = 0; i < height; i++) {
-      for (int j = 0; j < width; j++) {
-        assign.apply(i, j);
-        __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
-      }
-    }
-  }
-}
-
-#ifdef __NVCC__
-template <typename Assign, typename... AssignOp>
-__global__ void AssignGpuEvaluate1(const int border,
-                                   Assign assign,
-                                   AssignOp... args) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    assign.apply(idx);
-    __attribute__((unused)) int dummy[] = {(((args)).apply(idx), 0)...};
-  }
-}
-
-template <typename Assign, typename... AssignOp>
-__global__ void AssignGpuEvaluate2(const int height,
-                                   const int width,
-                                   Assign assign,
-                                   AssignOp... args) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < height; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < width; j += gridDim.x * blockDim.x) {
-      assign.apply(i, j);
-      __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
-    }
-  }
-}
-#endif
-
-/**
- * \brief Evaluate one or more TensorAssignOp objects.
- *
- * \note At least one assignment expression is required
- */
-template <typename Assign, typename... AssignOp>
-void AssignEvaluate(Assign&& assign, AssignOp&&... args) {
-  const bool useGpu_ = assign.useGpu();
-  bool isContiguous_ = assign.isContiguous();
-  const size_t height = assign.getHeight();
-  const size_t width = assign.getWidth();
-
-  const int packSize = sizeof...(args);
-  const bool packUseGpu[] = {((args)).useGpu()...};
-  const bool packIsContiguous[] = {((args)).isContiguous()...};
-  const size_t packHeight[] = {((args)).getHeight()...};
-  const size_t packWidth[] = {((args)).getWidth()...};
-
-  for (int i = 0; i < packSize; i++) {
-    CHECK_EQ(useGpu_, packUseGpu[i]);
-    CHECK_EQ(height, packHeight[i]);
-    CHECK_EQ(width, packWidth[i]);
-    isContiguous_ = isContiguous_ && packIsContiguous[i];
-  }
-
-  if (useGpu_) {
-#ifdef __NVCC__
-    if (isContiguous_) {
-      int size = height * width;
-      int blockSize = size <= 1024 ? size : 1024;
-      int gridSize = (size + 1024 - 1) / 1024;
-      AssignGpuEvaluate1<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-          size, assign, args...);
-    } else {
-      int blockSizeY = std::min(32, (int)height);
-      int blockSizeX = (32 / blockSizeY) * 32;
-      int gridSizeX = std::min(32, (int)(width + blockSizeX - 1) / blockSizeX);
-      int gridSizeY = std::min(32, (int)(height + blockSizeY - 1) / blockSizeY);
-      dim3 threads(blockSizeX, blockSizeY);
-      dim3 grid(gridSizeX, gridSizeY);
-      AssignGpuEvaluate2<<<grid, threads, 0, STREAM_DEFAULT>>>(
-          height, width, assign, args...);
-    }
-
-    CHECK_SYNC("AssignEvaluate failed");
-#endif
-  } else {
-    AssignCpuEvaluate(height, width, isContiguous_, assign, args...);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/TensorEvaluate.h b/paddle/legacy/math/TensorEvaluate.h
deleted file mode 100644
index 3029dd35fb0..00000000000
--- a/paddle/legacy/math/TensorEvaluate.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include "hl_base.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-/**
- * \brief The tensor cpu evaluate api.
- */
-template <class T, typename LeftType, typename RightType>
-inline void TensorCpuApply(LeftType& lhs, const RightType& rhs) {
-  TensorApply<LeftType, T> lhs_(lhs);
-  TensorApply<const RightType, T> rhs_(rhs);
-  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
-  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
-  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
-
-  int height = lhs_.getHeight();
-  int width = lhs_.getWidth();
-  if (lhs_.isContiguous() && rhs_.isContiguous()) {
-    int size = height * width;
-    for (int index = 0; index < size; index++) {
-      lhs_.applyRef(index) = rhs_.apply(index);
-    }
-  } else {
-    for (int i = 0; i < height; i++) {
-      for (int j = 0; j < width; j++) {
-        lhs_.applyRef(i, j) = rhs_.apply(i, j);
-      }
-    }
-  }
-}
-
-#ifdef __NVCC__
-template <typename LeftType, typename RightType>
-__global__ void TensorElementWiseOp(LeftType lhs,
-                                    RightType rhs,
-                                    const int border) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    lhs.applyRef(idx) = rhs.apply(idx);
-  }
-}
-
-template <typename LeftType, typename RightType>
-__global__ void TensorElementWiseOp(LeftType lhs, RightType rhs) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < lhs.getHeight(); i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < lhs.getWidth(); j += gridDim.x * blockDim.x) {
-      lhs.applyRef(i, j) = rhs.apply(i, j);
-    }
-  }
-}
-
-/**
- * \brief The tensor gpu evaluate api.
- */
-template <class T, typename LeftType, typename RightType>
-inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) {
-  TensorApply<LeftType, T> lhs_(lhs);
-  TensorApply<const RightType, T> rhs_(rhs);
-  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
-  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
-  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
-
-  int dimM = lhs_.getHeight();
-  int dimN = lhs_.getWidth();
-
-  if (lhs_.isContiguous() && rhs_.isContiguous()) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    TensorElementWiseOp<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-        lhs_, rhs_, size);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    TensorElementWiseOp<<<grid, threads, 0, STREAM_DEFAULT>>>(lhs_, rhs_);
-  }
-
-  CHECK_SYNC("TensorGpuApply failed");
-}
-#else
-template <class T, typename LeftType, typename RightType>
-inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {
-  LOG(FATAL) << "Since it is gcc compiled, "
-                "this calculation does not support GPU implementation.";
-}
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/TensorExpression.h b/paddle/legacy/math/TensorExpression.h
deleted file mode 100644
index 1c6cf078314..00000000000
--- a/paddle/legacy/math/TensorExpression.h
+++ /dev/null
@@ -1,446 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stdint.h>
-#include <cstddef>
-#include "hl_tensor_ops.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-template <class OP, typename ExprType, class T>
-class TensorConstant;
-template <class OP, typename ExprType, class T>
-class TensorUnaryOp;
-template <class OP, typename LhsType, typename RhsType, class T>
-class TensorBinaryOp;
-template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
-class TensorTernaryOp;
-
-template <typename LhsType, typename RhsType, class T>
-class TensorAssignOp;
-
-/**
- * \brief Tensor base class.
- *
- * This is the base class of all Tensor and Expression class.
- */
-template <typename Derived, class T>
-class TensorExpression {
- public:
-  /**
-   * Element wise unary expression.
-   */
-  template <typename UnaryOp>
-  const TensorUnaryOp<UnaryOp, const Derived, T> unaryExpression(
-      const UnaryOp& op) const {
-    return TensorUnaryOp<UnaryOp, const Derived, T>(op, derived());
-  }
-
-  const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
-      T p) const {
-    return unaryExpression(hppl::unary::add_scale<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::sub_scale<T>, const Derived, T> operator-(
-      T p) const {
-    return unaryExpression(hppl::unary::sub_scale<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
-      T p) const {
-    return unaryExpression(hppl::unary::mul_scale<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::div_scale<T>, const Derived, T> operator/(
-      T p) const {
-    return unaryExpression(hppl::unary::div_scale<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::neg<T>, const Derived, T> operator-() const {
-    return unaryExpression(hppl::unary::neg<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::exp_op<T>, const Derived, T> exp() const {
-    return unaryExpression(hppl::unary::exp_op<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::log_op<T>, const Derived, T> log() const {
-    return unaryExpression(hppl::unary::log_op<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::sqrt_op<T>, const Derived, T> sqrt() const {
-    return unaryExpression(hppl::unary::sqrt_op<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::square<T>, const Derived, T> square() const {
-    return unaryExpression(hppl::unary::square<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::reciprocal<T>, const Derived, T> reciprocal()
-      const {
-    return unaryExpression(hppl::unary::reciprocal<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::abs<T>, const Derived, T> abs() const {
-    return unaryExpression(hppl::unary::abs<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::sign<T>, const Derived, T> sign() const {
-    return unaryExpression(hppl::unary::sign<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::pow_op<T>, const Derived, T> pow(T p) const {
-    return unaryExpression(hppl::unary::pow_op<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::min<T>, const Derived, T> min(T p) const {
-    return unaryExpression(hppl::unary::min<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::max<T>, const Derived, T> max(T p) const {
-    return unaryExpression(hppl::unary::max<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_eq<T>, const Derived, T> operator==(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_eq<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_ne<T>, const Derived, T> operator!=(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_ne<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_le<T>, const Derived, T> operator<=(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_le<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_lt<T>, const Derived, T> operator<(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_lt<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_ge<T>, const Derived, T> operator>=(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_ge<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_gt<T>, const Derived, T> operator>(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_gt<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::and_op<T>, const Derived, T> operator&&(
-      T p) const {
-    return unaryExpression(hppl::unary::and_op<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::or_op<T>, const Derived, T> operator||(
-      T p) const {
-    return unaryExpression(hppl::unary::or_op<T>(p));
-  }
-
-  /**
-   * Element wise binary expression.
-   */
-  template <typename BinaryOp, typename ExpressionType>
-  const TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>
-  binaryExpression(const BinaryOp& op, const ExpressionType& expr) const {
-    return TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>(
-        op, derived(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_eq<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator==(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_eq<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_ne<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator!=(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_ne<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_le<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator<=(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_le<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_lt<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator<(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_lt<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_ge<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator>=(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_ge<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_gt<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator>(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_gt<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::and_op<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator&&(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::and_op<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::or_op<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator||(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::or_op<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::add<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator+(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::add<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::sub<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator-(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::sub<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::mul<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator*(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::mul<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::div<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator/(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::div<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::min<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  min(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::min<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::max<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  max(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::max<T>(), expr);
-  }
-
-  /**
-   * Element wise ternary expression.
-   *
-   * ternary conditional operator(?: operator).
-   * The conditional expression returns one of two values depending on
-   * the result of derived expression.
-   * If derived expression evaluates to true, then expression1 is evaluated.
-   * If derived expression evaluates to false, then expression2 is evaluated.
-   */
-  template <typename ExprType1, typename ExprType2>
-  const TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>
-  condition(const ExprType1& expr1, const ExprType2& expr2) const {
-    return TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>(
-        derived(), expr1, expr2);
-  }
-
-  template <typename ExprType>
-  const TensorTernaryOp<
-      const Derived,
-      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
-      const ExprType,
-      T>
-  condition(T p, const ExprType& expr) const {
-    return condition(constant(p), expr);
-  }
-
-  template <typename ExprType>
-  const TensorTernaryOp<
-      const Derived,
-      const ExprType,
-      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
-      T>
-  condition(const ExprType& expr, T p) const {
-    return condition(expr, constant(p));
-  }
-
-  const TensorTernaryOp<
-      const Derived,
-      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
-      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
-      T>
-  condition(T p1, T p2) const {
-    return condition(constant(p1), constant(p2));
-  }
-
-  /**
-   * return a TensorConstant. A TensorConstant object hold a constant value.
-   */
-  const TensorConstant<hppl::unary::constant<T>, const Derived, T> constant(
-      T p) const {
-    return TensorConstant<hppl::unary::constant<T>, const Derived, T>(
-        hppl::unary::constant<T>(p), derived());
-  }
-
-  /**
-   * return a TensorAssignOp, and use AssignEvaluate to evaluate one or more
-   * TensorAssignOp objects.
-   */
-  template <typename ExpressionType>
-  TensorAssignOp<Derived, ExpressionType, T> lazyAssign(
-      const ExpressionType& expr) const {
-    return TensorAssignOp<Derived, ExpressionType, T>(derived(), expr);
-  }
-
- protected:
-  const Derived& derived() const { return *static_cast<const Derived*>(this); }
-};
-
-/**
- * \brief Unary Operator Expression
- */
-template <class OP, typename ExprType, class T>
-class TensorUnaryOp
-    : public TensorExpression<TensorUnaryOp<OP, ExprType, T>, T> {
- public:
-  explicit TensorUnaryOp(const OP op, const ExprType& expr)
-      : op_(op), expr_(expr) {}
-
-  const OP op_;
-  const ExprType expr_;
-};
-
-/**
- * \brief Binary Operator Expression
- */
-template <class OP, typename LhsType, typename RhsType, class T>
-class TensorBinaryOp
-    : public TensorExpression<TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
- public:
-  explicit TensorBinaryOp(const OP op, const LhsType& lhs, const RhsType& rhs)
-      : op_(op), lhs_(lhs), rhs_(rhs) {}
-
-  const OP op_;
-  const LhsType lhs_;
-  const RhsType rhs_;
-};
-
-/**
- * \brief Ternary Operator Expression
- */
-template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
-class TensorTernaryOp : public TensorExpression<
-                            TensorTernaryOp<ExprType1, ExprType2, ExprType3, T>,
-                            T> {
- public:
-  explicit TensorTernaryOp(const ExprType1& expr1,
-                           const ExprType2& expr2,
-                           const ExprType3& expr3)
-      : expr1_(expr1), expr2_(expr2), expr3_(expr3) {}
-
-  const ExprType1 expr1_;
-  const ExprType2 expr2_;
-  const ExprType3 expr3_;
-};
-
-/**
- * \brief Constant Expression
- */
-template <class OP, typename ExprType, class T>
-class TensorConstant
-    : public TensorExpression<TensorConstant<OP, ExprType, T>, T> {
- public:
-  explicit TensorConstant(const OP op, const ExprType& expr)
-      : op_(op), expr_(expr) {}
-
-  const OP op_;
-  const ExprType expr_;
-};
-
-/**
- * \brief operator+ overload
- * \return a unary operator expression
- */
-template <typename Derived, class T>
-const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
-    T p, const TensorExpression<Derived, T>& expr) {
-  return expr + p;
-}
-
-/**
- * \brief operator* overload
- * \return a unary operator expression
- */
-template <typename Derived, class T>
-const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
-    T p, const TensorExpression<Derived, T>& expr) {
-  return expr * p;
-}
-
-}  // namespace paddle
-
-#include "TensorApply.h"
-#include "TensorEvaluate.h"
diff --git a/paddle/legacy/math/TrainingAlgorithmOp.cu b/paddle/legacy/math/TrainingAlgorithmOp.cu
deleted file mode 100644
index 9e1eaa0f45a..00000000000
--- a/paddle/legacy/math/TrainingAlgorithmOp.cu
+++ /dev/null
@@ -1,356 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BaseMatrix.h"
-#include "TrainingAlgorithmOp.h"
-#include "paddle/legacy/utils/Logging.h"
-
-#if __cplusplus > 199711L
-
-#include "TensorAssign.h"
-
-namespace paddle {
-
-void sparseMomentumApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& momU,
-                         BaseMatrix& momV,
-                         real alpha,
-                         real beta,
-                         real gamma,
-                         real tau,
-                         real learningRate) {
-  auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad);
-  auto expr2 =
-      momV.lazyAssign(momV + (tau * alpha * gamma * learningRate) * grad);
-  auto expr3 = value.lazyAssign((tau / beta + (real)1 / alpha) * momU +
-                                ((real)1 / beta) * momV);
-
-  AssignEvaluate(expr1, expr2, expr3);
-}
-
-void adadeltaApply(BaseMatrix& value,
-                   BaseMatrix& grad,
-                   BaseMatrix& mom,
-                   BaseMatrix& accum,
-                   BaseMatrix& accum_update,
-                   BaseMatrix& lr,
-                   real rou,
-                   real epsilon,
-                   real learningRate,
-                   real momentum,
-                   real decayRate) {
-  auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square());
-  auto expr2 =
-      lr.lazyAssign(((accum_update + epsilon) / (accum + epsilon)).sqrt());
-  auto expr3 = accum_update.lazyAssign(rou * accum_update +
-                                       ((real)1 - rou) * (grad * lr).square());
-  auto expr4 = mom.lazyAssign(mom * momentum -
-                              learningRate * lr * (grad + value * decayRate));
-  auto expr5 = value.lazyAssign(value + mom);
-
-  AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
-}
-
-void adagradApply(BaseMatrix& value,
-                  BaseMatrix& grad,
-                  BaseMatrix& mom,
-                  BaseMatrix& accum_buffer,
-                  BaseMatrix& accum,
-                  BaseMatrix& lr,
-                  real epsilon,
-                  real learningRate,
-                  real momentum,
-                  real decayRate) {
-  auto expr1 = accum.lazyAssign(accum + grad.square());
-  auto expr2 =
-      lr.lazyAssign((accum_buffer + accum + epsilon).sqrt().reciprocal());
-  auto expr3 = mom.lazyAssign(mom * momentum -
-                              learningRate * lr * (grad + value * decayRate));
-  auto expr4 = value.lazyAssign(value + mom);
-
-  AssignEvaluate(expr1, expr2, expr3, expr4);
-}
-
-void rmspropApply(BaseMatrix& value,
-                  BaseMatrix& grad,
-                  BaseMatrix& mom,
-                  BaseMatrix& g,
-                  BaseMatrix& f,
-                  BaseMatrix& lr,
-                  real accumulatedRou,
-                  real rou,
-                  real epsilon,
-                  real learningRate,
-                  real momentum,
-                  real decayRate,
-                  bool firstTime) {
-  auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad);
-  auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal());
-  auto expr4 = mom.lazyAssign(mom * momentum -
-                              learningRate * lr * (grad + value * decayRate));
-  auto expr5 = value.lazyAssign(value + mom);
-
-  if (firstTime) {
-    auto expr1 = g.lazyAssign(accumulatedRou * g + grad.square());
-
-    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
-  } else {
-    auto expr1 =
-        g.lazyAssign(accumulatedRou * g + ((real)1 - rou) * grad.square());
-
-    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
-  }
-}
-
-void decayedAdagradApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& mom,
-                         BaseMatrix& accum,
-                         BaseMatrix& lr,
-                         real accumulatedRou,
-                         real rou,
-                         real epsilon,
-                         real learningRate,
-                         real momentum,
-                         real decayRate,
-                         bool firstTime) {
-  auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal());
-  auto expr3 = mom.lazyAssign(mom * momentum -
-                              learningRate * lr * (grad + value * decayRate));
-  auto expr4 = value.lazyAssign(value + mom);
-
-  if (firstTime) {
-    auto expr1 = accum.lazyAssign(accumulatedRou * accum + grad.square());
-
-    AssignEvaluate(expr1, expr2, expr3, expr4);
-  } else {
-    auto expr1 = accum.lazyAssign(accumulatedRou * accum +
-                                  ((real)1 - rou) * grad.square());
-
-    AssignEvaluate(expr1, expr2, expr3, expr4);
-  }
-}
-
-void adamApply(BaseMatrix& value,
-               BaseMatrix& grad,
-               BaseMatrix& mom,  // firse moment
-               BaseMatrix& v,    // second moment
-               real beta1,
-               real beta2,
-               real beta1_power,
-               real beta2_power,
-               real epsilon,
-               real learningRate) {
-  real alpha =
-      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
-
-  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
-  auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square());
-  auto expr3 = value.lazyAssign(value - (mom * alpha) / (v.sqrt() + epsilon));
-
-  AssignEvaluate(expr1, expr2, expr3);
-}
-
-void adamaxApply(BaseMatrix& value,
-                 BaseMatrix& grad,
-                 BaseMatrix& mom,  // firse moment
-                 BaseMatrix& u,    // weighted infinity norm
-                 real beta1,
-                 real beta2,
-                 int64_t step,
-                 real alpha) {
-  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
-  auto expr2 =
-      u.lazyAssign((beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
-  auto expr3 = value.lazyAssign(
-      value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
-
-  AssignEvaluate(expr1, expr2, expr3);
-}
-
-}  // namespace paddle
-
-#else
-
-namespace paddle {
-
-void sparseMomentumApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& momU,
-                         BaseMatrix& momV,
-                         real alpha,
-                         real beta,
-                         real gamma,
-                         real tau,
-                         real learningRate) {
-  /**
-   * \alpha_t = \alpha_{t-1} / k
-   * \beta_t = \beta_{t-1} / (1 + \lambda\gamma_t)
-   * u_t = u_{t-1} - \alpha_t \gamma_t g_t
-   * v_t = v_{t-1} + \tau_{t-1} \alpha_t \gamma_t g_t
-   * \tau_t = \tau_{t-1} + \beta_t / \alpha_t
-   */
-  momU -= (alpha * gamma * learningRate) * grad;
-  momV += (tau * alpha * gamma * learningRate) * grad;
-  value = (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV;
-}
-
-void adadeltaApply(BaseMatrix& value,
-                   BaseMatrix& grad,
-                   BaseMatrix& mom,
-                   BaseMatrix& accum,
-                   BaseMatrix& accum_update,
-                   BaseMatrix& lr,
-                   real rou,
-                   real epsilon,
-                   real learningRate,
-                   real momentum,
-                   real decayRate) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  accum = rou * accum + ((real)1 - rou) * grad.square();
-
-  // learn_rate: sqrt(( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ))
-  lr = ((accum_update + epsilon) / (accum + epsilon)).sqrt();
-
-  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
-  accum_update = rou * accum_update + ((real)1 - rou) * (grad * lr).square();
-
-  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
-  value += mom;
-}
-
-void adagradApply(BaseMatrix& value,
-                  BaseMatrix& grad,
-                  BaseMatrix& mom,
-                  BaseMatrix& accum_buffer,
-                  BaseMatrix& accum,
-                  BaseMatrix& lr,
-                  real epsilon,
-                  real learningRate,
-                  real momentum,
-                  real decayRate) {
-  accum += grad.square();
-  lr = (accum_buffer + accum + epsilon).sqrt().reciprocal();
-  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
-  value += mom;
-}
-
-void rmspropApply(BaseMatrix& value,
-                  BaseMatrix& grad,
-                  BaseMatrix& mom,
-                  BaseMatrix& g,
-                  BaseMatrix& f,
-                  BaseMatrix& lr,
-                  real accumulatedRou,
-                  real rou,
-                  real epsilon,
-                  real learningRate,
-                  real momentum,
-                  real decayRate,
-                  bool firstTime) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  if (firstTime) {
-    g = accumulatedRou * g + grad.square();
-  } else {
-    g = accumulatedRou * g + ((real)1 - rou) * grad.square();
-  }
-
-  // E(f_t) = \rou * E(f_{t-1}) + (1-\rou) * g
-  f = accumulatedRou * f + ((real)1 - rou) * grad;
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(f_t))^2 + epsilon )
-  // Basiclly if the sign of the gradient changes more often,
-  // the learning rate will be decreased.
-  lr = (g - f.square() + epsilon).sqrt().reciprocal();
-
-  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
-  value += mom;
-}
-
-void decayedAdagradApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& mom,
-                         BaseMatrix& accum,
-                         BaseMatrix& lr,
-                         real accumulatedRou,
-                         real rou,
-                         real epsilon,
-                         real learningRate,
-                         real momentum,
-                         real decayRate,
-                         bool firstTime) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  if (firstTime) {
-    accum = accumulatedRou * accum + grad.square();
-  } else {
-    accum = accumulatedRou * accum + ((real)1 - rou) * grad.square();
-  }
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
-  // Basiclly if the bigger the magnitude gradient is,
-  // the smaller the learning rate will be.
-  lr = (accum + epsilon).sqrt().reciprocal();
-
-  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
-  value += mom;
-}
-
-void adamApply(BaseMatrix& value,
-               BaseMatrix& grad,
-               BaseMatrix& mom,  // firse moment
-               BaseMatrix& v,    // second moment
-               real beta1,
-               real beta2,
-               real beta1_power,
-               real beta2_power,
-               real epsilon,
-               real learningRate) {
-  real alpha =
-      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
-
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  mom = beta1 * mom + ((real)1 - beta1) * grad;
-
-  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
-  v = beta2 * v + ((real)1 - beta2) * grad.square();
-
-  value -= (mom * alpha) / (v.sqrt() + epsilon);
-}
-
-void adamaxApply(BaseMatrix& value,
-                 BaseMatrix& grad,
-                 BaseMatrix& mom,  // firse moment
-                 BaseMatrix& u,    // weighted infinity norm
-                 real beta1,
-                 real beta2,
-                 int64_t step,
-                 real alpha) {
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  mom = beta1 * mom + ((real)1 - beta1) * grad;
-
-  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
-  u = (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs());
-
-  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
-  value -= (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u);
-}
-
-}  // namespace paddle
-
-#endif
diff --git a/paddle/legacy/math/TrainingAlgorithmOp.h b/paddle/legacy/math/TrainingAlgorithmOp.h
deleted file mode 100644
index 921c2742cfe..00000000000
--- a/paddle/legacy/math/TrainingAlgorithmOp.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "BaseMatrix.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-/**
- * \brief Sparse Momentum optimizer.
- */
-extern void sparseMomentumApply(BaseMatrix& value,
-                                BaseMatrix& grad,
-                                BaseMatrix& momU,
-                                BaseMatrix& momV,
-                                real alpha,
-                                real beta,
-                                real gamma,
-                                real tau,
-                                real learningRate);
-
-/**
- * \brief AdaDelta optimizer.
- */
-extern void adadeltaApply(BaseMatrix& value,
-                          BaseMatrix& grad,
-                          BaseMatrix& sum,
-                          BaseMatrix& sum1,
-                          BaseMatrix& mom,
-                          BaseMatrix& lr,
-                          real rou,
-                          real epsilon,
-                          real learningRate,
-                          real momentum,
-                          real decayRate);
-
-/**
- * \brief AdaGrad optimizer.
- */
-extern void adagradApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& sum,
-                         BaseMatrix& sum1,
-                         BaseMatrix& mom,
-                         BaseMatrix& lr,
-                         real epsilon,
-                         real learningRate,
-                         real momentum,
-                         real decayRate);
-
-/**
- * \brief RMSProp optimizer.
- */
-extern void rmspropApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& g,
-                         BaseMatrix& f,
-                         BaseMatrix& mom,
-                         BaseMatrix& lr,
-                         real accumulatedRou,
-                         real rou,
-                         real epsilon,
-                         real learningRate,
-                         real momentum,
-                         real decayRate,
-                         bool firstTime);
-
-/**
- * \brief Decayed AdaGrad optimizer.
- */
-extern void decayedAdagradApply(BaseMatrix& value,
-                                BaseMatrix& grad,
-                                BaseMatrix& mom,
-                                BaseMatrix& accum,
-                                BaseMatrix& lr,
-                                real accumulatedRou,
-                                real rou,
-                                real epsilon,
-                                real learningRate,
-                                real momentum,
-                                real decayRate,
-                                bool firstTime);
-
-/**
- * \brief Adam optimizer.
- */
-extern void adamApply(BaseMatrix& value,
-                      BaseMatrix& grad,
-                      BaseMatrix& mom,
-                      BaseMatrix& v,
-                      real beta1,
-                      real beta2,
-                      real beta1_power,
-                      real beta2_power,
-                      real epsilon,
-                      real learningRate);
-
-/**
- * \brief AdaMax optimizer.
- */
-extern void adamaxApply(BaseMatrix& value,
-                        BaseMatrix& grad,
-                        BaseMatrix& mom,  // firse moment
-                        BaseMatrix& u,    // weighted infinity norm
-                        real beta1,
-                        real beta2,
-                        int64_t step,
-                        real alpha);
-}  // namespace paddle
diff --git a/paddle/legacy/math/Vector.cpp b/paddle/legacy/math/Vector.cpp
deleted file mode 100644
index 87f48bb1622..00000000000
--- a/paddle/legacy/math/Vector.cpp
+++ /dev/null
@@ -1,1091 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Vector.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include <memory>
-#include "Matrix.h"
-#include "hl_gpu.h"
-#include "hl_matrix.h"
-#include "hl_table_apply.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Thread.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-
-template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size, bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuVectorT<T>>(size);
-  } else {
-    return std::make_shared<CpuVectorT<T>>(size);
-  }
-}
-
-template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::createParallelVector(
-    size_t size, bool useGpu, SyncThreadPool* pool) {
-  if (!useGpu && FLAGS_trainer_count > 1 && FLAGS_enable_parallel_vector &&
-      size >= (size_t)FLAGS_enable_parallel_vector) {
-    return std::make_shared<ParallelCpuVectorT<T>>(
-        size, pool ? pool : getGlobalSyncThreadPool());
-  } else {
-    return create(size, useGpu);
-  }
-}
-
-template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::create(T* data,
-                                               size_t size,
-                                               bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuVectorT<T>>(size, data);
-  } else {
-    return std::make_shared<CpuVectorT<T>>(size, data);
-  }
-}
-
-template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size,
-                                               MemoryHandlePtr memoryHandle,
-                                               size_t offset) {
-  if (auto cpuMemHandle =
-          std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle)) {
-    return std::make_shared<CpuVectorT<T>>(size, cpuMemHandle, offset);
-  } else if (auto gpuMemHandle =
-                 std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle)) {
-    return std::make_shared<GpuVectorT<T>>(size, gpuMemHandle, offset);
-  } else {
-    LOG(FATAL) << "Wrong";
-    return NULL;
-  }
-}
-
-template <>
-MatrixPtr VectorT<real>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
-  LOG(FATAL) << "Wrong for real vector";
-  return nullptr;
-}
-
-template <>
-MatrixPtr VectorT<int>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
-  size_t height = getSize();
-  size_t width = idRange;
-  MatrixPtr mat = Matrix::createSparseMatrix(
-      height, idRange, height, NO_VALUE, SPARSE_CSR, false, useGpu);
-
-  CpuIVector cpuIds(height);
-  cpuIds.copyFrom(*this);
-  int* idData = cpuIds.getData();
-
-  for (decltype(height) i = 0; i < height; i++) {
-    const unsigned int id = idData[i];
-    CHECK_LT(id, width);
-    mat->setRow(i, 1, &id, nullptr);
-  }
-  return mat;
-}
-
-template <>
-std::shared_ptr<VectorT<int>> VectorT<real>::castToInt() {
-  std::shared_ptr<VectorT<int>> ret = IVector::create(this->getSize(), useGpu_);
-  if (useGpu_) {
-    hl_vector_cast2int(ret->getData(), this->getData(), this->getSize());
-  } else {
-    for (size_t i = 0; i < getSize(); ++i) {
-      ret->getData()[i] = int(this->getData()[i]);
-    }
-  }
-  return ret;
-}
-
-template <class T>
-GpuVectorT<T>::GpuVectorT(size_t size)
-    : VectorT<T>(size,
-                 std::make_shared<GpuMemoryHandle>(sizeof(T) * size),
-                 0, /* offset = 0 */
-                 true /* useGpu = true */) {}
-
-template <class T>
-T GpuVectorT<T>::getElement(size_t i) const {
-  T elem = 0;
-  hl_memcpy_device2host(&elem, const_cast<T*>(&this->getData()[i]), sizeof(T));
-  return elem;
-}
-template <class T>
-void GpuVectorT<T>::setElement(size_t i, const T& value) {
-  hl_memcpy_host2device(&this->getData()[i], const_cast<T*>(&value), sizeof(T));
-}
-
-template <class T>
-T* GpuVectorT<T>::getPoint(const uint64_t beginPos) {
-  LOG(FATAL) << "Not implemented" << beginPos;
-  return NULL;
-}
-
-template <>
-int GpuVectorT<int>::getAbsSum() {
-  LOG(FATAL) << "Not implemented";
-  return 0;
-}
-
-template <>
-int GpuVectorT<int>::getSum() {
-  LOG(FATAL) << "Not implemented";
-  return 0;
-}
-
-template <>
-real GpuVectorT<real>::getAbsSum() {
-  real* A = this->getData();
-  real sum = 0;
-  hl_vector_abs_sum(A, &sum, this->getSize());
-  return sum;
-}
-
-template <>
-real GpuVectorT<real>::getSum() {
-  real* A = this->getData();
-  real sum = 0;
-  hl_vector_sum(A, &sum, this->getSize());
-  return sum;
-}
-
-template <>
-int GpuVectorT<int>::getMax() {
-  CpuIVector cpuIVec = CpuIVector(this->getSize());
-  copyTo(&cpuIVec);
-  return cpuIVec.getMax();
-}
-
-template <>
-int GpuVectorT<int>::getAbsMax() {
-  CpuIVector cpuIVec = CpuIVector(this->getSize());
-  copyTo(&cpuIVec);
-  return cpuIVec.getAbsMax();
-}
-
-template <class T>
-void GpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
-  BaseMatrixT<T>::isEqualTo((BaseMatrixT<T>&)b, value);
-}
-
-template <class T>
-void GpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
-#ifdef PADDLE_WITH_CUDA
-  hl_vector_select_from<T>(this->getData(),
-                           this->getSize(),
-                           src.getData(),
-                           src.getSize(),
-                           ids.getData(),
-                           ids.getSize());
-#endif
-}
-
-template <class Func>
-real gpuRowFunc(Func f, GpuVector& v) {
-  static ThreadLocal<std::unique_ptr<CpuVectorT<real>>> local;
-  if (!*local) {
-    (*local).reset(new CpuVector(1));
-  }
-  real* A = v.getData();
-  f(A, (*local)->getData(), 1, v.getSize());
-  return (*local)->getData()[0];
-}
-
-template <>
-real GpuVectorT<real>::getMax() {
-  return gpuRowFunc(hl_matrix_row_max, *this);
-}
-
-template <>
-real GpuVectorT<real>::getAbsMax() {
-  return std::max(gpuRowFunc(hl_matrix_row_max, *this),
-                  -gpuRowFunc(hl_matrix_row_min, *this));
-}
-
-template <>
-int GpuVectorT<int>::getMin() {
-  LOG(FATAL) << "Not implemented";
-  return 0;
-}
-
-template <>
-real GpuVectorT<real>::getMin() {
-  return gpuRowFunc(hl_matrix_row_min, *this);
-}
-
-template <class T>
-T GpuVectorT<T>::get(size_t pos) {
-  T val = (T)0;
-  hl_memcpy_device2host((void*)&val, (void*)(this->getData() + pos), sizeof(T));
-  return val;
-}
-
-template <class T>
-void GpuVectorT<T>::histogram(std::ostream& os, int type) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <class T>
-void GpuVectorT<T>::zeroMem() {
-  BaseMatrixT<T>::zero();
-}
-
-template <class T>
-void GpuVectorT<T>::reset(const T& value) {
-  BaseMatrixT<T>::assign(value);
-}
-
-template <class T>
-void GpuVectorT<T>::fillSequence() {
-  LOG(FATAL) << "not implemented";
-}
-
-template <class T>
-void GpuVectorT<T>::copyFrom(const VectorT<T>& src) {
-  src.copyTo(this);
-}
-
-template <class T>
-void GpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
-  CHECK_EQ(src.getSize(), this->getSize());
-  hl_memcpy_async((void*)this->getData(),
-                  (void*)src.getData(),
-                  sizeof(T) * this->getSize(),
-                  stream);
-}
-
-template <class T>
-void GpuVectorT<T>::copyFrom(const T* gpuSrc, size_t size) {
-  CHECK(gpuSrc != NULL);
-  CHECK_LE(size, this->size_);
-
-  hl_memcpy((void*)this->getData(), (void*)gpuSrc, sizeof(T) * size);
-}
-
-template <class T>
-void GpuVectorT<T>::copyFrom(const T* gpuSrc, size_t size, hl_stream_t stream) {
-  CHECK(gpuSrc != NULL);
-  CHECK_LE(size, this->size_);
-
-  hl_memcpy_async(
-      (void*)this->getData(), (void*)gpuSrc, sizeof(T) * size, stream);
-}
-
-template <class T>
-void GpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
-  CHECK_EQ(this->getSize(), dest->getSize());
-
-  hl_memcpy_device2host((void*)dest->getData(),
-                        (void*)this->getData(),
-                        sizeof(T) * this->getSize());
-}
-
-template <class T>
-void GpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
-  CHECK_EQ(this->getSize(), dest->getSize());
-
-  hl_memcpy_device2device((void*)dest->getData(),
-                          (void*)this->getData(),
-                          sizeof(T) * this->getSize());
-}
-
-template <>
-void GpuVectorT<int>::rand() {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void GpuVectorT<int>::print(std::ostream& os, size_t num) const {
-  IVectorPtr dest = IVector::create(this->size_, false);
-  hl_memcpy_device2host((void*)dest->getData(),
-                        (void*)this->getData(),
-                        sizeof(int) * this->getSize());
-  dest->print(os, num);
-}
-
-template <>
-void GpuVectorT<real>::print(std::ostream& os, size_t num) const {
-  VectorPtr dest = Vector::create(this->size_, false);
-  hl_memcpy_device2host((void*)dest->getData(),
-                        (void*)this->getData(),
-                        sizeof(int) * this->getSize());
-  dest->print(os, num);
-}
-
-template <>
-void GpuVectorT<int>::printOneElement(std::ostream& os, size_t idx) const {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void GpuVectorT<real>::printOneElement(std::ostream& os, size_t idx) const {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void CpuVectorT<int>::rand() {
-  LOG(FATAL) << "Not implemented";
-}
-template <>
-void GpuVectorT<real>::rand(size_t classNum) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void CpuVectorT<real>::rand(size_t classNum) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void GpuVectorT<real>::rand() {
-  VectorPtr cPtr = Vector::create(this->size_, false);
-  cPtr->rand();
-
-  hl_memcpy_host2device(data_, cPtr->getData(), this->size_ * sizeof(real));
-}
-
-template <>
-void GpuVectorT<int>::rand(size_t classNum) {
-  IVectorPtr cPtr = IVector::create(this->size_, false);
-  cPtr->rand(classNum);
-
-  hl_memcpy_host2device(data_, cPtr->getData(), this->size_ * sizeof(int));
-}
-
-template <>
-void CpuVectorT<int>::rand(size_t classNum) {
-  size_t size = this->getSize();
-  int* data = this->getData();
-  for (size_t i = 0; i < size; i++) {
-    data[i] =
-        std::min(classNum - 1,
-                 size_t(::rand() * (1. / ((double)RAND_MAX + 1)) * classNum));
-  }
-}
-
-template <>
-void CpuVectorT<real>::rand() {
-  size_t size = this->getSize();
-  real* data = this->getData();
-  for (size_t i = 0; i < size; i++) {
-    data[i] = ::rand() * (1. / (double)RAND_MAX);
-    // data[ii] = ((temp > RAND_MAX/2)? 1 : -1) *
-    // sqrt( abs((temp-RAND_MAX/2))/(double(RAND_MAX))/2048 );
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::randnorm(real, real) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <class T>
-void CpuVectorT<T>::uniform(real, real) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <class T>
-void GpuVectorT<T>::randnorm(real, real) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <class T>
-void GpuVectorT<T>::uniform(real, real) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void CpuVectorT<real>::randnorm(real mean, real std) {
-  size_t size = this->getSize();
-  real* data = this->getData();
-  unsigned int* seed = ThreadLocalRand::getSeed();
-  auto rand1 = [&]() { return (1. + ::rand_r(seed)) * (1. / (1. + RAND_MAX)); };
-  for (size_t i = 0; i < size - 1; i += 2) {
-    real r1 = rand1();
-    r1 = std::sqrt(-2 * std::log(r1));
-    real r2 = rand1();
-    data[i] = mean + std * r1 * cos(2 * M_PI * r2);
-    data[i + 1] = mean + std * r1 * sin(2 * M_PI * r2);
-  }
-  real r1 = rand1();
-  r1 = std::sqrt(-2 * std::log(r1));
-  real r2 = rand1();
-  data[size - 1] = mean + std * r1 * cos(2 * M_PI * r2);
-}
-
-template <>
-void CpuVectorT<real>::uniform(real left, real right) {
-  size_t size = this->getSize();
-  real* data = this->getData();
-  real range = right - left;
-  unsigned int* seed = ThreadLocalRand::getSeed();
-  auto rand1 = [&]() { return ::rand_r(seed) * (1. / (1. + RAND_MAX)); };
-  for (size_t i = 0; i < size; ++i) {
-    data[i] = rand1() * range + left;
-  }
-}
-
-template <>
-void GpuVectorT<real>::randnorm(real mean, real std) {
-  CpuVector cpuVec = CpuVector(this->getSize());
-  cpuVec.randnorm(mean, std);
-
-  hl_memcpy_host2device(
-      data_, cpuVec.getData(), this->getSize() * sizeof(real));
-}
-
-template <>
-void GpuVectorT<real>::uniform(real left, real right) {
-  CpuVector cpuVec = CpuVector(this->getSize());
-  cpuVec.uniform(left, right);
-
-  hl_memcpy_host2device(
-      data_, cpuVec.getData(), this->getSize() * sizeof(real));
-}
-
-template <class T>
-CpuVectorT<T>::CpuVectorT(size_t size)
-    : VectorT<T>(size,
-                 std::make_shared<CpuMemoryHandle>(sizeof(T) * size),
-                 0, /* offset = 0 */
-                 false /* useGpu = false */) {}
-
-template <class T>
-CpuVectorT<T>::CpuVectorT(const VectorT<T>& src)
-    : VectorT<T>(src.getSize(),
-                 src.getMemoryHandle(),
-                 0, /* offset = 0 */
-                 false /* useGpu = false */) {
-  if (typeid(*this->memoryHandle_.get()) != typeid(CpuMemoryHandle)) {
-    this->memoryHandle_ =
-        std::make_shared<CpuMemoryHandle>(sizeof(T) * this->getSize());
-    this->data_ = reinterpret_cast<T*>(this->memoryHandle_->getBuf());
-  }
-  src.copyTo(this);
-}
-
-template <class T>
-T CpuVectorT<T>::getAbsSum() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T sum = 0;
-  for (size_t i = 0; i < size; i++) {
-    sum += (A[i] > 0) ? A[i] : -A[i];
-  }
-  return sum;
-}
-
-// cannot use above version, due to precision issue of float
-template <>
-real CpuVectorT<real>::getAbsSum() {
-  const real* A = this->getData();
-  size_t size = this->getSize();
-  double sum = 0;
-  for (size_t i = 0; i < size; i++) {
-    sum += (A[i] > 0) ? A[i] : -A[i];
-  }
-  return sum;
-}
-
-template <class T>
-T CpuVectorT<T>::getSum() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T sum = 0;
-  for (size_t i = 0; i < size; i++) {
-    sum += A[i];
-  }
-  return sum;
-}
-
-template <>
-real CpuVectorT<real>::getSum() {
-  const real* A = this->getData();
-  size_t size = this->getSize();
-  double sum = 0;
-  for (size_t i = 0; i < size; i++) {
-    sum += A[i];
-  }
-  return sum;
-}
-
-template <class T>
-T CpuVectorT<T>::get(size_t pos) {
-  return this->getData()[pos];
-}
-
-template <class T>
-T CpuVectorT<T>::getMax() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T res = A[0];
-  for (size_t i = 1; i < size; i++) {
-    if (res < A[i]) res = A[i];
-  }
-  return res;
-}
-
-template <class T>
-T CpuVectorT<T>::getAbsMax() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T res = std::abs(A[0]);
-  for (size_t i = 1; i < size; i++) {
-    if (res < std::abs(A[i])) res = std::abs(A[i]);
-  }
-  return res;
-}
-
-template <class T>
-T CpuVectorT<T>::getMin() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T res = A[0];
-  for (size_t i = 1; i < size; i++) {
-    if (res > A[i]) res = A[i];
-  }
-  return res;
-}
-
-template <class T>
-void CpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
-  size_t size = this->getSize();
-  CHECK_EQ(b.getSize(), size);
-
-  const T* B = b.getData();
-  T* A = this->getData();
-  for (size_t i = 0; i < size; i++) {
-    A[i] = (B[i] == value);
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
-  size_t size = this->getSize();
-  CHECK_EQ(ids.getSize(), size);
-
-  const int* indices = ids.getData();
-  const T* B = src.getData();
-  T* A = this->getData();
-  for (size_t i = 0; i < size; i++) {
-    int index = indices[i];
-    CHECK_LT(index, (int)src.getSize());
-    A[i] = B[index];
-  }
-}
-
-static int getSignAndExponentOfFloat(float a) {
-  uint32_t* pa = reinterpret_cast<uint32_t*>(&a);
-  return *pa >> 23;
-}
-
-template <class T>
-void CpuVectorT<T>::histogram(std::ostream& os, int type) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void CpuVectorT<real>::histogram(std::ostream& os, int type) {
-  int counters[512];
-  memset(counters, 0, sizeof(counters));
-  int counterZero = 0;
-
-  const real* A = this->getData();
-  size_t size = this->getSize();
-  for (size_t i = 0; i < size; i++) {
-    if (A[i] == 0.0f) {
-      ++counterZero;
-    } else {
-      ++counters[getSignAndExponentOfFloat(A[i])];
-    }
-  }
-
-  int64_t sum = 0;
-  float sizeNonZero = size - counterZero;
-  os << "zero:" << counterZero;
-  for (int i = 0; i < 256; i++) {
-    int counter = counters[i];
-    if (counter) {
-      os << " 2^" << i - 127 << ":" << counter / sizeNonZero * 100 << "%";
-      sum += counter * (i - 127);
-    }
-  }
-  for (int i = 0; i < 256; i++) {
-    int counter = counters[i + 256];
-    if (counter) {
-      os << " -2^" << i - 127 << ":" << counter / sizeNonZero * 100 << "%";
-      sum += counter * (i - 127);
-    }
-  }
-  os << ", nonzero_exponent_avg=" << sum / sizeNonZero;
-}
-
-template <class T>
-void CpuVectorT<T>::zeroMem() {
-  memset(this->getData(), 0, sizeof(T) * this->getSize());
-}
-
-template <class T>
-void CpuVectorT<T>::reset(const T& value) {
-  T* A = this->getData();
-  size_t size = this->getSize();
-  for (size_t i = 0; i < size; i++) {
-    A[i] = value;
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::fillSequence() {
-  T* A = this->getData();
-  size_t size = this->getSize();
-  for (size_t i = 0; i < size; i++) {
-    A[i] = i;
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::copyFrom(const VectorT<T>& src) {
-  src.copyTo(this);
-}
-
-template <class T>
-void CpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
-  if (typeid(src) == typeid(GpuVectorT<T>)) {
-    hl_memcpy_async((void*)this->getData(),
-                    (void*)src.getData(),
-                    sizeof(T) * this->getSize(),
-                    stream);
-    // There is a need to add synchronization to ensure that the data is copied.
-    hl_stream_synchronize(stream);
-  } else {
-    src.copyTo(this);
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::copyFrom(const T* hostSrc, size_t size) {
-  CHECK(hostSrc != NULL);
-  CHECK_LE(size, this->size_);
-  memcpy(this->data_, hostSrc, sizeof(T) * size);
-}
-
-template <class T>
-void CpuVectorT<T>::copyFrom(const T* hostSrc,
-                             size_t size,
-                             hl_stream_t stream) {
-  (void)stream;
-
-  CHECK(hostSrc != NULL);
-  CHECK_LE(size, this->size_);
-  memcpy(this->data_, hostSrc, sizeof(T) * size);
-}
-
-template <class T>
-void CpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
-  CHECK_EQ(this->getSize(), dest->getSize());
-  memcpy(dest->getData(), this->getData(), sizeof(T) * this->getSize());
-}
-
-template <class T>
-void CpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
-  CHECK_EQ(this->getSize(), dest->getSize());
-  hl_memcpy_host2device((void*)dest->getData(),
-                        (void*)this->getData(),
-                        sizeof(T) * this->getSize());
-}
-
-template <>
-void CpuVectorT<real>::print(std::ostream& os, size_t num) const {
-  size_t w = size_ < num ? size_ : num;
-  os << "[";
-  for (size_t i = 0; i < w; ++i) {
-    os << data_[i] << " ";
-  }
-  os << "]" << std::endl;
-}
-
-template <>
-void CpuVectorT<int>::print(std::ostream& os, size_t num) const {
-  size_t w = size_ < num ? size_ : num;
-  os << "[";
-  for (size_t i = 0; i < w; ++i) {
-    os << (int)data_[i] << " ";
-  }
-  os << "]" << std::endl;
-}
-
-template <>
-void CpuVectorT<real>::printOneElement(std::ostream& os, size_t idx) const {
-  CHECK_LT(idx, size_);
-  os << data_[idx] << ";";
-}
-
-template <>
-void CpuVectorT<int>::printOneElement(std::ostream& os, size_t idx) const {
-  CHECK_LT(idx, size_);
-  os << (int)data_[idx] << ";";
-}
-
-template <class T>
-void ParallelCpuVectorT<T>::parallelExec(ExecFunc func) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void ParallelCpuVectorT<real>::parallelExec(ExecFunc func) {
-  pool_->exec([this, func](int tid, size_t numThreads) {
-    auto interval = calcSplitArrayInterval(
-        this->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
-    // setup sub bufs
-    CpuVector subVec(0, nullptr);
-    subVec.subVecFrom(*this, interval);
-    func(subVec);
-  });
-}
-
-template <class T>
-void ParallelCpuVectorT<T>::exec(SyncThreadPool::JobFunc func) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void ParallelCpuVectorT<real>::exec(SyncThreadPool::JobFunc func) {
-  pool_->exec(func);
-}
-
-template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu) : sync_(nullptr) {
-  if (!useGpu) {
-    cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size);
-  } else {
-    gpuVectorT_ = std::make_shared<GpuVectorT<T>>(size);
-  }
-  setSync(useGpu);
-}
-
-template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src)
-    : sync_(nullptr) {
-  bool useGpu = src->useGpu();
-  if (useGpu) {
-    gpuVectorT_ = src;
-  } else {
-    cpuVectorT_ = src;
-  }
-  setSync(useGpu);
-}
-
-template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, T* data, bool useGpu)
-    : sync_(nullptr) {
-  if (!useGpu) {
-    cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size, data);
-    setSync(DATA_AT_CPU);
-  } else {
-    gpuVectorT_ = std::make_shared<GpuVectorT<T>>(size, data);
-    setSync(DATA_AT_GPU);
-  }
-}
-
-template <class T>
-std::shared_ptr<CpuGpuVectorT<T>> CpuGpuVectorT<T>::create(size_t size,
-                                                           bool useGpu) {
-  return std::make_shared<CpuGpuVectorT<T>>(size, useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::resize(size_t size, bool useGpu) {
-  if (useGpu) {
-    CHECK(gpuVectorT_) << "gpuVectorT_ is null";
-    // If memoryHandle_ is nullptr,
-    // the data may be owned by the caller when it was constructed.
-    // It should not resize for this case.
-    if (gpuVectorT_->getMemoryHandle()) {
-      gpuVectorT_->resize(size);
-    } else {
-      CHECK_EQ(gpuVectorT_->getSize(), size);
-    }
-  } else {
-    CHECK(cpuVectorT_) << "cpuVectorT_ is null";
-    // If memoryHandle_ is nullptr,
-    // the data may be owned by the caller when it was constructed.
-    // It should not resize for this case.
-    if (cpuVectorT_->getMemoryHandle()) {
-      cpuVectorT_->resize(size);
-    } else {
-      CHECK_EQ(cpuVectorT_->getSize(), size);
-    }
-  }
-  setSync(useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
-                                      size_t size,
-                                      bool useGpu) {
-  if (vec) {
-    vec->resize(size, useGpu);
-  } else {
-    vec = create(size, useGpu);
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::resizeOrCreate(size_t size, bool useGpu) {
-  if (useGpu && (!gpuVectorT_)) {
-    gpuVectorT_ = VectorT<T>::create(size, true);
-  } else if ((!useGpu) && (!cpuVectorT_)) {
-    cpuVectorT_ = VectorT<T>::create(size, false);
-  } else {
-    CHECK((useGpu && gpuVectorT_) || (!useGpu && cpuVectorT_));
-    this->resize(size, useGpu);
-  }
-}
-
-template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
-                                size_t offset,
-                                size_t size)
-    : sync_(nullptr) {
-  CHECK_LE(offset + size, static_cast<size_t>(src.getSize()));
-#ifdef PADDLE_WITH_CUDA
-  SyncedFlag* flag = src.getSync();
-  if (*flag == DATA_AT_CPU) {
-    src.copyToGpu();  // will set synchronous data between CPU and GPU
-  } else if (*flag == DATA_AT_GPU) {
-    src.copyToCpu();  // will set synchronous data between CPU and GPU
-  }
-#endif
-  auto cMemHandle = (src.getVector(false))->getMemoryHandle();
-  cpuVectorT_ = std::make_shared<CpuVectorT<T>>(
-      size, std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
-#ifdef PADDLE_WITH_CUDA
-  auto gMemHandle = (src.getVector(true))->getMemoryHandle();
-  gpuVectorT_ = std::make_shared<GpuVectorT<T>>(
-      size, std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
-  src.setSync(SYNCED);
-#endif
-  setSync(src.getSync());
-}
-
-template <class T>
-std::shared_ptr<const VectorT<T>> CpuGpuVectorT<T>::getVector(
-    bool useGpu) const {
-  auto* self = const_cast<CpuGpuVectorT<T>*>(this);
-  if (useGpu) {
-    self->copyToGpu();
-    return std::const_pointer_cast<const VectorT<T>>(gpuVectorT_);
-  } else {
-    self->copyToCpu();
-    return std::const_pointer_cast<const VectorT<T>>(cpuVectorT_);
-  }
-}
-
-template <class T>
-std::shared_ptr<VectorT<T>>& CpuGpuVectorT<T>::getMutableVector(bool useGpu) {
-  setSync(useGpu);
-  if (useGpu) {
-    copyToGpu();
-    return gpuVectorT_;
-  } else {
-    copyToCpu();
-    return cpuVectorT_;
-  }
-}
-
-template <class T>
-const T* CpuGpuVectorT<T>::getData(bool useGpu) const {
-  auto self = const_cast<CpuGpuVectorT<T>*>(this);
-  if (useGpu) {
-    self->copyToGpu();
-    return gpuVectorT_->getData();
-  } else {
-    self->copyToCpu();
-    return cpuVectorT_->getData();
-  }
-}
-
-// Operation will change data and need to reset sync_ & syncFlag_.
-#define MUTABLE_VECTOR_OP(OP, useGpu, args...) \
-  do {                                         \
-    if (useGpu) {                              \
-      copyToGpu();                             \
-      setSync(useGpu);                         \
-      return gpuVectorT_->OP(args);            \
-    } else {                                   \
-      copyToCpu();                             \
-      setSync(useGpu);                         \
-      return cpuVectorT_->OP(args);            \
-    }                                          \
-  } while (0)
-
-template <class T>
-T* CpuGpuVectorT<T>::getMutableData(bool useGpu) {
-  MUTABLE_VECTOR_OP(getData, useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::zeroMem(bool useGpu) {
-  MUTABLE_VECTOR_OP(zeroMem, useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::fillSequence(bool useGpu) {
-  MUTABLE_VECTOR_OP(fillSequence, useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::setElement(size_t i, const T& value, bool useGpu) {
-  MUTABLE_VECTOR_OP(setElement, useGpu, i, value);
-}
-
-template <class T>
-T CpuGpuVectorT<T>::getElement(size_t i) const {
-  switch (*this->getSync()) {
-    case SYNCED:
-    case DATA_AT_CPU:
-      return cpuVectorT_->getElement(i);
-      break;
-    case DATA_AT_GPU:
-      return gpuVectorT_->getElement(i);
-      break;
-    default:
-      LOG(FATAL) << "Not support";
-      break;
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
-  auto cVec = dynamic_cast<const CpuVectorT<T>*>(&src);
-  auto gVec = dynamic_cast<const GpuVectorT<T>*>(&src);
-  if (cVec) {
-    copyToCpu(cVec->getData(), cVec->getSize(), stream);
-  } else if (gVec) {
-    copyToGpu(gVec->getData(), gVec->getSize(), stream);
-  } else {
-    LOG(FATAL) << "Invalid type of src";
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(const T* data, size_t size, bool useGpu) {
-  if (useGpu) {
-    copyToGpu(data, size);
-  } else {
-    copyToCpu(data, size);
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(const T* data,
-                                size_t size,
-                                hl_stream_t stream,
-                                bool useGpu) {
-  if (useGpu) {
-    copyToGpu(data, size, stream);
-  } else {
-    copyToCpu(data, size, stream);
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src,
-                                size_t offset,
-                                size_t size,
-                                bool useGpu,
-                                hl_stream_t stream) {
-  if (useGpu) {
-    VectorT<T>::resizeOrCreate(gpuVectorT_, size, true);
-    gpuVectorT_->copyFrom(src.getData(true) + offset, size, stream);
-  } else {
-    VectorT<T>::resizeOrCreate(cpuVectorT_, size, false);
-    cpuVectorT_->copyFrom(src.getData(false) + offset, size, stream);
-  }
-  setSync(useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src, hl_stream_t stream) {
-  switch (*src.getSync()) {
-    case DATA_AT_CPU:
-      copyFrom(*(src.getVector(false)), stream);
-      break;
-    case DATA_AT_GPU:
-      copyFrom(*(src.getVector(true)), stream);
-      break;
-    case SYNCED:
-      copyFrom(*(src.getVector(false)), stream);
-      copyFrom(*(src.getVector(true)), stream);
-      setSync(SYNCED);
-      break;
-    default:
-      LOG(FATAL) << "Not support";
-      break;
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyToCpu() {
-  switch (*this->getSync()) {
-    case DATA_AT_GPU:
-      CHECK(gpuVectorT_);
-      this->resizeOrCreate(gpuVectorT_->getSize(), false);
-      cpuVectorT_->copyFrom(*gpuVectorT_);
-      setSync(SYNCED);
-      break;
-    case DATA_AT_CPU:
-    case SYNCED:
-      CHECK(cpuVectorT_);
-      break;
-    default:
-      LOG(FATAL) << "Not support";
-      break;
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyToGpu() {
-  switch (*this->getSync()) {
-    case DATA_AT_CPU:
-      CHECK(cpuVectorT_);
-      this->resizeOrCreate(cpuVectorT_->getSize(), true);
-      gpuVectorT_->copyFrom(*cpuVectorT_);
-      setSync(SYNCED);
-      break;
-    case DATA_AT_GPU:
-    case SYNCED:
-      CHECK(gpuVectorT_);
-      break;
-    default:
-      LOG(FATAL) << "Not support";
-      break;
-  }
-}
-
-template class VectorT<real>;
-template class VectorT<int>;
-template class CpuVectorT<real>;
-template class CpuVectorT<int>;
-template class GpuVectorT<real>;
-template class GpuVectorT<int>;
-template class CpuGpuVectorT<real>;
-template class CpuGpuVectorT<int>;
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/Vector.h b/paddle/legacy/math/Vector.h
deleted file mode 100644
index 63cb4651c52..00000000000
--- a/paddle/legacy/math/Vector.h
+++ /dev/null
@@ -1,726 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cmath>
-#include <memory>
-
-#include <hl_gpu.h>
-
-#include "BaseMatrix.h"
-#include "MemoryHandle.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/Thread.h"
-
-namespace paddle {
-
-template <class T>
-class GpuVectorT;
-template <class T>
-class CpuVectorT;
-
-template <class T>
-class BaseVector;
-
-class SyncThreadPool;
-
-class Matrix;
-
-template <class T>
-class BaseVector : public BaseMatrixT<T> {
- public:
-  BaseVector(size_t size, T* data, bool useGpu)
-      : BaseMatrixT<T>(1, size, data, false, useGpu), size_(this->width_) {}
-
-  ~BaseVector() {}
-
- protected:
-  size_t& size_;
-};
-
-/**
- * Copy or assignemnt constructor will share the data as opposed to making a
- * copy of the original data. To make a copy of the orinal data, use copyFrom()
- * instead.
- */
-template <class T>
-class VectorT : public BaseVector<T> {
- protected:
-  VectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset, bool useGpu)
-      : BaseVector<T>(size,
-                      reinterpret_cast<T*>(memoryHandle->getBuf()) + offset,
-                      useGpu) {
-    memoryHandle_ = memoryHandle;
-  }
-
-  // data is still owned by the caller.
-  // data should be valid during the life of this vector.
-  // Caller is responsible for release the memory.
-  VectorT(size_t size, T* data, bool useGpu)
-      : BaseVector<T>(size, data, useGpu) {}
-
- public:
-  virtual ~VectorT() {}
-
-  static std::shared_ptr<VectorT<T>> create(size_t size, bool useGpu);
-
-  static std::shared_ptr<VectorT<T>> create(T* data, size_t size, bool useGpu);
-
-  static std::shared_ptr<VectorT<T>> create(size_t size,
-                                            MemoryHandlePtr memoryHandle,
-                                            size_t offset = 0);
-
-  // owner can set SyncThreadPool,
-  // if not set, will use globalSyncThreadPool,
-  // which can be used in main thread only.
-  static std::shared_ptr<VectorT<T>> createParallelVector(
-      size_t size, bool useGpu, SyncThreadPool* pool = nullptr);
-
-  size_t getSize() const { return this->size_; }
-  const T* getData() const { return this->data_; }
-  T* getData() { return this->data_; }
-
-  virtual void zeroMem() = 0;
-  // set all elements to value
-  virtual void reset(const T& value) = 0;
-  // fill data by 0, 1, 2, ...
-  virtual void fillSequence() = 0;
-
-  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
-
-  /**
-   * resizing to a big vector will not preserve old values.
-   */
-  void resize(size_t newSize) {
-    if (!memoryHandle_ || newSize * sizeof(T) > memoryHandle_->getAllocSize()) {
-      memoryHandle_ = newMemory(newSize * sizeof(T));
-      this->data_ = reinterpret_cast<T*>(memoryHandle_->getBuf());
-    }
-    this->size_ = newSize;
-  }
-
-  static void resizeOrCreate(std::shared_ptr<VectorT<T>>& vec,
-                             size_t size,
-                             bool useGpu) {
-    if (vec) {
-      vec->resize(size);
-    } else {
-      vec = create(size, useGpu);
-    }
-  }
-
-  virtual MemoryHandlePtr newMemory(size_t size) = 0;
-
-  /**
-   * form sub vector from *src*, shallow copy
-   */
-  void subVecFrom(const VectorT<T>& src, size_t start, size_t size) {
-    CHECK_EQ(BaseVector<T>::useGpu_, src.useGpu_);
-    CHECK_LT(start, src.size_);
-    CHECK_LE(start + size, src.size_);
-
-    BaseVector<T>::size_ = size;
-    BaseVector<T>::data_ = const_cast<T*>(src.data_) + start;
-  }
-
-  std::shared_ptr<VectorT<T>> subVec(size_t start, size_t size) {
-    CHECK_LE(start + size, static_cast<size_t>(getSize()));
-    return VectorT<T>::create(getData() + start, size, BaseVector<T>::useGpu_);
-  }
-
-  /**
-   * form sub vector from *src*, shallow copy
-   */
-  void subVecFrom(const T* src, size_t start, size_t size) {
-    BaseVector<T>::size_ = size;
-    BaseVector<T>::data_ = const_cast<T*>(src) + start;
-  }
-
-  /**
-   * form sub vector from *src*, shallow copy
-   * in *interval* [interval.first, interval.second)
-   */
-  void subVecFrom(const VectorT<T>& src, std::pair<size_t, size_t> interval) {
-    subVecFrom(src, interval.first, interval.second - interval.first);
-  }
-
-  /**
-   * convert the vector to a sparse one_hot matrix of width idRange
-   * only applies to IVector
-   */
-  std::shared_ptr<Matrix> toOneHotSparseMatrix(size_t idRange, bool useGpu);
-
-  /**
-   * @brief cast vector of "real" elements to "int" elements.
-   *
-   * @note: float -> int must be casted, or you'll get wrong data.
-   */
-  std::shared_ptr<VectorT<int>> castToInt();
-
-  /**
-   * This function will crash if the size of src and dest is different.
-   */
-  virtual void copyFrom(const VectorT<T>& src) = 0;
-
-  /**
-   * If GpuVector, this function is an asynchronous interface,
-   * will push the copy-task to the specifed-stream and return immediately.
-   *
-   * If CpuVector, this function is an synchronous interface,
-   * same as the copyFrom(const VectorT<T>& src).
-   */
-  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream) = 0;
-
-  /**
-   * copy size elements from src
-   *
-   * If this is GpuVector, src can be cpu or gpu memory
-   *
-   * If this is CpuVector, src is assumed to be cpu memory
-   */
-  virtual void copyFrom(const T* src, size_t size) = 0;
-
-  /**
-   * copy size elements from src
-   *
-   * If this is GpuVector, src can be cpu or gpu memory
-   *
-   * If this is CpuVector, src is assumed to be cpu memory,
-   */
-  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream) = 0;
-
-  /**
-   * exec a func in single/multi thread
-   */
-  virtual void exec(SyncThreadPool::JobFunc func) { func(0, 1); }
-
-  /// Get the buffer point with beginPos
-  virtual T* getPoint(const uint64_t beginPos) = 0;
-
-  /// Get the value for the i'th element
-  virtual T getElement(size_t i) const = 0;
-  virtual void setElement(size_t i, const T& value) = 0;
-
-  //----------  math operations ----------------
-
-  // sum of the absolute value of each elements
-  virtual T getAbsSum() = 0;
-
-  virtual T getSum() = 0;
-  virtual T getMax() = 0;
-  virtual T getAbsMax() = 0;
-  virtual T getMin() = 0;
-
-  /// element-wise calc:  this = (b == value)
-  virtual void isEqualTo(const VectorT<T>& b, const T& value) = 0;
-
-  /// select elements indexed by *ids* from vector *src*
-  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids) = 0;
-
-  enum HistogramType {
-    HISTOGRAM_EXPONENT = 0,
-  };
-
-  /**
-   * @brief  print histogram of vector values
-   *
-   * @note   only exponent histogram supported currently
-   */
-  virtual void histogram(std::ostream& os, int type = HISTOGRAM_EXPONENT) = 0;
-
-  /// generate uniform random value for each element
-  virtual void rand() = 0;
-  /**
-   * generate uniform random value for each element,
-   * data range is from 0 to (classes - 1).
-   */
-  virtual void rand(size_t classes) = 0;
-
-  /**
-   * Debug use only. Very inefficient for GPU vector.
-   * get the value at pos.
-   */
-  virtual T get(size_t pos) = 0;
-
-  /**
-   * generate univariate Gaussian distributed random numbers
-   * with given mean and standardDeviation.
-   */
-  virtual void randnorm(real mean, real standardDeviation) = 0;
-
-  /**
-   * generate uniform distributed random numbers
-   * with given range.
-   */
-  virtual void uniform(real left, real right) = 0;
-
-  /// print the first "num" elements of the Vector
-  virtual void print(std::ostream& os, size_t num) const = 0;
-
-  /// print the "idx" element of the Vector
-  virtual void printOneElement(std::ostream& os, size_t idx) const = 0;
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    if (BaseVector<T>::useGpu_) {
-      TensorGpuApply<T>(*this, expr);
-    } else {
-      TensorCpuApply<T>(*this, expr);
-    }
-  }
-
- protected:
-  friend class GpuVectorT<T>;
-  friend class CpuVectorT<T>;
-  virtual void copyTo(CpuVectorT<T>* dest) const = 0;
-  virtual void copyTo(GpuVectorT<T>* dest) const = 0;
-  MemoryHandlePtr memoryHandle_;
-};
-
-template <class T>
-std::ostream& operator<<(std::ostream& os, const VectorT<T>& vec) {
-  vec.print(os, vec.getSize());
-  return os;
-}
-
-template <class T>
-class GpuVectorT : public VectorT<T> {
- public:
-  explicit GpuVectorT(size_t size);
-  GpuVectorT(size_t size, GpuMemHandlePtr memHandle, size_t offset)
-      : VectorT<T>(size, memHandle, offset, true) {}
-
-  // data is still owned by the caller.
-  // data should be valid during the life of this vector.
-  // Caller is responsible for release the memory.
-  GpuVectorT(size_t size, T* data) : VectorT<T>(size, data, true) {}
-
-  virtual MemoryHandlePtr newMemory(size_t size) {
-    return std::make_shared<GpuMemoryHandle>(size);
-  }
-  virtual void zeroMem();
-  virtual void reset(const T& value);
-  virtual void fillSequence();
-
-  virtual void copyFrom(const T* src, size_t size);
-  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream);
-  virtual void copyFrom(const VectorT<T>& src);
-  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream);
-  virtual T getElement(size_t i) const;
-  virtual void setElement(size_t i, const T& value);
-  virtual T* getPoint(const uint64_t beginPos);
-
-  virtual T getAbsSum();
-  virtual T getSum();
-  virtual T getMax();
-  virtual T getAbsMax();
-  virtual T getMin();
-  virtual void isEqualTo(const VectorT<T>& b, const T& value);
-  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids);
-  virtual void histogram(std::ostream& os, int type);
-  virtual void rand();
-  virtual void rand(size_t classes);
-  virtual void randnorm(real mean, real standardDeviation);
-  virtual void uniform(real left, real right);
-  virtual T get(size_t pos);
-  virtual void print(std::ostream& os, size_t num) const;
-  virtual void printOneElement(std::ostream& os, size_t idx) const;
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorGpuApply<T>(*this, expr);
-  }
-
- protected:
-  virtual void copyTo(CpuVectorT<T>* dest) const;
-  virtual void copyTo(GpuVectorT<T>* dest) const;
-};
-
-template <class T>
-class CpuVectorT : public VectorT<T> {
- public:
-  explicit CpuVectorT(size_t size);
-  CpuVectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset)
-      : VectorT<T>(size, memoryHandle, offset, false) {}
-
-  // data is still owned by the caller.
-  // data should be valid during the life of this vector.
-  // Caller is responsible for release the memory.
-  CpuVectorT(size_t size, T* data) : VectorT<T>(size, data, false) {}
-
-  /**
-   * If src is a CpuVector, the new CpuVector will share the data with src
-   *
-   * If src is a GpuVector, the new CpuVector will copy data from src
-   */
-  explicit CpuVectorT(const VectorT<T>& src);
-
-  virtual MemoryHandlePtr newMemory(size_t size) {
-    return std::make_shared<CpuMemoryHandle>(size);
-  }
-
-  virtual void zeroMem();
-  virtual void reset(const T& value);
-  virtual void fillSequence();
-  virtual void copyFrom(const T* src, size_t size);
-  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream);
-  virtual void copyFrom(const VectorT<T>& src);
-  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream);
-  virtual void copyTo(CpuVectorT<T>* dest) const;
-  virtual void copyTo(GpuVectorT<T>* dest) const;
-
-  /// Get the buffer point with beginPos
-  virtual T* getPoint(const uint64_t beginPos) {
-    return this->getData() + beginPos;
-  }
-
-  virtual T getElement(size_t i) const { return this->getData()[i]; }
-  virtual void setElement(size_t i, const T& value) {
-    this->getData()[i] = value;
-  }
-
-  virtual T getAbsSum();
-  virtual T getSum();
-  virtual T getMax();
-  virtual T getAbsMax();
-  virtual T getMin();
-  virtual void isEqualTo(const VectorT<T>& b, const T& value);
-  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids);
-  virtual void histogram(std::ostream& os, int type);
-  virtual void rand();
-  virtual void rand(size_t classes);
-  virtual void randnorm(real mean, real standardDeviation);
-  virtual void uniform(real left, real right);
-  virtual T get(size_t pos);
-  virtual void print(std::ostream& os, size_t num) const;
-  virtual void printOneElement(std::ostream& os, size_t idx) const;
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorCpuApply<T>(*this, expr);
-  }
-};
-
-template <class T>
-class ParallelCpuVectorT : public CpuVectorT<T> {
- public:
-  ParallelCpuVectorT(size_t size, SyncThreadPool* pool)
-      : CpuVectorT<T>(size), pool_(pool) {}
-
-  virtual void zeroMem() {
-    parallelExec([](CpuVectorT<T>& vec) { vec.CpuVectorT<T>::zeroMem(); });
-  }
-  virtual void randnorm(real mean, real standardDeviation) {
-    parallelExec([=](CpuVectorT<T>& vec) {
-      vec.CpuVectorT<T>::randnorm(mean, standardDeviation);
-    });
-  }
-  virtual void uniform(real left, real right) {
-    parallelExec(
-        [=](CpuVectorT<T>& vec) { vec.CpuVectorT<T>::uniform(left, right); });
-  }
-
-  virtual void exec(SyncThreadPool::JobFunc jobFunc);
-
- private:
-  typedef std::function<void(CpuVectorT<T>& vec)> ExecFunc;
-  void parallelExec(ExecFunc func);
-  SyncThreadPool* pool_;
-};
-
-/**
- * A class to do conversion between CpuVector and GpuVector automatically.
- */
-template <class T>
-class CpuGpuVectorT {
- public:
-  /**
-   * @brief An enum type of SyncedFlag using to
-   *        mark data memory is in CPU or GPU.
-   *
-   * DATA_AT_CPU: data is located in CPU.
-   *
-   * DATA_AT_GPU: data is located in GPU.
-   *
-   * SYNCED: data is located in CPU and GPU simultaneously.
-   */
-  enum SyncedFlag { DATA_AT_CPU = 0, DATA_AT_GPU = 1, SYNCED = 2 };
-
-  /**
-   * @brief A constructor, create cpuVectorT_ or gpuVectorT_.
-   *
-   * @param[in] size    data size.
-   * @param[in] useGpu  use gpu or not.
-   */
-  explicit CpuGpuVectorT(size_t size, bool useGpu);
-
-  /**
-   * @brief A constructor, create CpuGpuVectorT by VectorT.
-   *
-   * If src is CpuVector, cpuVectorT_ is shared data with src.
-   *
-   * If src is GpuVector, gpuVectorT_ is shared data with src.
-   */
-  explicit CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src);
-
-  /**
-   * @brief A constructor.
-   *
-   * If useGpu is true, data should be located in device and
-   * create gpuVectorT_ with data.
-   *
-   * If useGpu is false, data should be located in host and
-   * create cpuVectorT_ with data.
-   *
-   * @note Data is owned by the caller and should be valid during
-   *       the life of this vector.
-   *       Caller is responsible for release the memory.
-   */
-  CpuGpuVectorT(size_t size, T* data, bool useGpu);
-
-  CpuGpuVectorT(CpuGpuVectorT<T>& src, size_t offset, size_t size);
-
-  virtual ~CpuGpuVectorT() {}
-
-  static std::shared_ptr<CpuGpuVectorT<T>> create(size_t size, bool useGpu);
-
-  /**
-   * @brief resize vector.
-   *
-   * If useGpu is true, resize gpuVectorT_ and set syncFlag_ to DATA_AT_GPU,
-   *
-   * otherwise resize cpuVectorT_ and set syncFlag_ to DATA_AT_CPU.
-   */
-  void resize(size_t size, bool useGpu);
-
-  /**
-   * @brief resize or create CpuGpuVectorT.
-   */
-  static void resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
-                             size_t size,
-                             bool useGpu);
-
-  /**
-   * @brief return a const cpuVectorT_ or gpuVectorT_.
-   *
-   * If useGpu is true, return gpuVectorT_.
-   *
-   * If useGpu is false, return cpuVectorT_.
-   *
-   * @note Caller should not change the data.
-   *       If caller changes const attribute,
-   *       should set syncFlag_.
-   */
-  std::shared_ptr<const VectorT<T>> getVector(bool useGpu) const;
-
-  /**
-   * @brief return a const cpuVectorT_ or gpuVectorT_.
-   *
-   * @note: This interface will change syncFlag_, so if you will
-   *        not change the data, you should call getVector.
-   */
-  std::shared_ptr<VectorT<T>>& getMutableVector(bool useGpu);
-
-  /**
-   * @brief return const T* data.
-   *
-   * If useGpu is true, return device data.
-   *
-   * If useGpu is false, return host data.
-   */
-  const T* getData(bool useGpu) const;
-
-  // TODO(yuyang18): Make getData more c++ style.
-  //  inline T* getData(bool useGpu) {
-  //    return getMutableData(useGpu);
-  //  }
-
-  T* getMutableData(bool useGpu);
-
-  /**
-   * If useGpu is true, gpuVectorT_->Op().
-   *
-   * If useGpu is false, cpuVectorT_->Op().
-   *
-   * Op is zeroMem, fillSequence, ...
-   */
-  void zeroMem(bool useGpu);
-  void fillSequence(bool useGpu);
-  void setElement(size_t i, const T& value, bool useGpu);
-
-  /**
-   * @brief return i-th element.
-   */
-  T getElement(size_t i) const;
-
-  /**
-   * @brief return vector size.
-   */
-  size_t getSize() const {
-    size_t size = 0;
-    switch (*sync_) {
-      case SYNCED:
-      case DATA_AT_CPU:
-        size = cpuVectorT_->getSize();
-        break;
-      case DATA_AT_GPU:
-        size = gpuVectorT_->getSize();
-        break;
-      default:
-        LOG(FATAL) << "Not support";
-        break;
-    }
-    return size;
-  }
-
-  /// copy data to cpuVectorT_.
-  inline void copyToCpu(const T* data, size_t size) {
-    this->resizeOrCreate(size, false);
-    cpuVectorT_->copyFrom(data, size);
-    setSync(DATA_AT_CPU);
-  }
-  /// copy data to cpuVectorT_ using specifed-stream.
-  inline void copyToCpu(const T* data, size_t size, hl_stream_t stream) {
-    this->resizeOrCreate(size, false);
-    cpuVectorT_->copyFrom(data, size, stream);
-    setSync(DATA_AT_CPU);
-  }
-
-  /// copy data to gpuVectorT_.
-  inline void copyToGpu(const T* data, size_t size) {
-    this->resizeOrCreate(size, true);
-    gpuVectorT_->copyFrom(data, size);
-    setSync(DATA_AT_GPU);
-  }
-  /// copy data to gpuVectorT_ using specifed-stream.
-  inline void copyToGpu(const T* data, size_t size, hl_stream_t stream) {
-    this->resizeOrCreate(size, true);
-    gpuVectorT_->copyFrom(data, size, stream);
-    setSync(DATA_AT_GPU);
-  }
-
-  /**
-   * @brief copy from src using specifed-stream.
-   *
-   * If src is CpuVectorT, copy to cpuVectorT_.
-   *
-   * If src is GpuVectorT, copy to gpuVectorT_.
-   */
-  void copyFrom(const VectorT<T>& src, hl_stream_t stream);
-
-  /**
-   * @brief copy data.
-   *
-   * If useGpu is false, copy host data to cpuVectorT_.
-   *
-   * If useGpu is true, copy device data to gpuVectorT_.
-   *
-   * @note  data address should consistent with useGpu.
-   */
-  void copyFrom(const T* data, size_t size, bool useGpu);
-  void copyFrom(const T* data, size_t size, hl_stream_t stream, bool useGpu);
-
-  /**
-   * @brief copy from (src + offset) using specifed-stream.
-   */
-  void copyFrom(CpuGpuVectorT<T>& src,
-                size_t offset,
-                size_t size,
-                bool useGpu,
-                hl_stream_t stream);
-
-  /**
-   * @brief copy from src using specifed-stream.
-   */
-  void copyFrom(CpuGpuVectorT<T>& src, hl_stream_t stream);
-
-  /**
-   * @brief return sync_.
-   */
-  inline SyncedFlag* getSync() const { return sync_; }
-
-  /**
-   * @brief set sync_.
-   */
-  inline void setSync(SyncedFlag* sync) { sync_ = sync; }
-
-  inline void setSync(SyncedFlag syncFlag) {
-    if (sync_) {
-      *sync_ = syncFlag;
-    } else {
-      syncFlag_ = syncFlag;
-      sync_ = &syncFlag_;
-    }
-  }
-
-  inline void setSync(bool useGpu) {
-    SyncedFlag flag = useGpu ? DATA_AT_GPU : DATA_AT_CPU;
-    setSync(flag);
-  }
-
- protected:
-  void resizeOrCreate(size_t size, bool useGpu);
-
-  /**
-   * @brief copy between cpuVectorT_ and gpuVectorT_.
-   *
-   * If syncFlag_ is DATA_AT_CPU and SYNCED, do nothing.
-   *
-   * If syncFlag_ is DATA_AT_GPU, copy gpuVectorT_ to cpuVectorT_
-   *   and set syncFlag_ to SYNCED.
-   */
-  void copyToCpu();
-
-  /**
-   * @brief copy between cpuVectorT_ and gpuVectorT_.
-   *
-   * If syncFlag_ is DATA_AT_GPU and SYNCED, do nothing.
-   *
-   * If syncFlag_ is DATA_AT_CPU, copy cpuVectorT_ to gpuVectorT_
-   *   and set syncFlag_ to SYNCED.
-   */
-  void copyToGpu();
-
-  /// host pointer.
-  std::shared_ptr<VectorT<T>> cpuVectorT_;
-  /// device pointer.
-  std::shared_ptr<VectorT<T>> gpuVectorT_;
-  /// specify current data address.
-  SyncedFlag syncFlag_;
-  SyncedFlag* sync_;
-};
-
-typedef VectorT<real> Vector;
-typedef CpuVectorT<real> CpuVector;
-typedef GpuVectorT<real> GpuVector;
-
-typedef VectorT<int> IVector;
-typedef CpuVectorT<int> CpuIVector;
-typedef GpuVectorT<int> GpuIVector;
-
-typedef std::shared_ptr<Vector> VectorPtr;
-typedef std::shared_ptr<CpuVector> CpuVectorPtr;
-typedef std::shared_ptr<GpuVector> GpuVectorPtr;
-
-typedef std::shared_ptr<IVector> IVectorPtr;
-typedef std::shared_ptr<CpuIVector> CpuIVectorPtr;
-typedef std::shared_ptr<GpuIVector> GpuIVectorPtr;
-
-typedef CpuGpuVectorT<real> CpuGpuVector;
-typedef CpuGpuVectorT<int> ICpuGpuVector;
-typedef std::shared_ptr<CpuGpuVector> CpuGpuVectorPtr;
-typedef std::shared_ptr<ICpuGpuVector> ICpuGpuVectorPtr;
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/tests/CMakeLists.txt b/paddle/legacy/math/tests/CMakeLists.txt
deleted file mode 100644
index d8b7f9e3fc7..00000000000
--- a/paddle/legacy/math/tests/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-# unittest for common package
-
-add_simple_unittest(test_ExecViaCpu)
-add_simple_unittest(test_SIMDFunctions)
-add_simple_unittest(test_TrainingAlgorithm)
-add_simple_unittest(test_RowBuffer)
-if(NOT MOBILE_INFERENCE)
-    add_simple_unittest(test_SparseMatrix)
-endif()
-
-# TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
-add_unittest(test_matrixCompare
-    test_matrixCompare.cpp)
-
-add_simple_unittest(test_sparseMatrixCompare)
-add_simple_unittest(test_perturbation)
-add_simple_unittest(test_CpuGpuVector)
-add_simple_unittest(test_Allocator)
-
-if(WITH_GPU)
-    CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu)
-    link_paddle_test(test_Tensor)
-    CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
-    link_paddle_test(test_lazyAssign)
-else()
-    compile_cu_as_cpp(test_Tensor.cu)
-    add_unittest(test_Tensor test_Tensor.cu)
-    compile_cu_as_cpp(test_lazyAssign.cu)
-    add_unittest(test_lazyAssign test_lazyAssign.cu)
-endif(WITH_GPU)
-
-add_simple_unittest(test_FPException)
-add_simple_unittest(test_GpuProfiler)
-add_simple_unittest(test_BaseMatrix)
-add_simple_unittest(test_Matrix)
diff --git a/paddle/legacy/math/tests/OriginalOptimizerApi.h b/paddle/legacy/math/tests/OriginalOptimizerApi.h
deleted file mode 100644
index f386e19958a..00000000000
--- a/paddle/legacy/math/tests/OriginalOptimizerApi.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-
-using namespace paddle;  // NOLINT
-
-void SparseMomentumParameterOptimizer(const VectorPtr vecs[],
-                                      real alpha,
-                                      real beta,
-                                      real gamma,
-                                      real tau,
-                                      real learningRate) {
-  vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
-                                   -alpha * gamma * learningRate);
-  vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
-                                   tau * alpha * gamma * learningRate);
-  vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
-                             tau / beta + 1.0 / alpha,
-                             *vecs[PARAMETER_MOMENTUM_VT],
-                             1.0 / beta);
-}
-
-void AdagradParameterOptimizer(const VectorPtr vecs[],
-                               real epsilon,
-                               real learningRate,
-                               real momentum,
-                               real decayRate) {
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
-                                                1.0f);
-  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
-                                     *vecs[PARAMETER_GRADIENT_SQURESUM1]);
-  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void AdaDeltaParameterOptimizer(const VectorPtr vecs[],
-                                real rou,
-                                real epsilon,
-                                real learningRate,
-                                real momentum,
-                                real decayRate) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], rou, 1.0f - rou);
-
-  // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
-  vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
-                                        *vecs[PARAMETER_GRADIENT_SQURESUM],
-                                        epsilon,
-                                        epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->sqrt2();
-
-  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
-      *vecs[PARAMETER_GRADIENT],
-      *vecs[PARAMETER_LEARNING_RATE],
-      rou,
-      1.0f - rou);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void RMSPropParameterOptimizer(const VectorPtr vecs[],
-                               real accumulatedRou,
-                               real rou,
-                               real epsilon,
-                               real learningRate,
-                               real momentum,
-                               real decayRate,
-                               bool firstTime) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
-
-  // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou);
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
-  // Basiclly if the sign of the gradient changes more often,
-  // the learning rate will be decreased.
-  vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
-  vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
-                                           -1.0f);
-  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void DecayedAdagradParameterOptimizer(const VectorPtr vecs[],
-                                      real accumulatedRou,
-                                      real rou,
-                                      real epsilon,
-                                      real learningRate,
-                                      real momentum,
-                                      real decayRate,
-                                      bool firstTime) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
-  // Basiclly if the bigger the magnitude gradient is,
-  // the smaller the learning rate will be.
-  vecs[PARAMETER_LEARNING_RATE]->assign(epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void AdamParameterOptimizer(const VectorPtr vecs[],
-                            real beta1,
-                            real beta2,
-                            real beta1_power,
-                            real beta2_power,
-                            real epsilon,
-                            real learningRate) {
-  Vector* m = vecs[PARAMETER_MOMENTUM].get();
-  Vector* g = vecs[PARAMETER_GRADIENT].get();
-  Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
-  Vector* theta = vecs[PARAMETER_VALUE].get();
-
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  m->add(*g, beta1, 1 - beta1);
-
-  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
-  g->square2();
-  v->add(*g, beta2, 1 - beta2);
-
-  // tmp = m_t / ( \sqrt{v_t} + \epsilon )
-  // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
-  g->sqrt2(*v);
-  g->dotDiv(*m, *g, 0., epsilon);
-  real alpha =
-      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
-  theta->add(*theta, 1.0, *g, -alpha);
-}
-
-void AdamaxParameterOptimizer(
-    const VectorPtr vecs[], real beta1, real beta2, int64_t step, real alpha) {
-  Vector* m = vecs[PARAMETER_MOMENTUM].get();
-  Vector* g = vecs[PARAMETER_GRADIENT].get();
-  Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
-  Vector* theta = vecs[PARAMETER_VALUE].get();
-
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  m->add(*g, beta1, 1 - beta1);
-
-  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
-  u->mulScalar(beta2);
-  g->abs2();
-  u->max2(*u, *g);
-
-  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
-  g->dotDiv(*m, *u);
-  real learningRate = alpha / (1 - std::pow(beta1, step));
-  theta->add(*theta, 1.0, *g, -learningRate);
-}
diff --git a/paddle/legacy/math/tests/PerfUtils.h b/paddle/legacy/math/tests/PerfUtils.h
deleted file mode 100644
index eaf4869e4c9..00000000000
--- a/paddle/legacy/math/tests/PerfUtils.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// Performance Check
-#ifdef PADDLE_DISABLE_TIMER
-
-#define EXPRESSION_PERFORMANCE(expression) expression;
-
-#else
-
-#include "paddle/legacy/utils/Stat.h"
-using namespace paddle;  // NOLINT
-
-#define EXPRESSION_PERFORMANCE(expression)                             \
-  do {                                                                 \
-    char expr[30];                                                     \
-    strncpy(expr, #expression, 30);                                    \
-    if (expr[29] != '\0') {                                            \
-      expr[27] = '.';                                                  \
-      expr[28] = '.';                                                  \
-      expr[29] = '\0';                                                 \
-    }                                                                  \
-    expression;                                                        \
-    for (int i = 0; i < 20; i++) {                                     \
-      REGISTER_TIMER(expr);                                            \
-      expression;                                                      \
-    }                                                                  \
-    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') \
-              << *globalStat.getStat(expr);                            \
-    globalStat.reset();                                                \
-  } while (0)
-
-#endif
diff --git a/paddle/legacy/math/tests/TensorCheck.h b/paddle/legacy/math/tests/TensorCheck.h
deleted file mode 100644
index 41c8ece282e..00000000000
--- a/paddle/legacy/math/tests/TensorCheck.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-/**
- * This file provides a TensorCheck template function, which can be used to
- * compare CpuMatrix and GpuMatrix, CpuVector and GpuVector, and so on.
- */
-
-#include <cmath>
-#include "paddle/legacy/math/Matrix.h"
-
-namespace autotest {
-
-using paddle::Matrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using paddle::VectorT;
-using paddle::CpuVectorT;
-using paddle::GpuVectorT;
-
-class AssertEqual {
- public:
-  AssertEqual(real err = 0) : err_(err) {}
-
-  inline bool operator()(real a, real b) {
-    if (err_ == 0) {
-      if (a != b) {
-        return false;
-      }
-    } else {
-      if (std::fabs(a - b) > err_) {
-        if ((std::fabs(a - b) / std::fabs(a)) > (err_ / 10.0f)) {
-          return false;
-        }
-      }
-    }
-
-    return true;
-  }
-
- private:
-  real err_;
-};
-
-template <typename Tensor>
-class CopyToCpu;
-
-template <>
-class CopyToCpu<CpuMatrix> {
- public:
-  explicit CopyToCpu(const CpuMatrix& arg) : arg_(arg) {}
-  const CpuMatrix& copiedArg() const { return arg_; }
-
- private:
-  const CpuMatrix& arg_;
-};
-
-template <>
-class CopyToCpu<GpuMatrix> {
- public:
-  explicit CopyToCpu(const GpuMatrix& arg)
-      : arg_(arg.getHeight(), arg.getWidth()) {
-    arg_.copyFrom(arg);
-  }
-  CpuMatrix& copiedArg() { return arg_; }
-
- private:
-  CpuMatrix arg_;
-};
-
-template <>
-class CopyToCpu<Matrix> {
- public:
-  explicit CopyToCpu(const Matrix& arg)
-      : arg_(arg.getHeight(), arg.getWidth()) {
-    arg_.copyFrom(arg);
-  }
-  CpuMatrix& copiedArg() { return arg_; }
-
- private:
-  CpuMatrix arg_;
-};
-
-template <typename T>
-class CopyToCpu<CpuVectorT<T>> {
- public:
-  explicit CopyToCpu(const CpuVectorT<T>& arg) : arg_(arg) {}
-  const CpuVectorT<T>& copiedArg() const { return arg_; }
-
- private:
-  const CpuVectorT<T>& arg_;
-};
-
-template <typename T>
-class CopyToCpu<GpuVectorT<T>> {
- public:
-  explicit CopyToCpu(const GpuVectorT<T>& arg) : arg_(arg.getSize()) {
-    arg_.copyFrom(arg);
-  }
-  CpuVectorT<T>& copiedArg() { return arg_; }
-
- private:
-  CpuVectorT<T> arg_;
-};
-
-template <typename T>
-class CopyToCpu<VectorT<T>> {
- public:
-  explicit CopyToCpu(const VectorT<T>& arg) : arg_(arg.getSize()) {
-    arg_.copyFrom(arg);
-  }
-  CpuVectorT<T>& copiedArg() { return arg_; }
-
- private:
-  CpuVectorT<T> arg_;
-};
-
-template <typename AssertEq>
-void TensorCheck(AssertEq compare,
-                 const CpuMatrix& matrix1,
-                 const CpuMatrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (!compare(a, b)) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-template <typename AssertEq, class T>
-void TensorCheck(AssertEq compare,
-                 const CpuVectorT<T>& vector1,
-                 const CpuVectorT<T>& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-
-  const T* data1 = vector1.getData();
-  const T* data2 = vector2.getData();
-  size_t size = vector1.getSize();
-  int count = 0;
-  for (size_t i = 0; i < size; i++) {
-    real a = data1[i];
-    real b = data2[i];
-    if (!compare(a, b)) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
-}
-
-template <typename AssertEq, typename Tensor1, typename Tensor2>
-void TensorCheck(AssertEq compare,
-                 const Tensor1& tensor1,
-                 const Tensor2& tensor2) {
-  TensorCheck(compare,
-              CopyToCpu<Tensor1>(tensor1).copiedArg(),
-              CopyToCpu<Tensor2>(tensor2).copiedArg());
-}
-
-template <typename AssertEq>
-void TensorCheck(AssertEq compare, real args1, real args2) {
-  EXPECT_EQ(compare(args1, args2), true) << "[Test error] args1 = " << args1
-                                         << ", args2 = " << args2;
-}
-
-template <typename AssertEq>
-void TensorCheck(AssertEq compare, size_t args1, size_t args2) {
-  EXPECT_EQ(args1, args2) << "[Test error] args1 = " << args1
-                          << ", args2 = " << args2;
-}
-
-template <typename Tensor1, typename Tensor2>
-void TensorCheckEqual(const Tensor1& tensor1, const Tensor2& tensor2) {
-  AssertEqual compare(0);
-  TensorCheck(compare,
-              CopyToCpu<Tensor1>(tensor1).copiedArg(),
-              CopyToCpu<Tensor2>(tensor2).copiedArg());
-}
-
-template <typename Tensor1, typename Tensor2>
-void TensorCheckErr(const Tensor1& tensor1, const Tensor2& tensor2) {
-#ifndef PADDLE_TYPE_DOUBLE
-  AssertEqual compare(1e-3);
-#else
-  AssertEqual compare(1e-10);
-#endif
-  TensorCheck(compare,
-              CopyToCpu<Tensor1>(tensor1).copiedArg(),
-              CopyToCpu<Tensor2>(tensor2).copiedArg());
-}
-
-}  // namespace autotest
diff --git a/paddle/legacy/math/tests/TestUtils.h b/paddle/legacy/math/tests/TestUtils.h
deleted file mode 100644
index 60e76359da6..00000000000
--- a/paddle/legacy/math/tests/TestUtils.h
+++ /dev/null
@@ -1,294 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-/**
- * This file provides a AutoCompare calss to simplify the comparison
- * of CPU and GPU member functions.
- *
- * This takes two steps
- * 1. Construct an AutoCompare object.
- *    When constructing an AutoCompare object, you can set the err argument
- * to specify the maximum error for CPU and GPU functions.
- *
- * 2. Use the template functions cmpWithArg or cmpWithoutArg.
- * A. [cmpWithArg] Requires the caller construct the cpu arguments.
- *
- *  AutoCompare test;
- *  Init Argument arg1,arg2...
- *  test.cmpWithArg(function, arg1, arg2....)
- *
- * B. [cmpWithoutArg] The caller do not need construct arguments.
- *    If matrix used in these functions arguments is the same size.
- *    Such as the element wise function and the aggregate function
- *    defined in the BaseMatrix.cpp.
- *
- *  AutoCompare test;
- *  test.cmpWithoutArg<I...>(function, height, width)
- */
-
-#include <gtest/gtest.h>
-#include "TensorCheck.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-
-namespace autotest {
-
-using paddle::BaseMatrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using paddle::CpuIVector;
-using paddle::GpuIVector;
-using paddle::CpuSparseMatrix;
-using paddle::GpuSparseMatrix;
-
-template <typename T1, typename T2>
-class ReplaceType {
- public:
-  typedef T1 type;
-};
-
-template <>
-class ReplaceType<BaseMatrix, CpuMatrix> {
- public:
-  typedef CpuMatrix type;
-};
-
-template <>
-class ReplaceType<BaseMatrix, GpuMatrix> {
- public:
-  typedef GpuMatrix type;
-};
-
-template <>
-class ReplaceType<Matrix, CpuMatrix> {
- public:
-  typedef CpuMatrix type;
-};
-
-template <>
-class ReplaceType<Matrix, GpuMatrix> {
- public:
-  typedef GpuMatrix type;
-};
-
-// construct a argument
-template <typename T>
-T construct(int height, int width);
-
-template <>
-float construct(int height, int width) {
-  return 0.5;
-}
-
-template <>
-double construct(int height, int width) {
-  return 0.5;
-}
-
-template <>
-size_t construct(int height, int width) {
-  size_t offset = std::rand() % (height < width ? height : width);
-  return offset;
-}
-
-template <>
-CpuMatrix construct(int height, int width) {
-  CpuMatrix a(height, width);
-  return a;
-}
-
-template <>
-GpuMatrix construct(int height, int width) {
-  GpuMatrix a(height, width);
-  return a;
-}
-
-// init a argument
-template <typename T>
-void init(T& v) {
-  return;
-}
-
-template <>
-void init(CpuMatrix& v) {
-  v.randomizeUniform();
-}
-
-template <>
-void init(GpuMatrix& v) {
-  v.randomizeUniform();
-}
-
-// init a tuple which contains a set of arguments.
-template <std::size_t I = 0, typename... Args>
-inline typename std::enable_if<I == sizeof...(Args), void>::type initTuple(
-    std::tuple<Args...>& t) {}
-
-template <std::size_t I = 0, typename... Args>
-    inline typename std::enable_if <
-    I<sizeof...(Args), void>::type initTuple(std::tuple<Args...>& t) {
-  init(std::get<I>(t));
-  initTuple<I + 1>(t);
-}
-
-// copy a argument, copy src to dest
-template <typename T1, typename T2>
-void copy(T1& dest, T2& src) {
-  dest = src;
-}
-
-template <>
-void copy(GpuMatrix& dest, CpuMatrix& src) {
-  dest.copyFrom(src);
-}
-
-// copy a tuple, copy src to dest
-template <std::size_t I = 0, typename... Args1, typename... Args2>
-inline typename std::enable_if<I == sizeof...(Args1), void>::type copyTuple(
-    std::tuple<Args1...>& dest, std::tuple<Args2...>& src) {}
-
-template <std::size_t I = 0, typename... Args1, typename... Args2>
-    inline typename std::enable_if <
-    I<sizeof...(Args1), void>::type copyTuple(std::tuple<Args1...>& dest,
-                                              std::tuple<Args2...>& src) {
-  copy(std::get<I>(dest), std::get<I>(src));
-  copyTuple<I + 1>(dest, src);
-}
-
-// call member function
-template <typename C,
-          typename FC,
-          typename R,
-          typename... FArgs,
-          typename... Args>
-R call(C& obj, R (FC::*f)(FArgs...), Args&&... args) {
-  return (obj.*f)(args...);
-}
-
-template <typename T>
-class ReturnType {
- public:
-  typedef T type;
-};
-
-template <>
-class ReturnType<CpuMatrix> {
- public:
-  typedef GpuMatrix type;
-};
-
-template <>
-class ReturnType<CpuIVector> {
- public:
-  typedef GpuIVector type;
-};
-
-template <>
-class ReturnType<CpuSparseMatrix> {
- public:
-  typedef GpuSparseMatrix type;
-};
-
-template <typename T>
-typename ReturnType<T>::type autoArgs(T& v) {
-  return v;
-}
-
-template <>
-GpuMatrix autoArgs(CpuMatrix& v) {
-  GpuMatrix a(v.getHeight(), v.getWidth());
-  a.copyFrom(v);
-  return a;
-}
-
-template <>
-GpuIVector autoArgs(CpuIVector& v) {
-  GpuIVector a(v.getSize());
-  a.copyFrom(v);
-  return a;
-}
-
-template <>
-GpuSparseMatrix autoArgs(CpuSparseMatrix& v) {
-  GpuSparseMatrix a(v.getHeight(),
-                    v.getWidth(),
-                    v.getElementCnt(),
-                    v.getValueType(),
-                    v.getFormat());
-  a.copyFrom(v, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  return a;
-}
-
-class AutoCompare {
- public:
-  /**
-   * err is the allowed calculation error.
-   * The smaller the value of err,
-   * the stricter the comparison is between CPU and GPU calculations.
-   */
-  AutoCompare(size_t height, size_t width, real err = 1e-3)
-      : cpu(height, width), gpu(height, width), compare(err) {
-    init(cpu);
-    copy(gpu, cpu);
-  }
-
-  template <typename C, typename R, typename... FArgs, typename... Args>
-  void cmpWithArg(R (C::*f)(FArgs...), Args&&... args) {
-    static_assert(sizeof...(FArgs) == sizeof...(Args),
-                  "size of parameter packs are not equal");
-    call(cpu, f, args...);
-    call(gpu, f, autoArgs(args)...);
-
-    TensorCheck(compare, cpu, gpu);
-  }
-
-  template <std::size_t... I, typename C, typename R, typename... Args>
-  void cmpWithoutArg(R (C::*f)(Args...), size_t height, size_t width) {
-    static_assert(sizeof...(I) == sizeof...(Args),
-                  "size of parameter packs are not equal");
-    (void)height;
-    (void)width;
-    auto tuple1 = std::make_tuple(
-        construct<typename ReplaceType<
-            typename std::decay<
-                typename std::tuple_element<I,
-                                            std::tuple<Args...>>::type>::type,
-            CpuMatrix>::type>(height, width)...);
-
-    auto tuple2 = std::make_tuple(
-        construct<typename ReplaceType<
-            typename std::decay<
-                typename std::tuple_element<I,
-                                            std::tuple<Args...>>::type>::type,
-            GpuMatrix>::type>(height, width)...);
-
-    initTuple(tuple1);
-    copyTuple(tuple2, tuple1);
-
-    call(cpu, f, std::get<I>(tuple1)...);
-    call(gpu, f, std::get<I>(tuple2)...);
-
-    TensorCheck(compare, cpu, gpu);
-  }
-
- protected:
-  CpuMatrix cpu;
-  GpuMatrix gpu;
-  AssertEqual compare;
-};
-
-}  // namespace autotest
diff --git a/paddle/legacy/math/tests/test_Allocator.cpp b/paddle/legacy/math/tests/test_Allocator.cpp
deleted file mode 100644
index 122be9082a8..00000000000
--- a/paddle/legacy/math/tests/test_Allocator.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Util.h"
-#define private public
-#include "paddle/legacy/math/Allocator.h"
-#include "paddle/legacy/math/MemoryHandle.h"
-#include "paddle/legacy/math/PoolAllocator.h"
-
-using namespace paddle;  // NOLINT
-
-template <typename Allocator>
-void testPoolAllocator() {
-  PoolAllocator* pool =
-      new PoolAllocator(new Allocator(), /* sizeLimit */ 1024);
-
-  /* alloc from system memory */
-  void* ptr1 = pool->alloc(10);
-  void* ptr2 = pool->alloc(200);
-  void* ptr3 = pool->alloc(200);
-  pool->free(ptr1, 10);
-  pool->free(ptr2, 200);
-  pool->free(ptr3, 200);
-  pool->printAll();
-  EXPECT_EQ((size_t)2, pool->pool_.size());
-  EXPECT_EQ((size_t)1, pool->pool_[10].size());
-  EXPECT_EQ((size_t)2, pool->pool_[200].size());
-  EXPECT_EQ(ptr1, pool->pool_[10][0]);
-  EXPECT_EQ(ptr2, pool->pool_[200][0]);
-  EXPECT_EQ(ptr3, pool->pool_[200][1]);
-
-  /* alloc from pool */
-  void* ptr4 = pool->alloc(10);
-  void* ptr5 = pool->alloc(200);
-  pool->printAll();
-  EXPECT_EQ((size_t)0, pool->pool_[10].size());
-  EXPECT_EQ((size_t)1, pool->pool_[200].size());
-  EXPECT_EQ(ptr1, ptr4);
-  EXPECT_EQ(ptr3, ptr5);
-  pool->free(ptr4, 10);
-  pool->free(ptr5, 200);
-
-  /* alloc size > sizeLimit */
-  void* ptr6 = pool->alloc(1024);
-  pool->free(ptr6, 1024);
-  EXPECT_LE((size_t)1024, pool->poolMemorySize_);
-
-  void* ptr7 = pool->alloc(1);
-  EXPECT_EQ((size_t)0, pool->poolMemorySize_);
-  EXPECT_EQ((size_t)0, pool->pool_.size());
-  pool->free(ptr7, 1);
-
-  delete pool;
-}
-
-TEST(Allocator, Pool) {
-  testPoolAllocator<CpuAllocator>();
-#ifdef PADDLE_WITH_CUDA
-  testPoolAllocator<GpuAllocator>();
-#endif
-}
-
-TEST(MemoryHandle, Cpu) {
-  for (auto size : {10, 30, 50, 100, 200, 512, 1000, 1023, 1024, 1025, 8193}) {
-    CpuMemoryHandle handle(size);
-    EXPECT_LE(handle.getSize(), handle.getAllocSize());
-  }
-
-  void* ptr1;
-  void* ptr2;
-  {
-    CpuMemoryHandle handle(256);
-    ptr1 = handle.getBuf();
-  }
-  {
-    CpuMemoryHandle handle(256);
-    ptr2 = handle.getBuf();
-  }
-  EXPECT_EQ(ptr1, ptr2);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(MemoryHandle, Gpu) {
-  int numGpu = hl_get_device_count();
-
-  /* alloc from system memory */
-  void* ptr3[numGpu];
-  void* ptr4[numGpu];
-  for (int i = 0; i < numGpu; i++) {
-    SetDevice device(i);
-    GpuMemoryHandle handle1(30);
-    GpuMemoryHandle handle2(30);
-    GpuMemoryHandle handle3(4000);
-    GpuMemoryHandle handle4(500);
-    ptr3[i] = handle3.getBuf();
-    ptr4[i] = handle4.getBuf();
-  }
-
-  /* alloc from pool */
-  for (int i = 0; i < numGpu; i++) {
-    SetDevice device(i);
-    GpuMemoryHandle handle1(30);
-    GpuMemoryHandle handle3(4000);
-    GpuMemoryHandle handle4(500);
-    EXPECT_EQ(ptr3[i], handle3.getBuf());
-    EXPECT_EQ(ptr4[i], handle4.getBuf());
-  }
-}
-#endif
diff --git a/paddle/legacy/math/tests/test_BaseMatrix.cpp b/paddle/legacy/math/tests/test_BaseMatrix.cpp
deleted file mode 100644
index 488765c6ac2..00000000000
--- a/paddle/legacy/math/tests/test_BaseMatrix.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-/**
- * This test file use autotest::AutoCompare and cmpWithoutArg to compares the
- * implementation of CPU and GPU member function in
- * BaseMatrix.cpp and Matrix.cpp.
- */
-
-#include <gtest/gtest.h>
-#include "TestUtils.h"
-#include "paddle/legacy/math/BaseMatrix.h"
-
-using paddle::BaseMatrix;
-using paddle::Matrix;
-using autotest::AutoCompare;
-
-// Test all void (BaseMatrix::*)() function
-TEST(BaseMatrix, void) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)()) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg(f, height, width);
-      };
-
-      compare(&BaseMatrix::neg);
-      compare(&BaseMatrix::exp2);
-      compare(&BaseMatrix::log2);
-      compare(&BaseMatrix::sqrt2);
-      compare(&BaseMatrix::square2);
-      compare(&BaseMatrix::reciprocal2);
-      compare(&BaseMatrix::abs2);
-      compare(&BaseMatrix::sign2);
-      compare(&BaseMatrix::zero);
-      compare(&BaseMatrix::one);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(real) function
-TEST(BaseMatrix, real) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(real)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0>(f, height, width);
-      };
-
-      compare(&BaseMatrix::pow2);
-      compare(&BaseMatrix::subScalar);
-      compare(&BaseMatrix::mulScalar);
-      compare(&BaseMatrix::divScalar);
-      compare(&BaseMatrix::assign);
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::biggerThanScalar);
-      compare(&BaseMatrix::downClip);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(BaseMatrix&) function
-TEST(BaseMatrix, BaseMatrix) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0>(f, height, width);
-      };
-
-      compare(&BaseMatrix::assign);
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::relu);
-      compare(&BaseMatrix::reluDerivative);
-      compare(&BaseMatrix::softrelu);
-      compare(&BaseMatrix::softreluDerivative);
-      compare(&BaseMatrix::brelu);
-      compare(&BaseMatrix::breluDerivative);
-      compare(&BaseMatrix::square2);
-      compare(&BaseMatrix::squareDerivative);
-      compare(&BaseMatrix::tanh);
-      compare(&BaseMatrix::tanhDerivative);
-      compare(&BaseMatrix::reciprocal2);
-      compare(&BaseMatrix::reciprocalDerivative);
-      compare(&BaseMatrix::abs2);
-      compare(&BaseMatrix::absDerivative);
-      compare(&BaseMatrix::sigmoid);
-      compare(&BaseMatrix::sigmoidDerivative);
-      compare(&BaseMatrix::expDerivative);
-      compare(&BaseMatrix::sign2);
-      compare(&BaseMatrix::exp2);
-      compare(&BaseMatrix::log2);
-      compare(&BaseMatrix::sqrt2);
-      compare(&BaseMatrix::dotMul);
-      compare(&BaseMatrix::dotMulSquare);
-      compare(&BaseMatrix::dotSquareMul);
-      compare(&BaseMatrix::addColVector);
-      compare(&BaseMatrix::addRowVector);
-      compare(&BaseMatrix::mulRowVector);
-      compare(&BaseMatrix::divRowVector);
-      compare(&BaseMatrix::mulColVector);
-      compare(&BaseMatrix::divColVector);
-      compare(&BaseMatrix::addP2P);
-      compare(&BaseMatrix::invSqrt);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(real, real) function
-TEST(BaseMatrix, real_real) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(real, real)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0, 1>(f, height, width);
-      };
-
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::clip);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(BaseMatrix&, real) function
-TEST(BaseMatrix, BaseMatrix_real) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&, real)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0, 1>(f, height, width);
-      };
-
-      compare(&BaseMatrix::addBias);
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::sub);
-      compare(&BaseMatrix::pow2);
-      compare(&BaseMatrix::addScalar);
-      compare(&BaseMatrix::subScalar);
-      compare(&BaseMatrix::mulScalar);
-      compare(&BaseMatrix::divScalar);
-      compare(&BaseMatrix::scalarDiv);
-      compare(&BaseMatrix::addSquare);
-      compare(&BaseMatrix::isEqualTo);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(BaseMatrix&, BaseMatrix&) function
-TEST(BaseMatrix, BaseMatrix_BaseMatrix) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height,
-                      width](void (BaseMatrix::*f)(BaseMatrix&, BaseMatrix&)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0, 1>(f, height, width);
-      };
-
-      compare(&BaseMatrix::softCrossEntropy);
-      compare(&BaseMatrix::softCrossEntropyBp);
-      compare(&BaseMatrix::binaryLabelCrossEntropy);
-      compare(&BaseMatrix::binaryLabelCrossEntropyBp);
-      compare(&BaseMatrix::sub);
-      compare(&BaseMatrix::add2);
-      compare(&BaseMatrix::dotMul);
-      compare(&BaseMatrix::dotDiv);
-      compare(&BaseMatrix::logisticRegressionLoss);
-      compare(&BaseMatrix::logisticRegressionLossBp);
-      compare(&BaseMatrix::biggerThan);
-      compare(&BaseMatrix::max2);
-      compare(&BaseMatrix::dotMulSquare);
-      compare(&BaseMatrix::dotSquareSquare);
-    }
-  }
-}
-
-void TestEelementWise(size_t height, size_t width) {
-  AutoCompare rowScale(height, width);
-  rowScale.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowScale, height, width);
-
-  AutoCompare rowDotMul(height, width);
-  rowDotMul.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowDotMul, height, width);
-
-  AutoCompare binaryClassificationError(height, width);
-  binaryClassificationError.cmpWithoutArg<0, 1, 2, 3>(
-      &BaseMatrix::binaryClassificationError, height, width);
-
-  AutoCompare sumOfSquaresBp(height, width);
-  sumOfSquaresBp.cmpWithoutArg<0, 1>(&Matrix::sumOfSquaresBp, height, width);
-}
-
-void TestAggregateToRow(size_t height, size_t width) {
-  AutoCompare maxCols(1, width);
-  maxCols.cmpWithoutArg<0>(&BaseMatrix::maxCols, height, width);
-
-  AutoCompare minCols(1, width);
-  minCols.cmpWithoutArg<0>(&BaseMatrix::minCols, height, width);
-
-  AutoCompare addDotMulVMM(1, width);
-  addDotMulVMM.cmpWithoutArg<0, 1>(&BaseMatrix::addDotMulVMM, height, width);
-
-  AutoCompare sumCols(1, width);
-  sumCols.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumCols, height, width);
-
-  AutoCompare collectBias(1, width);
-  collectBias.cmpWithoutArg<0, 1>(
-      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::collectBias),
-      height,
-      width);
-}
-
-void TestAggregateToCol(size_t height, size_t width) {
-  AutoCompare maxRows(height, 1);
-  maxRows.cmpWithoutArg<0>(&BaseMatrix::maxRows, height, width);
-
-  AutoCompare minRows(height, 1);
-  minRows.cmpWithoutArg<0>(&BaseMatrix::minRows, height, width);
-
-  AutoCompare sumRows(height, 1);
-  sumRows.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumRows, height, width);
-
-  AutoCompare sumOfSquares(height, 1);
-  sumOfSquares.cmpWithoutArg<0, 1>(&Matrix::sumOfSquares, height, width);
-}
-
-TEST(BaseMatrix, Other) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      TestEelementWise(height, width);
-      TestAggregateToRow(height, width);
-      TestAggregateToCol(height, width);
-    }
-  }
-}
-
-#endif
diff --git a/paddle/legacy/math/tests/test_CpuGpuVector.cpp b/paddle/legacy/math/tests/test_CpuGpuVector.cpp
deleted file mode 100644
index 010fef534d1..00000000000
--- a/paddle/legacy/math/tests/test_CpuGpuVector.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <gtest/gtest.h>
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/utils/Util.h"
-#include "test_matrixUtil.h"
-
-using namespace paddle;  // NOLINT
-
-TEST(CpuGpuVector, getData) {
-  size_t size = 500;
-  hl_stream_t stream(HPPL_STREAM_DEFAULT);
-  CpuVectorPtr cpuVec = std::make_shared<CpuVector>(size);
-  GpuVectorPtr gpuVec = std::make_shared<GpuVector>(size);
-  cpuVec->uniform(0.0, 10.0);
-  gpuVec->copyFrom(*cpuVec, stream);
-  hl_stream_synchronize(stream);
-
-  CpuGpuVectorPtr vec = std::make_shared<CpuGpuVector>(gpuVec);
-  auto a = vec->getData(false);
-  auto b = cpuVec->getData();
-  hl_stream_synchronize(stream);
-  checkDataEqual(a, b, size);
-}
-
-TEST(CpuGpuVector, subCreate) {
-  size_t size1 = 1024;
-  size_t offset = 100;
-  size_t size2 = 500;
-  hl_stream_t stream(HPPL_STREAM_DEFAULT);
-  CpuGpuVectorPtr v1 = std::make_shared<CpuGpuVector>(size1, /*useGpu*/ false);
-  auto vec = v1->getMutableVector(false);
-  vec->uniform(0.0, 10.0);
-  auto v2 = std::make_shared<CpuGpuVector>(*v1, offset, size2);
-  CHECK_EQ(*v1->getSync(), *v2->getSync());
-
-  // check subVec equal
-  checkDataEqual(v1->getData(false) + offset, v2->getData(false), size2);
-
-  CpuVectorPtr v1Check = std::make_shared<CpuVector>(size1);
-  CpuVectorPtr v2Check = std::make_shared<CpuVector>(size2);
-  v1Check->copyFrom(*(v1->getVector(true)), stream);
-  v2Check->copyFrom(*(v2->getVector(true)), stream);
-  hl_stream_synchronize(stream);
-
-  checkDataEqual(v2->getData(false), v2Check->getData(), size2);
-  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
-
-  CpuVectorPtr noise = std::make_shared<CpuVector>(size2);
-  noise->uniform(0.0, 1.0);
-  auto v = v2->getMutableVector(false);  // will change header
-  // add noise to subVec
-  v->add(*noise);
-
-  // check v1_cpu_data == v2_cpu_data
-  checkDataEqual(v1->getData(false) + offset, v2->getData(false), size2);
-
-  v1Check->copyFrom(*(v1->getVector(true)), stream);
-  v2Check->copyFrom(*(v2->getVector(true)), stream);
-  hl_stream_synchronize(stream);
-
-  // check v1_gpu_data == v2_gpu_data
-  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
-}
-
-#endif
diff --git a/paddle/legacy/math/tests/test_ExecViaCpu.cpp b/paddle/legacy/math/tests/test_ExecViaCpu.cpp
deleted file mode 100644
index b2ce0bc7ede..00000000000
--- a/paddle/legacy/math/tests/test_ExecViaCpu.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/utils/PythonUtil.h>
-#include <paddle/legacy/utils/Util.h>
-#include <vector>
-#include "paddle/legacy/math/SparseMatrix.h"
-
-using namespace paddle;  // NOLINT
-
-const int height = 10;
-const int width = 16;
-
-real f(Matrix& mat1,
-       const Matrix& mat2,
-       IVector& vec1,
-       const IVector& vec2,
-       real scalar) {
-  CHECK(!mat1.useGpu());
-  CHECK(!mat2.useGpu());
-  CHECK(!vec1.useGpu());
-  CHECK(!vec2.useGpu());
-  mat1.copyFrom(mat2);
-  vec1.copyFrom(vec2);
-
-  return scalar;
-}
-
-class Functor {
- public:
-  real operator()(Matrix& mat1,
-                  const Matrix& mat2,
-                  IVector& vec1,
-                  const IVector& vec2,
-                  real scalar) {
-    a_ = f(mat1, mat2, vec1, vec2, scalar);
-    return a_;
-  }
-
- private:
-  real a_;
-};
-
-template <typename F>
-void testWrapper(F&& f) {
-  MatrixPtr cpumat1 = Matrix::create(height, width, false, /*useGpu=*/false);
-  MatrixPtr cpumat2 = Matrix::create(height, width, false, /*useGpu=*/false);
-
-  IVectorPtr cpuvec1 = IVector::create(height, /*useGpu=*/false);
-  IVectorPtr cpuvec2 = IVector::create(height, /*useGpu=*/false);
-
-  const real scalar = 1.23456;
-
-  MatrixPtr gpumat1 = Matrix::create(height, width, false, /*useGpu=*/true);
-  MatrixPtr gpumat2 = Matrix::create(height, width, false, /*useGpu=*/true);
-  IVectorPtr gpuvec1 = IVector::create(height, /*useGpu=*/true);
-  IVectorPtr gpuvec2 = IVector::create(height, /*useGpu=*/true);
-
-  cpumat2->randomizeUniform();
-  cpuvec2->rand(width);
-  gpumat2->copyFrom(*cpumat2);
-  gpuvec2->copyFrom(*cpuvec2);
-
-  real ret = execViaCpu(f, *gpumat1, *gpumat2, *gpuvec1, *gpuvec2, 1.23456);
-  EXPECT_EQ(ret, scalar);
-  cpumat1->copyFrom(*gpumat1);
-  cpuvec1->copyFrom(*gpuvec1);
-
-  for (int i = 0; i < height; ++i) {
-    EXPECT_EQ(cpuvec1->getElement(i), cpuvec2->getElement(i));
-    for (int j = 0; j < width; ++j) {
-      EXPECT_EQ(cpumat1->getElement(i, j), cpumat2->getElement(i, j));
-    }
-  }
-  gpumat1->resize(height, 1);
-  execViaCpu2(&CpuMatrix::selectElements, *gpumat1, *gpumat2, *gpuvec1);
-
-  cpumat1->resize(height, 1);
-  cpumat1->selectElements(*cpumat2, *cpuvec1);
-  for (int i = 0; i < height; ++i) {
-    EXPECT_EQ(cpumat1->getElement(i, 0), gpumat1->getElement(i, 0));
-  }
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(ExecViaCpu, test1) {
-  testWrapper(f);
-  testWrapper(&f);
-
-  auto lambda = [](Matrix& mat1,
-                   const Matrix& mat2,
-                   IVector& vec1,
-                   const IVector& vec2,
-                   real scalar) -> real {
-    return f(mat1, mat2, vec1, vec2, scalar);
-  };
-  LOG(INFO) << "lambda is_class=" << std::is_class<decltype(lambda)>::value
-            << " is_function=" << std::is_function<decltype(lambda)>::value;
-  testWrapper(lambda);
-
-  Functor functor;
-  testWrapper(functor);
-}
-#endif
diff --git a/paddle/legacy/math/tests/test_FPException.cpp b/paddle/legacy/math/tests/test_FPException.cpp
deleted file mode 100644
index aa6aea71c8d..00000000000
--- a/paddle/legacy/math/tests/test_FPException.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/**
- * This test is about floating point calculation exception.
- * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions.
- *
- * Some exceptions occur in the middle of a set of formulas,
- * that can be circumvented by some tricks.
- * For example,
- * calculate tanh
- *   b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
- *
- * If the result of (-2 * a) is too large,
- * a FE_OVERFLOW exception occurs when calculating exp.
- * But the result of tanh is no overflow problem,
- * so we can add some tricks to prevent exp calculate an excessive value.
- *
- */
-
-#include <gtest/gtest.h>
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Common.h"
-
-using namespace paddle;  // NOLINT
-
-void SetTensorValue(Matrix& matrix, real value) {
-  int height = matrix.getHeight();
-  int width = matrix.getWidth();
-  int stride = matrix.getStride();
-  real* data = matrix.getData();
-  for (int i = 0; i < height; i++) {
-    int j = rand() % width;  // NOLINT
-    if (typeid(matrix) == typeid(CpuMatrix)) {
-      data[i * stride + j] = value;
-    } else if (typeid(matrix) == typeid(GpuMatrix)) {
-      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
-    } else {
-      LOG(FATAL) << "should not reach here";
-    }
-  }
-}
-
-template <typename Matrix>
-void testTanh(real illegal) {
-  MatrixPtr A = std::make_shared<Matrix>(10, 10);
-  MatrixPtr B = std::make_shared<Matrix>(10, 10);
-  A->randomizeUniform();
-  B->randomizeUniform();
-
-  SetTensorValue(*A, illegal);
-
-  A->tanh(*B);
-}
-
-template <typename Matrix>
-void testSigmoid(real illegal) {
-  MatrixPtr A = std::make_shared<Matrix>(10, 10);
-  MatrixPtr B = std::make_shared<Matrix>(10, 10);
-  A->randomizeUniform();
-  B->randomizeUniform();
-
-  SetTensorValue(*A, illegal);
-
-  A->sigmoid(*B);
-}
-
-TEST(fp, overflow) {
-  for (auto illegal : {-90.0, 90.0}) {
-    LOG(INFO) << " illegal=" << illegal;
-    testTanh<CpuMatrix>(illegal);
-    testSigmoid<CpuMatrix>(illegal);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/math/tests/test_GpuProfiler.cpp b/paddle/legacy/math/tests/test_GpuProfiler.cpp
deleted file mode 100644
index ee27109f218..00000000000
--- a/paddle/legacy/math/tests/test_GpuProfiler.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <gtest/gtest.h>
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (fabs(a - b) > err) {
-        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
-          count++;
-        }
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void testBilinearFwdBwd(int numSamples,
-                        int imgSizeH,
-                        int imgSizeW,
-                        int channels) {
-  int inWidth = imgSizeH * imgSizeW * channels;
-  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
-  real ratioH = 0.5;
-  real ratioW = 0.5;
-
-  // forward
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-
-  input->randomizeUniform();
-  inputGpu->copyFrom(*input);
-
-  {
-    // nvprof: GPU Proflier
-    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
-    target->bilinearForward(*input,
-                            imgSizeH,
-                            imgSizeW,
-                            2 * imgSizeH,
-                            2 * imgSizeW,
-                            channels,
-                            ratioH,
-                            ratioW);
-    targetGpu->bilinearForward(*inputGpu,
-                               imgSizeH,
-                               imgSizeW,
-                               2 * imgSizeH,
-                               2 * imgSizeW,
-                               channels,
-                               ratioH,
-                               ratioW);
-  }
-
-  // check
-  targetCheck->copyFrom(*targetGpu);
-  MatrixCheckErr(*target, *targetCheck);
-
-  // backward
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheckGrad =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->bilinearBackward(*targetGrad,
-                              2 * imgSizeH,
-                              2 * imgSizeW,
-                              imgSizeH,
-                              imgSizeW,
-                              channels,
-                              ratioH,
-                              ratioW);
-  inputGpuGrad->bilinearBackward(*targetGpuGrad,
-                                 2 * imgSizeH,
-                                 2 * imgSizeW,
-                                 imgSizeH,
-                                 imgSizeW,
-                                 channels,
-                                 ratioH,
-                                 ratioW);
-
-  // check
-  targetCheckGrad->copyFrom(*inputGpuGrad);
-  MatrixCheckErr(*inputGrad, *targetCheckGrad);
-}
-
-TEST(Profiler, testBilinearFwdBwd) {
-  auto numSamples = 10;
-  auto channels = 16;
-  auto imgSize = 64;
-  {
-    // nvprof: GPU Proflier
-    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
-    // Paddle built-in timer
-    REGISTER_TIMER_INFO(
-        "testBilinearFwdBwd",
-        "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
-    testBilinearFwdBwd(numSamples, imgSize, imgSize, channels);
-  }
-  globalStat.printAllStatus();
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-
-  // nvprof: GPU Proflier
-  REGISTER_GPU_PROFILER(
-      "RecursiveProfilingTest",
-      "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
-
-  return RUN_ALL_TESTS();
-}
-
-#endif
diff --git a/paddle/legacy/math/tests/test_Matrix.cpp b/paddle/legacy/math/tests/test_Matrix.cpp
deleted file mode 100644
index a9407a31f33..00000000000
--- a/paddle/legacy/math/tests/test_Matrix.cpp
+++ /dev/null
@@ -1,273 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-/**
- * This test file use autotest::AutoCompare and cmpWithArg to compares the
- * implementation of CPU and GPU member function in Matrix.cpp.
- */
-
-#include <gtest/gtest.h>
-#include "TestUtils.h"
-
-using paddle::BaseMatrix;
-using paddle::Matrix;
-using paddle::CpuMatrix;
-using paddle::CpuIVector;
-using paddle::CpuSparseMatrix;
-using autotest::AutoCompare;
-
-void testBilinearFwdBwd(int numSamples,
-                        int imgSizeH,
-                        int imgSizeW,
-                        int channels) {
-  int inWidth = imgSizeH * imgSizeW * channels;
-  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
-  real ratioH = 0.5;
-  real ratioW = 0.5;
-
-  AutoCompare forward(numSamples, outWidth);
-  CpuMatrix arg1(numSamples, inWidth);
-  arg1.randomizeUniform();
-  forward.cmpWithArg(&Matrix::bilinearForward,
-                     arg1,
-                     imgSizeH,
-                     imgSizeW,
-                     2 * imgSizeH,
-                     2 * imgSizeW,
-                     channels,
-                     ratioH,
-                     ratioW);
-
-  AutoCompare backward(numSamples, inWidth);
-  CpuMatrix arg2(numSamples, outWidth);
-  arg2.randomizeUniform();
-  backward.cmpWithArg(&Matrix::bilinearBackward,
-                      arg2,
-                      2 * imgSizeH,
-                      2 * imgSizeW,
-                      imgSizeH,
-                      imgSizeW,
-                      channels,
-                      ratioH,
-                      ratioW);
-}
-
-TEST(Matrix, BilinearFwdBwd) {
-  for (auto numSamples : {5, 10}) {
-    for (auto channels : {8, 16}) {
-      for (auto imgSizeH : {14, 28}) {
-        for (auto imgSizeW : {16, 30}) {
-          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
-          testBilinearFwdBwd(numSamples, imgSizeH, imgSizeW, channels);
-        }
-      }
-    }
-  }
-}
-
-void testMatrixAddBias(int height, int width, real scale) {
-  AutoCompare test(height, width);
-  CpuMatrix arg1(1, width);
-  arg1.randomizeUniform();
-  test.cmpWithArg(
-      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::addBias),
-      arg1,
-      scale);
-}
-
-void testMatrixAddDotMulMMV(int height, int width) {
-  AutoCompare test(height, width);
-  CpuMatrix arg1(height, width);
-  CpuMatrix arg2(1, width);
-  arg1.randomizeUniform();
-  arg2.randomizeUniform();
-  test.cmpWithArg(&BaseMatrix::addDotMulMMV, arg1, arg2);
-}
-
-TEST(Matrix, unary) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-      testMatrixAddBias(height, width, 1.0);
-      testMatrixAddBias(height, width, 3.5);
-      testMatrixAddDotMulMMV(height, width);
-    }
-  }
-}
-
-void testMatrixAddAtOffset(int height, int width1, int width2, int offset) {
-  AutoCompare test(height, width2);
-  CpuMatrix arg1(height, width1);
-  arg1.randomizeUniform();
-  test.cmpWithArg(&Matrix::addAtOffset, arg1, offset);
-}
-
-void testMatrixAssignAtOffset(int height, int width1, int width2, int offset) {
-  AutoCompare test(height, width2);
-  CpuMatrix arg1(height, width1);
-  arg1.randomizeUniform();
-  test.cmpWithArg(&Matrix::assignAtOffset, arg1, offset);
-}
-
-TEST(Matrix, AtOffset) {
-  for (auto height : {1, 11, 73, 128, 200}) {
-    for (auto width1 : {1, 32, 100, 512, 1000}) {
-      for (auto width2 : {1, 32, 100, 512, 1000}) {
-        int columnOffset = 0;
-        int offset = std::abs(width1 - width2);
-        if (offset) {
-          columnOffset = std::rand() % offset;
-        }
-        VLOG(3) << " height=" << height << " width1=" << width1
-                << " width2=" << width2 << " columnOffset = " << columnOffset;
-        testMatrixAddAtOffset(height, width1, width2, columnOffset);
-        testMatrixAssignAtOffset(height, width1, width2, columnOffset);
-      }
-    }
-  }
-}
-
-void testMatrixSelectRows(int numSamples, int tableSize, int inputDim) {
-  AutoCompare test(numSamples, inputDim);
-  CpuMatrix arg1(tableSize, inputDim);
-  CpuIVector arg2(numSamples);
-  arg1.randomizeUniform();
-  arg2.rand(tableSize);
-  test.cmpWithArg(&Matrix::selectRows, arg1, arg2);
-}
-
-TEST(Matrix, tableProjection) {
-  for (auto numSamples : {10, 100, 1000, 10000, 80000}) {
-    for (auto tableSize : {10, 100}) {
-      for (auto inputDim : {20, 50}) {
-        VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
-                << " inputDim=" << inputDim;
-        testMatrixSelectRows(numSamples, tableSize, inputDim);
-      }
-    }
-  }
-}
-
-void testMatrixCopyByRowIndex(int outHeight, int inHeight, int width) {
-  AutoCompare test(outHeight, width);
-  CpuMatrix arg1(inHeight, width);
-  CpuIVector arg2(outHeight);
-  arg1.randomizeUniform();
-  arg2.rand(inHeight);
-  test.cmpWithArg(&Matrix::copyByRowIndex, arg1, arg2);
-}
-
-TEST(Matrix, copyByRowIndex) {
-  for (auto outHeight : {31, 500, 1000}) {
-    for (auto inHeight : {17, 257, 500, 1200}) {
-      for (auto width : {512, 1024}) {
-        VLOG(3) << outHeight << " " << inHeight << " " << width;
-        testMatrixCopyByRowIndex(outHeight, inHeight, width);
-      }
-    }
-  }
-}
-
-void testParamReluForward(int height, int width, int w_height, int w_width) {
-  AutoCompare test(height, width);
-  CpuMatrix arg1(height, width);
-  CpuMatrix arg2(w_height, w_width);
-  arg1.randomizeUniform();
-  arg2.randomizeUniform();
-  arg1.add(-0.5);
-  test.cmpWithArg(&Matrix::paramReluForward, arg1, arg2);
-}
-
-void testParamReluBackwardW(int height, int width, int w_height, int w_width) {
-  AutoCompare test(w_height, w_width);
-  CpuMatrix arg1(height, width);
-  CpuMatrix arg2(height, width);
-  arg1.randomizeUniform();
-  arg2.randomizeUniform();
-  arg2.add(-0.5);
-  test.cmpWithArg(&Matrix::paramReluBackwardW, arg1, arg2);
-}
-
-TEST(Matrix, paramRelu) {
-  for (auto height : {10, 40, 100}) {
-    for (auto width : {10, 40, 100}) {
-      for (auto w_height : {1, 2}) {
-        for (auto w_width : {1, 2}) {
-          if (width % (w_height * w_width)) continue;
-          testParamReluForward(height, width, w_height, w_width);
-          testParamReluBackwardW(height, width, w_height, w_width);
-        }
-      }
-    }
-  }
-}
-
-void testAddSharedBias(int numSamples, int dim, int channel) {
-  AutoCompare test(numSamples, dim);
-  CpuMatrix arg1(1, channel);
-  arg1.randomizeUniform();
-  test.cmpWithArg(&Matrix::addSharedBias, arg1, 1.0);
-}
-
-void testCollectSharedBias(int numSamples, int dim, int channel) {
-  AutoCompare test(1, channel);
-  CpuMatrix arg1(numSamples, dim);
-  arg1.randomizeUniform();
-  test.cmpWithArg(&Matrix::collectSharedBias, arg1, 1.0);
-}
-
-TEST(Matrix, sharedBias) {
-  for (auto numSamples : {1, 100, 520}) {
-    for (auto dim : {100 * 16, 100 * 32}) {
-      for (auto channel : {8, 16}) {
-        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
-                << " channel=" << channel;
-        testAddSharedBias(numSamples, dim, channel);
-        testCollectSharedBias(numSamples, dim, channel);
-      }
-    }
-  }
-}
-
-void testMultiBinaryLabelCrossEntropy(int numSamples, int dim) {
-  AutoCompare forward(numSamples, 1);
-  CpuMatrix arg1(numSamples, dim);
-  CpuSparseMatrix arg2(
-      numSamples, dim, numSamples, paddle::NO_VALUE, paddle::SPARSE_CSR);
-
-  CpuMatrix output1(numSamples, dim);
-  output1.randomizeUniform();
-  output1.softmax(arg1);
-  for (int i = 0; i < numSamples; i++) {
-    const unsigned int id = std::rand() % dim;
-    arg2.setRow(i, 1, &id, nullptr);
-  }
-  forward.cmpWithArg(&Matrix::multiBinaryLabelCrossEntropy, arg1, arg2);
-
-  AutoCompare backward(numSamples, dim);
-  backward.cmpWithArg(&Matrix::multiBinaryLabelCrossEntropyBp, arg1, arg2);
-}
-
-TEST(Matrix, multiBinaryCrossEntropy) {
-  for (auto numSamples : {100, 1000, 10000}) {
-    for (auto dim : {100, 1000, 10000}) {
-      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
-      testMultiBinaryLabelCrossEntropy(numSamples, dim);
-    }
-  }
-}
-
-#endif
diff --git a/paddle/legacy/math/tests/test_RowBuffer.cpp b/paddle/legacy/math/tests/test_RowBuffer.cpp
deleted file mode 100644
index 2ef8cd303d6..00000000000
--- a/paddle/legacy/math/tests/test_RowBuffer.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/legacy/math/RowBuffer.h"
-
-TEST(RowBuffer, testAutoGrow) {
-  paddle::RowBuffer buf(128);
-  ASSERT_EQ(128UL, buf.getWidth());
-  ASSERT_TRUE(buf.isAutoGrowth());
-  buf.resize(2);
-  ASSERT_EQ(2UL, buf.getRowCount());
-  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
-    buf.data()[i] = i;
-  }
-  for (size_t i = 0; i < buf.getRowCount(); ++i) {
-    for (size_t j = 0; j < buf.getWidth(); ++j) {
-      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
-    }
-  }
-
-  auto data = buf.getWithAutoGrowth(2);
-  for (size_t i = 0; i < buf.getWidth(); ++i) {
-    data[i] = i;
-  }
-
-  ASSERT_EQ(3UL, buf.getRowCount());
-  for (size_t i = 0; i < buf.getRowCount() - 1; ++i) {
-    for (size_t j = 0; j < buf.getWidth(); ++j) {
-      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
-    }
-  }
-  for (size_t i = 0; i < buf.getWidth(); ++i) {
-    ASSERT_NEAR(i, buf.get(2)[i], 1e-5);
-  }
-}
-
-TEST(RowBuffer, testWithMemBuf) {
-  paddle::CpuMemHandlePtr mem =
-      std::make_shared<paddle::CpuMemoryHandle>(128 * 2 * sizeof(real));
-  paddle::RowBuffer buf(mem, 128);
-  ASSERT_TRUE(!buf.isAutoGrowth());
-  ASSERT_EQ(2UL, buf.getRowCount());
-  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
-    buf.data()[i] = i;
-  }
-  for (size_t i = 0; i < buf.getRowCount(); ++i) {
-    for (size_t j = 0; j < buf.getWidth(); ++j) {
-      ASSERT_NEAR(i * buf.getWidth() + j, buf.getWithAutoGrowth(i)[j], 1e-5);
-    }
-  }
-
-  ASSERT_DEATH_IF_SUPPORTED(buf.getWithAutoGrowth(3), ".*");
-}
diff --git a/paddle/legacy/math/tests/test_SIMDFunctions.cpp b/paddle/legacy/math/tests/test_SIMDFunctions.cpp
deleted file mode 100644
index c6490f70e33..00000000000
--- a/paddle/legacy/math/tests/test_SIMDFunctions.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/math/SIMDFunctions.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <random>
-
-#include <stdlib.h>
-#include <time.h>
-
-static constexpr size_t VECTOR_LEN = 3072;
-static constexpr size_t BATCH_SIZE = 64;
-static constexpr size_t ALIGN = 32;
-static_assert(VECTOR_LEN % ALIGN == 0, "VECTOR_LEN % ALIGN == 0");
-static_assert(BATCH_SIZE % ALIGN == 0, "BATCH_SIZE % ALIGN == 0");
-static constexpr float EPSILON = 1e-5;
-static std::mt19937 RandomEngine(time(0));
-
-inline static std::unique_ptr<float[]> NewVector(size_t len = VECTOR_LEN,
-                                                 size_t align = ALIGN) {
-  float* ptr;
-  CHECK_EQ(posix_memalign((void**)&ptr, align, len * sizeof(float)), 0);
-  return std::unique_ptr<float[]>(ptr);
-}
-
-inline static std::unique_ptr<float[]> NewRandomVector(size_t len = VECTOR_LEN,
-                                                       size_t align = ALIGN) {
-  std::uniform_real_distribution<float> dist(-100.0f, 100.0f);
-  auto generator = std::bind(dist, RandomEngine);
-  auto retv = NewVector(len, align);
-  std::generate_n(retv.get(), len, generator);
-  return retv;
-}
-
-TEST(SIMDFunction, addTo) {
-  typedef std::function<void(float*, const float*, size_t)> AddToMethodType;
-
-  AddToMethodType naive = paddle::simd::naive::addTo<float>;
-  AddToMethodType simd = paddle::simd::addTo<float>;
-
-  auto A = NewRandomVector();
-  auto B = NewRandomVector();
-
-  auto ACopy = NewVector();
-  memcpy(ACopy.get(), A.get(), VECTOR_LEN * sizeof(float));
-
-  naive(A.get(), B.get(), VECTOR_LEN);
-  simd(ACopy.get(), B.get(), VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(A[i], ACopy[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, batchAddTo) {
-  auto A = NewRandomVector();
-  auto ACopy = NewVector();
-  memcpy(ACopy.get(), A.get(), sizeof(float) * VECTOR_LEN);
-
-  std::vector<std::unique_ptr<float[]>> B;
-  for (size_t i = 0; i < BATCH_SIZE; ++i) {
-    B.emplace_back(NewRandomVector());
-  }
-  std::unique_ptr<float* []> BRaw(new float*[BATCH_SIZE]);
-  for (size_t i = 0; i < BATCH_SIZE; ++i) {
-    BRaw[i] = B[i].get();
-  }
-
-  typedef std::function<void(float*, const float**, int, size_t)>
-      BatchAddToMethodType;
-
-  BatchAddToMethodType naive = paddle::simd::naive::batchAddTo<float>;
-  BatchAddToMethodType simd = paddle::simd::batchAddTo<float>;
-
-  naive(A.get(), (const float**)BRaw.get(), BATCH_SIZE, VECTOR_LEN);
-  simd(ACopy.get(), (const float**)BRaw.get(), BATCH_SIZE, VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(A[i], ACopy[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, colMax) {
-  auto A = NewRandomVector(VECTOR_LEN * BATCH_SIZE);
-  auto naiveResult = NewVector(BATCH_SIZE);
-  auto simdResult = NewVector(BATCH_SIZE);
-
-  typedef std::function<void(float*, const float*, int, int)> ColMaxMethodType;
-  ColMaxMethodType naive = paddle::simd::naive::colMax<float>;
-  ColMaxMethodType simd = paddle::simd::colMax<float>;
-
-  naive(naiveResult.get(), A.get(), BATCH_SIZE, VECTOR_LEN);
-  simd(simdResult.get(), A.get(), BATCH_SIZE, VECTOR_LEN);
-
-  for (size_t i = 0; i < BATCH_SIZE; ++i) {
-    ASSERT_NEAR(naiveResult[i], simdResult[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, decayL1_WithLR) {
-  auto dest = NewRandomVector();
-  auto src = NewRandomVector();
-  auto lr = NewRandomVector();
-  auto lambda = 0.23f;
-
-  auto simd_dest = NewVector();
-  memcpy(simd_dest.get(), dest.get(), sizeof(float) * VECTOR_LEN);
-
-  typedef std::function<void(float*, float*, float*, float, size_t)>
-      DecayL1MethodType;
-
-  DecayL1MethodType naive = [](
-      float* d, float* s, float* lr, float l, size_t len) {
-    paddle::simd::naive::decayL1<float>(d, s, lr, l, len);
-  };
-
-  DecayL1MethodType simd = [](
-      float* d, float* s, float* lr, float l, size_t len) {
-    paddle::simd::decayL1<float>(d, s, lr, l, len);
-  };
-
-  naive(dest.get(), src.get(), lr.get(), lambda, VECTOR_LEN);
-  simd(simd_dest.get(), src.get(), lr.get(), lambda, VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, decayL1_WithoutLR) {
-  auto dest = NewRandomVector();
-  auto src = NewRandomVector();
-  auto lambda = 0.23;
-
-  auto simd_dest = NewVector();
-  memcpy(simd_dest.get(), dest.get(), sizeof(float) * VECTOR_LEN);
-
-  typedef std::function<void(float*, float*, float, size_t)> DecayL1MethodType;
-
-  DecayL1MethodType naive = [](float* d, float* s, float l, size_t len) {
-    paddle::simd::naive::decayL1<float>(d, s, l, len);
-  };
-
-  DecayL1MethodType simd = [](float* d, float* s, float l, size_t len) {
-    paddle::simd::decayL1<float>(d, s, l, len);
-  };
-
-  naive(dest.get(), src.get(), lambda, VECTOR_LEN);
-  simd(simd_dest.get(), src.get(), lambda, VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
-  }
-}
diff --git a/paddle/legacy/math/tests/test_SparseMatrix.cpp b/paddle/legacy/math/tests/test_SparseMatrix.cpp
deleted file mode 100644
index 30896a945ec..00000000000
--- a/paddle/legacy/math/tests/test_SparseMatrix.cpp
+++ /dev/null
@@ -1,565 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/legacy/utils/PythonUtil.h>
-#include <vector>
-#include "test_matrixUtil.h"
-
-using namespace paddle;  // NOLINT
-
-TEST(Matrix, CopyCpuMatrixToSparseMatrix) {
-  const size_t HEIGHT = 20;
-  const size_t WIDTH = 10;
-  const size_t WIDTH_TEST = 15;
-  MatrixPtr testMatrix(
-      new CpuSparseMatrix(HEIGHT, WIDTH, HEIGHT * 5, FLOAT_VALUE, SPARSE_CSR));
-  MatrixPtr testCpuMatrix(new CpuMatrix(HEIGHT, WIDTH));
-  testCpuMatrix->randomizeUniform();
-  testMatrix->copyFrom(*testCpuMatrix, HPPL_STREAM_DEFAULT);
-  MatrixPtr mulCpuMatrix(new CpuMatrix(WIDTH, WIDTH_TEST));
-  mulCpuMatrix->randomizeUniform();
-  MatrixPtr ret1(new CpuMatrix(HEIGHT, WIDTH_TEST)),
-      ret2(new CpuMatrix(HEIGHT, WIDTH_TEST));
-  ret1->zeroMem();
-  ret2->zeroMem();
-  ret1->mul(*testMatrix, *mulCpuMatrix, 1.0, 1.0);
-  ret2->mul(*testCpuMatrix, *mulCpuMatrix, 1.0, 1.0);
-  checkMatrixEqual(ret1, ret2);
-}
-
-struct MatrixPara {
-  size_t height;
-  size_t width;
-  bool trans;
-  bool sparse;
-  size_t nnz;
-  SparseFormat format;
-};
-
-#ifdef PADDLE_WITH_CUDA
-void test_sparse_matrix_mul(MatrixPara paraA,
-                            MatrixPara paraB,
-                            MatrixPara paraC) {
-  // for cpu sparse matrix mul
-  MatrixPtr cpuMatrixA, cpuMatrixB, cpuMatrixC, gpuMatrixC_d2h;
-  // for gpu sparse matrix mul
-  MatrixPtr gpuMatrixA, gpuMatrixB, gpuMatrixC;
-  // for cpu dense matrix mul
-  MatrixPtr cpuDenseA, cpuDenseB, cpuDenseC;
-
-  if (paraA.sparse) {
-    cpuMatrixA = Matrix::createSparseMatrix(paraA.height,
-                                            paraA.width,
-                                            paraA.nnz,
-                                            FLOAT_VALUE,
-                                            paraA.format,
-                                            paraA.trans,
-                                            false);
-    gpuMatrixA = Matrix::createSparseMatrix(paraA.height,
-                                            paraA.width,
-                                            paraA.nnz,
-                                            FLOAT_VALUE,
-                                            paraA.format,
-                                            paraA.trans,
-                                            true);
-  } else {
-    cpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
-    gpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, true);
-  }
-  cpuDenseA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
-
-  if (paraB.sparse) {
-    cpuMatrixB = Matrix::createSparseMatrix(paraB.height,
-                                            paraB.width,
-                                            paraB.nnz,
-                                            FLOAT_VALUE,
-                                            paraB.format,
-                                            paraB.trans,
-                                            false);
-    gpuMatrixB = Matrix::createSparseMatrix(paraB.height,
-                                            paraB.width,
-                                            paraB.nnz,
-                                            FLOAT_VALUE,
-                                            paraB.format,
-                                            paraB.trans,
-                                            true);
-  } else {
-    cpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
-    gpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, true);
-  }
-  cpuDenseB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
-
-  if (paraC.sparse) {
-    cpuMatrixC = Matrix::createSparseMatrix(paraC.height,
-                                            paraC.width,
-                                            paraC.nnz,
-                                            FLOAT_VALUE,
-                                            paraC.format,
-                                            paraC.trans,
-                                            false);
-    gpuMatrixC = Matrix::createSparseMatrix(paraC.height,
-                                            paraC.width,
-                                            paraC.nnz,
-                                            FLOAT_VALUE,
-                                            paraC.format,
-                                            paraC.trans,
-                                            true);
-    gpuMatrixC_d2h = Matrix::createSparseMatrix(paraC.height,
-                                                paraC.width,
-                                                paraC.nnz,
-                                                FLOAT_VALUE,
-                                                paraC.format,
-                                                paraC.trans,
-                                                false);
-  } else {
-    cpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, false);
-    gpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, true);
-    gpuMatrixC_d2h =
-        Matrix::create(paraC.height, paraC.width, paraC.trans, false);
-  }
-  cpuDenseC = Matrix::create(paraC.height, paraC.width, paraC.trans, false);
-
-  /*matrix init*/
-  hl_stream_t stream(HPPL_STREAM_1);
-  cpuMatrixA->randomizeUniform();
-  cpuMatrixB->randomizeUniform();
-  cpuMatrixC->randomizeUniform();
-
-  gpuMatrixA->copyFrom(*cpuMatrixA, stream);
-  gpuMatrixB->copyFrom(*cpuMatrixB, stream);
-  gpuMatrixC->copyFrom(*cpuMatrixC, stream);
-
-  cpuDenseA->copyFrom(*cpuMatrixA);
-  cpuDenseB->copyFrom(*cpuMatrixB);
-  cpuDenseC->copyFrom(*cpuMatrixC);
-
-  hl_stream_synchronize(stream);
-
-  /*matrix mul*/
-  cpuMatrixC->mul(*cpuMatrixA, *cpuMatrixB, 1.0, 1.0);
-  gpuMatrixC->mul(*gpuMatrixA, *gpuMatrixB, 1.0, 1.0);
-  cpuDenseC->mul(*cpuDenseA, *cpuDenseB, 1.0, 1.0);
-
-  gpuMatrixC_d2h->copyFrom(*gpuMatrixC, stream);
-  hl_stream_synchronize(stream);
-
-  /*check result*/
-  if (paraC.sparse) {
-    checkSMatrixEqual(
-        std::dynamic_pointer_cast<CpuSparseMatrix>(cpuMatrixC),
-        std::dynamic_pointer_cast<CpuSparseMatrix>(gpuMatrixC_d2h));
-    checkSMatrixEqual2Dense(
-        std::dynamic_pointer_cast<CpuSparseMatrix>(cpuMatrixC),
-        std::dynamic_pointer_cast<CpuMatrix>(cpuDenseC));
-  } else {
-    checkMatrixEqual(cpuMatrixC, gpuMatrixC_d2h);
-    checkMatrixEqual(cpuMatrixC, cpuDenseC);
-  }
-}
-
-TEST(Matrix, SparseMatrixMul) {
-  const size_t DIM_M = 4;
-  const size_t DIM_N = 4;
-  const size_t DIM_K = 8;
-  const size_t NNZ = 5;
-  for (auto format : {SPARSE_CSC, SPARSE_CSR}) {
-    std::string str_format = format == SPARSE_CSC ? "CSC" : "CSR";
-    LOG(INFO) << "test dense mul " << str_format;
-    test_sparse_matrix_mul(
-        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format},
-        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format});
-
-    LOG(INFO) << "test dense mul " << str_format << "  trans";
-    test_sparse_matrix_mul(
-        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_N, DIM_K, /*trans*/ true, /*sparse*/ true, NNZ, format},
-        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format});
-
-    LOG(INFO) << "test dense mul dense 2 " << str_format;
-    test_sparse_matrix_mul(
-        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format});
-
-    LOG(INFO) << "test denseT mul dense 2 " << str_format;
-    test_sparse_matrix_mul(
-        {DIM_K, DIM_M, /*trans*/ true, /*sparse*/ false, NNZ, format},
-        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format});
-  }
-}
-
-TEST(Matrix, CopySparseMatrixToGpuSparseMatrix) {
-  const size_t HEIGHT = 20;
-  const size_t WIDTH = 10;
-  const size_t WIDTH_TEST = 15;
-  MatrixPtr testMatrix(
-      new CpuSparseMatrix(HEIGHT, WIDTH, HEIGHT * 2, FLOAT_VALUE, SPARSE_CSR));
-  MatrixPtr testCpuMatrix(new CpuMatrix(HEIGHT, WIDTH));
-  testCpuMatrix->randomizeUniform();
-  testMatrix->copyFrom(*testCpuMatrix, HPPL_STREAM_DEFAULT);
-
-  MatrixPtr testGpuMatrix = testMatrix->clone(HEIGHT, WIDTH, true);
-  hl_stream_t gpuStream(HPPL_STREAM_3);
-  testGpuMatrix->copyFrom(*testMatrix, gpuStream);
-  hl_stream_synchronize(gpuStream);
-
-  MatrixPtr mulCpuMatrix(new CpuMatrix(WIDTH, WIDTH_TEST));
-  mulCpuMatrix->randomizeUniform();
-  MatrixPtr mulGpuMatrix(new GpuMatrix(WIDTH, WIDTH_TEST));
-  mulGpuMatrix->copyFrom(*mulCpuMatrix);
-  MatrixPtr ret1(new CpuMatrix(HEIGHT, WIDTH_TEST));
-  MatrixPtr ret2(new GpuMatrix(HEIGHT, WIDTH_TEST));
-  ret1->zeroMem();
-  ret2->zeroMem();
-  ret1->mul(*testMatrix, *mulCpuMatrix, 1.0, 1.0);
-  ret2->mul(*testGpuMatrix, *mulGpuMatrix, 1.0, 1.0);
-  checkMatrixEqual(ret1, ret2);
-}
-
-#endif
-
-TEST(Matrix, SparseMatrixTranspose) {
-  for (auto height : {10, 50, 100}) {
-    for (auto width : {10, 50, 100}) {
-      auto nnz = height * width;
-      for (auto valueType : {FLOAT_VALUE, NO_VALUE}) {
-        for (auto format : {SPARSE_CSR, SPARSE_CSC}) {
-          for (auto sparseRate : {0.1, 0.2, 0.5}) {
-            MatrixPtr matA = Matrix::createSparseMatrix(
-                height, width, size_t(nnz * sparseRate), valueType, format);
-            MatrixPtr matB(new CpuSparseMatrix(
-                width, height, size_t(nnz * sparseRate), valueType, format));
-            matA->randomizeUniform();
-            matA->transpose(matB, false);
-
-            /*dense matrix transpose*/
-            CpuMatrixPtr matC(new CpuMatrix(height, width));
-            matC->copyFrom(*matA);
-            MatrixPtr matD(new CpuMatrix(width, height));
-            matC->transpose(matD, false);
-
-            /*check result*/
-            checkSMatrixEqual2Dense(
-                std::dynamic_pointer_cast<CpuSparseMatrix>(matB),
-                std::dynamic_pointer_cast<CpuMatrix>(matD));
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Matrix, CpuSparseMatrixSubMatrix) {
-  const size_t HEIGHT = 10;
-  const size_t WIDTH = 10;
-  const size_t NNZ = HEIGHT * WIDTH;
-  for (auto valueType : {FLOAT_VALUE, NO_VALUE}) {
-    size_t startRow = 3;
-    size_t rowNum = 2;
-    real sparseRate = 0.1;
-    /*sparse matrix init and get subMatrix*/
-    CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
-        HEIGHT, WIDTH, size_t(NNZ * sparseRate), valueType, SPARSE_CSR);
-    matA->randomizeUniform();
-    CpuSparseMatrixPtr matB = std::dynamic_pointer_cast<CpuSparseMatrix>(
-        matA->subMatrix(startRow, rowNum));
-
-    int start = matA->getRows()[startRow];
-    int end = matA->getRows()[startRow + rowNum];
-
-    /*compare two matrix*/
-    ASSERT_EQ(matB->getElementCnt(), size_t(end - start));
-    if (valueType == FLOAT_VALUE) {
-      for (size_t i = 0; i < matB->getElementCnt(); i++) {
-        ASSERT_FLOAT_EQ(matB->getValue()[start + i],
-                        matA->getValue()[start + i]);
-      }
-    }
-
-    for (size_t i = 0; i < matB->getElementCnt(); i++) {
-      ASSERT_EQ(matB->getCols()[start + i], matA->getCols()[start + i]);
-    }
-    for (size_t i = 0; i < rowNum; i++) {
-      ASSERT_EQ(matB->getRows()[i], matA->getRows()[startRow + i]);
-    }
-  }
-}
-
-void sparseValid(
-    int* major, int* minor, size_t nnz, size_t majorLen, size_t minorLen) {
-  CHECK_EQ(nnz, size_t(major[majorLen - 1]));
-  CHECK_EQ(nnz, minorLen);
-  for (size_t i = 0; i < majorLen - 1; i++) {
-    EXPECT_LE(major[i], major[i + 1]);
-    for (int j = major[i]; j < major[i + 1] - 1; j++) {
-      EXPECT_LE(minor[j], minor[j + 1]);
-    }
-  }
-}
-
-TEST(Matrix, CpuSparseMatrixRandUniform) {
-  const size_t HEIGHT = 5;
-  const size_t WIDTH = 10;
-  const size_t NNZ = HEIGHT * WIDTH;
-  int* major = nullptr;
-  int* minor = nullptr;
-  size_t majorLen = 0;
-  size_t minorLen = 0;
-  size_t nnz = 0;
-  for (auto valueType : {NO_VALUE, FLOAT_VALUE}) {
-    for (auto format : {SPARSE_CSR, SPARSE_CSC}) {
-      CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
-          HEIGHT, WIDTH, size_t(NNZ * 0.1), valueType, format);
-      matA->randomizeUniform();
-      nnz = matA->getElementCnt();
-      if (format == SPARSE_CSR) {
-        majorLen = matA->getHeight() + 1;
-        minorLen = matA->getElementCnt();
-        major = matA->getRows();
-        minor = matA->getCols();
-      } else {
-        majorLen = matA->getWidth() + 1;
-        minorLen = matA->getElementCnt();
-        major = matA->getCols();
-        minor = matA->getRows();
-      }
-      sparseValid(major, minor, nnz, majorLen, minorLen);
-    }
-  }
-}
-
-TEST(Matrix, CpuSparseMatrixCopyFrom) {
-  size_t height = 10;
-  size_t width = 8;
-  int64_t indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 30, 32};
-  sparse_non_value_t data[32];
-  for (size_t i = 0; i < 32; i++) {
-    data[i].col = ::rand() % width;
-  }
-  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
-      height, width, 32, NO_VALUE, SPARSE_CSR, false);
-  mat->copyFrom(indices, data);
-
-  /*compare indices*/
-  size_t sum = 0;
-  CHECK_EQ(sum, size_t(mat->getRows()[0]));
-  for (size_t i = 1; i < height + 1; i++) {
-    sum += indices[i] - indices[i - 1];
-    CHECK_EQ(sum, size_t(mat->getRows()[i]));
-  }
-  CHECK_EQ(mat->getElementCnt(), size_t(indices[height] - indices[0]));
-  for (size_t i = 0; i < mat->getElementCnt(); i++) {
-    CHECK_EQ(size_t(mat->getCols()[i]), size_t(data[i].col));
-  }
-}
-
-TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
-  size_t height = 10;
-  size_t width = 8;
-  int64_t indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 27, 32};
-  sparse_float_value_t data[32];
-  int value[32] = {
-      1,                       // row_0 : 1
-      5, 3, 1, 6,              // row_1 : 4
-      0, 1, 2, 3,              // row_3 : 4
-      4, 5, 6, 7,              // row_4 : 4
-      2, 3,                    // row_5 : 2
-      3, 5,                    // row_6 : 2
-      0, 1,                    // row_7 : 2
-      0, 1, 2, 3, 4, 5, 6, 7,  // row_8 : 8
-      2, 4, 7, 3, 1            // row_9 : 5
-  };
-  for (size_t i = 0; i < 32; i++) {
-    data[i].col = value[i];
-    data[i].value = float(value[i]);
-  }
-  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
-      height, width, 32, FLOAT_VALUE, SPARSE_CSR, false);
-  mat->copyFrom(indices, data);
-
-  /*compare indices*/
-  size_t sum = 0;
-  CHECK_EQ(sum, size_t(mat->getRows()[0]));
-  for (size_t i = 1; i < height + 1; i++) {
-    sum += indices[i] - indices[i - 1];
-    CHECK_EQ(sum, size_t(mat->getRows()[i]));
-  }
-  CHECK_EQ(mat->getElementCnt(), size_t(indices[height] - indices[0]));
-  for (size_t i = 0; i < mat->getElementCnt(); i++) {
-    CHECK_EQ(size_t(mat->getCols()[i]), size_t(data[i].col));
-  }
-
-  size_t trimedWidth = 4;
-  int64_t trimedIndices[11] = {0, 1, 3, 3, 7, 7, 9, 10, 12, 16, 19};
-  sparse_float_value_t trimedData[19];
-  int trimedValue[19] = {
-      1,  // row_0 : 1
-      3,
-      1,  // row_1 : 2
-      0,
-      1,
-      2,
-      3,  // row_3 : 4
-      2,
-      3,  // row_5 : 2
-      3,  // row_6 : 1
-      0,
-      1,  // row_7 : 2
-      0,
-      1,
-      2,
-      3,  // row_8 : 4
-      2,
-      3,
-      1  // row_9 : 3
-  };
-  for (size_t i = 0; i < 19; i++) {
-    trimedData[i].col = trimedValue[i];
-    trimedData[i].value = float(trimedValue[i]);
-  }
-  CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, 19, FLOAT_VALUE, SPARSE_CSR, false);
-  matA->copyFrom(trimedIndices, trimedData);
-
-  /*compare indices*/
-  sum = 0;
-  CHECK_EQ(sum, size_t(matA->getRows()[0]));
-  for (size_t i = 1; i < height + 1; i++) {
-    sum += trimedIndices[i] - trimedIndices[i - 1];
-    CHECK_EQ(sum, size_t(matA->getRows()[i]));
-  }
-  CHECK_EQ(matA->getElementCnt(),
-           size_t(trimedIndices[height] - trimedIndices[0]));
-  for (size_t i = 0; i < matA->getElementCnt(); i++) {
-    CHECK_EQ(size_t(matA->getCols()[i]), size_t(trimedData[i].col));
-  }
-
-  CpuSparseMatrixPtr matB = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, false);
-  matB->trimFrom(*mat);
-  checkSMatrixEqual2(matA, matB);
-
-#ifdef PADDLE_WITH_CUDA
-  GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
-      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, true);
-  matC->trimFrom(*mat);
-
-  CpuSparseMatrixPtr matD =
-      std::make_shared<CpuSparseMatrix>(height,
-                                        trimedWidth,
-                                        matC->getElementCnt(),
-                                        FLOAT_VALUE,
-                                        SPARSE_CSR,
-                                        false);
-  matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  checkSMatrixEqual2(matA, matD);
-#endif
-}
-
-TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
-  size_t height = 8;
-  size_t width = 10;
-  int indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 27, 32};
-  int value[32] = {
-      1,                       // col_0 : 1
-      5, 3, 1, 6,              // col_1 : 4
-      0, 1, 2, 3,              // col_3 : 4
-      4, 5, 6, 7,              // col_4 : 4
-      2, 3,                    // col_5 : 2
-      3, 5,                    // col_6 : 2
-      0, 1,                    // col_7 : 2
-      0, 1, 2, 3, 4, 5, 6, 7,  // col_8 : 8
-      2, 4, 7, 3, 1            // col_9 : 5
-  };
-  std::vector<int> rows(value, value + 32);
-  std::vector<int> cols(indices, indices + 11);
-  std::vector<real> values(value, value + 32);
-  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
-      height, width, 32, FLOAT_VALUE, SPARSE_CSC, false);
-  mat->copyFrom(rows, cols, values);
-
-  /*compare indices*/
-  size_t sum = 0;
-  CHECK_EQ(sum, size_t(mat->getCols()[0]));
-  for (size_t i = 1; i < width + 1; i++) {
-    sum += indices[i] - indices[i - 1];
-    CHECK_EQ(sum, size_t(mat->getCols()[i]));
-  }
-  CHECK_EQ(mat->getElementCnt(), size_t(indices[width] - indices[0]));
-  for (size_t i = 0; i < mat->getElementCnt(); i++) {
-    CHECK_EQ(size_t(mat->getRows()[i]), size_t(value[i]));
-  }
-
-  size_t trimedWidth = 5;
-  int trimedIndices[6] = {0, 1, 5, 5, 9, 13};
-  int trimedValue[13] = {
-      1,  // col_0 : 1
-      5,
-      3,
-      1,
-      6,  // col_1 : 4
-      0,
-      1,
-      2,
-      3,  // col_3 : 4
-      4,
-      5,
-      6,
-      7  // col_4 : 4
-  };
-  std::vector<int> rowsA(trimedValue, trimedValue + 13);
-  std::vector<int> colsA(trimedIndices, trimedIndices + 6);
-  std::vector<real> valuesA(trimedValue, trimedValue + 13);
-  CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, 13, FLOAT_VALUE, SPARSE_CSC, false);
-  matA->copyFrom(rowsA, colsA, valuesA);
-
-  /*compare indices*/
-  sum = 0;
-  CHECK_EQ(sum, size_t(matA->getCols()[0]));
-  for (size_t i = 1; i < trimedWidth + 1; i++) {
-    sum += trimedIndices[i] - trimedIndices[i - 1];
-    CHECK_EQ(sum, size_t(matA->getCols()[i]));
-  }
-  CHECK_EQ(matA->getElementCnt(),
-           size_t(trimedIndices[trimedWidth] - trimedIndices[0]));
-  for (size_t i = 0; i < matA->getElementCnt(); i++) {
-    CHECK_EQ(size_t(matA->getRows()[i]), size_t(rowsA[i]));
-  }
-
-  CpuSparseMatrixPtr matB = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, false);
-  matB->trimFrom(*mat);
-  checkSMatrixEqual2(matA, matB);
-
-#ifdef PADDLE_WITH_CUDA
-  GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
-      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, true);
-  matC->trimFrom(*mat);
-
-  CpuSparseMatrixPtr matD =
-      std::make_shared<CpuSparseMatrix>(height,
-                                        trimedWidth,
-                                        matC->getElementCnt(),
-                                        FLOAT_VALUE,
-                                        SPARSE_CSC,
-                                        false);
-  matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  checkSMatrixEqual2(matA, matD);
-#endif
-}
diff --git a/paddle/legacy/math/tests/test_Tensor.cu b/paddle/legacy/math/tests/test_Tensor.cu
deleted file mode 100644
index 3ce056d6614..00000000000
--- a/paddle/legacy/math/tests/test_Tensor.cu
+++ /dev/null
@@ -1,1162 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "TensorCheck.h"
-#include "paddle/legacy/math/Matrix.h"
-
-using paddle::Matrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using paddle::CpuVector;
-using paddle::GpuVector;
-using paddle::CpuIVector;
-using paddle::GpuIVector;
-using autotest::TensorCheckEqual;
-using autotest::TensorCheckErr;
-
-#define INIT_UNARY(A1, A2)  \
-  Tensor A1(height, width); \
-  Tensor A2(height, width); \
-  A1.randomizeUniform();    \
-  A2.copyFrom(A1)
-#define INIT_BINARY(A1, A2, B) \
-  INIT_UNARY(A1, A2);          \
-  Tensor B(height, width);     \
-  B.randomizeUniform()
-#define INIT_TERNARY(A1, A2, B, C) \
-  INIT_BINARY(A1, A2, B);          \
-  Tensor C(height, width);         \
-  C.randomizeUniform()
-#define INIT_QUATERNARY(A1, A2, B, C, D) \
-  INIT_TERNARY(A1, A2, B, C);            \
-  Tensor D(height, width);               \
-  D.randomizeUniform()
-
-template <typename Tensor>
-struct TestUnaryMatrix {
-  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
-
-  explicit TestUnaryMatrix(UnaryFunc testUnaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_UNARY(A1, A2);
-        testUnaryFunc(A1, A2);
-      }
-    }
-  }
-};
-
-template <typename Tensor>
-struct TestBinaryMatrix {
-  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B)> BinaryFunc;
-
-  explicit TestBinaryMatrix(BinaryFunc testBinaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_BINARY(A1, A2, B);
-        testBinaryFunc(A1, A2, B);
-      }
-    }
-  }
-};
-
-template <typename Tensor>
-struct TestTernaryMatrix {
-  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)>
-      TernaryFunc;
-
-  explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_TERNARY(A1, A2, B, C);
-        testTernaryFunc(A1, A2, B, C);
-      }
-    }
-  }
-};
-
-template <typename Tensor>
-struct TestQuaternaryMatrix {
-  typedef std::function<void(
-      Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)>
-      QuaternaryFunc;
-
-  explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_QUATERNARY(A1, A2, B, C, D);
-        testQuaternaryFunc(A1, A2, B, C, D);
-      }
-    }
-  }
-};
-
-template <typename Tensor, class T>
-struct TestUnaryVectorT {
-  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
-
-  explicit TestUnaryVectorT(UnaryFunc testUnaryFunc) {
-    for (auto size : {1, 11, 73, 128, 200, 330, 512, 1000, 4210}) {
-      LOG(INFO) << " size=" << size;
-      Tensor A1(size);
-      Tensor A2(size);
-      if (typeid(T) == typeid(real)) {
-        A1.rand();
-      } else {
-        A1.rand(1000);
-      }
-      A2.copyFrom(A1);
-      testUnaryFunc(A1, A2);
-    }
-  }
-};
-
-void SetTensorValue(Matrix& matrix, real value) {
-  int height = matrix.getHeight();
-  int width = matrix.getWidth();
-  int stride = matrix.getStride();
-  real* data = matrix.getData();
-  for (int i = 0; i < height; i++) {
-    int j = rand() % width;  // NOLINT
-    if (typeid(matrix) == typeid(CpuMatrix)) {
-      data[i * stride + j] = value;
-    } else if (typeid(matrix) == typeid(GpuMatrix)) {
-      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
-    } else {
-    }
-  }
-}
-
-template <typename Tensor>
-void testTensorAddScalar(Tensor& A1, Tensor& A2) {
-  real p1 = 2.5;
-  real p2 = 3.0;
-  A1.add(p1);  // a += p
-  A2 += p1;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(p1, p2);  // a = a * p1 + p2
-  A2 = A2 * p1 + p2;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSubScalar(Tensor& A1, Tensor& A2) {
-  real p = 2.5;
-  A1.subScalar(p);  // a -= p
-  A2 -= p;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMulScalar(Tensor& A1, Tensor& A2) {
-  real p = 2.5;
-  A1.mulScalar(p);  // a *= p
-  A2 *= p;
-  TensorCheckEqual(A1, A2);
-
-  real learningRate = 0.7f;
-  real decayRate = 1.2f;
-  A1.applyL2(learningRate, decayRate);
-  A2 = A2 * (1.0f / (1.0f + learningRate * decayRate));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorDivScalar(Tensor& A1, Tensor& A2) {
-  real p = 2.5;
-  A1.divScalar(p);  // a /= p
-  A2 /= p;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorNeg(Tensor& A1, Tensor& A2) {
-  A1.neg();  // a = -a
-  A2 = -A2;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAbs(Tensor& A1, Tensor& A2) {
-  A1.abs2();  // a = a > 0 ? a : -a
-  A2 = A2.abs();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSquare(Tensor& A1, Tensor& A2) {
-  A1.square2();  // a = a * a
-  A2 = A2.square();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocal(Tensor& A1, Tensor& A2) {
-  A1.reciprocal2();  // a = 1.0f / a
-  A2 = A2.reciprocal();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSign(Tensor& A1, Tensor& A2) {
-  A1.sign2();  // a = (a > 0) - (a < 0)
-  A2 = A2.sign();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAssign(Tensor& A1, Tensor& A2) {
-  A1.assign(1.5);  // a = p
-  A2 = A2.constant(1.5);
-  TensorCheckEqual(A1, A2);
-
-  A1.one();  // a = 1
-  A2 = A2.constant(1.0);
-  TensorCheckEqual(A1, A2);
-
-  A1.zero();  // a = 0
-  A2 = A2.constant(0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
-  testTensorAddScalar(A1, A2);
-  testTensorSubScalar(A1, A2);
-  testTensorMulScalar(A1, A2);
-  testTensorDivScalar(A1, A2);
-  testTensorNeg(A1, A2);
-  testTensorAbs(A1, A2);
-  testTensorSquare(A1, A2);
-  testTensorReciprocal(A1, A2);
-  testTensorSign(A1, A2);
-  testTensorAssign(A1, A2);
-}
-
-template <typename Tensor>
-void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
-  A1.add(2);  // a += p
-  A2 += 2;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(3, 2);  // a = a * p1 + p2
-  A2 = A2 * 3 + 2;
-  TensorCheckEqual(A1, A2);
-
-  testTensorNeg(A1, A2);
-  testTensorAbs(A1, A2);
-}
-
-TEST(Unary, BaseOp) {
-  TestUnaryMatrix<CpuMatrix> testCpuMatrix(testUnaryBaseOp<CpuMatrix>);
-  TestUnaryVectorT<CpuVector, real> testCpuVector(testUnaryBaseOp<CpuVector>);
-  TestUnaryVectorT<CpuIVector, int> testCpuIVector(
-      testUnaryBaseOpInt<CpuIVector>);
-
-#ifdef PADDLE_WITH_GPU
-  TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
-  TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
-  TestUnaryVectorT<GpuIVector, int> testGpuIVector(
-      testUnaryBaseOpInt<GpuIVector>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorExp(Tensor& A1, Tensor& A2) {
-  A1.exp2();  // a = exp(a)
-  A2 = A2.exp();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLog(Tensor& A1, Tensor& A2) {
-  A1.log2();  // a = log(a)
-  A2 = A2.log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSqrt(Tensor& A1, Tensor& A2) {
-  A1.sqrt2();  // a = sqrt(a)
-  A2 = A2.sqrt();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorPow(Tensor& A1, Tensor& A2) {
-  A1.pow2(3.2);  // a = pow(a, p)
-  A2 = A2.pow(3.2);
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testUnayrMathOp(Tensor& A1, Tensor& A2) {
-  testTensorExp(A1, A2);
-  testTensorLog(A1, A2);
-  testTensorSqrt(A1, A2);
-  testTensorPow(A1, A2);
-}
-
-TEST(Unary, MathOp) {
-  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrMathOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrMathOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorClip(Tensor& A1, Tensor& A2) {
-  real p1 = 0.003f;
-  real p2 = 0.877f;
-  A1.clip(p1, p2);  // a = a < p1 ? p1 : (a > p2 ? p2 : a)
-  // A2 = A2.min(0.877f).max(0.003f);
-  A2 = (A2 < p1).condition(p1, (A2 > p2).condition(p2, A2));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
-  real p = 0.5f;
-  A1.biggerThanScalar(p);  // a = a > p ? 1.0f : 0.0f
-  A2 = (A2 > p).condition((real)1.0, (real)0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorapplyL1(Tensor& A1, Tensor& A2) {
-  /**
-   * T lambda = p;
-   * a = (a > lambda) ? (a - lambda)
-   *                  : (a < -lambda) ? (a + lambda) : 0
-   *
-   * p = learningRate * decayRate;
-   */
-  real learningRate = 0.7f;
-  real decayRate = 0.6f;
-  A1.applyL1(learningRate, decayRate);
-  A2 = (A2 > (learningRate * decayRate))
-           .condition(
-               (A2 - (learningRate * decayRate)),
-               (A2 < -(learningRate * decayRate))
-                   .condition((A2 + (learningRate * decayRate)), (real)0.0));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
-  testTensorClip(A1, A2);
-  testTensorBiggerThanScalar(A1, A2);
-
-  A1.randomizeUniform();
-  A1.subScalar(0.5f);
-  A2.copyFrom(A1);
-  testTensorapplyL1(A1, A2);
-}
-
-TEST(Unary, CompareOp) {
-  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrCompareOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p1 = 2.5;
-  real p2 = 3.2;
-  A1.add(B);  // a += b
-  A2 += B;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(B, p1);  // a += b * p
-  A2 += B * p1;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(B, p1, p2);  // a = p1 * a + p2 * b
-  A2 = A2 * p1 + B * p2;
-  TensorCheckEqual(A1, A2);
-
-  A1.addScalar(B, p1);  // a = b + p
-  A2 = B + p1;
-  TensorCheckEqual(A1, A2);
-
-  A1.addSquare(B, p1);  // a += p * b * b
-  A2 += B.constant(p1) * B * B;
-  TensorCheckEqual(A1, A2);
-
-  A1.decayAddSquare(B, p1, p2);  // a = p1 * a + p2 * b * b
-  A2 = A2 * p1 + B.constant(p2) * B * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 2.5;
-  A1.sub(B);  // a -= b
-  A2 -= B;
-  TensorCheckEqual(A1, A2);
-
-  A1.sub(B, p);  // a -= b * p
-  A2 -= B * p;
-  TensorCheckEqual(A1, A2);
-
-  A1.subScalar(B, p);  // a = b - p
-  A2 = B - p;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 2.5;
-  A1.mulScalar(B, p);  // a = b * p
-  A2 = B * p;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotMulSquare(B);  // a *= b * b
-  A2 *= B * B;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotSquareMul(B);  // a = a * a * b
-  A2 = A2 * A2 * B;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotMul(B);  // a *= b
-  A2 *= B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 2.5;
-  A1.divScalar(B, p);  // a = b / p
-  A2 = B / p;
-  TensorCheckEqual(A1, A2);
-
-  A1.scalarDiv(B, p);  // a = p / b
-  A2 = B.constant(p) / B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.assign(B);  // a = b
-  A2 = B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.square2(A1);  // b = a * a
-  A2 = B.square();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.squareDerivative(B);  // a *= 2.0 * b
-  A2 = A2 * (real)2.0 * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.reciprocal2(A1);  // b = 1.0f / a
-  A2 = B.reciprocal();
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 0.58;
-  real p2 = 0.32;
-  A1.reciprocal2(B, p1, p2);  // a = 1 / (p1 * b + p2)
-  A2 = (B * p1 + p2).reciprocal();
-  TensorCheckEqual(A1, A2);
-
-  real learningRate = 0.7f;
-  real decayRate = 1.2f;
-  A1.applyL2(B, learningRate, decayRate);  // a *= (1.0f / (1.0f + p * b))
-  A2 *= (B.constant(1.0f) + B.constant(learningRate * decayRate) * B)
-            .reciprocal();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.reciprocalDerivative(B);  // a *= -b * b
-  A2 *= (-B) * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.sign2(A1);  // b = a > 0.0f ? 1.0f : -1.0f
-  A2 = B.sign();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.abs2(A1);  // b = a > 0.0f ? a : -a
-  A2 = B.abs();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
-  testTensorAdd(A1, A2, B);
-  testTensorSub(A1, A2, B);
-  testTensorMul(A1, A2, B);
-  testTensorDiv(A1, A2, B);
-  testTensorSquare(A1, A2, B);
-  testTensorSquareDerivative(A1, A2, B);
-  testTensorReciprocal(A1, A2, B);
-  testTensorReciprocalDerivative(A1, A2, B);
-  testTensorAbs(A1, A2, B);
-  testTensorSign(A1, A2, B);
-  testTensorAssign(A1, A2, B);
-}
-
-TEST(Binary, BaseOp) {
-  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryBaseOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryBaseOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = exp(b)
-  A1.exp2(B);
-  A2 = B.exp();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.expDerivative(B);  // a *= b
-  A2 *= B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = log(b)
-  A1.log2(B);
-  A2 = B.log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = sqrt(b)
-  A1.sqrt2(B);
-  A2 = B.sqrt();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = 1.0f / sqrt(b)
-  A1.invSqrt(B);
-  A2 = B.sqrt().reciprocal();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.pow2(B, 2.5f);  // a = pow(b, p)
-  A2 = B.pow(2.5f);
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-   * const T THRESHOLD = 40.0;
-   * b = log(1.0 +
-   *         exp((a > THRESHOLD) ? THRESHOLD
-   *             : ((a < -THRESHOLD) ? (-THRESHOLD) : a)))
-   */
-  B.softrelu(A1);
-
-  real THRESHOLD = 40.0;
-  A2 = (B.constant(1.0f) +
-        (B > THRESHOLD)
-            .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))
-            .exp())
-           .log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-   * const T THRESHOLD = 40.0;
-   * a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
-   *                             ? THRESHOLD
-   *                             : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
-   */
-  A1.softreluDerivative(B);
-  real THRESHOLD = 40.0;
-  A2 = A2 *
-       (B.constant(1.0f) -
-        (B.constant(-1.0f) *
-         (B > THRESHOLD)
-             .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)))
-            .exp());
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-    const T THRESHOLD_MIN = -40.0;
-    const T THRESHOLD_MAX = 13.0;
-    T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN
-            : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
-    b = 1.0f / (1.0f + exp(-tmp)))
-   */
-  B.sigmoid(A1);
-
-  const real THRESHOLD_MIN = -40.0;
-  const real THRESHOLD_MAX = 13.0;
-  auto tmp = (B < THRESHOLD_MIN)
-                 .condition(THRESHOLD_MIN,
-                            (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B));
-  A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.sigmoidDerivative(B);  // a *= b * (1 - b)
-  A2 *= B * (B.constant(1.0f) - B);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.tanh(A1);  // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
-  A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f;
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.tanhDerivative(B);  // a *= 1 - b * b
-  A2 *= B.constant(1.0f) - B * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p1 = 2.5;
-  real p2 = 3.1;
-  // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
-  B.scaledTanh(A1, p1, p2);
-  A2 = B.constant(p1) *
-       (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) -
-        (real)1.0);
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p1 = 2.5;
-  real p2 = 3.1;
-  // a *= (p2 / p1) * (p1 * p1 - b * b));
-  A1.scaledTanhDerivative(B, p1, p2);
-  A2 = A2 * (B.constant(p2 / p1) * (B.constant(p1 * p1) - B * B));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
-  testTensorTanhDerivative(A1, A2, B);
-  testTensorScaledTanhDerivative(A1, A2, B);
-  testTensorSigmoidDerivative(A1, A2, B);
-  testTensorExpDerivative(A1, A2, B);
-  testTensorScaledTanh(A1, A2, B);
-  testTensorTanh(A1, A2, B);
-  testTensorExp(A1, A2, B);
-  testTensorLog(A1, A2, B);
-  testTensorSqrt(A1, A2, B);
-  testTensorInvSqrt(A1, A2, B);
-  testTensorPow(A1, A2, B);
-
-  testTensorSoftrelu(A1, A2, B);
-  testTensorSoftreluDerivative(A1, A2, B);
-  testTensorSigmoid(A1, A2, B);
-}
-
-TEST(Binary, MathOp) {
-  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryMathOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryMathOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.relu(A1);  // b = a > 0.0f ? a : 0.0f
-  A2 = (B > (real)0.0f).condition(B, (real)0.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.reluDerivative(B);  // a *= (b > 0.0f ? 1.0f : 0.0f)
-  A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-   * b = a > p1 ? a : p1
-   * b = b < p2 ? b : p2
-   * int p1 = 0, p2 = 24;
-   */
-  SetTensorValue(B, 32.0f);
-  B.brelu(A1);
-  auto tmp = (B > (real)0.0f).condition(B, (real)0.0f);
-  A2 = (tmp < (real)24.0f).condition(tmp, (real)24.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  SetTensorValue(B, 32.0f);
-  /*
-   * a *= (b > p1 && b < p2) ? 1.0 : 0.0
-   * int p1 = 0, p2 = 24;
-   */
-  A1.breluDerivative(B);
-  A2 *= (B > (real)0.0f && B < (real)24.0f).condition((real)1.0f, (real)0.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.absDerivative(B);  // a = (b > 0) ? a : (b < 0) ? -a : 0
-  A2 = (B > (real)0.0f)
-           .condition(A2, (B < (real)0.0f).condition(-A2, (real)0.0f));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 0.613;
-  SetTensorValue(B, p);
-  A1.isEqualTo(B, p);  // a = (b == p)
-  A2 = (B == p);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
-  /**
-   * T lambda = p * b;
-   * a = (a > lambda) ? (a - lambda)
-   *                  : (a < -lambda) ? (a + lambda) : 0
-   *
-   * p = learningRate * decayRate;
-   */
-  real learningRate = 0.7f;
-  real decayRate = 0.6f;
-  A1.applyL1(B, learningRate, decayRate);
-  auto lambda = B.constant(learningRate * decayRate) * B;
-  A2 = (A2 > lambda)
-           .condition((A2 - lambda),
-                      (A2 < -lambda).condition((A2 + lambda), (real)0.0f));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.subScalar(0.5f);
-  SetTensorValue(B, 0.0f);
-  testTensorReluDerivative(A1, A2, B);
-
-  A1.randomizeUniform();
-  A2.copyFrom(A1);
-  testTensorBreluDerivative(A1, A2, B);
-
-  testTensorAbsDerivative(A1, A2, B);
-  testTensorRelu(A1, A2, B);
-  testTensorBrelu(A1, A2, B);
-  testTensorIsEqualTo(A1, A2, B);
-}
-
-TEST(Binary, CompareOp) {
-  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryCompareOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.add(B, C);  // a = b + c
-  A2 = B + C;
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-  real p3 = 3.8;
-  A1.add(B, p1, C, p2);  // a = p1 * b + p2 * c
-  A2 = B * p1 + C * p2;
-  TensorCheckEqual(A1, A2);
-
-  A1.add2(B, C);  // a = a + b + c
-  A2 = A2 + B + C;
-  TensorCheckEqual(A1, A2);
-
-  A1.add2(B, C, p1, p2, p3);  // a = p1 * a + p2 * b + p3 * c
-  A2 = A2 * p1 + B * p2 + C * p3;
-  TensorCheckEqual(A1, A2);
-
-  A1.decayAddSquareMul(B, C, p1, p2);  // a = p1 * a + p2 * b * b * c * c
-  A2 = A2 * p1 + B.constant(p2) * B * B * C * C;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.sub(B, C);  // a = b - c
-  A2 = B - C;
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-  A1.sub(B, p1, C, p2);  // a = p1 * b - p2 * c
-  A2 = B * p1 - C * p2;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.dotMul(B, C);  // a = b * c
-  A2 = B * C;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotMulSquare(B, C);  // a = b * c * c
-  A2 = B * C * C;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotSquareSquare(B, C);  // a = b * b * c * c
-  A2 = B * B * C * C;
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-
-  /*
-   * T tmp = p1 * b + p2 * c;
-   * a *= tmp * tmp
-   */
-  A1.dotMulSquareSum(B, C, p1, p2);
-  auto tmp = B * p1 + C * p2;
-  A2 *= tmp * tmp;
-  TensorCheckEqual(A1, A2);
-
-  /*
-   * T tmp = p1 * b + p2 * c;
-   * a = tmp * tmp
-   */
-  A1.dotSquareSum(B, C, p1, p2);
-  auto tmp2 = B * p1 + C * p2;
-  A2 = tmp2 * tmp2;
-  TensorCheckEqual(A1, A2);
-
-  // a *= p1 * b + p2 * c
-  A1.dotMulSum(B, C, p1, p2);
-  A2 *= B * p1 + C * p2;
-  TensorCheckEqual(A1, A2);
-
-  // a = p1 * a + p2 * b * c
-  A1.addDotMul(B, C, p1, p2);
-  A2 = A2 * p1 + B.constant(p2) * B * C;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.dotDiv(B, C);  // a = (b == 0.0) ? 0.0 : b / c
-  A2 = (B == (real)0.0).condition((real)0.0, B / C);
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-  A1.dotDiv(B, C, p1, p2);  // a = (b + p1) / (c + p2)
-  A2 = (B + p1) / (C + p2);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  real p1 = 1.5;
-  real p2 = 2.5;
-  real p3 = 3.5;
-  A1.reciprocalSum(B, C, p1, p2, p3);  // a = 1 / (p1 * b + p2 * c + p3)
-  A2 = (B * p1 + C * p2 + p3).reciprocal();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.softCrossEntropy(B, C);  // a = -c * log(b) - (1 - c) * log(1 - b)
-  A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftCrossEntropyBp(Tensor& A1,
-                                  Tensor& A2,
-                                  Tensor& B,
-                                  Tensor& C) {
-  A1.softCrossEntropyBp(B, C);  // a += (b - c) / (b * (1 - b))
-  A2 += (B - C) / (B * (B.constant(1.0f) - B));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  testTensorAdd(A1, A2, B, C);
-  testTensorSub(A1, A2, B, C);
-  testTensorMul(A1, A2, B, C);
-  testTensorDiv(A1, A2, B, C);
-  testTensorReciprocal(A1, A2, B, C);
-  testTensorSoftCrossEntropyBp(A1, A2, B, C);
-
-  testTensorSoftCrossEntropy(A1, A2, B, C);
-}
-
-TEST(Ternary, BaseOp) {
-  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryBaseOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryBaseOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorBinaryLabelCrossEntropy(Tensor& A1,
-                                       Tensor& A2,
-                                       Tensor& B,
-                                       Tensor& C) {
-  A1.binaryLabelCrossEntropy(B, C);  // a = c > 0.5 ? -log(b) : -log(1.0 - b)
-  A2 = (C > (real)0.5).condition(-(B.log()), -((B.constant(1.0f) - B).log()));
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBinaryLabelCrossEntropyBp(Tensor& A1,
-                                         Tensor& A2,
-                                         Tensor& B,
-                                         Tensor& C) {
-  // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
-  A1.binaryLabelCrossEntropyBp(B, C);
-  A2 += (C > (real)0.5)
-            .condition((B.constant(-1.0f) / B),
-                       (B.constant(1.0f) - B).reciprocal());
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLogisticRegressionLoss(Tensor& A1,
-                                      Tensor& A2,
-                                      Tensor& B,
-                                      Tensor& C) {
-  SetTensorValue(B, 50.0f);
-  SetTensorValue(B, -50.0f);
-  /**
-   * const T THRESHOLD = 40.0;
-   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
-   *                                        ? -THRESHOLD
-   *                                        : b;
-   * a = log(1 + exp(x)) - c * x
-   */
-  A1.logisticRegressionLoss(B, C);
-  real THRESHOLD = 40.0;
-  auto tmp =
-      (B > THRESHOLD)
-          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
-  A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp;
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLogisticRegressionLossBp(Tensor& A1,
-                                        Tensor& A2,
-                                        Tensor& B,
-                                        Tensor& C) {
-  SetTensorValue(B, 50.0f);
-  SetTensorValue(B, -50.0f);
-  /**
-   * const T THRESHOLD = 40.0;
-   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
-   *                                        ? -THRESHOLD
-   *                                        : b;
-   * x = exp(x); a = x / (1 + x) - c
-   */
-  A1.logisticRegressionLossBp(B, C);
-  real THRESHOLD = 40.0;
-  auto tmp =
-      (B > THRESHOLD)
-          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
-  auto tmp2 = tmp.exp();
-  A2 = tmp2 / (C.constant(1.0) + tmp2) - C;
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.biggerThan(B, C);  // a = (b > c) ? 1.0f : 0.0f
-  A2 = (B > C).condition((real)1.0f, (real)0.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.max2(B, C);  // a = (b > c) ? b : c
-  A2 = (B > C).condition(B, C);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C);
-  testTensorBinaryLabelCrossEntropy(A1, A2, B, C);
-  testTensorBiggerThan(A1, A2, B, C);
-  testTensorMax(A1, A2, B, C);
-
-  testTensorLogisticRegressionLoss(A1, A2, B, C);
-  testTensorLogisticRegressionLossBp(A1, A2, B, C);
-}
-
-TEST(Ternary, CompareOp) {
-  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryCompareOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testQuaternaryAdd(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f);  // a = p1 * b + p2 * c + p3 * d
-  // A2 = B * 1.5f + C * 2.5f + D * 3.5f;
-  // TensorCheckEqual(A1, A2);
-
-  /*
-   * T tmp = p1 * b + p2 * c + p3 * d;
-   * a += tmp * tmp
-   */
-  real p1 = 1.5f;
-  real p2 = 2.5f;
-  real p3 = 3.5f;
-  A1.addSquareSum(B, C, D, p1, p2, p3);
-  auto tmp = B * p1 + C * p2 + D * p3;
-  A2 += tmp * tmp;
-  TensorCheckEqual(A1, A2);
-}
-
-TEST(Quaternary, BaseOp) {
-  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryAdd<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryAdd<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorBiggerThan(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
-  A1.biggerThan(B, C, D);
-  A2 = ((B > C && D > (real)0.5) || (B < C && D < (real)0.5))
-           .condition((real)1.0, (real)0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorRankLoss(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  /**
-   * const T THRESHOLD = 40.0; a = b - c;
-   * a = (a > THRESHOLD)
-   *         ? THRESHOLD
-   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-   * a = log(1 + exp(a)) - a * d
-   */
-  A1.rankLoss(B, C, D);
-
-  real THRESHOLD = 40.0;
-  auto tmp = B - C;
-  auto tmp2 =
-      (tmp > THRESHOLD)
-          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
-  A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D;
-
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorRankLossBp(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  /**
-   * const T THRESHOLD = 40.0; a = b - c;
-   * a = (a > THRESHOLD)
-   *         ? THRESHOLD
-   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-   * a = exp(a); a = (a / (1 + a) - d)
-   */
-  A1.rankLossBp(B, C, D);
-  real THRESHOLD = 40.0;
-  auto tmp = B - C;
-  auto tmp2 =
-      (tmp > THRESHOLD)
-          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
-  auto tmp3 = tmp2.exp();
-  A2 = tmp3 / (D.constant(1.0f) + tmp3) - D;
-
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testQuaternaryCompareOp(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  testTensorBiggerThan(A1, A2, B, C, D);
-  testTensorRankLoss(A1, A2, B, C, D);
-  testTensorRankLossBp(A1, A2, B, C, D);
-}
-
-TEST(Quaternary, CompareOp) {
-  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
-#endif
-}
diff --git a/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp b/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp
deleted file mode 100644
index 214ae8971ae..00000000000
--- a/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp
+++ /dev/null
@@ -1,461 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "OriginalOptimizerApi.h"
-#include "PerfUtils.h"
-#include "TensorCheck.h"
-#include "paddle/legacy/math/TrainingAlgorithmOp.h"
-#include "paddle/legacy/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-
-#ifndef PADDLE_TYPE_DOUBLE
-DEFINE_double(max_diff, 1e-5, "max diff allowed");
-#else
-DEFINE_double(max_diff, 1e-13, "max diff allowed");
-#endif
-
-class SetMaxDiff {
- public:
-  explicit SetMaxDiff(double max_diff) {
-    max_diff_ = FLAGS_max_diff;
-    FLAGS_max_diff = max_diff;
-  }
-  ~SetMaxDiff() { FLAGS_max_diff = max_diff_; }
-
- private:
-  double max_diff_;
-};
-
-#define COPY_VECTOR_TO_CPU(cpuVec, vector)               \
-  do {                                                   \
-    if (vector->useGpu()) {                              \
-      cpuVec = Vector::create(vector->getSize(), false); \
-      cpuVec->copyFrom(*vector);                         \
-    } else {                                             \
-      cpuVec = vector;                                   \
-    }                                                    \
-  } while (0)
-
-int VectorCheckErr(const Vector& vector1, const Vector& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-
-  const real* data1 = vector1.getData();
-  const real* data2 = vector2.getData();
-  size_t size = vector1.getSize();
-  int count = 0;
-  for (size_t i = 0; i < size; i++) {
-    real a = data1[i];
-    real b = data2[i];
-    if (fabs(a - b) > FLAGS_max_diff) {
-      if ((fabsf(a - b) / fabsf(a)) > (FLAGS_max_diff / 10.0f)) {
-        count++;
-      }
-    }
-  }
-
-  return count;
-}
-
-int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
-  VectorPtr tmp1;
-  VectorPtr tmp2;
-  COPY_VECTOR_TO_CPU(tmp1, vector1);
-  COPY_VECTOR_TO_CPU(tmp2, vector2);
-  return VectorCheckErr(*tmp1, *tmp2);
-}
-
-#ifdef PADDLE_DISABLE_TIMER
-
-#define CHECK_VECTORPTR(vector1, vector2) \
-  EXPECT_EQ(VectorCheckErr(vector1, vector2), 0)
-
-#else
-
-#define CHECK_VECTORPTR(vector1, vector2)
-
-#endif
-
-typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
-
-void testCase(testMatrixFunc matrixFunc) {
-#ifdef PADDLE_WITH_CUDA
-  for (auto useGpu : {false, true}) {
-#else
-  for (auto useGpu : {false}) {
-#endif
-    for (auto size : {1,
-                      32,
-                      64,
-                      128,
-                      512,
-                      1024,
-                      4096,
-                      32768,
-                      65536,
-                      131072,
-                      262144,
-                      524288,
-                      1048576,
-                      2097152}) {
-      LOG(INFO) << " size=" << size << " useGpu=" << useGpu;
-      matrixFunc(size, useGpu);
-    }
-  }
-}
-
-#define INIT_VECTOR(vec1, vec2, type, size, useGpu) \
-  vec1[type] = Vector::create(size, useGpu);        \
-  vec2[type] = Vector::create(size, useGpu);        \
-  vec1[type]->rand();                               \
-  vec2[type]->copyFrom(*vec1[type]);
-
-void testAdagrad(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-
-  EXPRESSION_PERFORMANCE(AdagradParameterOptimizer(
-      bufs1, epsilon, learningRate, momentum, decayRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& accum_buffer = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(adagradApply(value,
-                                      grad,
-                                      mom,
-                                      accum_buffer,
-                                      accum,
-                                      lr,
-                                      epsilon,
-                                      learningRate,
-                                      momentum,
-                                      decayRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, Adagrad) { testCase(testAdagrad); }
-
-void testAdaDelta(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-
-  EXPRESSION_PERFORMANCE(AdaDeltaParameterOptimizer(
-      bufs1, rou, epsilon, learningRate, momentum, decayRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum_update = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(adadeltaApply(value,
-                                       grad,
-                                       mom,
-                                       accum,
-                                       accum_update,
-                                       lr,
-                                       rou,
-                                       epsilon,
-                                       learningRate,
-                                       momentum,
-                                       decayRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, AdaDelta) { testCase(testAdaDelta); }
-
-template <bool isFirstTime>
-void testRMSProp(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  /* make sure 'g - f.square()' greater than 0 */
-  bufs1[PARAMETER_GRADIENT_SQURESUM]->add(1.0);
-  bufs2[PARAMETER_GRADIENT_SQURESUM]->copyFrom(
-      *bufs1[PARAMETER_GRADIENT_SQURESUM]);
-
-  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-  real accumulatedRou = rou;
-
-  EXPRESSION_PERFORMANCE(RMSPropParameterOptimizer(bufs1,
-                                                   accumulatedRou,
-                                                   rou,
-                                                   epsilon,
-                                                   learningRate,
-                                                   momentum,
-                                                   decayRate,
-                                                   isFirstTime));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& sum1 = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(rmspropApply(value,
-                                      grad,
-                                      mom,
-                                      sum,
-                                      sum1,
-                                      lr,
-                                      accumulatedRou,
-                                      rou,
-                                      epsilon,
-                                      learningRate,
-                                      momentum,
-                                      decayRate,
-                                      isFirstTime));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, RMSProp) {
-  testCase(testRMSProp<true>);
-  testCase(testRMSProp<false>);
-}
-
-template <bool isFirstTime>
-void testDecayedAdagrad(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-  real accumulatedRou = rou;
-
-  if (isFirstTime) {
-    bufs1[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
-    bufs2[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
-  }
-
-  EXPRESSION_PERFORMANCE(DecayedAdagradParameterOptimizer(bufs1,
-                                                          accumulatedRou,
-                                                          rou,
-                                                          epsilon,
-                                                          learningRate,
-                                                          momentum,
-                                                          decayRate,
-                                                          isFirstTime));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(decayedAdagradApply(value,
-                                             grad,
-                                             mom,
-                                             sum,
-                                             lr,
-                                             accumulatedRou,
-                                             rou,
-                                             epsilon,
-                                             learningRate,
-                                             momentum,
-                                             decayRate,
-                                             isFirstTime));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, DecayedAdagrad) {
-  testCase(testDecayedAdagrad<false>);
-  testCase(testDecayedAdagrad<true>);
-}
-
-void testAdam(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_SECOND_MOMENTUM, size, useGpu);
-
-  real beta1 = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real beta2 = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real beta1_power = (real)rand() / (real)RAND_MAX;   // NOLINT
-  real beta2_power = (real)rand() / (real)RAND_MAX;   // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-
-  EXPRESSION_PERFORMANCE(AdamParameterOptimizer(
-      bufs1, beta1, beta2, beta1_power, beta2_power, epsilon, learningRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& v = *bufs2[PARAMETER_SECOND_MOMENTUM];
-
-  EXPRESSION_PERFORMANCE(adamApply(value,
-                                   grad,
-                                   mom,
-                                   v,
-                                   beta1,
-                                   beta2,
-                                   beta1_power,
-                                   beta2_power,
-                                   epsilon,
-                                   learningRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_SECOND_MOMENTUM],
-                  bufs2[PARAMETER_SECOND_MOMENTUM]);
-}
-
-TEST(Training, Adam) { testCase(testAdam); }
-
-void testAdamax(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_WEIGHTED_INFINITY_NORM, size, useGpu);
-
-  real beta1 = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real beta2 = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real alpha = (real)rand() / (real)RAND_MAX;  // NOLINT
-  int64_t step = 2;
-
-  EXPRESSION_PERFORMANCE(
-      AdamaxParameterOptimizer(bufs1, beta1, beta2, step, alpha));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& u = *bufs2[PARAMETER_WEIGHTED_INFINITY_NORM];
-
-  EXPRESSION_PERFORMANCE(
-      adamaxApply(value, grad, mom, u, beta1, beta2, step, alpha));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_WEIGHTED_INFINITY_NORM],
-                  bufs2[PARAMETER_WEIGHTED_INFINITY_NORM]);
-}
-
-TEST(Training, Adamax) {
-#ifndef PADDLE_TYPE_DOUBLE
-  SetMaxDiff diff(1e-4);
-#endif
-  testCase(testAdamax);
-}
-
-void testSparseMomentum(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_UT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_VT, size, useGpu);
-
-  real alpha = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real beta = (real)rand() / (real)RAND_MAX;          // NOLINT
-  real gamma = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real tau = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-
-  EXPRESSION_PERFORMANCE(SparseMomentumParameterOptimizer(
-      bufs1, alpha, beta, gamma, tau, learningRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& momU = *bufs2[PARAMETER_MOMENTUM_UT];
-  BaseMatrix& momV = *bufs2[PARAMETER_MOMENTUM_VT];
-
-  EXPRESSION_PERFORMANCE(sparseMomentumApply(
-      value, grad, momU, momV, alpha, beta, gamma, tau, learningRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_UT], bufs2[PARAMETER_MOMENTUM_UT]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_VT], bufs2[PARAMETER_MOMENTUM_VT]);
-}
-
-TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
diff --git a/paddle/legacy/math/tests/test_batchTranspose.cpp b/paddle/legacy/math/tests/test_batchTranspose.cpp
deleted file mode 100644
index ccfd6d5aae2..00000000000
--- a/paddle/legacy/math/tests/test_batchTranspose.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_batch_transpose.h"
-#include "test_matrixUtil.h"
-
-using namespace paddle;  // NOLINT
-
-#ifdef PADDLE_WITH_CUDA
-TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
-  const int nx = 100;
-  const int ny = 50;
-  const int numSamples = 50;
-
-  MatrixPtr cMat = Matrix::create(numSamples, nx * ny, false, false);
-  MatrixPtr gMat = Matrix::create(numSamples, nx * ny, false, true);
-
-  MatrixPtr cBatchTransMat = Matrix::create(numSamples, nx * ny, false, false);
-  MatrixPtr gBatchTransMat = Matrix::create(numSamples, nx * ny, false, true);
-  MatrixPtr cMat_d2h = Matrix::create(numSamples, nx * ny, false, false);
-
-  real* cData = cMat->getData();
-  real* gold = cBatchTransMat->getData();
-
-  // host
-  for (int sample_id = 0; sample_id < numSamples; ++sample_id)
-    for (int j = 0; j < ny; j++)
-      for (int i = 0; i < nx; i++)
-        cData[sample_id * nx * ny + j * nx + i] = j * nx + i;
-
-  // correct result for error checking
-  for (int sample_id = 0; sample_id < numSamples; ++sample_id)
-    for (int j = 0; j < ny; j++)
-      for (int i = 0; i < nx; i++)
-        gold[sample_id * nx * ny + i * ny + j] =
-            cData[sample_id * nx * ny + j * nx + i];
-  // device
-  gMat->copyFrom(*cMat, HPPL_STREAM_DEFAULT);
-  batchTranspose(
-      gMat->getData(), gBatchTransMat->getData(), nx, ny, numSamples);
-  cMat_d2h->copyFrom(*gBatchTransMat, HPPL_STREAM_DEFAULT);
-  checkMatrixEqual(cBatchTransMat, cMat_d2h);
-}
-#endif
diff --git a/paddle/legacy/math/tests/test_lazyAssign.cu b/paddle/legacy/math/tests/test_lazyAssign.cu
deleted file mode 100644
index cf8c3d77199..00000000000
--- a/paddle/legacy/math/tests/test_lazyAssign.cu
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "PerfUtils.h"
-#include "TensorCheck.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/TensorAssign.h"
-
-using paddle::BaseMatrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using autotest::TensorCheckEqual;
-using autotest::TensorCheckErr;
-
-typedef std::function<void(int height, int width)> testMatrixFunc;
-void testMatrixCase(testMatrixFunc matrixFunc) {
-  for (auto height : {1}) {
-    for (auto width : {1,
-                       32,
-                       64,
-                       128,
-                       512,
-                       1024,
-                       4096,
-                       32768,
-                       65536,
-                       131072,
-                       262144,
-                       524288,
-                       1048576,
-                       2097152,
-                       4194304,
-                       8388608}) {
-      matrixFunc(height, width);
-    }
-  }
-}
-
-template <typename Tensor>
-void testLazyAssign(int height, int width) {
-  Tensor A1(height, width);
-  Tensor A2(height, width);
-  Tensor B(height, width);
-  Tensor C(height, width);
-  Tensor D(height, width);
-  A1.randomizeUniform();
-  B.randomizeUniform();
-  C.randomizeUniform();
-  D.randomizeUniform();
-  A2.copyFrom(A1);
-
-  EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;);
-
-  EXPRESSION_PERFORMANCE(auto expr1 = A2.lazyAssign(B + C);
-                         auto expr2 = A2.lazyAssign(A2 * D);
-                         AssignEvaluate(expr1, expr2););
-
-  TensorCheckErr(A1, A2);
-}
-
-TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign<CpuMatrix>); }
-
-#ifdef PADDLE_WITH_GPU
-TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign<GpuMatrix>); }
-#endif
-
-template <typename Tensor>
-void sgdUpdateTensor(
-    Tensor& A, Tensor& B, Tensor& C, Tensor& D, real p1, real p2, real p3) {
-  C = C * p2 - D * (B + A * p3) * p1;
-  A += C;
-}
-
-void sgdUpdateLazyAssign(BaseMatrix& A,
-                         BaseMatrix& B,
-                         BaseMatrix& C,
-                         BaseMatrix& D,
-                         real p1,
-                         real p2,
-                         real p3) {
-  auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1);
-  auto expr2 = A.lazyAssign(A + C);
-  AssignEvaluate(expr1, expr2);
-}
-
-template <typename Tensor>
-void testSgdUpdate(int height, int width) {
-  Tensor A1(height, width);
-  Tensor A2(height, width);
-  Tensor A3(height, width);
-  A1.randomizeUniform();
-  A2.copyFrom(A1);
-  A3.copyFrom(A1);
-
-  Tensor B(height, width);
-  B.randomizeUniform();
-
-  Tensor C1(height, width);
-  Tensor C2(height, width);
-  Tensor C3(height, width);
-  C1.randomizeUniform();
-  C2.copyFrom(C1);
-  C3.copyFrom(C1);
-
-  Tensor D(height, width);
-  D.randomizeUniform();
-
-  real p1 = 0.2;
-  real p2 = 0.3;
-  real p3 = 0.5;
-
-  /**
-   * c = p2 * c - p1 * (b + p3 * a);
-   * a = a + c;
-   */
-  // BaseMatrix API
-  EXPRESSION_PERFORMANCE(A1.sgdUpdate(B, C1, D, p1, p2, p3););
-
-  // Tensor expression
-  EXPRESSION_PERFORMANCE(sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
-
-  // lazyAssign
-  EXPRESSION_PERFORMANCE(sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
-
-  TensorCheckErr(A1, A2);
-  TensorCheckErr(A1, A3);
-  TensorCheckErr(C1, C2);
-  TensorCheckErr(C1, C3);
-}
-
-TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate<CpuMatrix>); }
-
-#ifdef PADDLE_WITH_GPU
-TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate<GpuMatrix>); }
-#endif
diff --git a/paddle/legacy/math/tests/test_matrixCompare.cpp b/paddle/legacy/math/tests/test_matrixCompare.cpp
deleted file mode 100644
index a43adde46fc..00000000000
--- a/paddle/legacy/math/tests/test_matrixCompare.cpp
+++ /dev/null
@@ -1,1698 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-/// This unittest checks GpuMatrix/CpuMatrix get same result, so disable when
-/// only cpu version.
-
-#include <gtest/gtest.h>
-#include "TensorCheck.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/legacy/utils/DynamicLoader.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-using autotest::TensorCheckEqual;
-using autotest::TensorCheckErr;
-
-void testMatrixMaxSequence(int batchSize, int inputDim) {
-  // forward
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  int newBatchSize = cpuSequence->getSize() - 1;
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
-  cpuOutput->zero();
-  gpuOutput->zero();
-
-  IVectorPtr cpuIndex = nullptr;
-  IVectorPtr gpuIndex = nullptr;
-  IVector::resizeOrCreate(cpuIndex, newBatchSize * inputDim, false);
-  IVector::resizeOrCreate(gpuIndex, newBatchSize * inputDim, true);
-  cpuIndex->zeroMem();
-  gpuIndex->zeroMem();
-
-  cpuOutput->maxSequenceForward(*cpuInput, *cpuSequence, *cpuIndex);
-  gpuOutput->maxSequenceForward(*gpuInput, *gpuSequence, *gpuIndex);
-
-  TensorCheckEqual(*cpuOutput, *gpuOutput);
-  TensorCheckEqual(*cpuIndex, *gpuIndex);
-
-  // backward
-  MatrixPtr cpuOutputGrad = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  MatrixPtr gpuOutputGrad = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
-  cpuOutputGrad->randomizeUniform();
-  gpuOutputGrad->copyFrom(*cpuOutputGrad);
-
-  MatrixPtr cpuInputGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInputGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInputGrad->randomizeUniform();
-  gpuInputGrad->copyFrom(*cpuInputGrad);
-
-  cpuInputGrad->maxSequenceBackward(*cpuOutputGrad, *cpuSequence, *cpuIndex);
-  gpuInputGrad->maxSequenceBackward(*gpuOutputGrad, *gpuSequence, *gpuIndex);
-
-  TensorCheckEqual(*cpuInputGrad, *gpuInputGrad);
-}
-
-TEST(Matrix, maxSequence) {
-  for (auto batchSize : {1, 3, 997}) {   // prime numbers close to 1, 4, 1024
-    for (auto inputDim : {1, 7, 131}) {  // prime numbers close to 1, 8, 128
-      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
-      testMatrixMaxSequence(batchSize, inputDim);
-    }
-  }
-}
-
-void testMatrixGetSum(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  int x = log10(height * width);
-  real err = 1e-6 * pow(10, x);
-#else
-  real err = 1e-8;
-#endif
-
-  real cpuSum = cpuInput->getSum();
-  real gpuSum = gpuInput->getSum();
-
-  EXPECT_LE(fabs(cpuSum - gpuSum), err);
-}
-
-void testMatrixGetMinMax(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  real cpuMin = cpuInput->getMin();
-  real gpuMin = gpuInput->getMin();
-  real cpuMax = cpuInput->getMax();
-  real gpuMax = gpuInput->getMax();
-
-  EXPECT_EQ(cpuMin, gpuMin);
-  EXPECT_EQ(cpuMax, gpuMax);
-}
-
-void testMatrixZeroAtOffset(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr cpuTest = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  cpuTest->copyFrom(*cpuA);
-
-  int columnOffset = rand() % width;  // NOLINT we just use rand() for test.
-  int numColumns = rand() % (width - columnOffset);  // NOLINT
-
-  if (numColumns == 0) return;
-
-  cpuA->zeroAtOffset(columnOffset, numColumns);
-  gpuA->zeroAtOffset(columnOffset, numColumns);
-
-  /* cpuTest */
-  real* a = cpuTest->getData() + columnOffset;
-  for (int64_t i = 0; i < height; ++i) {
-    for (int64_t j = 0; j < numColumns; ++j) {
-      a[i * width + j] = 0;
-    }
-  }
-
-  TensorCheckEqual(*cpuA, *gpuA);
-  TensorCheckEqual(*cpuA, *cpuTest);
-}
-
-void testMatrixDeepSwap(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuCopyA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuCopyB = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuCopyA->copyFrom(*cpuA);
-  cpuCopyB->copyFrom(*cpuB);
-
-  // swap matrix cpuA and cpuB
-  cpuA->deepSwap(*cpuB);
-
-  TensorCheckEqual(*cpuA, *cpuCopyB);
-  TensorCheckEqual(*cpuB, *cpuCopyA);
-}
-
-void testMatrixTranspose(int height, int width) {
-  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr cpuT = std::make_shared<CpuMatrix>(width, height);
-  MatrixPtr gpuT = std::make_shared<GpuMatrix>(width, height);
-
-  cpu->randomizeUniform();
-  gpu->copyFrom(*cpu);
-  cpu->transpose(cpuT, false);
-  gpu->transpose(gpuT, true);
-
-  TensorCheckEqual(*cpuT, *gpuT);
-}
-
-void testMatrixRotate(int height, int width) {
-  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr cpuR = std::make_shared<CpuMatrix>(width, height);
-  MatrixPtr gpuR = std::make_shared<GpuMatrix>(width, height);
-
-  cpu->randomizeUniform();
-  gpu->copyFrom(*cpu);
-
-  cpu->rotate(cpuR, false, true);
-  gpu->rotate(gpuR, true, true);
-  TensorCheckEqual(*cpuR, *gpuR);
-
-  cpu->rotate(cpuR, true, false);
-  gpu->rotate(gpuR, false, false);
-  TensorCheckEqual(*cpuR, *gpuR);
-}
-
-void testMatrixInverse(int height) {
-  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, height);
-  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, height);
-  MatrixPtr cpuI = std::make_shared<CpuMatrix>(height, height);
-  MatrixPtr gpuI = std::make_shared<GpuMatrix>(height, height);
-
-  /* Make matrix well conditioned: cpu * cpuT + Identity */
-  cpu->randomizeUniform();
-  MatrixPtr cpuT = cpu->getTranspose();
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, height);
-  outputCheck->mul(*cpu, *cpuT);
-  cpu->setDiag(1.0);
-  cpu->add(*outputCheck);
-
-  gpu->copyFrom(*cpu);
-  cpu->inverse(cpuI, true);
-  gpu->inverse(gpuI, false);
-
-  TensorCheckErr(*cpuI, *gpuI);
-
-  outputCheck->mul(*cpu, *cpuI);
-  cpu->setDiag(1.0);
-  TensorCheckErr(*cpu, *outputCheck);
-}
-
-TEST(Matrix, unary) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-
-      testMatrixDeepSwap(height, width);
-      testMatrixZeroAtOffset(height, width);
-      testMatrixGetSum(height, width);
-      testMatrixTranspose(height, width);
-      testMatrixRotate(height, width);
-    }
-#ifdef LAPACK_FOUND
-    // inverse matrix
-    testMatrixInverse(height);
-#else
-    LOG(WARNING) << "This version of PaddlePaddle was not built with LAPACK"
-                 << "support so we cannot test matrix inverse. To test "
-                 << "matrix inverse, please install LAPACKE "
-                 << "and MKL/Openblas, and re-build PaddlePaddle.";
-#endif
-  }
-}
-
-void testMatrixSoftmax(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->zero();
-  gpuOutput->zero();
-  cpuInput->softmax(*cpuOutput);
-  gpuInput->softmax(*gpuOutput);
-
-  TensorCheckErr(*cpuOutput, *gpuOutput);
-}
-
-void testSequenceSoftmax(int batchSize) {
-  // forward
-  int inputDim = 1;
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  cpuInput->sequenceSoftmax(*cpuInput, *cpuSequence);
-  gpuInput->sequenceSoftmax(*gpuInput, *gpuSequence);
-
-  TensorCheckErr(*cpuInput, *gpuInput);
-}
-
-void testMatrixSoftmaxThreshold(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  cpuInput->getData()[0] = 100.0;
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->zero();
-  gpuOutput->zero();
-  cpuInput->softmax(*cpuOutput);
-  gpuInput->softmax(*gpuOutput);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuOutput);
-  // check output zero
-  int cpuCount = 0;
-  int gpuCount = 0;
-  auto zeroNum = [](MatrixPtr out, int& count) {
-    for (size_t i = 0; i < out->getHeight(); i++) {
-      for (size_t j = 0; j < out->getWidth(); j++) {
-        if (out->getElement(i, j) == 0) count++;
-      }
-    }
-  };
-  zeroNum(cpuOutput, cpuCount);
-  zeroNum(outputCheck, gpuCount);
-  EXPECT_EQ(cpuCount, 0) << "Cpu softmax output value 0";
-  EXPECT_EQ(gpuCount, 0) << "Gpu softmax output value 0";
-}
-
-void testMatrixSoftmaxBp(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-  gpuOutput->softmaxBackward(*gpuInput);
-
-  MatrixPtr sftMaxSum = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr sftMaxDot = std::make_shared<CpuMatrix>(height, width);
-  sftMaxDot->dotMul(*cpuOutput, *cpuInput);
-  sftMaxSum->colMerge(*sftMaxDot);
-  cpuOutput->softmaxDerivative(*cpuInput, *sftMaxSum);
-
-  TensorCheckErr(*cpuOutput, *gpuOutput);
-}
-
-TEST(Matrix, softmax) {
-  for (auto height : {1, 3, 131}) {    // prime numbers close to 1, 4, 127
-    for (auto width : {1, 17, 251}) {  // prime numbers close to 1, 16, 256
-      VLOG(3) << " height=" << height << " width=" << width;
-
-      testMatrixSoftmax(height, width);
-      testMatrixSoftmaxBp(height, width);
-      testMatrixSoftmaxThreshold(height, width);
-    }
-    testSequenceSoftmax(height);
-  }
-}
-
-void testMatrixAddToRows(int numSamples, int tableSize, int inputDim) {
-  MatrixPtr cpuTable = std::make_shared<CpuMatrix>(tableSize, inputDim);
-  MatrixPtr gpuTable = std::make_shared<GpuMatrix>(tableSize, inputDim);
-  cpuTable->randomizeUniform();
-  gpuTable->copyFrom(*cpuTable);
-
-  IVectorPtr cpuIds;
-  IVectorPtr gpuIds;
-  cpuIds = VectorT<int>::create(numSamples, false);
-  gpuIds = VectorT<int>::create(numSamples, true);
-  cpuIds->rand(tableSize);
-  gpuIds->copyFrom(*cpuIds);
-
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, inputDim);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-
-  cpuOutput->addToRows(*cpuTable, *cpuIds);
-  gpuOutput->addToRows(*gpuTable, *gpuIds);
-
-  TensorCheckErr(*cpuTable, *gpuTable);
-}
-
-TEST(Matrix, tableProjection) {
-  for (auto numSamples : {10, 100, 1000, 10000, 80000}) {
-    for (auto tableSize : {10, 100}) {
-      for (auto inputDim : {20, 50}) {
-        VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
-                << " inputDim=" << inputDim;
-        testMatrixAddToRows(numSamples, tableSize, inputDim);
-      }
-    }
-  }
-}
-
-void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
-  int heightA = transa == false ? dimM : dimK;
-  int widthA = transa == false ? dimK : dimM;
-  int heightB = transb == false ? dimK : dimN;
-  int widthB = transb == false ? dimN : dimK;
-  int heightC = dimM;
-  int widthC = dimN;
-
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
-
-  real alpha = 1.5;
-  real beta = 2.0;
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  cpuC->mul(*cpuA, *cpuB, alpha, beta);
-  gpuC->mul(*gpuA, *gpuB, alpha, beta);
-
-  TensorCheckErr(*cpuC, *gpuC);
-}
-
-void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
-  int heightA = transa == false ? dimM : dimK;
-  int widthA = transa == false ? dimK : dimM;
-  int heightB = transb == false ? dimK : dimN;
-  int widthB = transb == false ? dimN : dimK;
-  int heightC = dimM;
-  int widthC = dimN;
-
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
-
-  real alpha = 1.5;
-  real beta = 2.0;
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  auto subSize = [](int& start, int& end, int dim) {
-    if (dim == 1) {
-      start = 0;
-      end = dim;
-    } else {
-      int subDim = rand() % (dim - 1) + 1;  // NOLINT
-      start = rand() % (dim - subDim);      // NOLINT
-      end = start + subDim;
-    }
-  };
-
-  auto subMatrix = [](MatrixPtr& sub,
-                      MatrixPtr matrix,
-                      size_t startRow,
-                      size_t endRow,
-                      size_t startCol,
-                      size_t endCol) {
-    if (!matrix->isTransposed()) {
-      sub = matrix->subMatrix(startRow, endRow, startCol, endCol);
-    } else {
-      sub = matrix->subMatrix(startCol, endCol, startRow, endRow);
-    }
-  };
-
-  int startM, endM;
-  int startN, endN;
-  int startK, endK;
-  subSize(startM, endM, dimM);
-  subSize(startN, endN, dimN);
-  subSize(startK, endK, dimK);
-
-  MatrixPtr subCpuA;
-  MatrixPtr subCpuB;
-  MatrixPtr subGpuA;
-  MatrixPtr subGpuB;
-  subMatrix(subCpuA, cpuA, startM, endM, startK, endK);
-  subMatrix(subGpuA, gpuA, startM, endM, startK, endK);
-  subMatrix(subCpuB, cpuB, startK, endK, startN, endN);
-  subMatrix(subGpuB, gpuB, startK, endK, startN, endN);
-  MatrixPtr subCpuC = cpuC->subMatrix(startM, endM, startN, endN);
-  MatrixPtr subGpuC = gpuC->subMatrix(startM, endM, startN, endN);
-
-  subCpuC->mul(*subCpuA, *subCpuB, alpha, beta);
-  subGpuC->mul(*subGpuA, *subGpuB, alpha, beta);
-
-  TensorCheckErr(*cpuC, *gpuC);
-}
-
-TEST(Matrix, mul) {
-  for (auto transa : {false, true}) {
-    for (auto transb : {false, true}) {
-      for (auto dimM : {1, 9, 53, 127, 345, 1023, 2135}) {
-        for (auto dimN : {1, 5, 37, 256, 1024}) {
-          for (auto dimK : {8, 45, 346, 784, 1025}) {
-            if (true == transa && true == transb) {
-              continue;
-            }
-            VLOG(3) << setiosflags(ios::left) << setfill(' ')
-                    << " transa=" << transa << " transb=" << transb
-                    << " dimM=" << setw(5) << dimM << " dimN=" << setw(5)
-                    << dimN << " dimK=" << setw(5) << dimK;
-
-            testMatrixMul(transa, transb, dimM, dimN, dimK);
-            testSubMatrixMul(transa, transb, dimM, dimN, dimK);
-          }
-        }
-      }
-    }
-  }
-}
-
-void testVectorRowFunc(int size) {
-  CpuVectorPtr cpu = std::make_shared<CpuVectorT<real>>(size);
-  GpuVectorPtr gpu = std::make_shared<GpuVectorT<real>>(size);
-
-  cpu->rand();
-  gpu->copyFrom(*cpu);
-
-  EXPECT_EQ(cpu->getMax(), gpu->getMax());
-  EXPECT_EQ(cpu->getMin(), gpu->getMin());
-  EXPECT_EQ(cpu->getAbsMax(), gpu->getAbsMax());
-}
-
-TEST(Vector, rowFunc) {
-  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
-    VLOG(3) << " size=" << size;
-    testVectorRowFunc(size);
-  }
-}
-
-template <class T>
-void testVectorReset(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
-
-  T value = (T)((int)rand() % 100 + 1.0f / ((int)rand() % 100));
-  cpu->reset(value);
-  gpu->reset(value);
-
-  TensorCheckEqual(*cpu, *gpu);
-}
-
-template <class T>
-void testVecortSelectFrom(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpuDst = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpuDst = std::make_shared<GpuVectorT<T>>(size);
-  std::shared_ptr<CpuVectorT<T>> cpuSrc =
-      std::make_shared<CpuVectorT<T>>(size * 2);
-  std::shared_ptr<GpuVectorT<T>> gpuSrc =
-      std::make_shared<GpuVectorT<T>>(size * 2);
-  CpuIVectorPtr cpuIds = std::make_shared<CpuVectorT<int>>(size);
-  GpuIVectorPtr gpuIds = std::make_shared<GpuVectorT<int>>(size);
-
-  if (std::is_same<T, real>::value) {
-    cpuSrc->rand();
-  } else {
-    cpuSrc->rand(100000);
-  }
-  gpuSrc->copyFrom(*cpuSrc);
-  cpuIds->rand(size);
-  gpuIds->copyFrom(*cpuIds);
-
-  cpuDst->selectFrom(*cpuSrc, *cpuIds);
-  gpuDst->selectFrom(*gpuSrc, *gpuIds);
-
-  TensorCheckEqual(*cpuDst, *gpuDst);
-}
-
-template <class T>
-void testVecotrZeroMem(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
-
-  cpu->zeroMem();
-  gpu->zeroMem();
-
-  TensorCheckEqual(*cpu, *gpu);
-}
-
-template <class T>
-void testVectorIsEqual(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpuA = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<CpuVectorT<T>> cpuB = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpuA = std::make_shared<GpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpuB = std::make_shared<GpuVectorT<T>>(size);
-
-  if (std::is_same<T, real>::value) {
-    cpuB->rand();
-  } else {
-    cpuB->rand(100000);
-  }
-  gpuB->copyFrom(*cpuB);
-
-  T value = (T)((int)rand() % 100 + 1.0f / ((int)rand() % 100));
-  cpuA->isEqualTo(*cpuB, value);
-  gpuA->isEqualTo(*gpuB, value);
-
-  TensorCheckEqual(*cpuA, *gpuA);
-}
-
-TEST(Vector, Equal) {
-  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
-    VLOG(3) << " size=" << size;
-    testVectorReset<int>(size);
-    testVectorReset<real>(size);
-    testVecortSelectFrom<int>(size);
-    testVecortSelectFrom<real>(size);
-    testVecotrZeroMem<int>(size);
-    testVecotrZeroMem<real>(size);
-    testVectorIsEqual<int>(size);
-    testVectorIsEqual<real>(size);
-  }
-}
-
-void testMatrixTopK(int samples, int dim, int beamSize) {
-  MatrixPtr cpuSrc = std::make_shared<CpuMatrix>(samples, dim);
-  MatrixPtr gpuSrc = std::make_shared<GpuMatrix>(samples, dim);
-  MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
-  MatrixPtr gpuVal = std::make_shared<GpuMatrix>(samples, beamSize);
-  IVectorPtr cpuIds = std::make_shared<CpuIVector>(samples * beamSize);
-  IVectorPtr gpuIds = std::make_shared<GpuIVector>(samples * beamSize);
-
-  cpuSrc->randomizeUniform();
-  gpuSrc->copyFrom(*cpuSrc);
-
-  cpuSrc->rowMax(*cpuIds, *cpuVal);
-  gpuSrc->rowMax(*gpuIds, *gpuVal);
-
-  TensorCheckEqual(*cpuVal, *gpuVal);
-}
-
-TEST(Matrix, topK) {
-  for (auto samples : {1, 17, 131}) {  // prime numbers close to 1, 16, 127
-    for (auto dim : {1, 3, 997}) {     // prime numbers close to 1, 4, 1024
-      for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
-        if (beamSize > dim) continue;
-        VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
-                << " dim=" << dim;
-        testMatrixTopK(samples, dim, beamSize);
-      }
-    }
-  }
-}
-
-void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
-  int nnz = samples * dim * ratio;
-  if (nnz < 1) nnz = 1;  // Because sparseRand in MathUtil.cpp requires this.
-  MatrixPtr cpuSrc = std::make_shared<CpuSparseMatrix>(samples, dim, nnz);
-  MatrixPtr gpuSrc = std::make_shared<GpuSparseMatrix>(samples, dim, nnz);
-  MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
-  MatrixPtr gpuVal = std::make_shared<GpuMatrix>(samples, beamSize);
-  IVectorPtr cpuIds = std::make_shared<CpuIVector>(samples * beamSize);
-  IVectorPtr gpuIds = std::make_shared<GpuIVector>(samples * beamSize);
-
-  cpuSrc->randomizeUniform();
-  gpuSrc->copyFrom(*cpuSrc);
-  cpuVal->zero();
-  cpuIds->zero();
-  gpuVal->zero();
-  gpuIds->zero();
-
-  cpuSrc->rowMax(*cpuIds, *cpuVal);
-  gpuSrc->rowMax(*gpuIds, *gpuVal);
-
-  TensorCheckEqual(*cpuVal, *gpuVal);
-
-  IVectorPtr outCheckIds = std::make_shared<CpuIVector>(samples * beamSize);
-  outCheckIds->copyFrom(*gpuIds);
-
-  const int* data1 = cpuIds->getData();
-  const int* data2 = outCheckIds->getData();
-  size_t size = cpuIds->getSize();
-  for (size_t i = 0; i < size; i++) {
-    if (data1[i] == -1 && data1[i] != data2[i]) {
-      EXPECT_EQ(data1[i], data2[i]);
-    }
-  }
-}
-
-TEST(SMatrix, topK) {
-  for (auto samples : {1, 3, 61}) {
-    for (auto dim : {1, 3, 61}) {
-      for (auto beamSize : {1, 3, 61}) {
-        for (auto ratio : {0.01, 0.001}) {
-          if (beamSize > dim) continue;
-          VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
-                  << " dim=" << dim << " ratio=" << ratio;
-          testSMatrixTopK(samples, dim, beamSize, ratio);
-        }
-      }
-    }
-  }
-}
-
-void testMatrixSequenceAvg(int batchSize, int inputDim, int mode) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  int newBatchSize = cpuSequence->getSize() - 1;
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
-  cpuOutput->zero();
-  gpuOutput->zero();
-
-  cpuOutput->sequenceAvgForward(*cpuInput, *cpuSequence, mode);
-  gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);
-
-  TensorCheckErr(*cpuOutput, *gpuOutput);
-
-  MatrixPtr cpuInGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInGrad->randomizeUniform();
-  gpuInGrad->copyFrom(*cpuInGrad);
-
-  cpuInGrad->sequenceAvgBackward(*cpuOutput, *cpuSequence, mode);
-  gpuInGrad->sequenceAvgBackward(*gpuOutput, *gpuSequence, mode);
-
-  TensorCheckErr(*cpuInGrad, *gpuInGrad);
-}
-
-TEST(Matrix, sequenceAvg) {
-  for (auto batchSize : {10, 128, 6000}) {
-    for (auto inputDim : {32, 100, 512}) {
-      for (auto mode : {0, 1, 2}) {
-        VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim
-                << " mode=" << mode;
-        testMatrixSequenceAvg(batchSize, inputDim, mode);
-      }
-    }
-  }
-}
-
-void testParamReluBackwardDiff(int height,
-                               int width,
-                               int w_height,
-                               int w_width) {
-  MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
-  MatrixPtr input = CpuMatrix::create(height, width, false, false);
-  MatrixPtr diff = CpuMatrix::create(height, width, false, false);
-  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
-
-  oGrad->randomizeUniform();
-  input->randomizeUniform();
-  w->randomizeUniform();
-  diff->randomizeUniform();
-  input->add(-0.5);
-
-  MatrixPtr oGradGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr diffGpu = CpuMatrix::create(height, width, false, true);
-  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
-
-  oGradGpu->copyFrom(*oGrad);
-  inputGpu->copyFrom(*input);
-  wGpu->copyFrom(*w);
-  diffGpu->copyFrom(*diff);
-
-  diff->paramReluBackwardDiff(*oGrad, *input, *w);
-  diffGpu->paramReluBackwardDiff(*oGradGpu, *inputGpu, *wGpu);
-
-  TensorCheckErr(*diff, *diffGpu);
-}
-
-TEST(Matrix, paramReluBackwardDiff) {
-  for (auto height : {10, 40, 100}) {
-    for (auto width : {10, 40, 100}) {
-      for (auto w_height : {1, 2}) {
-        for (auto w_width : {1, 2}) {
-          if (width % (w_height * w_width)) continue;
-          testParamReluBackwardDiff(height, width, w_height, w_width);
-        }
-      }
-    }
-  }
-}
-
-void testClassificationError(int numSamples, int dim, int topkSize) {
-  MatrixPtr cpuError = std::make_shared<CpuMatrix>(numSamples, 1);
-  MatrixPtr gpuError = std::make_shared<GpuMatrix>(numSamples, 1);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, dim);
-  IVectorPtr cpuLabel = std::make_shared<CpuIVector>(numSamples);
-  IVectorPtr gpuLabel = std::make_shared<GpuIVector>(numSamples);
-
-  cpuOutput->randomizeUniform();
-  cpuLabel->rand(dim);
-  gpuOutput->copyFrom(*cpuOutput);
-  gpuLabel->copyFrom(*cpuLabel);
-
-  cpuError->classificationError(*cpuOutput, *cpuLabel, topkSize);
-  gpuError->classificationError(*gpuOutput, *gpuLabel, topkSize);
-
-  TensorCheckEqual(*cpuError, *gpuError);
-}
-
-TEST(Matrix, classificationError) {
-  for (auto numSamples : {1, 3, 31}) {
-    for (auto dim : {1, 3, 31}) {
-      for (auto topkSize : {1, 3, (int)rand() % dim + 1}) {
-        if (topkSize > dim) continue;
-        VLOG(3) << " sample= " << numSamples << " topkSize= " << topkSize
-                << " dim= " << dim;
-        testClassificationError(numSamples, dim, topkSize);
-      }
-    }
-  }
-}
-
-void testMaxPoolFwdBwd(int numSamples,
-                       int channels,
-                       int imgSizeH,
-                       int imgSizeW,
-                       int ksizeH,
-                       int ksizeW,
-                       int strideH,
-                       int strideW,
-                       int padH,
-                       int padW) {
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = imgSizeH * imgSizeW * channels;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->maxPoolForward(*input,
-                         imgSizeH,
-                         imgSizeW,
-                         channels,
-                         ksizeW,
-                         ksizeH,
-                         strideH,
-                         strideW,
-                         outH,
-                         outW,
-                         padH,
-                         padW);
-  targetGpu->maxPoolForward(*inputGpu,
-                            imgSizeH,
-                            imgSizeW,
-                            channels,
-                            ksizeW,
-                            ksizeH,
-                            strideH,
-                            strideW,
-                            outH,
-                            outW,
-                            padH,
-                            padW);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-  targetCheck->copyFrom(*targetGpu);
-  checkMatrixEqual(target, targetCheck);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->maxPoolBackward(*input,
-                             imgSizeH,
-                             imgSizeW,
-                             *targetGrad,
-                             *target,
-                             ksizeW,
-                             ksizeH,
-                             strideH,
-                             strideW,
-                             outH,
-                             outW,
-                             1.0,
-                             1.0,
-                             padH,
-                             padW);
-  inputGpuGrad->maxPoolBackward(*inputGpu,
-                                imgSizeH,
-                                imgSizeW,
-                                *targetGpuGrad,
-                                *targetGpu,
-                                ksizeW,
-                                ksizeH,
-                                strideH,
-                                strideW,
-                                outH,
-                                outW,
-                                1.0,
-                                1.0,
-                                padH,
-                                padW);
-  MatrixPtr targetBwdCheck =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-  targetBwdCheck->copyFrom(*inputGpuGrad);
-  checkMatrixEqual(inputGrad, targetBwdCheck);
-}
-
-void testAvgPoolFwdBwd(int numSamples,
-                       int channels,
-                       int imgSizeH,
-                       int imgSizeW,
-                       int ksizeH,
-                       int ksizeW,
-                       int strideH,
-                       int strideW,
-                       int padH,
-                       int padW) {
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = imgSizeH * imgSizeW * channels;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->avgPoolForward(*input,
-                         imgSizeH,
-                         imgSizeW,
-                         channels,
-                         ksizeW,
-                         ksizeH,
-                         strideH,
-                         strideW,
-                         outH,
-                         outW,
-                         padH,
-                         padW);
-  targetGpu->avgPoolForward(*inputGpu,
-                            imgSizeH,
-                            imgSizeW,
-                            channels,
-                            ksizeW,
-                            ksizeH,
-                            strideH,
-                            strideW,
-                            outH,
-                            outW,
-                            padH,
-                            padW);
-
-  TensorCheckErr(*target, *targetGpu);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->avgPoolBackward(*targetGrad,
-                             imgSizeH,
-                             imgSizeW,
-                             ksizeW,
-                             ksizeH,
-                             strideH,
-                             strideW,
-                             outH,
-                             outW,
-                             1.0,
-                             1.0,
-                             padH,
-                             padW);
-  inputGpuGrad->avgPoolBackward(*targetGpuGrad,
-                                imgSizeH,
-                                imgSizeW,
-                                ksizeW,
-                                ksizeH,
-                                strideH,
-                                strideW,
-                                outH,
-                                outW,
-                                1.0,
-                                1.0,
-                                padH,
-                                padW);
-
-  TensorCheckErr(*inputGrad, *inputGpuGrad);
-}
-
-// TODO(yi): I noticed many such blindly combinatorial tests in this
-// file.  They are no help to locate defects at all.
-TEST(Matrix, PoolFwdBwd) {
-  for (auto numSamples : {1, 3}) {
-    for (auto channels : {1, 3}) {
-      for (auto imgSizeH : {13, 17}) {
-        for (auto imgSizeW : {17, 19}) {
-          for (auto sizeX : {2, 3}) {
-            for (auto sizeY : {2, 3}) {
-              for (auto sH : {1, 2}) {
-                for (auto sW : {1, 2}) {
-                  for (auto pH : {0, (sizeY - 1) / 2}) {
-                    for (auto pW : {0, (sizeX - 1) / 2}) {
-                      VLOG(3) << " numSamples=" << numSamples
-                              << " channels=" << channels
-                              << " imgSizeH=" << imgSizeH
-                              << " imgSizeW=" << imgSizeW << " sizeX=" << sizeX
-                              << " sizeY=" << sizeY << " strideH=" << sH
-                              << " strideW=" << sW << " padingH=" << pH
-                              << " padingW=" << pW;
-                      testMaxPoolFwdBwd(numSamples,
-                                        channels,
-                                        imgSizeH,
-                                        imgSizeW,
-                                        sizeX,
-                                        sizeY,
-                                        sH,
-                                        sW,
-                                        pH,
-                                        pW);
-                      testAvgPoolFwdBwd(numSamples,
-                                        channels,
-                                        imgSizeH,
-                                        imgSizeW,
-                                        sizeX,
-                                        sizeY,
-                                        sH,
-                                        sW,
-                                        pH,
-                                        pW);
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void testMaxOutFwdBwd(
-    int numSamples, int imgSizeH, int imgSizeW, int channels, int groups) {
-  int inWidth = imgSizeH * imgSizeW * channels;
-  int outChannels = channels / groups;
-  int outWidth = imgSizeH * imgSizeW * outChannels;
-
-  // forward
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
-  IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
-
-  input->randomizeUniform();
-  inputGpu->copyFrom(*input);
-
-  target->maxoutForward(*input, *id, outChannels, groups);
-  targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);
-
-  TensorCheckErr(*target, *targetGpu);
-  TensorCheckEqual(*id, *idGpu);
-
-  // backward
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
-  inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);
-
-  TensorCheckErr(*inputGrad, *inputGpuGrad);
-}
-
-TEST(Matrix, MaxOutFwdBwd) {
-  for (auto numSamples : {5, 10}) {
-    for (auto channels : {8, 16}) {
-      for (auto imgSizeH : {14, 28}) {
-        for (auto imgSizeW : {16, 30}) {
-          for (auto groups : {2, 4}) {
-            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
-                    << " groups=" << groups;
-            testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups);
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(CpuMatrix, copyFrom) {
-  const size_t height = 31;
-  const size_t width = 53;
-  CpuMatrix cpu(height, width);
-  GpuMatrix gpu(height, width);
-  CpuMatrix copy(height, width);
-
-  cpu.randomizeUniform();
-  gpu.copyFrom(cpu);
-  copy.copyFrom(gpu, HPPL_STREAM_DEFAULT);
-
-  TensorCheckEqual(cpu, copy);
-}
-
-void testBatch2seqPadding(int batchSize, int inputDim) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  for (int i = 0; i < int(cpuSequence->getSize()); ++i) {
-    (cpuSequence->getData())[i] += 1;  // so no way that maxSeqLen is 0;
-  }
-
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  size_t numSeq = cpuSequence->getSize() - 1;
-  size_t maxSeqLen = *std::max_element(cpuSequence->getData(),
-                                       cpuSequence->getData() + numSeq);
-
-  printf("numSeq = %ld, maxSeqLen = %ld\n", numSeq, maxSeqLen);
-  MatrixPtr cBatch = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
-  MatrixPtr gBatch = std::make_shared<GpuMatrix>(numSeq * maxSeqLen, inputDim);
-  MatrixPtr cCheck = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
-
-  // hl_sequence2batch_copy_padding(gBatch->getData(),
-  //                                gpuInput->getData(),
-  //                                cpuSequence->getData(),
-  //                                inputDim,
-  //                                maxSeqLen,
-  //                                numSeq,
-  //                                false,
-  //                                true);
-  // cCheck->copyFrom(*gBatch);
-
-  // int* seqStart = cpuSequence->getData();
-  // float* batchData = cBatch->getData();
-  // float* seqData = cpuInput->getData();
-  // for (size_t i = 0; i < maxSeqLen; i++) {
-  //   for (size_t j = 0; j < numSeq; j++) {
-  //     size_t sequenceStart = seqStart[j];
-  //     size_t sequenceLength = seqStart[j + 1] - seqStart[j];
-  //     if (i < sequenceLength) {
-  //       memcpy(batchData + (i * numSeq + j) * inputDim,
-  //              seqData + (sequenceStart + i) * inputDim,
-  //              inputDim * sizeof(real));
-  //     } else {
-  //       memset(batchData + (i * numSeq + j) * inputDim,
-  //              0,
-  //              inputDim * sizeof(real));
-  //     }
-  //   }
-  // }
-
-  // TensorCheckErr(*cBatch, *cCheck);
-}
-
-TEST(Matrix, warpCTC) {
-  for (auto batchSize : {1, 3, 17}) {
-    for (auto inputDim : {1, 3, 31}) {
-      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
-      testBatch2seqPadding(batchSize, inputDim);
-    }
-  }
-}
-
-void testMaxPool3DFwdBwd(int numSamples,
-                         int channels,
-                         int imgSizeD,
-                         int imgSizeH,
-                         int imgSizeW,
-                         int ksizeD,
-                         int ksizeH,
-                         int ksizeW,
-                         int strideD,
-                         int strideH,
-                         int strideW,
-                         int padD,
-                         int padH,
-                         int padW) {
-  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = channels * imgSizeD * imgSizeH * imgSizeW;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outD * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr maxIdx = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr maxIdxGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->maxPool3DForward(*input,
-                           *maxIdx,
-                           channels,
-                           imgSizeD,
-                           imgSizeH,
-                           imgSizeW,
-                           outD,
-                           outH,
-                           outW,
-                           ksizeD,
-                           ksizeH,
-                           ksizeW,
-                           strideD,
-                           strideH,
-                           strideW,
-                           padD,
-                           padH,
-                           padW);
-  targetGpu->maxPool3DForward(*inputGpu,
-                              *maxIdxGpu,
-                              channels,
-                              imgSizeD,
-                              imgSizeH,
-                              imgSizeW,
-                              outD,
-                              outH,
-                              outW,
-                              ksizeD,
-                              ksizeH,
-                              ksizeW,
-                              strideD,
-                              strideH,
-                              strideW,
-                              padD,
-                              padH,
-                              padW);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-  targetCheck->copyFrom(*targetGpu);
-  checkMatrixEqual(target, targetCheck);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->maxPool3DBackward(*targetGrad,
-                               *maxIdx,
-                               imgSizeD,
-                               imgSizeH,
-                               imgSizeW,
-                               outD,
-                               outH,
-                               outW,
-                               ksizeD,
-                               ksizeH,
-                               ksizeW,
-                               strideD,
-                               strideH,
-                               strideW,
-                               padD,
-                               padH,
-                               padW,
-                               1.0,
-                               1.0);
-  inputGpuGrad->maxPool3DBackward(*targetGpuGrad,
-                                  *maxIdxGpu,
-                                  imgSizeD,
-                                  imgSizeH,
-                                  imgSizeW,
-                                  outD,
-                                  outH,
-                                  outW,
-                                  ksizeD,
-                                  ksizeH,
-                                  ksizeW,
-                                  strideD,
-                                  strideH,
-                                  strideW,
-                                  padD,
-                                  padH,
-                                  padW,
-                                  1.0,
-                                  1.0);
-  MatrixPtr targetBwdCheck =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-  targetBwdCheck->copyFrom(*inputGpuGrad);
-  checkMatrixEqual(inputGrad, targetBwdCheck);
-}
-
-void testAvgPool3DFwdBwd(int numSamples,
-                         int channels,
-                         int imgSizeD,
-                         int imgSizeH,
-                         int imgSizeW,
-                         int ksizeD,
-                         int ksizeH,
-                         int ksizeW,
-                         int strideD,
-                         int strideH,
-                         int strideW,
-                         int padD,
-                         int padH,
-                         int padW) {
-  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = imgSizeD * imgSizeH * imgSizeW * channels;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outD * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->avgPool3DForward(*input,
-                           channels,
-                           imgSizeD,
-                           imgSizeH,
-                           imgSizeW,
-                           outD,
-                           outH,
-                           outW,
-                           ksizeD,
-                           ksizeH,
-                           ksizeW,
-                           strideD,
-                           strideH,
-                           strideW,
-                           padD,
-                           padH,
-                           padW);
-
-  targetGpu->avgPool3DForward(*inputGpu,
-                              channels,
-                              imgSizeD,
-                              imgSizeH,
-                              imgSizeW,
-                              outD,
-                              outH,
-                              outW,
-                              ksizeD,
-                              ksizeH,
-                              ksizeW,
-                              strideD,
-                              strideH,
-                              strideW,
-                              padD,
-                              padH,
-                              padW);
-
-  TensorCheckErr(*target, *targetGpu);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->avgPool3DBackward(*targetGrad,
-                               imgSizeD,
-                               imgSizeH,
-                               imgSizeW,
-                               outD,
-                               outH,
-                               outW,
-                               ksizeD,
-                               ksizeH,
-                               ksizeW,
-                               strideD,
-                               strideH,
-                               strideW,
-                               padD,
-                               padH,
-                               padW,
-                               1.0,
-                               1.0);
-
-  inputGpuGrad->avgPool3DBackward(*targetGpuGrad,
-                                  imgSizeD,
-                                  imgSizeH,
-                                  imgSizeW,
-                                  outD,
-                                  outH,
-                                  outW,
-                                  ksizeD,
-                                  ksizeH,
-                                  ksizeW,
-                                  strideD,
-                                  strideH,
-                                  strideW,
-                                  padD,
-                                  padH,
-                                  padW,
-                                  1.0,
-                                  1.0);
-  TensorCheckErr(*inputGrad, *inputGpuGrad);
-}
-
-// TODO(yi): I noticed many such blindly combinatorial tests in this
-// file.  They are no help to locate defects at all.
-TEST(Matrix, Pool3DFwdBwd) {
-  for (auto numSamples : {1, 3}) {
-    for (auto channels : {3}) {
-      for (auto imgSizeD : {9, 16}) {
-        for (auto imgSizeH : {9, 32}) {
-          for (auto imgSizeW : {9, 32}) {
-            for (auto sizeX : {3}) {
-              for (auto sizeY : {3}) {
-                for (auto sizeZ : {3}) {
-                  for (auto sD : {2}) {
-                    for (auto sH : {2}) {
-                      for (auto sW : {2}) {
-                        for (auto pD : {0, (sizeZ - 1) / 2}) {
-                          for (auto pH : {0, (sizeY - 1) / 2}) {
-                            for (auto pW : {0, (sizeX - 1) / 2}) {
-                              VLOG(3) << " numSamples=" << numSamples
-                                      << " channels=" << channels
-                                      << " imgSizeD=" << imgSizeD
-                                      << " imgSizeH=" << imgSizeH
-                                      << " imgSizeW=" << imgSizeW
-                                      << " sizeX=" << sizeX
-                                      << " sizeY=" << sizeY
-                                      << " sizeZ=" << sizeZ << " strideD=" << sD
-                                      << " strideH=" << sH << " strideW=" << sW
-                                      << " padingD=" << pD << " padingH=" << pH
-                                      << " padingW=" << pW;
-
-                              testMaxPool3DFwdBwd(numSamples,
-                                                  channels,
-                                                  imgSizeD,
-                                                  imgSizeH,
-                                                  imgSizeW,
-                                                  sizeX,
-                                                  sizeY,
-                                                  sizeZ,
-                                                  sD,
-                                                  sH,
-                                                  sW,
-                                                  pD,
-                                                  pH,
-                                                  pW);
-                              testAvgPool3DFwdBwd(numSamples,
-                                                  channels,
-                                                  imgSizeD,
-                                                  imgSizeH,
-                                                  imgSizeW,
-                                                  sizeX,
-                                                  sizeY,
-                                                  sizeZ,
-                                                  sD,
-                                                  sH,
-                                                  sW,
-                                                  pD,
-                                                  pH,
-                                                  pW);
-                            }
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  //  for (auto numSamples : {1, 3}) {
-  //    for (auto channels : {1, 3}) {
-  //      for (auto imgSizeD : {9,16}) {
-  //      for (auto imgSizeH : {9, 32}) {
-  //        for (auto imgSizeW : {9, 32}) {
-  //          for (auto sizeX : {2, 3}) {
-  //            for (auto sizeY : {2, 3}) {
-  //            for (auto sizeZ : {2,3}){
-  //              for (auto sD : {1, 2}) {
-  //              for (auto sH : {1, 2}) {
-  //                for (auto sW : {1, 2}) {
-  //                  for (auto pD : {0, (sizeZ - 1) / 2}){
-  //                  for (auto pH : {0, (sizeY - 1) / 2}) {
-  //                    for (auto pW : {0, (sizeX - 1) / 2}) {
-  //                      VLOG(3) << " numSamples=" << numSamples
-  //                              << " channels=" << channels
-  //                              << " imgSizeD=" << imgSizeD
-  //                              << " imgSizeH=" << imgSizeH
-  //                              << " imgSizeW=" << imgSizeW
-  //                              << " sizeX=" << sizeX
-  //                              << " sizeY=" << sizeY
-  //                              << " sizeZ=" << sizeZ
-  //                              << " strideD=" << sD
-  //                              << " strideH=" << sH
-  //                              << " strideW=" << sW
-  //                              << " padingD=" << pD
-  //                              << " padingH=" << pH
-  //                              << " padingW=" << pW;
-  //
-  //                      testMaxPool3DFwdBwd(numSamples,
-  //                                        channels,
-  //                                        imgSizeD,
-  //                                        imgSizeH,
-  //                                        imgSizeW,
-  //                                        sizeX,
-  //                                        sizeY,
-  //                                        sizeZ,
-  //                                        sD,
-  //                                        sH,
-  //                                        sW,
-  //                                        pD,
-  //                                        pH,
-  //                                        pW);
-  //                      testAvgPool3DFwdBwd(numSamples,
-  //                                        channels,
-  //                                        imgSizeD,
-  //                                        imgSizeH,
-  //                                        imgSizeW,
-  //                                        sizeX,
-  //                                        sizeY,
-  //                                        sizeZ,
-  //                                        sD,
-  //                                        sH,
-  //                                        sW,
-  //                                        pD,
-  //                                        pH,
-  //                                        pW);
-  //                    }
-  //                  }
-  //                }
-  //              }
-  //            }
-  //            }
-  //          }
-  //        }
-  //      }
-  //      }
-  //    }
-  //    }
-  //  }
-  //  }
-}
-
-void testMatrixCol2Vol(int depth, int height, int width) {
-  int channel = 3;
-  int filterX = 3, filterY = 4, filterZ = 5;
-  int strideX = 2, strideY = 2, strideZ = 2;
-  int padX = 1, padY = 1, padZ = 1;
-
-  MatrixPtr cpuImage =
-      std::make_shared<CpuMatrix>(channel, depth * height * width);
-  MatrixPtr gpuImage =
-      std::make_shared<GpuMatrix>(channel, depth * height * width);
-  cpuImage->randomizeUniform();
-  gpuImage->copyFrom(*cpuImage);
-
-  int outD = outputSize(depth, filterZ, padZ, strideZ, true);
-  int outH = outputSize(height, filterY, padY, strideY, true);
-  int outW = outputSize(width, filterX, padX, strideX, true);
-
-  int colBufHeight = channel * filterZ * filterY * filterX;
-  int colBufWidth = outD * outH * outW;
-  MatrixPtr cpuColBuf = std::make_shared<CpuMatrix>(colBufHeight, colBufWidth);
-  MatrixPtr gpuColBuf = std::make_shared<GpuMatrix>(colBufHeight, colBufWidth);
-  cpuColBuf->vol2Col(cpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX);
-  gpuColBuf->vol2Col(gpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX);
-  TensorCheckEqual(*cpuColBuf, *gpuColBuf);
-
-  cpuColBuf->randomizeUniform();
-  gpuColBuf->copyFrom(*cpuColBuf);
-  cpuColBuf->col2Vol(cpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX,
-                     1.0,
-                     1.0);
-  gpuColBuf->col2Vol(gpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX,
-                     1.0,
-                     1.0);
-  TensorCheckErr(*cpuImage, *gpuImage);
-}
-
-TEST(Matrix, col2Vol) {
-  for (auto depth : {9, 16, 64}) {
-    for (auto height : {9, 11, 128}) {
-      for (auto width : {9, 32, 128}) {
-        VLOG(3) << "depth=" << depth << " height=" << height
-                << " width=" << width;
-        testMatrixCol2Vol(depth, height, width);
-      }
-    }
-  }
-}
-
-#endif
diff --git a/paddle/legacy/math/tests/test_matrixUtil.h b/paddle/legacy/math/tests/test_matrixUtil.h
deleted file mode 100644
index 58c93f746e7..00000000000
--- a/paddle/legacy/math/tests/test_matrixUtil.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <gtest/gtest.h>
-#include <paddle/legacy/utils/Util.h>
-#include "paddle/legacy/math/SparseMatrix.h"
-
-namespace paddle {
-
-void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  for (size_t r = 0; r < a->getHeight(); ++r) {
-    for (size_t c = 0; c < a->getWidth(); ++c) {
-      ASSERT_FLOAT_EQ(a->getElement(r, c), b->getElement(r, c));
-    }
-  }
-}
-
-void checkSMatrixEqual(const CpuSparseMatrix& a, const CpuSparseMatrix& b) {
-  ASSERT_EQ(a.getWidth(), b.getWidth());
-  ASSERT_EQ(a.getHeight(), b.getHeight());
-  ASSERT_EQ(a.isTransposed(), b.isTransposed());
-  ASSERT_EQ(a.getFormat(), b.getFormat());
-  ASSERT_EQ(a.getElementCnt(), b.getElementCnt());
-  for (size_t r = 0; r < a.getElementCnt(); ++r) {
-    ASSERT_FLOAT_EQ(a.getValue()[r], b.getValue()[r]);
-  }
-}
-
-void checkSMatrixEqual(const CpuSparseMatrixPtr& a,
-                       const CpuSparseMatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  ASSERT_EQ(a->getFormat(), b->getFormat());
-  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
-  for (size_t r = 0; r < a->getElementCnt(); ++r) {
-    ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
-  }
-}
-
-void checkSMatrixEqual2(const CpuSparseMatrixPtr& a,
-                        const CpuSparseMatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  ASSERT_EQ(a->getFormat(), b->getFormat());
-  ASSERT_EQ(a->getValueType(), b->getValueType());
-  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
-  if (a->getFormat() == SPARSE_CSR) {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
-      }
-    }
-    for (size_t r = 0; r <= a->getHeight(); r++) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-    }
-  } else {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
-      }
-    }
-    for (size_t r = 0; r <= a->getWidth(); r++) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-    }
-  }
-}
-
-void checkSMatrixEqual2Dense(const CpuSparseMatrix& a, const CpuMatrix& b) {
-  ASSERT_EQ(a.getWidth(), b.getWidth());
-  ASSERT_EQ(a.getHeight(), b.getHeight());
-  ASSERT_EQ(a.isTransposed(), b.isTransposed());
-
-  if (a.getFormat() == SPARSE_CSC) {
-    int* rows = a.getRows();
-    for (size_t i = 0; i < a.getWidth(); i++) {
-      for (size_t j = a.getColStartIdx(i); j < a.getColStartIdx(i + 1); j++) {
-        if (a.getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(rows[j], i));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b.getElement(rows[j], i));
-        }
-      }
-    }
-  } else {
-    int* cols = a.getCols();
-    for (size_t i = 0; i < a.getHeight(); i++) {
-      for (size_t j = a.getRowStartIdx(i); j < a.getRowStartIdx(i + 1); j++) {
-        if (a.getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(i, cols[j]));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b.getElement(i, cols[j]));
-        }
-      }
-    }
-  }
-}
-
-void checkSMatrixEqual2Dense(const CpuSparseMatrixPtr& a,
-                             const CpuMatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-
-  if (a->getFormat() == SPARSE_CSC) {
-    int* rows = a->getRows();
-    for (size_t i = 0; i < a->getWidth(); i++) {
-      for (size_t j = a->getColStartIdx(i); j < a->getColStartIdx(i + 1); j++) {
-        if (a->getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a->getValue()[j], b->getElement(rows[j], i));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b->getElement(rows[j], i));
-        }
-      }
-    }
-  } else {
-    int* cols = a->getCols();
-    for (size_t i = 0; i < a->getHeight(); i++) {
-      for (size_t j = a->getRowStartIdx(i); j < a->getRowStartIdx(i + 1); j++) {
-        if (a->getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a->getValue()[j], b->getElement(i, cols[j]));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b->getElement(i, cols[j]));
-        }
-      }
-    }
-  }
-}
-
-void checkSMatrixErr(const CpuSparseMatrixPtr& a, const CpuSparseMatrixPtr& b) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  ASSERT_EQ(a->getFormat(), b->getFormat());
-  ASSERT_EQ(a->getValueType(), b->getValueType());
-  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
-  int count = 0;
-  if (a->getFormat() == SPARSE_CSR) {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        real aVal = a->getValue()[r];
-        real bVal = b->getValue()[r];
-        if (std::abs(aVal - bVal) > err) {
-          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
-            LOG(INFO) << "a=" << aVal << "\t"
-                      << "b=" << bVal;
-            count++;
-          }
-        }
-      }
-    }
-    for (size_t r = 0; r <= a->getHeight(); r++) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-    }
-  } else {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        real aVal = a->getValue()[r];
-        real bVal = b->getValue()[r];
-        if (std::abs(aVal - bVal) > err) {
-          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
-            count++;
-          }
-        }
-      }
-    }
-    for (size_t r = 0; r <= a->getWidth(); r++) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void checkMatrixErr(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (std::abs(a - b) > err) {
-        if ((std::abs(a - b) / std::abs(a)) > (err / 10.0f)) {
-          count++;
-        }
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void checkDataEqual(const real* a, const real* b, size_t size) {
-  for (size_t i = 0; i < size; ++i) {
-    ASSERT_FLOAT_EQ(a[i], b[i]);
-  }
-}
-
-}  //  namespace paddle
diff --git a/paddle/legacy/math/tests/test_perturbation.cpp b/paddle/legacy/math/tests/test_perturbation.cpp
deleted file mode 100644
index 969400666f1..00000000000
--- a/paddle/legacy/math/tests/test_perturbation.cpp
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <cuda_runtime.h>
-#include <gtest/gtest.h>
-#include <cmath>
-#include <vector>
-#include "hl_cuda.h"
-#include "hl_perturbation_util.cuh"
-
-using namespace std;  // NOLINT
-
-#define _USE_MATH_DEFINES
-
-const int NUM_IMAGES = 2;
-const int SAMPLING_RATE = 2;
-const int IMG_SIZE = 41;
-const int TGT_SIZE = 21;
-const int CHANNELS = 3;
-
-class PerturbationTest : public testing::Test {
- protected:
-  virtual void SetUp() { generateTestImages(gpuImages_); }
-
-  virtual void TearDown() {}
-
-  void allocateMem(real*& gpuAngle,
-                   real*& gpuScale,
-                   int*& gpuCenterR,
-                   int*& gpuCenterC) {
-    gpuAngle = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
-    gpuScale = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
-    gpuCenterR =
-        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-    gpuCenterC =
-        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-  }
-
-  // Generate translation parameters for testing.
-  void generateTranslationParams(int*& gpuCenterR,
-                                 int*& gpuCenterC,
-                                 int imgSize) {
-    int cpuCenterR[NUM_IMAGES * SAMPLING_RATE];
-    int cpuCenterC[NUM_IMAGES * SAMPLING_RATE];
-    for (int i = 0; i < NUM_IMAGES * SAMPLING_RATE; ++i) {
-      cpuCenterR[i] = (imgSize - 1) / 2;
-      cpuCenterC[i] = (imgSize - 1) / 2 - 1;
-    }
-
-    gpuCenterR =
-        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-    hl_memcpy_host2device(
-        gpuCenterR, cpuCenterR, sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-
-    gpuCenterC =
-        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-    hl_memcpy_host2device(
-        gpuCenterC, cpuCenterC, sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-  }
-
-  // Generate rotation parameters for testing.
-  void generateRotationParams(real*& gpuAngle) {
-    real cpuAngle[NUM_IMAGES];
-    for (int i = 0; i < NUM_IMAGES; ++i) {
-      cpuAngle[i] = 90.0 * M_PI / 180.0;
-    }
-    gpuAngle = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
-    hl_memcpy_host2device(gpuAngle, cpuAngle, sizeof(real) * NUM_IMAGES);
-  }
-
-  void generateScaleParams(real*& gpuScale) {
-    real cpuScale[NUM_IMAGES];
-    for (int i = 0; i < NUM_IMAGES; ++i) {
-      cpuScale[i] = static_cast<real>(TGT_SIZE - 2) / TGT_SIZE;
-    }
-    gpuScale = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
-    hl_memcpy_host2device(gpuScale, cpuScale, sizeof(real) * NUM_IMAGES);
-  }
-
-  // Generate the test images, only the center regions are set to 1.
-  // The other parts are set to 0.
-  void generateTestImages(real*& gpuImages) {
-    const int IMAGE_MEM_SIZE = NUM_IMAGES * IMG_SIZE * IMG_SIZE * CHANNELS;
-    real cpuImages[IMAGE_MEM_SIZE];
-    // Set the middle of each image to 1.
-    real* ptr = cpuImages;
-    for (int i = 0; i < NUM_IMAGES; ++i) {
-      for (int r = 0; r < IMG_SIZE; ++r) {
-        for (int c = 0; c < IMG_SIZE; ++c) {
-          for (int ch = 0; ch < CHANNELS; ++ch) {
-            if (r >= IMG_SIZE / 4 && r < IMG_SIZE - IMG_SIZE / 4 &&
-                c >= IMG_SIZE / 4 && c < IMG_SIZE - IMG_SIZE / 4) {
-              *ptr = 1.0;
-            } else {
-              *ptr = 0.0;
-            }
-            ++ptr;
-          }
-        }
-      }
-    }
-    gpuImages = (real*)hl_malloc_device(sizeof(real) * IMAGE_MEM_SIZE);
-    hl_memcpy_host2device(gpuImages, cpuImages, sizeof(real) * IMAGE_MEM_SIZE);
-  }
-
-  real* gpuImages_;
-};
-
-// Random perturbation. Only to make sure the code does not break.
-TEST_F(PerturbationTest, random_perturb) {
-  real *gpuAngle, *gpuScaleRatio;
-  int *gpuCenterR, *gpuCenterC;
-  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-
-  real* targets = NULL;
-  const int TARGET_MEM_SIZE =
-      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
-  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb(gpuImages_,
-                         IMG_SIZE,
-                         TGT_SIZE,
-                         CHANNELS,
-                         NUM_IMAGES,
-                         1.0,
-                         1.0,
-                         SAMPLING_RATE,
-                         gpuAngle,
-                         gpuScaleRatio,
-                         gpuCenterR,
-                         gpuCenterC,
-                         2,
-                         true,
-                         targets);
-  real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
-}
-
-TEST_F(PerturbationTest, identity_perturb) {
-  real *gpuAngle, *gpuScaleRatio;
-  int *gpuCenterR, *gpuCenterC;
-  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-
-  real* targets = NULL;
-  const int TARGET_MEM_SIZE =
-      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
-  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb(gpuImages_,
-                         IMG_SIZE,
-                         TGT_SIZE,
-                         CHANNELS,
-                         NUM_IMAGES,
-                         1.0,
-                         1.0,
-                         SAMPLING_RATE,
-                         gpuAngle,
-                         gpuScaleRatio,
-                         gpuCenterR,
-                         gpuCenterC,
-                         2,
-                         false,
-                         targets);
-  real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
-  for (int i = 0; i < TARGET_MEM_SIZE; ++i) {
-    EXPECT_FLOAT_EQ(1.0, cpuTargets[i]);
-  }
-}
-
-TEST_F(PerturbationTest, translation_test) {
-  real *gpuAngle, *gpuScaleRatio;
-  int *gpuCenterR, *gpuCenterC;
-  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-  hl_generate_disturb_params(gpuAngle,
-                             gpuScaleRatio,
-                             gpuCenterR,
-                             gpuCenterC,
-                             NUM_IMAGES,
-                             IMG_SIZE,
-                             0.0,
-                             0.0,
-                             SAMPLING_RATE,
-                             false);
-  generateTranslationParams(gpuCenterR, gpuCenterC, IMG_SIZE);
-
-  real* targets = NULL;
-  const int TARGET_MEM_SIZE =
-      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
-  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb_with_params(gpuImages_,
-                                     IMG_SIZE,
-                                     TGT_SIZE,
-                                     CHANNELS,
-                                     NUM_IMAGES,
-                                     SAMPLING_RATE,
-                                     gpuAngle,
-                                     gpuScaleRatio,
-                                     gpuCenterR,
-                                     gpuCenterC,
-                                     2,
-                                     targets);
-
-  real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
-  for (int i = 0; i < SAMPLING_RATE * NUM_IMAGES; ++i) {
-    for (int p = 0; p < TGT_SIZE * TGT_SIZE * CHANNELS; ++p) {
-      const int offset = i * TGT_SIZE * TGT_SIZE * CHANNELS + p;
-      if (p < TGT_SIZE * CHANNELS) {
-        EXPECT_FLOAT_EQ(0.0, cpuTargets[offset]);
-      } else {
-        EXPECT_FLOAT_EQ(1.0, cpuTargets[offset]);
-      }
-    }
-  }
-}
-
-TEST_F(PerturbationTest, rotation_test) {
-  real *gpuAngle, *gpuScaleRatio;
-  int *gpuCenterR, *gpuCenterC;
-  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-  hl_generate_disturb_params(gpuAngle,
-                             gpuScaleRatio,
-                             gpuCenterR,
-                             gpuCenterC,
-                             NUM_IMAGES,
-                             IMG_SIZE,
-                             0.0,
-                             0.0,
-                             SAMPLING_RATE,
-                             false);
-  generateRotationParams(gpuAngle);
-
-  real* targets = NULL;
-  const int TARGET_MEM_SIZE =
-      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
-  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb_with_params(gpuImages_,
-                                     IMG_SIZE,
-                                     TGT_SIZE,
-                                     CHANNELS,
-                                     NUM_IMAGES,
-                                     SAMPLING_RATE,
-                                     gpuAngle,
-                                     gpuScaleRatio,
-                                     gpuCenterR,
-                                     gpuCenterC,
-                                     2,
-                                     targets);
-
-  real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
-  for (int i = 0; i < TARGET_MEM_SIZE; ++i) {
-    EXPECT_FLOAT_EQ(1.0, cpuTargets[i]);
-  }
-}
-
-TEST_F(PerturbationTest, scale_test) {
-  real *gpuAngle, *gpuScaleRatio;
-  int *gpuCenterR, *gpuCenterC;
-  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-  hl_generate_disturb_params(gpuAngle,
-                             gpuScaleRatio,
-                             gpuCenterR,
-                             gpuCenterC,
-                             NUM_IMAGES,
-                             IMG_SIZE,
-                             0.0,
-                             0.0,
-                             SAMPLING_RATE,
-                             false);
-  generateScaleParams(gpuScaleRatio);
-
-  real* targets = NULL;
-  const int TARGET_MEM_SIZE =
-      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
-  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb_with_params(gpuImages_,
-                                     IMG_SIZE,
-                                     TGT_SIZE,
-                                     CHANNELS,
-                                     NUM_IMAGES,
-                                     SAMPLING_RATE,
-                                     gpuAngle,
-                                     gpuScaleRatio,
-                                     gpuCenterR,
-                                     gpuCenterC,
-                                     2,
-                                     targets);
-
-  real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
-  for (int i = 0; i < SAMPLING_RATE * NUM_IMAGES; ++i) {
-    for (int p = 0; p < TGT_SIZE * TGT_SIZE * CHANNELS; ++p) {
-      const int offset = i * TGT_SIZE * TGT_SIZE * CHANNELS + p;
-      int c = (p / CHANNELS) % TGT_SIZE;
-      int r = (p / CHANNELS) / TGT_SIZE;
-      if (r == 0 || r == TGT_SIZE - 1 || c == 0 || c == TGT_SIZE - 1) {
-        EXPECT_FLOAT_EQ(0.0, cpuTargets[offset]);
-      } else {
-        EXPECT_FLOAT_EQ(1.0, cpuTargets[offset]);
-      }
-    }
-  }
-}
-
-#endif
diff --git a/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp b/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp
deleted file mode 100644
index 492aa0a6895..00000000000
--- a/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-/// This unittest checks GpuSparseMatrix/CpuSparseMatrix get same result,
-//  so disable when
-/// only cpu version.
-
-#include <gtest/gtest.h>
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Util.h"
-#include "test_matrixUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static inline int uniformRandom(int n) { return n == 0 ? 0 : rand() % n; }
-
-void testSpMatrixAddBias(int M, int N, real rate, real scale) {
-  int nnz = M * N * rate;
-
-  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, N);
-
-  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, N);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_1);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  hl_stream_synchronize(stream);
-
-  cpuA->addBias(*cpuB, scale);
-  gpuA->addBias(*gpuB, scale);
-
-  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
-  outputCheck->copyFrom(*gpuA, stream);
-  hl_stream_synchronize(stream);
-  checkSMatrixEqual2(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuA),
-                     std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
-}
-
-void testSpMatrixAddDense(int M, int N, real rate) {  // add3
-  int nnz = M * N * rate;
-
-  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(M, N);
-
-  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(M, N);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_3);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  hl_stream_synchronize(stream);
-
-  cpuA->add3(cpuB);
-  gpuA->add3(gpuB);
-
-  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
-  outputCheck->copyFrom(*gpuA, stream);
-  hl_stream_synchronize(stream);
-  checkSMatrixEqual2(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuA),
-                     std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
-}
-
-void testSpMatrixMul(int M, int N, int K, real rate) {
-  int nnz = M * N * rate;
-
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(M, K);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(N, K);
-  MatrixPtr cpuC(new CpuSparseMatrix(M, N, nnz));
-
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(M, K);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(N, K);
-  MatrixPtr gpuC(new GpuSparseMatrix(M, N, nnz));
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_3);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  gpuC->copyFrom(*cpuC, stream);
-  hl_stream_synchronize(stream);
-
-  cpuC->mul(*cpuA, *cpuB->getTranspose(), 1, 1);
-  gpuC->mul(*gpuA, *gpuB->getTranspose(), 1, 1);
-
-  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
-  outputCheck->copyFrom(*gpuC, stream);
-  hl_stream_synchronize(stream);
-  checkSMatrixErr(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuC),
-                  std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
-}
-
-void testSpMatrixCollectBias(int M, int N, real rate) {
-  int nnz = M * N * rate;
-  LOG(INFO) << "nnz=" << nnz;
-
-  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, N);
-
-  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, N);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_3);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  hl_stream_synchronize(stream);
-
-  cpuB->collectBias(*cpuA, 1);
-  gpuB->collectBias(*gpuA, 1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, N);
-  outputCheck->copyFrom(*gpuB, stream);
-  hl_stream_synchronize(stream);
-  checkMatrixErr(*cpuB, *outputCheck);
-}
-
-TEST(SMatrix, sMatrixOp) {
-  for (auto height : {1, 11, 200}) {
-    for (auto width : {200, 2048, 20480}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-      for (auto rate : {0.02, 0.1}) {
-        testSpMatrixAddDense(height, width, rate);
-        testSpMatrixAddBias(height, width, rate, 1.0);
-      }
-    }
-  }
-}
-
-TEST(SMatrix, sMatrixMul) {
-  for (auto M : {1, 40, 128, 200}) {
-    for (auto N : {100, 2000, 20480}) {
-      for (auto K : {100, 512, 1024}) {
-        VLOG(3) << " M=" << M << " N=" << N << " K=" << K;
-        testSpMatrixMul(M, N, K, 0.05);
-      }
-    }
-  }
-}
-
-TEST(SMatrix, sMatrixCollectBias) {
-  for (auto height : {1, 128, 200}) {
-    for (auto width : {100, 2048, 20480}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-      testSpMatrixCollectBias(height, width, 0.1);
-    }
-  }
-}
-
-#endif
diff --git a/paddle/legacy/optimizer/CMakeLists.txt b/paddle/legacy/optimizer/CMakeLists.txt
deleted file mode 100644
index 7c80faa48ce..00000000000
--- a/paddle/legacy/optimizer/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-set(OPITMIZER_SRCS
-    adadelta_optimizer.cc
-    adagrad_optimizer.cc
-    adam_optimizer.cc
-    optimizer.cc
-    parameter_optimizer.cc
-    sgd_optimizer.cc
-  )
-
-add_library(paddle_optimizer ${OPITMIZER_SRCS})
-target_link_libraries(paddle_optimizer paddle_proto glog)
-
-if (WITH_TESTING)
-    add_unittest(serialization_test serialization_test.cc)
-    add_unittest(parameter_optimizer_test parameter_optimizer_test.cc)
-endif()
diff --git a/paddle/legacy/optimizer/adadelta_optimizer.cc b/paddle/legacy/optimizer/adadelta_optimizer.cc
deleted file mode 100644
index 1faeb0cd31e..00000000000
--- a/paddle/legacy/optimizer/adadelta_optimizer.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "adadelta_optimizer.h"
-#include <algorithm>
-#include <cmath>
-
-namespace paddle {
-namespace optimizer {
-
-void AdadeltaOptimizer::Update(const Tensor* gradient) {
-  num_sample_passed_ += 1;
-  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
-  Tensor& param = *parameter_;
-  const Tensor& grad = *gradient;
-  Tensor& accum_g = *accum_gradient_;
-  Tensor& accum_d = *accum_delta_;
-  Tensor& update_d = *update_delta_;
-  for (size_t i = 0; i < param.size(); ++i) {
-    accum_g[i] = rho_ * accum_g[i] + (1.0 - rho_) * grad[i] * grad[i];
-
-    update_d[i] = std::sqrt(accum_d[i] + epsilon_) /
-                  std::sqrt(accum_g[i] + epsilon_) * grad[i];
-
-    accum_d[i] = rho_ * accum_d[i] + (1.0 - rho_) * update_d[i] * update_d[i];
-
-    param[i] -= learning_rate * update_d[i] + learning_rate * decay_ * param[i];
-  }
-}
-
-std::string AdadeltaOptimizer::SerializeState() {
-  AdadeltaOptimizerState state;
-  state.set_num_sample_passed(num_sample_passed_);
-  std::string lr_str = this->lr_policy_->SerializeState();
-  state.mutable_lr_state()->ParseFromString(lr_str);
-
-  TensorToProto(*parameter_, state.mutable_parameter());
-  TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
-  TensorToProto(*accum_delta_, state.mutable_accum_delta());
-  TensorToProto(*update_delta_, state.mutable_update_delta());
-  return state.SerializeAsString();
-}
-
-void AdadeltaOptimizer::DeserializeState(const std::string& str) {
-  AdadeltaOptimizerState state;
-  state.ParseFromString(str);
-  auto lr_state = state.lr_state();
-  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
-  num_sample_passed_ = state.num_sample_passed();
-
-  ProtoToTensor(state.parameter(), parameter_);
-  ProtoToTensor(state.accum_gradient(), accum_gradient_);
-  ProtoToTensor(state.accum_delta(), accum_delta_);
-  ProtoToTensor(state.update_delta(), update_delta_);
-}
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/adadelta_optimizer.h b/paddle/legacy/optimizer/adadelta_optimizer.h
deleted file mode 100644
index 5beb62295a8..00000000000
--- a/paddle/legacy/optimizer/adadelta_optimizer.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "parameter_optimizer.h"
-
-namespace paddle {
-namespace optimizer {
-
-class AdadeltaOptimizer : public ParameterOptimizer {
- public:
-  AdadeltaOptimizer(
-      Tensor *parameter, LrPolicy *lr, double rho, double epsilon, double decay)
-      : ParameterOptimizer(parameter, lr),
-        accum_gradient_(new Tensor(parameter->size())),
-        accum_delta_(new Tensor(parameter->size())),
-        update_delta_(new Tensor(parameter->size())),
-        rho_(rho),
-        epsilon_(epsilon),
-        decay_(decay) {}
-
-  ~AdadeltaOptimizer() {
-    if (accum_gradient_) delete accum_gradient_;
-    if (accum_delta_) delete accum_delta_;
-    if (update_delta_) delete update_delta_;
-  }
-  void Update(const Tensor *gradient);
-  std::string SerializeState();
-  void DeserializeState(const std::string &state);
-
- private:
-  Tensor *accum_gradient_;
-  Tensor *accum_delta_;
-  Tensor *update_delta_;
-  double rho_;
-  double epsilon_;
-  double decay_;
-};
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/adagrad_optimizer.cc b/paddle/legacy/optimizer/adagrad_optimizer.cc
deleted file mode 100644
index 5ac65dbd720..00000000000
--- a/paddle/legacy/optimizer/adagrad_optimizer.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-
-#include "adagrad_optimizer.h"
-
-namespace paddle {
-namespace optimizer {
-
-void AdagradOptimizer::Update(const Tensor* gradient) {
-  num_sample_passed_ += 1;
-  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
-  Tensor& param = *parameter_;
-  Tensor& accum_g = *accum_gradient_;
-  const Tensor& grad = *gradient;
-  for (size_t i = 0; i < param.size(); ++i) {
-    accum_g[i] += grad[i] * grad[i];
-    param[i] += learning_rate * grad[i] / std::sqrt(accum_g[i] + epsilon_) +
-                learning_rate * decay_ * param[i];
-  }
-}
-std::string AdagradOptimizer::SerializeState() {
-  AdagradOptimizerState state;
-  state.set_num_sample_passed(num_sample_passed_);
-  std::string lr_str = this->lr_policy_->SerializeState();
-  state.mutable_lr_state()->ParseFromString(lr_str);
-
-  TensorToProto(*parameter_, state.mutable_parameter());
-  TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
-  return state.SerializeAsString();
-}
-
-void AdagradOptimizer::DeserializeState(const std::string& str) {
-  AdagradOptimizerState state;
-  state.ParseFromString(str);
-  auto lr_state = state.lr_state();
-  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
-
-  num_sample_passed_ = state.num_sample_passed();
-  ProtoToTensor(state.parameter(), parameter_);
-  ProtoToTensor(state.accum_gradient(), accum_gradient_);
-}
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/adagrad_optimizer.h b/paddle/legacy/optimizer/adagrad_optimizer.h
deleted file mode 100644
index b6fc0673997..00000000000
--- a/paddle/legacy/optimizer/adagrad_optimizer.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "parameter_optimizer.h"
-
-namespace paddle {
-namespace optimizer {
-
-class AdagradOptimizer : public ParameterOptimizer {
- public:
-  AdagradOptimizer(Tensor *parameter,
-                   LrPolicy *lr,
-                   double epsilon,
-                   double decay)
-      : ParameterOptimizer(parameter, lr),
-        accum_gradient_(new Tensor(parameter->size())),
-        epsilon_(epsilon),
-        decay_(decay) {}
-  ~AdagradOptimizer() {
-    if (accum_gradient_) delete accum_gradient_;
-  }
-  void Update(const Tensor *gradient);
-  std::string SerializeState();
-  void DeserializeState(const std::string &state);
-
- private:
-  Tensor *accum_gradient_;
-  double epsilon_;
-  double decay_;
-};
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/adam_optimizer.cc b/paddle/legacy/optimizer/adam_optimizer.cc
deleted file mode 100644
index 9a4ff5ecc0f..00000000000
--- a/paddle/legacy/optimizer/adam_optimizer.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "adam_optimizer.h"
-#include <cmath>
-
-namespace paddle {
-namespace optimizer {
-
-void AdamOptimizer::Update(const Tensor *gradient) {
-  num_sample_passed_ += 1;
-  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
-  double coef1 = 1.0 - std::pow(beta_1_, num_sample_passed_);
-  double coef2 = 1.0 - std::pow(beta_2_, num_sample_passed_);
-  learning_rate *= std::sqrt(coef2) / coef1;
-  Tensor &param = *parameter_;
-  const Tensor &grad = *gradient;
-  Tensor &m = *momentums_;
-  Tensor &v = *velocitys_;
-  for (size_t i = 0; i < param.size(); ++i) {
-    m[i] = beta_1_ * m[i] + (1.0 - beta_1_) * grad[i];
-    v[i] = beta_2_ * v[i] + (1.0 - beta_2_) * grad[i] * grad[i];
-    param[i] -=
-        learning_rate * (m[i] / std::sqrt(v[i] + epsilon_) + decay_ * param[i]);
-  }
-}
-
-std::string AdamOptimizer::SerializeState() {
-  AdamOptimizerState state;
-  std::string lr_str = this->lr_policy_->SerializeState();
-  state.mutable_lr_state()->ParseFromString(lr_str);
-  state.set_num_sample_passed(num_sample_passed_);
-
-  TensorToProto(*parameter_, state.mutable_parameter());
-  TensorToProto(*momentums_, state.mutable_momentums());
-  TensorToProto(*velocitys_, state.mutable_velocitys());
-  return state.SerializeAsString();
-}
-
-void AdamOptimizer::DeserializeState(const std::string &str) {
-  AdamOptimizerState state;
-  state.ParseFromString(str);
-  auto lr_state = state.lr_state();
-  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
-  num_sample_passed_ = state.num_sample_passed();
-
-  ProtoToTensor(state.parameter(), parameter_);
-  ProtoToTensor(state.momentums(), momentums_);
-  ProtoToTensor(state.velocitys(), velocitys_);
-}
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/adam_optimizer.h b/paddle/legacy/optimizer/adam_optimizer.h
deleted file mode 100644
index fce10960068..00000000000
--- a/paddle/legacy/optimizer/adam_optimizer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "parameter_optimizer.h"
-
-namespace paddle {
-namespace optimizer {
-
-class AdamOptimizer : public ParameterOptimizer {
- public:
-  AdamOptimizer(Tensor *parameter,
-                LrPolicy *lr,
-                double beta_1,
-                double beta_2,
-                double epsilon,
-                double decay)
-      : ParameterOptimizer(parameter, lr),
-        momentums_(new Tensor(parameter->size())),
-        velocitys_(new Tensor(parameter->size())),
-        beta_1_(beta_1),
-        beta_2_(beta_2),
-        epsilon_(epsilon),
-        decay_(decay) {}
-  ~AdamOptimizer() {
-    if (momentums_) delete momentums_;
-    if (velocitys_) delete velocitys_;
-  }
-  void Update(const Tensor *gradient);
-  std::string SerializeState();
-  void DeserializeState(const std::string &state);
-
- private:
-  Tensor *momentums_;
-  Tensor *velocitys_;
-  double beta_1_;
-  double beta_2_;
-  double epsilon_;
-  double decay_;
-};
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/lr_policy.h b/paddle/legacy/optimizer/lr_policy.h
deleted file mode 100644
index d639c9f22c8..00000000000
--- a/paddle/legacy/optimizer/lr_policy.h
+++ /dev/null
@@ -1,82 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <algorithm>
-#include "OptimizerConfig.pb.h"
-
-namespace paddle {
-namespace optimizer {
-
-class LrPolicy {
- public:
-  virtual ~LrPolicy() {}
-  virtual double LearningRate(const uint64_t num_sample_passed) = 0;
-  virtual std::string SerializeState() = 0;
-  virtual void DeserializeState(const std::string &state) = 0;
-};
-
-// constant learning rate policy
-class ConstLr final : public LrPolicy {
- public:
-  ConstLr(double lr) : learning_rate_(lr){};
-  double LearningRate(const uint64_t num_sample_passed) {
-    return learning_rate_;
-  }
-  std::string SerializeState() {
-    LrPolicyState state;
-    state.set_learning_rate(learning_rate_);
-    return state.SerializeAsString();
-  }
-  void DeserializeState(const std::string &str) {
-    LrPolicyState state;
-    state.ParseFromString(str);
-    learning_rate_ = state.learning_rate();
-  }
-
- private:
-  double learning_rate_;
-};
-
-class LinearLr final : public LrPolicy {
- public:
-  LinearLr(double lr, double lr_decay_a, double lr_decay_b)
-      : learning_rate_(lr), lr_decay_a_(lr_decay_a), lr_decay_b_(lr_decay_b) {}
-  double LearningRate(const uint64_t num_sample_passed) {
-    return std::max(learning_rate_ - lr_decay_a_ * num_sample_passed,
-                    lr_decay_b_);
-  }
-  std::string SerializeState() {
-    LrPolicyState state;
-    state.set_learning_rate(learning_rate_);
-    state.set_lr_decay_a(lr_decay_a_);
-    state.set_lr_decay_b(lr_decay_b_);
-    return state.SerializeAsString();
-  }
-  void DeserializeState(const std::string &str) {
-    LrPolicyState state;
-    state.ParseFromString(str);
-    learning_rate_ = state.learning_rate();
-    lr_decay_a_ = state.lr_decay_a();
-    lr_decay_b_ = state.lr_decay_b();
-  }
-
- private:
-  double learning_rate_;
-  double lr_decay_a_;
-  double lr_decay_b_;
-};
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/optimizer.cc b/paddle/legacy/optimizer/optimizer.cc
deleted file mode 100644
index e583aebd77a..00000000000
--- a/paddle/legacy/optimizer/optimizer.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "optimizer.h"
-#include <glog/logging.h>
-#include <cstdlib>
-#include <cstring>
-#include <string>
-
-#include "parameter_optimizer.h"
-
-using paddle::optimizer::ParameterOptimizer;
-using paddle::optimizer::Tensor;
-
-template <paddle_element_type VALUE>
-struct EnumToType {};
-
-template <class T>
-struct TypeToEnum {};
-
-#define MATCH_ENUM_TYPE(TYPE, ENUM)                 \
-  template <>                                       \
-  struct TypeToEnum<TYPE> {                         \
-    static paddle_element_type v() { return ENUM; } \
-    static constexpr TYPE value = ENUM;             \
-  };                                                \
-  template <>                                       \
-  struct EnumToType<ENUM> {                         \
-    typedef TYPE Type;                              \
-  }
-
-MATCH_ENUM_TYPE(int32_t, PADDLE_ELEMENT_TYPE_INT32);
-MATCH_ENUM_TYPE(uint32_t, PADDLE_ELEMENT_TYPE_UINT32);
-MATCH_ENUM_TYPE(int64_t, PADDLE_ELEMENT_TYPE_INT64);
-MATCH_ENUM_TYPE(uint64_t, PADDLE_ELEMENT_TYPE_UINT64);
-MATCH_ENUM_TYPE(float, PADDLE_ELEMENT_TYPE_FLOAT32);
-MATCH_ENUM_TYPE(double, PADDLE_ELEMENT_TYPE_FLOAT64);
-
-struct paddle_optimizer {
-  paddle::optimizer::ParameterOptimizer* impl;
-};
-
-paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto,
-                                          const int config_proto_len,
-                                          const paddle_element_type data_type,
-                                          void* param_buffer,
-                                          int num_bytes,
-                                          const char* state,
-                                          const int state_len) {
-  paddle_optimizer* optimizer = new paddle_optimizer;
-  std::string config(config_proto, config_proto + config_proto_len);
-  Tensor* parameter = new Tensor(reinterpret_cast<float*>(param_buffer),
-                                 num_bytes / sizeof(float));
-  optimizer->impl = ParameterOptimizer::Create(config, parameter);
-  if (state != nullptr) {
-    std::string s(state, state + state_len);
-    optimizer->impl->DeserializeState(s);
-  }
-  return optimizer;
-}
-
-int paddle_release_optimizer(paddle_optimizer* o) {
-  if (o != nullptr) delete o->impl;
-  return PADDLE_SUCCESS;
-}
-
-int paddle_update_parameter(paddle_optimizer* o,
-                            const paddle_element_type data_type,
-                            const void* grad_buffer,
-                            int num_bytes) {
-  // TOOD(zhihong): datatype not work. need to add the runtime datatype
-  auto grad_type = reinterpret_cast<const float*>(grad_buffer);
-  Tensor* gradient =
-      new Tensor(const_cast<float*>(grad_type), num_bytes / sizeof(float));
-  o->impl->Update(gradient);
-  return PADDLE_SUCCESS;
-}
-
-int paddle_optimizer_get_weights(paddle_optimizer* o, void** param_buffer) {
-  int param_size = 0;
-  *param_buffer = (void*)o->impl->get_weight(&param_size);
-  return param_size;
-}
-
-int paddle_optimizer_get_state(paddle_optimizer* o, const char** state) {
-  std::string s = o->impl->SerializeState();
-  int state_len = s.size();
-
-  if (state_len > 0) {
-    *state = (char*)std::malloc(state_len);
-    std::memcpy((void*)*state, (const void*)s.c_str(), state_len);
-  }
-
-  return state_len;
-}
diff --git a/paddle/legacy/optimizer/optimizer.h b/paddle/legacy/optimizer/optimizer.h
deleted file mode 100644
index c079de921fa..00000000000
--- a/paddle/legacy/optimizer/optimizer.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdbool.h>
-#include <stdint.h>
-
-/**
- * @brief optimizer library in independent with other module
- * which will be used in :
- * Case A, the gradient optimized locally on the trainer.
- *
- * Case B, the gradient optimized on the parameter server.
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef enum {
-  PADDLE_ELEMENT_TYPE_INT32 = 0,
-  PADDLE_ELEMENT_TYPE_UINT32 = 1,
-  PADDLE_ELEMENT_TYPE_INT64 = 2,
-  PADDLE_ELEMENT_TYPE_UINT64 = 3,
-  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
-  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
-} paddle_element_type;
-
-/**
- * @brief execution status code
- */
-const int32_t PADDLE_SUCCESS = 0;
-const int32_t PADDLE_ERROR = -1;
-
-typedef struct paddle_optimizer paddle_optimizer;
-/**
- * this group interface called in order :
- * 1. create optimizer with config
- * 2. set weights
- * 3. update_parameter
- * 4. get_weights
- * 5. release optimizer
- */
-
-/**
- *  @brief create optimizer with proto_config
- *  @param config_proto, optimizer protobuf, see OptimizerConfig.proto in detail
- *  @return return optimizer instance
- */
-paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto,
-                                          const int config_proto_len,
-                                          const paddle_element_type data_type,
-                                          void* param_buffer,
-                                          int num_bytes,
-                                          const char* state,
-                                          const int state_len);
-
-/**
- *  @brief release optimizer
- *  @param optimizer
- *  @return return exec status
- */
-int paddle_release_optimizer(paddle_optimizer* o);
-
-/**
- *  @brief optimizer instance
- *  @param datatype of gradient and parameter
- *  @param gradient, calculate by optimzizer caller.
- *       TODO(zhihong): just pass loss to reduce communicate overhead.
- *                     Project Adam Ms'14 paper for detail
- *  @param num_bytes, gradient size
- *  @return return exec status
- */
-int paddle_update_parameter(paddle_optimizer* o,
-                            const paddle_element_type data_type,
-                            const void* gradient,
-                            int num_bytes);
-
-/**
- *  @brief optimizer for get parameter buffer
- *  @param param_buffer, initilized parameter buffer
- *  @return return content length
- */
-int paddle_optimizer_get_weights(paddle_optimizer* o, void** param_buffer);
-
-/**
- *  @brief optimzizer for saving training state
- *  @param training state for receive SerializeState
- *  @return return state_buffer length
- */
-int paddle_optimizer_get_state(paddle_optimizer* o, const char** state);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/paddle/legacy/optimizer/parameter_optimizer.cc b/paddle/legacy/optimizer/parameter_optimizer.cc
deleted file mode 100644
index f9474b315d5..00000000000
--- a/paddle/legacy/optimizer/parameter_optimizer.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-#include "adadelta_optimizer.h"
-#include "adagrad_optimizer.h"
-#include "adam_optimizer.h"
-#include "lr_policy.h"
-#include "sgd_optimizer.h"
-
-#include "parameter_optimizer.h"
-
-namespace paddle {
-namespace optimizer {
-
-ParameterOptimizer *ParameterOptimizer::Create(const std::string &config_proto,
-                                               Tensor *parameter) {
-  paddle::OptimizerConfig config;
-  CHECK(config.ParseFromString(config_proto) == true)
-      << "failed parse optimizer config";
-  auto select_lr_policy = [=](const OptimizerConfig &config) -> LrPolicy * {
-    if (config.lr_policy() == OptimizerConfig::Const)
-      return new ConstLr(config.const_lr().learning_rate());
-    if (config.lr_policy() == OptimizerConfig::Linear)
-      return new LinearLr(config.linear_lr().learning_rate(),
-                          config.linear_lr().lr_decay_a(),
-                          config.linear_lr().lr_decay_b());
-    // default
-    LOG(WARNING) << " have not select any LrPolicy. use ConstLr in default";
-    return new ConstLr(0.1);
-  };
-
-  LrPolicy *lr = select_lr_policy(config);
-  auto select_optimizer = [=](
-      Tensor *parameter,
-      const OptimizerConfig &config) -> ParameterOptimizer * {
-    if (config.optimizer() == OptimizerConfig::SGD) {
-      LOG(INFO) << "creating SGD optimizer";
-      return new SGDOptimizer(parameter,
-                              lr,
-                              config.sgd().momentum(),
-                              config.sgd().decay(),
-                              config.sgd().nesterov());
-    }
-    if (config.optimizer() == OptimizerConfig::Adadelta) {
-      LOG(INFO) << "creating Adadelta optimizer";
-      return new AdadeltaOptimizer(parameter,
-                                   lr,
-                                   config.adadelta().rho(),
-                                   config.adadelta().epsilon(),
-                                   config.adadelta().decay());
-    }
-    if (config.optimizer() == OptimizerConfig::Adagrad) {
-      LOG(INFO) << "creating Adagrad optimizer";
-      return new AdagradOptimizer(
-          parameter, lr, config.adagrad().epsilon(), config.adagrad().decay());
-    }
-    if (config.optimizer() == OptimizerConfig::Adam) {
-      LOG(INFO) << "creating Adam optimizer";
-      return new AdamOptimizer(parameter,
-                               lr,
-                               config.adam().beta_1(),
-                               config.adam().beta_2(),
-                               config.adam().epsilon(),
-                               config.adam().decay());
-    }
-    // default
-    LOG(WARNING)
-        << "have not select any Optimizer. use SGDOptimizer in default";
-    return new SGDOptimizer(parameter, lr, 0.0, 0.0, false);
-  };
-  return select_optimizer(parameter, config);
-}
-
-float *ParameterOptimizer::get_weight(int *param_size) const {
-  *param_size = (int)parameter_->size();
-  return parameter_->get_buffer();
-}
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/parameter_optimizer.h b/paddle/legacy/optimizer/parameter_optimizer.h
deleted file mode 100644
index d5abca82d55..00000000000
--- a/paddle/legacy/optimizer/parameter_optimizer.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-#include <functional>
-#include <string>
-#include "OptimizerConfig.pb.h"
-#include "lr_policy.h"
-#include "serialization.h"
-#include "tensor.h"
-
-namespace paddle {
-namespace optimizer {
-
-class ParameterOptimizer {
- public:
-  /**
-   * @brief  update hook for algorithm need to traverse parameter more than
-   * once.
-   */
-  ParameterOptimizer(Tensor *parameter, LrPolicy *lr)
-      : parameter_(parameter), lr_policy_(lr), num_sample_passed_(0) {}
-  virtual ~ParameterOptimizer() {
-    delete parameter_;
-    delete lr_policy_;
-  }
-
-  static ParameterOptimizer *Create(const std::string &config_proto,
-                                    Tensor *parameter);
-  virtual void Update(const Tensor *gradient) = 0;
-  virtual float *get_weight(int *param_size) const;
-  virtual std::string SerializeState() = 0;
-  virtual void DeserializeState(const std::string &state) = 0;
-
- protected:
-  Tensor *parameter_;
-  // learning rate policy
-  LrPolicy *lr_policy_;
-  uint64_t num_sample_passed_;
-};
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/parameter_optimizer_test.cc b/paddle/legacy/optimizer/parameter_optimizer_test.cc
deleted file mode 100644
index 1d9572999e9..00000000000
--- a/paddle/legacy/optimizer/parameter_optimizer_test.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "parameter_optimizer.h"
-#include <cmath>
-#include <map>
-#include <vector>
-#include "gtest/gtest.h"
-#include "lr_policy.h"
-
-paddle::optimizer::Tensor* FillTensor(size_t size) {
-  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
-  paddle::optimizer::Tensor& p = *param;
-  for (size_t i = 0; i < p.size(); ++i) {
-    p[i] = (float)rand() / (float)RAND_MAX;
-  }
-  return param;
-}
-
-paddle::optimizer::Tensor* FixedTensor(size_t size) {
-  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
-  paddle::optimizer::Tensor& p = *param;
-  for (size_t i = 0; i < p.size(); ++i) {
-    p[i] = i;
-  }
-  return param;
-}
-
-class OptimizerTest : public testing::Test {
- public:
-  virtual ~OptimizerTest() {}
-  // init paddle::optimizer::Tensor shape
-  const size_t kSize = 5;
-
-  virtual void SetUp() {
-    CreateSGD();
-    CreateAdam();
-  }
-  virtual void TearDown() {}
-
-  void CreateSGD() {
-    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(paddle::OptimizerConfig::SGD);
-    config_.mutable_sgd()->set_momentum(0.0);
-    config_.mutable_sgd()->set_decay(0.0);
-    config_.mutable_sgd()->set_nesterov(false);
-    config_.set_lr_policy(paddle::OptimizerConfig::Const);
-    config_.mutable_const_lr()->set_learning_rate(0.1);
-    std::string str = config_.SerializeAsString();
-    paddle::optimizer::ParameterOptimizer* opt =
-        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
-    opts_.push_back(opt);
-  }
-
-  void CreateAdam() {
-    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(paddle::OptimizerConfig::Adam);
-    config_.mutable_adam()->set_beta_1(0.9);
-    config_.mutable_adam()->set_beta_2(0.1);
-    config_.mutable_adam()->set_epsilon(1e-3);
-    config_.mutable_adam()->set_decay(0.0);
-    config_.set_lr_policy(paddle::OptimizerConfig::Const);
-    config_.mutable_const_lr()->set_learning_rate(0.1);
-    std::string str = config_.SerializeAsString();
-    paddle::optimizer::ParameterOptimizer* opt =
-        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
-    opts_.push_back(opt);
-  }
-
-  void TestGetWeight() {
-    paddle::optimizer::Tensor* p = FixedTensor(kSize);
-    for (size_t i = 0; i < opts_.size(); ++i) {
-      int s = 0;
-      float* newp = (float*)opts_[i]->get_weight(&s);
-      EXPECT_EQ(static_cast<size_t>(s), kSize);
-      for (size_t j = 0; j < kSize; ++j) {
-        EXPECT_EQ(newp[j], (*p)[j]);
-      }
-    }
-  }
-
-  void TestUpdate() {
-    paddle::optimizer::Tensor* g = FixedTensor(kSize);
-    for (size_t i = 0; i < opts_.size(); ++i) {
-      opts_[i]->Update(g);
-    }
-  }
-
-  void TestCheckPoint() {
-    paddle::optimizer::Tensor* p = FixedTensor(kSize);
-    for (size_t i = 0; i < opts_.size(); ++i) {
-      auto state = opts_[i]->SerializeState();
-      opts_[i]->DeserializeState(state);
-      auto state1 = opts_[i]->SerializeState();
-      opts_[i]->DeserializeState(state);
-      EXPECT_EQ(state, state1);
-
-      int s = 0;
-      float* newp = (float*)opts_[i]->get_weight(&s);
-      EXPECT_EQ(static_cast<size_t>(s), kSize);
-      for (size_t j = 0; j < kSize; ++j) {
-        EXPECT_EQ(newp[j], (*p)[j]);
-      }
-    }
-  }
-
- private:
-  std::vector<paddle::optimizer::ParameterOptimizer*> opts_;
-  paddle::OptimizerConfig config_;
-};
-
-TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
-
-TEST_F(OptimizerTest, TestUpdate) { TestUpdate(); }
-
-TEST_F(OptimizerTest, TestCheckPoint) { TestCheckPoint(); }
diff --git a/paddle/legacy/optimizer/serialization.h b/paddle/legacy/optimizer/serialization.h
deleted file mode 100644
index 2067a8d8cff..00000000000
--- a/paddle/legacy/optimizer/serialization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <type_traits>
-#include "OptimizerConfig.pb.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "tensor.h"
-
-namespace paddle {
-namespace optimizer {
-
-static void TensorToProto(const Tensor& tensor, TensorProto* proto) {
-  proto->set_data_type(TensorProto::PADDLE_ELEMENT_TYPE_FLOAT32);
-  std::stringstream os;
-  for (size_t i = 0; i < tensor.size(); ++i) {
-    os << tensor[i];
-    proto->add_content(os.str());
-    os.str(std::string());
-  }
-}
-
-static void ProtoToTensor(const TensorProto& proto, Tensor* tensor) {
-  std::stringstream sin;
-  for (auto i = 0; i < proto.content_size(); ++i) {
-    sin << proto.content(i);
-    sin >> (*tensor)[i];
-    sin.str(std::string());
-    sin.clear();
-  }
-}
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/serialization_test.cc b/paddle/legacy/optimizer/serialization_test.cc
deleted file mode 100644
index 93ee1f492f0..00000000000
--- a/paddle/legacy/optimizer/serialization_test.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "serialization.h"
-#include "gtest/gtest.h"
-
-TEST(TensorToProto, Case1) {
-  paddle::optimizer::Tensor t(3), t1(3);
-  for (size_t i = 0; i < t.size(); ++i) {
-    t[i] = i;
-    t1[i] = 10;
-  }
-
-  paddle::TensorProto proto;
-  paddle::optimizer::TensorToProto(t, &proto);
-  paddle::optimizer::ProtoToTensor(proto, &t1);
-  for (size_t i = 0; i < t1.size(); ++i) {
-    EXPECT_EQ(t1[i], t[i]);
-  }
-}
-
-TEST(TensorToProto, Case2) {
-  paddle::optimizer::Tensor t(1), t1(1);
-  for (size_t i = 0; i < t.size(); ++i) {
-    t[i] = i;
-    t1[i] = 10;
-  }
-
-  paddle::TensorProto proto;
-  paddle::optimizer::TensorToProto(t, &proto);
-  paddle::optimizer::ProtoToTensor(proto, &t1);
-  for (size_t i = 0; i < t1.size(); ++i) {
-    EXPECT_EQ(t1[i], t[i]);
-  }
-}
diff --git a/paddle/legacy/optimizer/sgd_optimizer.cc b/paddle/legacy/optimizer/sgd_optimizer.cc
deleted file mode 100644
index c1e2064de75..00000000000
--- a/paddle/legacy/optimizer/sgd_optimizer.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "sgd_optimizer.h"
-#include "serialization.h"
-
-namespace paddle {
-namespace optimizer {
-
-void SGDOptimizer::Update(const Tensor *gradient) {
-  num_sample_passed_ += 1;
-  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
-  float velocity = 0.0;
-  Tensor &param = *parameter_;
-  const Tensor &grad = *gradient;
-  Tensor &m = *momentums_;
-  for (size_t i = 0; i < param.size(); ++i) {
-    if (momentum_ == 0.0) {
-      velocity = -learning_rate * grad[i] - learning_rate * decay_ * param[i];
-    } else {
-      m[i] = momentum_ * m[i] - learning_rate * grad[i] -
-             learning_rate * decay_ * param[i];
-      velocity = m[i];
-    }
-    if (nesterov_) {
-      param[i] += momentum_ * velocity - learning_rate * grad[i];
-    } else {
-      param[i] += velocity;
-    }
-  }
-}
-
-std::string SGDOptimizer::SerializeState() {
-  SGDOptimizerState state;
-  state.set_num_sample_passed(num_sample_passed_);
-  std::string lr_str = this->lr_policy_->SerializeState();
-  state.mutable_lr_state()->ParseFromString(lr_str);
-  TensorToProto(*parameter_, state.mutable_parameter());
-  if (momentum_ != 0.0) TensorToProto(*momentums_, state.mutable_momentums());
-  return state.SerializeAsString();
-}
-
-void SGDOptimizer::DeserializeState(const std::string &str) {
-  SGDOptimizerState state;
-  state.ParseFromString(str);
-  auto lr_state = state.lr_state();
-  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
-  num_sample_passed_ = state.num_sample_passed();
-  ProtoToTensor(state.parameter(), parameter_);
-  if (momentum_ != 0.0) ProtoToTensor(state.momentums(), momentums_);
-}
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/sgd_optimizer.h b/paddle/legacy/optimizer/sgd_optimizer.h
deleted file mode 100644
index a8957cde54a..00000000000
--- a/paddle/legacy/optimizer/sgd_optimizer.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "parameter_optimizer.h"
-
-namespace paddle {
-namespace optimizer {
-
-class SGDOptimizer : public ParameterOptimizer {
- public:
-  SGDOptimizer(Tensor* parameter, LrPolicy* lr, double m, double d, bool n)
-      : ParameterOptimizer(parameter, lr),
-        momentums_(nullptr),
-        momentum_(m),
-        decay_(d),
-        nesterov_(n) {
-    if (momentum_ != 0.0) {
-      size_t size = parameter->size();
-      momentums_ = new Tensor(size);
-    }
-  }
-  virtual ~SGDOptimizer() {
-    if (momentums_) delete momentums_;
-  }
-  void Update(const Tensor* gradient);
-  std::string SerializeState();
-  void DeserializeState(const std::string& state);
-
- private:
-  Tensor* momentums_;
-  double momentum_;
-  double decay_;
-  bool nesterov_;
-};
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/tensor.h b/paddle/legacy/optimizer/tensor.h
deleted file mode 100644
index 2e58577d4df..00000000000
--- a/paddle/legacy/optimizer/tensor.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-/**
- * @brief tensor used by optimizer
- */
-
-#include <string.h>
-#include <memory>
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-namespace optimizer {
-
-template <class T>
-class TensorT {
- public:
-  TensorT(size_t size) : height_(1), width_(size) {
-    // new T[size]() initializes all element to zero value.
-    data_ptr_ = std::shared_ptr<T>(new T[size](), std::default_delete<T[]>());
-    data_ = data_ptr_.get();
-  }
-
-  TensorT(T* data, size_t size)
-      : height_(1), width_(size), data_ptr_(nullptr), data_(data) {}
-
-  TensorT(T* data, size_t h, size_t w)
-      : height_(h), width_(w), data_ptr_(nullptr), data_(data) {}
-
-  virtual ~TensorT() {}
-
-  T* get_buffer() { return this->data_; }
-
-  T& operator[](const size_t idx) {
-    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
-    return data_[idx];
-  }
-  T& operator[](const size_t idx) const {
-    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
-    return data_[idx];
-  }
-  // TODO: replace with tensorshape
-  size_t size() const { return this->width_ * this->height_; }
-
- protected:
-  size_t height_;
-  size_t width_;
-  std::shared_ptr<T> data_ptr_;
-  T* data_;
-};
-
-// TODO(zhihong): design problem of dynamic datatype, need to fix it
-typedef TensorT<float> Tensor;
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/Argument.cpp b/paddle/legacy/parameter/Argument.cpp
deleted file mode 100644
index 3f1d599e901..00000000000
--- a/paddle/legacy/parameter/Argument.cpp
+++ /dev/null
@@ -1,707 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Argument.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-
-#include <algorithm>
-
-namespace paddle {
-static void resizeAndCopy(MatrixPtr& dest,
-                          const MatrixPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    if (!dest) {
-      dest = src->clone(0, 0, useGpu);
-    } else {
-      CHECK_EQ(dest->useGpu(), useGpu);
-      dest->resize(src->getHeight(), src->getWidth());
-    }
-    dest->copyFrom(*src, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(IVectorPtr& dest,
-                          const IVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    IVector::resizeOrCreate(dest, src->getSize(), useGpu);
-    dest->copyFrom(*src, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(ICpuGpuVectorPtr& dest,
-                          const ICpuGpuVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    ICpuGpuVector::resizeOrCreate(dest, src->getSize(), useGpu);
-    dest->copyFrom(*src, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(MatrixPtr& dest,
-                          const MatrixPtr& src,
-                          int32_t startRow,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startRow + copySize, src->getHeight());
-    int height = copySize;
-    int width = src->getWidth();
-    if (!dest) {
-      dest = src->clone(height, width, useGpu);
-    } else {
-      CHECK_EQ(dest->useGpu(), useGpu);
-      dest->resize(height, width);
-    }
-    MatrixPtr submat = src->subMatrix(startRow, copySize);
-    if (dynamic_cast<GpuSparseMatrix*>(dest.get())) {
-      // copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix.
-      // First copy it to CPU, and then copy it to the GPU.
-      MatrixPtr tmp = src->clone(height, width, false);
-      tmp->copyFrom(*submat, stream);
-      dest->copyFrom(*tmp, stream);
-    } else {
-      dest->copyFrom(*submat, stream);
-    }
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(IVectorPtr& dest,
-                          const IVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startPos + copySize, src->getSize());
-
-    int height = copySize;
-    IVector::resizeOrCreate(dest, height, useGpu);
-    dest->copyFrom(src->getData() + startPos, height, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(ICpuGpuVectorPtr& dest,
-                          const ICpuGpuVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startPos + copySize, src->getSize());
-
-    ICpuGpuVector::resizeOrCreate(dest, copySize, useGpu);
-    dest->copyFrom(*src, startPos, copySize, useGpu, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(SVectorPtr& dest,
-                          const SVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    size_t height = src->size();
-    if (!dest) {
-      dest = std::make_shared<std::vector<std::string>>(height);
-    } else {
-      dest->resize(height);
-    }
-    std::copy_n(src->begin(), height, dest->begin());
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(SVectorPtr& dest,
-                          const SVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startPos + copySize, src->size());
-    size_t height = copySize;
-    if (!dest) {
-      dest = std::make_shared<std::vector<std::string>>(height);
-    } else {
-      dest->resize(height);
-    }
-    std::copy_n(src->begin() + startPos, height, dest->begin());
-  } else {
-    dest.reset();
-  }
-}
-
-void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) {
-  resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-}
-
-void Argument::resizeAndCopyFrom(const Argument& src,
-                                 bool useGpu,
-                                 hl_stream_t stream) {
-  dataId = src.dataId;
-  resizeAndCopy(value, src.value, useGpu, stream);
-  resizeAndCopy(grad, src.grad, useGpu, stream);
-  resizeAndCopy(in, src.in, useGpu, stream);
-  resizeAndCopy(ids, src.ids, useGpu, stream);
-  resizeAndCopy(sequenceStartPositions,
-                src.sequenceStartPositions,
-                false /* useGpu */,
-                stream);
-  if (src.hasSubseq()) {
-    resizeAndCopy(subSequenceStartPositions,
-                  src.subSequenceStartPositions,
-                  false /* useGpu */,
-                  stream);
-  }
-  resizeAndCopy(strs, src.strs, useGpu, stream);
-  frameWidth = src.frameWidth;
-  frameHeight = src.frameHeight;
-  frameDepth = src.frameDepth;
-}
-
-int32_t Argument::resizeAndCopyFrom(const Argument& src,
-                                    int32_t startSeq,
-                                    int32_t copySize,
-                                    bool useGpu) {
-  int32_t size =
-      resizeAndCopyFrom(src, startSeq, copySize, useGpu, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  return size;
-}
-
-int32_t Argument::resizeAndCopyFrom(const Argument& src,
-                                    int32_t startSeq,
-                                    int32_t copySize,
-                                    bool useGpu,
-                                    hl_stream_t stream) {
-  dataId = src.dataId;
-  frameWidth = src.frameWidth;
-  frameHeight = src.frameHeight;
-  frameDepth = src.frameDepth;
-
-  if (!src.sequenceStartPositions) {
-    // non-sequence input, copy samples directly
-    int32_t startRow = startSeq;
-    resizeAndCopy(in, src.in, startRow, copySize, useGpu, stream);
-    resizeAndCopy(value, src.value, startRow, copySize, useGpu, stream);
-    resizeAndCopy(grad, src.grad, startRow, copySize, useGpu, stream);
-    resizeAndCopy(ids, src.ids, startRow, copySize, useGpu, stream);
-    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
-    return copySize;
-  } else {
-    // sequence input
-    const int* sequence = src.sequenceStartPositions->getData(false);
-    int32_t startRow = sequence[startSeq];           // sample start from here
-    int32_t endRow = sequence[startSeq + copySize];  // sample end
-    int32_t copyFeatureSize = endRow - startRow;     // num of samples
-    resizeAndCopy(in, src.in, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(value, src.value, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(grad, src.grad, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(ids, src.ids, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(sequenceStartPositions,
-                  src.sequenceStartPositions,
-                  startSeq,
-                  copySize + 1,
-                  false,
-                  stream);
-    // modify new sequenceStartPositions
-    int* destSequences = sequenceStartPositions->getMutableData(false);
-    for (int i = 0; i < copySize + 1; i++) {
-      destSequences[i] -= startRow;
-    }
-    CHECK_EQ(destSequences[0], 0);
-    CHECK_EQ(destSequences[copySize], copyFeatureSize);
-    if (src.hasSubseq()) {
-      // sequence has sub-sequence
-      int* subSequence = src.subSequenceStartPositions->getMutableData(false);
-      int32_t subStartSeq = 0;
-      int32_t subEndSeq = 0;
-      int numSubSequences = src.getNumSubSequences();
-      for (int i = 0; i < numSubSequences + 1; i++) {
-        if (subSequence[i] == startRow) {
-          subStartSeq = i;
-        } else if (subSequence[i] == endRow) {
-          subEndSeq = i;
-          break;
-        }
-      }
-      int32_t copySubSize = subEndSeq - subStartSeq;
-      resizeAndCopy(subSequenceStartPositions,
-                    src.subSequenceStartPositions,
-                    subStartSeq,
-                    copySubSize + 1,
-                    false,
-                    stream);
-      // modify new subSequenceStartPositions
-      int* destSubSequences = subSequenceStartPositions->getMutableData(false);
-      for (int i = 0; i < copySubSize + 1; i++) {
-        destSubSequences[i] -= startRow;
-      }
-      CHECK_EQ(destSubSequences[0], 0);
-      CHECK_EQ(destSubSequences[copySubSize], copyFeatureSize);
-    }
-    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
-    return copyFeatureSize;
-  }
-}
-
-void Argument::concat(const std::vector<Argument>& args,
-                      const std::vector<int>& selectRows,
-                      const std::vector<int>& seqStartPos,
-                      const std::vector<int>& copySize,
-                      bool useGpu,
-                      hl_stream_t stream,
-                      PassType passType) {
-  CHECK(!subSequenceStartPositions)
-      << "undefined behavior for subsequence positions";
-
-  size_t batchSize = 0;
-  for (size_t i = 0; i < copySize.size(); ++i)
-    batchSize += copySize[i] * (seqStartPos[i + 1] - seqStartPos[i]);
-
-  auto copyArg = [batchSize, stream](MatrixPtr& dst,
-                                     MatrixPtr src,
-                                     int desStartRow,
-                                     int srcStartRow,
-                                     int size,
-                                     bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    size_t width = src->getWidth();
-    if (!dst) {
-      dst = src->clone(batchSize, width, useGpu);
-    } else {
-      dst->resize(batchSize, width);
-    }
-
-    MatrixPtr tmpMatrix = dst->subMatrix(desStartRow, size);
-    tmpMatrix->copyFrom(*src->subMatrix(srcStartRow, size), stream);
-  };
-
-  auto copyIds = [batchSize, stream](IVectorPtr& dst,
-                                     const IVectorPtr& src,
-                                     int desStartRow,
-                                     int srcStartRow,
-                                     int size,
-                                     bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    IVector::resizeOrCreate(dst, batchSize, useGpu);
-    dst->subVec(desStartRow, size)
-        ->copyFrom(*src->subVec(srcStartRow, size), stream);
-  };
-
-  auto copyStrs = [batchSize](SVectorPtr& dst,
-                              const SVectorPtr& src,
-                              int desStartRow,
-                              int srcStartRow,
-                              int size,
-                              bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    if (!dst) {
-      dst = std::make_shared<std::vector<std::string>>(batchSize);
-    } else {
-      dst->resize(batchSize);
-    }
-    std::copy(src->begin() + srcStartRow,
-              src->begin() + srcStartRow + size,
-              dst->begin() + desStartRow);
-  };
-
-  dataId = args[0].dataId;
-  CHECK_NE(seqStartPos.size(), 0UL);
-  int desStartRow = 0;
-  for (size_t i = 0; i < copySize.size(); ++i) {
-    int startPos = seqStartPos[i];
-    int endPos = seqStartPos[i + 1];
-    CHECK_GE(args.size(), static_cast<size_t>(endPos - startPos));
-    for (int j = startPos; j < endPos; ++j) {
-      const Argument& arg = args[j - startPos];
-      CHECK_EQ(arg.dataId, dataId) << "Arguments to concatenate should have "
-                                   << "the same dataId.";
-      const int srcStartRow = selectRows[j];
-      copyArg(in, arg.in, desStartRow, srcStartRow, copySize[i], useGpu);
-      copyArg(value, arg.value, desStartRow, srcStartRow, copySize[i], useGpu);
-      if (passType != PASS_TEST) {
-        copyArg(grad, arg.grad, desStartRow, srcStartRow, copySize[i], useGpu);
-      }
-      copyIds(ids, arg.ids, desStartRow, srcStartRow, copySize[i], useGpu);
-      copyStrs(strs, arg.strs, desStartRow, srcStartRow, copySize[i], useGpu);
-      desStartRow += copySize[i];
-    }
-  }
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, seqStartPos.size(), useGpu);
-  sequenceStartPositions->copyFrom(
-      seqStartPos.data(), seqStartPos.size(), useGpu);
-}
-
-void Argument::concat(const std::vector<Argument>& args,
-                      bool useGpu,
-                      hl_stream_t stream,
-                      PassType passType) {
-  int32_t batchSize = 0;
-  int64_t numSequences = 0;
-  int64_t numSubSequences = 0;
-  for (auto& arg : args) {
-    batchSize += arg.getBatchSize();
-    numSequences += arg.getNumSequences();
-    numSubSequences += arg.getNumSubSequences();
-  }
-
-  auto copyArg = [batchSize, stream](
-      MatrixPtr& dst, MatrixPtr src, int startRow, bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    size_t width = src->getWidth();
-    if (!dst) {
-      dst = src->clone(batchSize, width, useGpu);
-    } else {
-      dst->resize(batchSize, width);
-    }
-
-    MatrixPtr tmpMatrix = dst->subMatrix(startRow, src->getHeight());
-    tmpMatrix->copyFrom(*src, stream);
-  };
-
-  auto copyIds = [batchSize, stream](
-      IVectorPtr& dst, const IVectorPtr& src, int startRow, bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    IVector::resizeOrCreate(dst, batchSize, useGpu);
-    dst->subVec(startRow, src->getSize())->copyFrom(*src, stream);
-  };
-
-  auto copyStrs = [batchSize](
-      SVectorPtr& dst, const SVectorPtr& src, int startRow, bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    if (!dst) {
-      dst = std::make_shared<std::vector<std::string>>(batchSize);
-    } else {
-      dst->resize(batchSize);
-    }
-    std::copy(src->begin(), src->end(), dst->begin() + startRow);
-  };
-
-  auto copySequencePos = [](ICpuGpuVectorPtr& dstSeq,
-                            const ICpuGpuVectorPtr& srcSeq,
-                            int dstNumSequences,
-                            int srcNumSequences,
-                            int& startSequences,
-                            int startRow) {
-    if (srcSeq) {
-      ICpuGpuVector::resizeOrCreate(dstSeq, dstNumSequences + 1, false);
-      const int* src = srcSeq->getData(false);
-      int* dest = dstSeq->getMutableData(false);
-      for (int i = 0; i < srcNumSequences + 1; ++i) {
-        dest[i + startSequences] = src[i] + startRow;
-      }
-      startSequences += srcNumSequences;
-    } else {
-      dstSeq.reset();
-    }
-  };
-
-  int startRow = 0;
-  int startSequences = 0;
-  int startSubSequences = 0;
-  dataId = args[0].dataId;
-  for (auto& arg : args) {
-    CHECK_EQ(arg.dataId, dataId) << "Arguments in concat should have"
-                                 << " same dataId";
-    copyArg(in, arg.in, startRow, useGpu);
-    copyArg(value, arg.value, startRow, useGpu);
-    if (passType != PASS_TEST) copyArg(grad, arg.grad, startRow, useGpu);
-    copyIds(ids, arg.ids, startRow, useGpu);
-    copySequencePos(sequenceStartPositions,
-                    arg.sequenceStartPositions,
-                    numSequences,
-                    arg.getNumSequences(),
-                    startSequences,
-                    startRow);
-    copySequencePos(subSequenceStartPositions,
-                    arg.subSequenceStartPositions,
-                    numSubSequences,
-                    arg.getNumSubSequences(),
-                    startSubSequences,
-                    startRow);
-    copyStrs(strs, arg.strs, startRow, useGpu);
-    startRow += arg.getBatchSize();
-  }
-}
-
-void Argument::splitByDataId(const std::vector<Argument>& argus,
-                             std::vector<std::vector<Argument>>* arguGroups) {
-  arguGroups->clear();
-  int lastDataId = -1;
-  for (const auto& argu : argus) {
-    if (argu.dataId == -1) {
-      // is -1, then create a new group
-      arguGroups->emplace_back();
-      lastDataId = -1;
-    } else if (argu.dataId != lastDataId) {
-      // not -1, also not equal to last Argument, then create a new group
-      arguGroups->emplace_back();
-      lastDataId = argu.dataId;
-    } else {
-      // not -1, and equal to last Argument, do nothing
-    }
-    arguGroups->back().push_back(argu);
-  }
-}
-
-void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
-  const int* starts = sequenceStartPositions->getData(false);
-  const int* subStarts =
-      hasSubseq() ? subSequenceStartPositions->getData(false) : nullptr;
-  size_t numSequences = getNumSequences();
-  seqInfo->reserve(numSequences);
-  int subSeqEnd = 0;
-  for (size_t i = 0; i < numSequences; ++i) {
-    SeqInfo info;
-    info.seqStart = starts[i];
-    info.subLevelLength = starts[i + 1] - starts[i];
-    info.seqId = i;
-    if (hasSubseq()) {
-      info.subSeqStart = subSeqEnd;
-      while (subStarts[subSeqEnd] < starts[i + 1]) {
-        ++subSeqEnd;
-      }
-      info.topLevelLength = subSeqEnd - info.subSeqStart;
-    } else {
-      info.topLevelLength = info.subLevelLength;
-      info.subSeqStart = 0;  // not used
-    }
-    seqInfo->push_back(info);
-  }
-  std::sort(
-      seqInfo->begin(), seqInfo->end(), [](const SeqInfo& a, const SeqInfo& b) {
-        return a.topLevelLength > b.topLevelLength;
-      });
-}
-
-void Argument::checkSubset() const {
-  if (getNumSequences() > getNumSubSequences()) {
-    LOG(FATAL) << "numSubSequences is less than numSequences ("
-               << getNumSubSequences() << " vs. " << getNumSequences() << ")";
-  }
-  const int* start = sequenceStartPositions->getData(false);
-  const int* subStart = subSequenceStartPositions->getData(false);
-  int seqId = 0;
-  int subSeqId = 0;
-  while (seqId < getNumSequences() && subSeqId < getNumSubSequences()) {
-    if (start[seqId] > subStart[subSeqId]) {
-      ++subSeqId;
-    } else if (start[seqId] == subStart[subSeqId]) {
-      ++subSeqId;
-      ++seqId;
-    } else {
-      LOG(FATAL) << "seqStartPositions is not subset of subSeqStartPositions";
-    }
-  }
-  if (seqId < getNumSequences()) {
-    LOG(FATAL) << "seqStartPositions is not subset of subSeqStartPositions";
-  }
-}
-
-void Argument::degradeSequence(const Argument& input) {
-  CHECK_EQ(input.hasSubseq(), 1UL);
-  size_t numSequences = input.getNumSequences();
-  size_t numSubSequences = input.getNumSubSequences();
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, numSequences + 1, false);
-  int* tgtBuf = sequenceStartPositions->getMutableData(false);
-  const int* starts = input.sequenceStartPositions->getData(false);
-  const int* subStarts = input.subSequenceStartPositions->getData(false);
-  int seqId = 0;
-  for (size_t subSeqId = 0; subSeqId < numSubSequences; ++subSeqId) {
-    if (subStarts[subSeqId] == starts[seqId]) {
-      tgtBuf[seqId] = subSeqId;
-      seqId++;
-    }
-  }
-  tgtBuf[numSequences] = numSubSequences;
-}
-
-void Argument::poolSequenceWithStride(const Argument& input,
-                                      size_t stride,
-                                      ICpuGpuVectorPtr* stridePostions,
-                                      bool reversed) {
-  // If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
-  // then sequenceStartPositions = [0, 2, 3, 4, 7].
-  // If reversed = false, stridePostions = [0, 5, 9, 14, 17, 22, 27, 30];
-  // else reversed = true, stridePostions = [0, 4, 9, 14, 17, 20, 25, 30]
-
-  CHECK(input.sequenceStartPositions);
-  CHECK_EQ(input.hasSubseq(), 0UL);
-  CHECK_GT(stride, 0UL) << "stride must larger than 0";
-  size_t numSequences = input.getNumSequences();
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, numSequences + 1, false);
-  const int* starts = input.sequenceStartPositions->getData(false);
-  int* tgtBuf = sequenceStartPositions->getMutableData(false);
-  // first index of target sequence and stride positions are both 0
-  tgtBuf[0] = 0;
-  std::vector<int> stridePos;
-  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-    size_t seqLength = starts[seqId + 1] - starts[seqId];
-    stridePos.emplace_back(starts[seqId]);
-    if (seqLength == 0) {
-      // empty sequence
-      tgtBuf[seqId + 1] = tgtBuf[seqId];
-    } else {
-      int size = ceil((float)seqLength / stride);
-      tgtBuf[seqId + 1] = tgtBuf[seqId] + size;
-      for (int i = 0; i < size - 1; ++i) {
-        int cur = reversed ? starts[seqId + 1] - (size - 1 - i) * stride
-                           : stridePos.back() + stride;
-        stridePos.emplace_back(cur);
-      }
-    }
-  }
-  stridePos.emplace_back(starts[numSequences]);
-  int size = stridePos.size();
-  CHECK_EQ(size - 1, tgtBuf[numSequences]);
-  ICpuGpuVector::resizeOrCreate(*stridePostions, size, false);
-  (*stridePostions)->getMutableVector(false)->copyFrom(stridePos.data(), size);
-}
-
-void Argument::getValueString(
-    std::unordered_map<std::string, std::string>* out) const {
-  if (value) {
-    std::ostringstream os;
-    value->print(os);
-    out->insert({"value", os.str()});
-  }
-  if (ids) {
-    std::ostringstream os;
-    ids->print(os, ids->getSize());
-    out->insert({"ids", os.str()});
-  }
-  if (sequenceStartPositions) {
-    std::ostringstream os;
-    sequenceStartPositions->getVector(false)->print(
-        os, sequenceStartPositions->getSize());
-    out->insert({"sequence pos", os.str()});
-  }
-  if (subSequenceStartPositions) {
-    std::ostringstream os;
-    subSequenceStartPositions->getVector(false)->print(
-        os, subSequenceStartPositions->getSize());
-    out->insert({"sub-sequence pos", os.str()});
-  }
-}
-
-void Argument::printValueString(std::ostream& stream,
-                                const std::string& prefix) const {
-  std::unordered_map<std::string, std::string> out;
-  getValueString(&out);
-  for (auto field : {"value", "ids", "sequence pos", "sub-sequence pos"}) {
-    auto it = out.find(field);
-    if (it != out.end()) {
-      stream << prefix << field << ":\n" << it->second;
-    }
-  }
-}
-
-void Argument::subArgFrom(const Argument& input,
-                          size_t offset,
-                          size_t height,
-                          size_t width,
-                          bool useGpu,
-                          bool trans,
-                          bool seqFlag,
-                          size_t seqStart,
-                          size_t seqSize) {
-  if (input.value) {
-    value = Matrix::create(
-        input.value->getData() + offset * width, height, width, trans, useGpu);
-  }
-  if (input.ids) {
-    ids = IVector::create(input.ids->getData() + offset, height, useGpu);
-  }
-  if (input.grad) {
-    grad = Matrix::create(
-        input.grad->getData() + offset * width, height, width, trans, useGpu);
-  }
-  if (seqFlag) {
-    sequenceStartPositions = std::make_shared<ICpuGpuVector>(
-        *(input.sequenceStartPositions), seqStart, seqSize);
-  }
-}
-
-void Argument::reorganizeSeqInfo(
-    const ICpuGpuVectorPtr seqStartPos,
-    const ICpuGpuVectorPtr subSeqStartPos,
-    std::vector<std::vector<int>>& reorganizedSeqInfo) {
-  CHECK(seqStartPos);
-  reorganizedSeqInfo.clear();
-
-  int seqNum = seqStartPos->getSize() - 1;
-  int* seqStarts = seqStartPos->getMutableData(false);
-
-  if (subSeqStartPos) {
-    int* subSeqStarts = subSeqStartPos->getMutableData(false);
-    reorganizedSeqInfo.resize(seqNum, std::vector<int>());
-    int seqIdx = 0;
-    for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) {
-      reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
-      if (subSeqStarts[i] == seqStarts[seqIdx + 1]) {
-        seqIdx++;
-        if (seqIdx == seqNum) return;
-        reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
-      }
-    }
-  } else {
-    reorganizedSeqInfo.resize(1, std::vector<int>(seqNum + 1, 0));
-    memcpy(reorganizedSeqInfo[0].data(),
-           seqStarts,
-           sizeof(int) * seqStartPos->getSize());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/Argument.h b/paddle/legacy/parameter/Argument.h
deleted file mode 100644
index ea8634896c1..00000000000
--- a/paddle/legacy/parameter/Argument.h
+++ /dev/null
@@ -1,349 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "hl_gpu.h"
-
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-typedef std::shared_ptr<std::vector<std::string>> SVectorPtr;
-
-struct Argument {
-  Argument()
-      : in(nullptr),
-        value(nullptr),
-        ids(nullptr),
-        grad(nullptr),
-        strs(nullptr),
-        frameHeight(0),
-        frameWidth(0),
-        frameDepth(0),
-        sequenceStartPositions(nullptr),
-        subSequenceStartPositions(nullptr),
-        cpuSequenceDims(nullptr),
-        deviceId(-1),
-        allCount(0),
-        valueCount(0),
-        gradCount(0),
-        dataId(0) {}
-  Argument(const Argument& argument) {
-    *this = argument;
-    valueCount = 0;
-    gradCount = 0;
-    dataId = argument.dataId;
-  }
-  ~Argument() {}
-
-  void operator=(const Argument& argument) {
-    in = argument.in;
-    value = argument.value;
-    ids = argument.ids;
-    grad = argument.grad;
-    strs = argument.strs;
-    sequenceStartPositions = argument.sequenceStartPositions;
-    subSequenceStartPositions = argument.subSequenceStartPositions;
-    cpuSequenceDims = argument.cpuSequenceDims;
-    deviceId = argument.deviceId;
-    allCount = argument.allCount;
-    frameHeight = argument.frameHeight;
-    frameWidth = argument.frameWidth;
-    frameDepth = argument.frameDepth;
-    dataId = argument.dataId;
-  }
-
-  MatrixPtr in;  // used if needed
-  MatrixPtr value;
-  IVectorPtr ids;  // a sequence of ids. Can be use for class id for costLayer
-  MatrixPtr grad;  // If empty, gradient is not needed.
-  SVectorPtr strs;
-
-  // A dataBatch includes batchSize frames, one frame maybe not only vector
-  size_t frameHeight;
-  size_t frameWidth;
-  size_t frameDepth;
-
-  // If NULL, each position is treated independently.
-  // Otherwise, its size should be #NumberOfSequences + 1.
-  // The first position is always 0 and
-  // the last position should be equal to batchSize.
-  ICpuGpuVectorPtr sequenceStartPositions;
-
-  // If NULL, each sequence has no subsequence.
-  // Otherwise, its size should be #NumberOfSubSequences + 1.
-  // The first position is always 0 and
-  // the last position should be equal to batchSize.
-  ICpuGpuVectorPtr subSequenceStartPositions;
-
-  // dimension of sequence, stored only in CPU
-  IVectorPtr cpuSequenceDims;
-
-  int deviceId;            // the GPU device id which the argument in
-  int allCount;            // the number of output layers using this argument
-  mutable int valueCount;  // waiting this member when layer do forward
-  mutable int gradCount;   // waiting this member when layer do backward
-  mutable LockedCondition valueReadyCond;
-  mutable LockedCondition gradReadyCond;
-
-  int dataId;  // dataProvider id
-
-  /* Increase the reference count of the argument. */
-  void countIncrement() { allCount++; }
-
-  int getAllCount() const { return allCount; }
-
-  void waitValueReady() const {
-    valueReadyCond.wait([this] { return (valueCount != 0); });
-
-    std::lock_guard<std::mutex> guard(*valueReadyCond.mutex());
-    valueCount--;
-  }
-
-  void notifyValueReady() const {
-    valueReadyCond.notify_all([this] { valueCount = allCount; });
-  }
-
-  void waitGradReady() const {
-    gradReadyCond.wait([this] { return (gradCount == allCount); });
-    gradCount = 0;
-  }
-
-  void notifyGradReady() const {
-    gradReadyCond.notify_all([this] { gradCount++; });
-  }
-
-  int64_t getBatchSize() const {
-    if (value) return value->getHeight();
-    if (ids) return ids->getSize();
-    if (grad) return grad->getHeight();
-    if (in) return in->getHeight();
-    if (strs) return strs->size();
-    return 0;
-  }
-  size_t getFrameHeight() const { return frameHeight; }
-  size_t getFrameWidth() const { return frameWidth; }
-  size_t getFrameDepth() const { return frameDepth; }
-  void setFrameHeight(size_t h) { frameHeight = h; }
-  void setFrameWidth(size_t w) { frameWidth = w; }
-  void setFrameDepth(size_t d) { frameDepth = d; }
-
-  int64_t getNumSequences() const {
-    return sequenceStartPositions ? sequenceStartPositions->getSize() - 1
-                                  : getBatchSize();
-  }
-
-  int64_t getNumSubSequences() const {
-    return subSequenceStartPositions ? subSequenceStartPositions->getSize() - 1
-                                     : getBatchSize();
-  }
-
-  bool hasSeq() const { return sequenceStartPositions != nullptr; }
-  bool hasSubseq() const { return subSequenceStartPositions != nullptr; }
-
-  const int* getCpuStartPositions() const {
-    return hasSubseq() ? subSequenceStartPositions->getData(false)
-                       : sequenceStartPositions->getData(false);
-  }
-
-  static inline real sum(const std::vector<Argument>& arguments) {
-    real cost = 0;
-    for (auto& arg : arguments) {
-      if (arg.value) {
-        SetDevice device(arg.deviceId);
-        cost += arg.value->getSum();
-      }
-    }
-    return cost;
-  }
-
-  /**
-   * @brief (value, ids, grad, sequenceStartPositions) of output are subset of
-   *        input. Note that, output share the same memory of input.
-   *
-   * @param input[in]       input
-   * @param offset[in]      offset in terms of rows
-   * @param height[in]      height of output.value
-   * @param width[in]       width of output.value
-   * @param useGpu[in]
-   * @param trans[in]       whether input.value is transform
-   * @param seqFlag[in]     whether input has sequenceStartPositions
-   * @param seqStart[in]    offset of input.sequenceStartPositions
-   * @param seqSize[in]     lenght of output.sequenceStartPositions
-   */
-  void subArgFrom(const Argument& input,
-                  size_t offset,
-                  size_t height,
-                  size_t width,
-                  bool useGpu,
-                  bool trans = false,
-                  bool seqFlag = false,
-                  size_t seqStart = 0,
-                  size_t seqSize = 0);
-  /*
-   * for sequence input:
-   *   startSeq: the sequence id of start
-   *   copySize: how many sequences need to copy
-   *   return value: how many samples are copied
-   * for non-sequence input:
-   *   startSeq: the sample id of start
-   *   copySize: how many samples need to copy
-   *   return value: how many samples are copied
-   * Note that when specifying the stream explicitly in this case,
-   * synchronize should also be called somewhere after this function
-   */
-  int32_t resizeAndCopyFrom(const Argument& src,
-                            int32_t startSeq,
-                            int32_t copySize,
-                            bool useGpu,
-                            hl_stream_t stream);
-
-  /*
-   * same with the above function, except that the stream is
-   * HPPL_STREAM_DEFAULT and synchronize is automatically called
-   * inside it
-   */
-  int32_t resizeAndCopyFrom(const Argument& src,
-                            int32_t startSeq,
-                            int32_t copySize,
-                            bool useGpu = FLAGS_use_gpu);
-
-  void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
-
-  /*
-   * same with the above function, except that the stream is
-   * HPPL_STREAM_DEFAULT and synchronize is automatically called
-   * inside it
-   */
-  void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu);
-
-  /*
-    @brief Concatenate several arguments into one and put the result into it.
-    @param args : a vector of argument, each element of which is a frame in a
-    batch of sequences.
-    @param selectRows : select several row of args to concatenate
-    @param seqStartPos : sequence start positions in the final Argument
-    @param hl_stream_t : cuda stream
-    @param passTyoe : type of task, training or testing
-   */
-  void concat(const std::vector<Argument>& args,
-              const std::vector<int>& selectRows,
-              const std::vector<int>& seqStartPos,
-              const std::vector<int>& copySize,
-              bool useGpu,
-              hl_stream_t stream,
-              PassType passType);
-
-  /*
-    Concatenate several args into one and put the result into this.
-   */
-  void concat(const std::vector<Argument>& src,
-              bool useGpu = FLAGS_use_gpu,
-              hl_stream_t stream = HPPL_STREAM_DEFAULT,
-              PassType passType = PASS_TEST);
-
-  /*
-   * split vector<Argument> to several vectors according to dataId
-   */
-  static void splitByDataId(const std::vector<Argument>& argus,
-                            std::vector<std::vector<Argument>>* arguGroups);
-
-  struct SeqInfo {
-    // Equal to sequence length for sequence data
-    // Equal to number of subsequences for subsequence data
-    int topLevelLength;
-
-    int seqStart;
-    int seqId;
-
-    // Equal to topLevelLength for sequence data
-    // Equal to sum of the length of subsequences for subsequence data
-    int subLevelLength;
-
-    // Only used for subsequence data, start position of this sequence
-    // is subSequenceStartPositions, i.e.
-    // subSequenceStartPositions[subSeqStart] == seqStart
-    int subSeqStart;
-  };
-  /*
-    Get SeqInfo for each sequence of this argument
-    Elements in *seqInfo are sorted by topLevelLength in descending order
-  */
-  void getSeqInfo(std::vector<SeqInfo>* segInfo) const;
-
-  /*
-   Check Whether sequenceStartPositions is subset of
-   subSequenceStartPositions.
-   */
-  void checkSubset() const;
-
-  /*
-   sequence has sub-sequence degrades to a sequence.
-   */
-  void degradeSequence(const Argument& input);
-
-  /*
-   After pooling with stride n (n is smaller than sequence length),
-   a long sequence will be shorten.
-   This function is invalid for sequence having sub-sequence.
-   */
-  void poolSequenceWithStride(const Argument& input,
-                              size_t stride,
-                              ICpuGpuVectorPtr* stridePositions,
-                              bool reversed = false);
-  /**
-   * @brief getValueString will return the argument's output in string. There
-   * are several kinds of output. The keys of output dictionary are 'value',
-   * 'id', 'sequence pos', 'sub-sequence pos'.
-   * @param out [out]: the return values.
-   */
-  void getValueString(std::unordered_map<std::string, std::string>* out) const;
-
-  /**
-   * @brief printValueString will print the argument's output in order of
-   * 'value', 'id', 'sequence pos', 'sub-sequence pos'.
-   * @param stream: Output stream
-   * @param prefix: line prefix for printing.
-   */
-  void printValueString(std::ostream& stream,
-                        const std::string& prefix = "") const;
-
-  /**
-   * @brief reorganizeSeqInfo will reorganize sequenceStartPositions and
-   * subSequenceStartPositions into a 2 dimensional arrary: reorganizedSeqInfo.
-   *
-   * @param seqStartPos: sequenceStartPositions of an Argument.
-   * @param subSeqStartPos: subSequenceStartPositions of an Argument.
-   * @param the reorganized sequence start position information.
-   *
-   * Examples:
-   * seqStartPos: [0, 4, 15, 20, 28]
-   * subSeqStartPos: [0, 3, 4, 5, 7, 10, 15, 20, 22, 23, 25, 28]
-   * reorganizedSeqInfo:
-   *   [
-   *     [0,3,4],
-   *     [4,5,7,10,15],
-   *     [15,20],
-   *     [20,22,23,25,28]
-   *   ]
-   */
-  static void reorganizeSeqInfo(
-      const ICpuGpuVectorPtr seqStartPos,
-      const ICpuGpuVectorPtr subSeqStartPos,
-      std::vector<std::vector<int>>& reorganizedSeqInfo);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/AverageOptimizer.cpp b/paddle/legacy/parameter/AverageOptimizer.cpp
deleted file mode 100644
index 82a7fed6c64..00000000000
--- a/paddle/legacy/parameter/AverageOptimizer.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "AverageOptimizer.h"
-
-namespace paddle {
-
-// factory method to create an instance of AverageOptimizer
-ParameterOptimizer* AverageOptimizer::create(
-    const OptimizationConfig& optConfig,
-    ParameterOptimizer* optimizer,
-    bool isParameterSparse,
-    bool useParameterApply) {
-  if (optConfig.average_window() <= 0) {
-    return optimizer;
-  }
-  // disable average for embeded local updater
-  if (!useParameterApply && optConfig.num_batches_per_send_parameter() > 1) {
-    return optimizer;
-  }
-  if (isParameterSparse) {
-    return new AverageSparseOptimizer(optConfig, optimizer, useParameterApply);
-  }
-  return new AverageOptimizer(optConfig, optimizer, useParameterApply);
-}
-
-AverageOptimizer::AverageOptimizer(const OptimizationConfig& optConfig,
-                                   ParameterOptimizer* optimizer,
-                                   bool useParameterApply)
-    : ParameterOptimizer(optConfig),
-      optimizer_(optimizer),
-      useApply_(useParameterApply),
-      numUpdates_(0),
-      prevNumUpdates_(0),
-      numAccumulates_(0),
-      oldNumAccumulates_(0),
-      minAverageWindow_(
-          std::min<int64_t>(10000L, optConfig_.max_average_window())),
-      maxAverageWindow_(optConfig_.max_average_window()) {
-  parameterTypes_ = optimizer_->getParameterTypes();
-  addParameterType(PARAMETER_SUM1);
-  addParameterType(PARAMETER_SUM2);
-  addParameterType(PARAMETER_SUM3);
-  if (useParameterApply) {
-    addParameterType(PARAMETER_APPLY);
-  }
-}
-
-void AverageOptimizer::startBatch(int64_t numSamplesProcessed) {
-  optimizer_->startBatch(numSamplesProcessed);
-  learningRate_ = optimizer_->getLearningRate();
-
-  ++numUpdates_;
-  ++numAccumulates_;
-}
-
-/*
-  After traversal, the averaged parameter can be obtained by
-  ((PARAMETER_SUM1 + PARAMETER_SUM2 + PARAMETER_SUM3)
-  / (numAccumulates_ + oldNumAccumulates_))
-*/
-ParameterOptimizer::TraverseCallback AverageOptimizer::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  TraverseCallbackVec callbacks;
-
-  if (auto callback = optimizer_->needSpecialTraversal(config)) {
-    callbacks.emplace_back(callback);
-  }
-
-  if (numUpdates_ % kMaxNumAccumulates == 0) {
-    // Move the sum to a different buffer to avoid loss of precision
-    // due to too many sums.
-    callbacks.emplace_back([](const VectorPtr vecs[],
-                              const ParameterConfig& config,
-                              size_t sparseId) {
-      vecs[PARAMETER_SUM2]->add(*vecs[PARAMETER_SUM1]);
-      vecs[PARAMETER_SUM1]->zeroMem();
-    });
-  }
-
-  if (isAverageWindowTooLong()) {
-    // Now the average window is too long, discard the old sum.
-    if (auto callback = this->startCatchUpWith()) {
-      callbacks.emplace_back(callback);
-    }
-    callbacks.emplace_back([](const VectorPtr vecs[],
-                              const ParameterConfig& config,
-                              size_t sparseId) {
-      vecs[PARAMETER_SUM3]->add(*vecs[PARAMETER_SUM1], *vecs[PARAMETER_SUM2]);
-      vecs[PARAMETER_SUM1]->zeroMem();
-      vecs[PARAMETER_SUM2]->zeroMem();
-    });
-  }
-
-  return composeCallbacks(callbacks);
-}
-
-void AverageOptimizer::finishBatch() {
-  optimizer_->finishBatch();
-  if (isAverageWindowTooLong()) {
-    this->finishCatchUpWith();
-    oldNumAccumulates_ = numAccumulates_;
-    numAccumulates_ = 0;
-  }
-}
-
-ParameterOptimizer::TraverseCallback AverageOptimizer::apply() {
-  if (numAccumulates_ + oldNumAccumulates_ == 0) {
-    return nullptr;
-  }
-
-  real scale = 1. / (numAccumulates_ + oldNumAccumulates_);
-  if (useApply_) {
-    return [scale](const VectorPtr vecs[],
-                   const ParameterConfig& config,
-                   size_t sparseId) {
-      vecs[PARAMETER_APPLY]->add3(*vecs[PARAMETER_SUM1],
-                                  *vecs[PARAMETER_SUM2],
-                                  *vecs[PARAMETER_SUM3],
-                                  scale,
-                                  scale,
-                                  scale);
-    };
-  } else {
-    return [scale](const VectorPtr vecs[],
-                   const ParameterConfig& config,
-                   size_t sparseId) {
-      vecs[PARAMETER_GRADIENT]->copyFrom(*vecs[PARAMETER_VALUE]);
-      vecs[PARAMETER_VALUE]->add3(*vecs[PARAMETER_SUM1],
-                                  *vecs[PARAMETER_SUM2],
-                                  *vecs[PARAMETER_SUM3],
-                                  scale,
-                                  scale,
-                                  scale);
-    };
-  }
-}
-
-ParameterOptimizer::TraverseCallback AverageOptimizer::restore() {
-  if (numAccumulates_ + oldNumAccumulates_ == 0) {
-    return nullptr;
-  }
-  if (useApply_) {
-    return nullptr;
-  }
-
-  return [](
-      const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) {
-    vecs[PARAMETER_VALUE]->copyFrom(*vecs[PARAMETER_GRADIENT]);
-    vecs[PARAMETER_GRADIENT]->zeroMem();
-  };
-}
-
-void AverageSparseOptimizer::update(const VectorPtr vecs[],
-                                    const ParameterConfig& paraConfig,
-                                    size_t sparseId) const {
-  optimizer_->update(vecs, paraConfig, sparseId);
-
-  CHECK_LT(sparseId, t0Vec_.size());
-  int timediff = timer_ + 1 - t0Vec_[sparseId];
-  if (timediff > 0) {
-    vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], timediff);
-    t0Vec_[sparseId] = timer_ + 1;
-  }
-}
-
-ParameterOptimizer::TraverseCallback AverageSparseOptimizer::startCatchUpWith()
-    const {
-  TraverseCallbackVec callbacks;
-
-  if (auto callback = optimizer_->startCatchUpWith()) {
-    callbacks.emplace_back(callback);
-  }
-
-  if (timer_ > 0) {
-    callbacks.emplace_back(
-        [this](const VectorPtr vecs[],
-               const ParameterConfig& config,
-               size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
-  }
-
-  return composeCallbacks(callbacks);
-}
-
-void AverageSparseOptimizer::catchUpWith(const VectorPtr vecs[],
-                                         const ParameterConfig& paraConfig,
-                                         size_t sparseId) const {
-  CHECK_LT(sparseId, t0Vec_.size());
-  int timediff = timer_ - t0Vec_[sparseId];
-  if (timediff > 0) {
-    vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], timediff);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/AverageOptimizer.h b/paddle/legacy/parameter/AverageOptimizer.h
deleted file mode 100644
index f0fe2fd28e4..00000000000
--- a/paddle/legacy/parameter/AverageOptimizer.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "FirstOrderOptimizer.h"
-
-namespace paddle {
-
-// After Optimization, parameter values are further averaged within
-// time range.
-class AverageOptimizer : public ParameterOptimizer {
- public:
-  // if *useParameterApply* set, use PARAMETER_APPLY to store averaged parameter
-  // else use PARAMETER_VALUE, and value backup in PARAMETER_GRADIENT
-  AverageOptimizer(const OptimizationConfig& optConfig,
-                   ParameterOptimizer* optimizer,
-                   bool useParameterApply);
-
-  static ParameterOptimizer* create(const OptimizationConfig& optConfig,
-                                    ParameterOptimizer* optimizer,
-                                    bool isParameterSparse = false,
-                                    bool useParameterApply = false);
-
-  virtual void init(size_t numRows, const ParameterConfig* config) {
-    optimizer_->init(numRows, config);
-  }
-
-  virtual void startPass() { optimizer_->startPass(); }
-  virtual void finishPass() {
-    optimizer_->finishPass();
-    updateAverageWindowLimit();
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed);
-  virtual void finishBatch();
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      size_t sparseId) const {
-    optimizer_->update(vecs, paraConfig, sparseId);
-    vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], 1.0f);
-  }
-
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const;
-
-  virtual TraverseCallback startCatchUpWith() const {
-    return optimizer_->startCatchUpWith();
-  }
-  virtual void finishCatchUpWith() { return optimizer_->finishCatchUpWith(); }
-
-  virtual TraverseCallback apply();
-  virtual TraverseCallback restore();
-
-  virtual void setNoDecay() { optimizer_->setNoDecay(); }
-
- protected:
-  std::unique_ptr<ParameterOptimizer> optimizer_;
-  bool useApply_;
-
-  // should only be called from finishPass()
-  void updateAverageWindowLimit() {
-    if (!optConfig_.has_max_average_window()) {
-      // use the number of batches in the last pass as maxAverageWindow_
-      CHECK_GT(numUpdates_, prevNumUpdates_);
-      maxAverageWindow_ = numUpdates_ - prevNumUpdates_;
-      prevNumUpdates_ = numUpdates_;
-    }
-    minAverageWindow_ = std::min(minAverageWindow_, numUpdates_);
-  }
-
-  bool isAverageWindowTooLong() const {
-    return numAccumulates_ >= minAverageWindow_ &&
-           numAccumulates_ >=
-               std::min<int64_t>(maxAverageWindow_,
-                                 numUpdates_ * optConfig_.average_window());
-  }
-
-  static const int64_t kMaxNumAccumulates = 16384;
-  int64_t numUpdates_;
-  int64_t prevNumUpdates_;
-  int64_t numAccumulates_;
-  int64_t oldNumAccumulates_;
-  int64_t minAverageWindow_;
-  int64_t maxAverageWindow_;
-};
-
-// Average Optimizer with Sparse support.
-class AverageSparseOptimizer : public AverageOptimizer {
- public:
-  AverageSparseOptimizer(const OptimizationConfig& optConfig,
-                         ParameterOptimizer* optimizer,
-                         bool useParameterApply)
-      : AverageOptimizer(optConfig, optimizer, useParameterApply) {}
-
-  virtual void init(size_t numRows, const ParameterConfig* config) {
-    AverageOptimizer::init(numRows, config);
-
-    t0Vec_.resize(numRows);
-
-    timer_ = 0;
-    t0Vec_.assign(t0Vec_.size(), 0);
-  }
-  virtual void finishBatch() {
-    AverageOptimizer::finishBatch();
-    timer_++;
-  }
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      size_t sparseId) const;
-  void catchUpWith(const VectorPtr vecs[],
-                   const ParameterConfig& paraConfig,
-                   size_t sparseId) const;
-  virtual TraverseCallback startCatchUpWith() const;
-  virtual void finishCatchUpWith() {
-    optimizer_->finishCatchUpWith();
-
-    timer_ = 0;
-    t0Vec_.assign(t0Vec_.size(), 0);
-  }
-
- protected:
-  /**
-   *  counting batches, clear after catch up with
-   *  t(timer_) is current time,
-   *  t0(t0Vec_) are last occur time of i rows.
-   *  if one block is update by multi threads,
-   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
-   */
-  int timer_;
-  mutable std::vector<int32_t> t0Vec_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/CMakeLists.txt b/paddle/legacy/parameter/CMakeLists.txt
deleted file mode 100644
index 19ae07e077e..00000000000
--- a/paddle/legacy/parameter/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-# The utilities for paddle
-
-file(GLOB PARAMETERS_HEADERS . *.h)
-file(GLOB PARAMETERS_SOURCES . *.cpp)
-
-add_library(paddle_parameter STATIC
-        ${PARAMETERS_SOURCES})
-add_dependencies(paddle_parameter paddle_proto ${external_project_dependencies})
-if(WITH_TESTING)
-    add_subdirectory(tests)
-endif()
diff --git a/paddle/legacy/parameter/FirstOrderOptimizer.cpp b/paddle/legacy/parameter/FirstOrderOptimizer.cpp
deleted file mode 100644
index 4f82a115f7b..00000000000
--- a/paddle/legacy/parameter/FirstOrderOptimizer.cpp
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FirstOrderOptimizer.h"
-#include "paddle/legacy/math/TrainingAlgorithmOp.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include <cmath>
-
-DEFINE_bool(log_clipping, false, "enable log clipping or not");
-
-namespace paddle {
-
-SparseMomentumParameterOptimizer::SparseMomentumParameterOptimizer(
-    const OptimizationConfig& optConfig)
-    : ParameterOptimizer(optConfig) {
-  addParameterType(PARAMETER_MOMENTUM);
-  addParameterType(PARAMETER_MOMENTUM_UT);
-  addParameterType(PARAMETER_MOMENTUM_VT);
-  alpha_ = 1;
-  beta_ = 1;
-  tau_ = -1;
-  threshold_ = 1e+06;
-}
-
-void SparseMomentumParameterOptimizer::init(size_t numRows,
-                                            const ParameterConfig* config) {
-  isParameterSparse_ = numRows != 0;
-  t0Vec_.resize(numRows);
-  t0Vec_.assign(t0Vec_.size(), 0);
-  timer_ = 0;
-  momentum_ = config->momentum();
-  decayRate_ = config->decay_rate();
-  gamma_ = config->learning_rate();
-}
-
-void SparseMomentumParameterOptimizer::startBatch(int64_t numSamplesProcessed) {
-  learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  if (isParameterSparse_) {
-    tau_ = tau_ + beta_ / alpha_;
-    alpha_ = alpha_ / momentum_;
-    beta_ = beta_ / (1 + decayRate_ * gamma_ * learningRate_);
-  }
-}
-
-void SparseMomentumParameterOptimizer::update(const VectorPtr vecs[],
-                                              const ParameterConfig& paraConfig,
-                                              size_t sparseId) const {
-  if (sparseId != -1LU) {
-    CHECK_LT(sparseId, t0Vec_.size());
-    if (t0Vec_[sparseId] == 0) {
-      vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
-      t0Vec_[sparseId] = 1;
-    }
-    vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
-                                     -alpha_ * gamma_ * learningRate_);
-    vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
-                                     tau_ * alpha_ * gamma_ * learningRate_);
-    vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
-                               tau_ / beta_ + 1.0 / alpha_,
-                               *vecs[PARAMETER_MOMENTUM_VT],
-                               1.0 / beta_);
-
-  } else {
-    vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                     *vecs[PARAMETER_MOMENTUM],
-                                     learningRate_ * paraConfig.learning_rate(),
-                                     paraConfig.momentum(),
-                                     applyDecay_ ? paraConfig.decay_rate() : 0);
-  }
-}
-
-ParameterOptimizer::TraverseCallback
-SparseMomentumParameterOptimizer::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  if (alpha_ > threshold_ && isParameterSparse_) {
-    //  Restart to avoid large value multiplication
-    //  1. \alpha = 1, \beta = 1, \tau = 0
-    //  2. Note that \tau * u_t + v_t = \beta \theta_t, therefore:
-    //     u_t should be rescaled to u_t/alpha_
-    //     v_t should be reset to \theta_t
-    return [this](const VectorPtr vecs[],
-                  const ParameterConfig& config,
-                  size_t sparseId) {
-      vecs[PARAMETER_MOMENTUM_UT]->divScalar(alpha_);
-      vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
-    };
-  } else {
-    return nullptr;
-  }
-}
-
-void SparseMomentumParameterOptimizer::finishBatch() {
-  timer_++;
-  if (!isParameterSparse_) return;
-  if (alpha_ > threshold_) {
-    alpha_ = 1;
-    beta_ = 1;
-    tau_ = -1;
-  }
-}
-
-void AdagradParameterOptimizer::update(const VectorPtr vecs[],
-                                       const ParameterConfig& config,
-                                       size_t sparseId) const {
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& accum_buffer = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real epsilon = optConfig_.ada_epsilon();
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  adagradApply(value,
-               grad,
-               mom,
-               accum_buffer,
-               accum,
-               lr,
-               epsilon,
-               learningRate,
-               momentum,
-               decayRate);
-}
-
-ParameterOptimizer::TraverseCallback
-AdagradParameterOptimizer::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  if (numUpdates_ % kMaxNumAccumulates == 0) {
-    // Move the sum to a different buffer to avoid loss of precision
-    // due to too many sums.
-    return [](const VectorPtr vecs[],
-              const ParameterConfig& config,
-              size_t sparseId) {
-      vecs[PARAMETER_GRADIENT_SQURESUM]->add(
-          *vecs[PARAMETER_GRADIENT_SQURESUM1]);
-      vecs[PARAMETER_GRADIENT_SQURESUM1]->zeroMem();
-    };
-  } else {
-    return nullptr;
-  }
-}
-
-void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
-                                        const ParameterConfig& config,
-                                        size_t sparseId) const {
-  CHECK(sparseId == -1LU) << "Sparse update is not supported";
-
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum_update = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  adadeltaApply(value,
-                grad,
-                mom,
-                accum,
-                accum_update,
-                lr,
-                rou_,
-                epsilon_,
-                learningRate,
-                momentum,
-                decayRate);
-}
-
-void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
-                                       const ParameterConfig& config,
-                                       size_t sparseId) const {
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& sum1 = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real accumulatedRou = rou_;
-  bool firstTime = timer_ == 0;
-  if (sparseId != -1LU) {
-    CHECK_LT(sparseId, t0Vec_.size());
-    accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]);
-    firstTime = t0Vec_[sparseId] == 0;
-    t0Vec_[sparseId] = timer_ + 1;
-  }
-
-  real epsilon = optConfig_.ada_epsilon();
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  rmspropApply(value,
-               grad,
-               mom,
-               sum,
-               sum1,
-               lr,
-               accumulatedRou,
-               rou_,
-               epsilon,
-               learningRate,
-               momentum,
-               decayRate,
-               firstTime);
-}
-
-void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
-                                              const ParameterConfig& config,
-                                              size_t sparseId) const {
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real accumulatedRou = rou_;
-  bool firstTime = timer_ == 0;
-  if (sparseId != -1LU) {
-    CHECK_LT(sparseId, t0Vec_.size());
-    accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]);
-    firstTime = t0Vec_[sparseId] == 0;
-    t0Vec_[sparseId] = timer_ + 1;
-  }
-
-  real epsilon = optConfig_.ada_epsilon();
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  decayedAdagradApply(value,
-                      grad,
-                      mom,
-                      sum,
-                      lr,
-                      accumulatedRou,
-                      rou_,
-                      epsilon,
-                      learningRate,
-                      momentum,
-                      decayRate,
-                      firstTime);
-}
-
-void AdamParameterOptimizer::update(const VectorPtr vecs[],
-                                    const ParameterConfig& config,
-                                    size_t sparseId) const {
-  CHECK(sparseId == -1UL) << "Sparse update is not supported";
-
-  real beta1_power = std::pow(beta1_, step_);
-  real beta2_power = std::pow(beta2_, step_);
-  real learningRate = config.learning_rate() * learningRate_;
-
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& v = *vecs[PARAMETER_SECOND_MOMENTUM];
-
-  adamApply(value,
-            grad,
-            mom,
-            v,
-            beta1_,
-            beta2_,
-            beta1_power,
-            beta2_power,
-            epsilon_,
-            learningRate);
-}
-
-void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
-                                      const ParameterConfig& config,
-                                      size_t sparseId) const {
-  CHECK(sparseId == -1UL) << "Sparse update is not supported";
-  real learningRate = config.learning_rate() * learningRate_;
-
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& u = *vecs[PARAMETER_WEIGHTED_INFINITY_NORM];
-
-  adamaxApply(value, grad, mom, u, beta1_, beta2_, step_, learningRate);
-}
-
-void OptimizerWithGradientClipping::update(const VectorPtr vecs[],
-                                           const ParameterConfig& config,
-                                           size_t sparseId) const {
-  real globalThreshold = optConfig_.gradient_clipping_threshold();
-  real localThreshold = config.gradient_clipping_threshold();
-
-  // Use local gradient clipping threshold if it's enabled,
-  // otherwise using the global one.
-  real threshold = localThreshold > 0.0f ? localThreshold : globalThreshold;
-  std::string field = localThreshold > 0.0f ? "local" : "global";
-
-  real maxAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsMax();
-  if (maxAbsGrad > threshold) {
-    if (FLAGS_log_clipping) {
-      real avgAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsSum() /
-                        vecs[PARAMETER_GRADIENT]->getSize();
-      LOG(INFO) << "parameter=" << config.name() << " need clipping by "
-                << field << " threshold=" << threshold
-                << ", max grad=" << maxAbsGrad << ", avg grad=" << avgAbsGrad;
-    }
-    vecs[PARAMETER_GRADIENT]->clip(-threshold, threshold);
-  }
-  optimizer_->update(vecs, config, sparseId);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/FirstOrderOptimizer.h b/paddle/legacy/parameter/FirstOrderOptimizer.h
deleted file mode 100644
index 86b9a591aff..00000000000
--- a/paddle/legacy/parameter/FirstOrderOptimizer.h
+++ /dev/null
@@ -1,381 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ParameterOptimizer.h"
-#include "ParameterUpdateFunctions.h"
-#include "Regularizer.h"
-
-namespace paddle {
-
-// Plain SGD optimization.
-class SgdOptimizer : public ParameterOptimizer {
- public:
-  explicit SgdOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {
-    addParameterType(PARAMETER_MOMENTUM);
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  }
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      size_t sparseId) const {
-    (void)sparseId;
-    real torch_learningRate = optConfig_.learning_method() == "torch_momentum"
-                                  ? 1.0 - paraConfig.momentum()
-                                  : 1.0;
-#ifdef PADDLE_WITH_MKLDNN
-    sgdUpdate(learningRate_ * paraConfig.learning_rate() *
-                  (firstTime_ ? 1.0 : torch_learningRate),
-              paraConfig.momentum(),
-              applyDecay_ ? paraConfig.decay_rate() : 0,
-              vecs[PARAMETER_VALUE].get(),
-              vecs[PARAMETER_GRADIENT].get(),
-              vecs[PARAMETER_MOMENTUM].get());
-#else
-    vecs[PARAMETER_VALUE]->sgdUpdate(
-        *vecs[PARAMETER_GRADIENT],
-        *vecs[PARAMETER_MOMENTUM],
-        learningRate_ * paraConfig.learning_rate() *
-            (firstTime_ ? 1.0 : torch_learningRate),
-        paraConfig.momentum(),
-        applyDecay_ ? paraConfig.decay_rate() : 0);
-#endif
-  }
-  virtual void finishBatch() { firstTime_ = false; }
-};
-
-// SGD optimization with sparse support.
-class SparseMomentumParameterOptimizer : public ParameterOptimizer {
-  /* sparse momentum optimizer
-
-    update scheme:
-
-    \alpha_t = \alpha_{t-1} / k
-    \beta_t = \beta_{t-1} / (1 + \lambda\gamma_t)
-    u_t = u_{t-1} - \alpha_t \gamma_t g_t
-    v_t = v_{t-1} + \tau_{t-1} \alpha_t \gamma_t g_t
-    \tau_t = \tau_{t-1} + \beta_t / \alpha_t
-
-    where:
-    k: momentum
-    lambda: decay rate
-    \gamma_t: learning rate at the t'th step
-  */
-
- public:
-  explicit SparseMomentumParameterOptimizer(
-      const OptimizationConfig& optConfig);
-  virtual void init(size_t numRows, const ParameterConfig* config);
-  virtual void startBatch(int64_t numSamplesProcessed);
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      size_t sparseId) const;
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const;
-  virtual void finishBatch();
-
- private:
-  real alpha_;
-  real beta_;
-  real tau_;
-  real gamma_;
-  real threshold_;
-  real momentum_;
-  real decayRate_;
-
- protected:
-  int64_t timer_;
-  mutable std::vector<int64_t> t0Vec_;
-  bool isParameterSparse_;
-};
-
-/*
- * AdaGrad optimization.
- * http://www.magicbroom.info/Papers/DuchiHaSi10.pdf
- */
-class AdagradParameterOptimizer : public ParameterOptimizer {
- public:
-  explicit AdagradParameterOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {
-    addParameterType(PARAMETER_MOMENTUM);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM1);
-    addParameterType(PARAMETER_LEARNING_RATE);
-    numUpdates_ = 0;
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    (void)numSamplesProcessed;
-    ++numUpdates_;
-  }
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const;
-
- protected:
-  int64_t numUpdates_;
-  static const int64_t kMaxNumAccumulates = 16384;
-};
-
-/*
- * AdaDelta Optimization.
- * http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf
- */
-class AdaDeltaParameterOptimizer : public ParameterOptimizer {
- public:
-  explicit AdaDeltaParameterOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {
-    addParameterType(PARAMETER_MOMENTUM);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM1);
-    addParameterType(PARAMETER_LEARNING_RATE);
-    rou_ = optConfig.ada_rou();
-    epsilon_ = optConfig.ada_epsilon();
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-
- protected:
-  real rou_;
-  real epsilon_;
-};
-
-// RMSProp Parameter Optimization.
-class RMSPropParameterOptimizer : public ParameterOptimizer {
- public:
-  explicit RMSPropParameterOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {
-    addParameterType(PARAMETER_MOMENTUM);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM1);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM);
-    addParameterType(PARAMETER_LEARNING_RATE);
-    rou_ = optConfig.ada_rou();
-    epsilon_ = optConfig.ada_epsilon();
-  }
-
-  virtual void init(size_t numRows, const ParameterConfig* config) {
-    t0Vec_.resize(numRows);
-    t0Vec_.assign(t0Vec_.size(), 0);
-    timer_ = 0;
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  }
-  virtual void finishBatch() { timer_++; }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-
- protected:
-  real rou_;
-  real epsilon_;
-
-  /**
-   *  counting batches, donot need catch up with
-   *  t(timer_) is current time,
-   *  t0(t0Vec_) are last occur time of i rows.
-   *  if one block is update by multi threads,
-   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
-   */
-  int64_t timer_;
-  mutable std::vector<int64_t> t0Vec_;
-};
-
-// Decayed AdaGrad Optimization.
-class DecayedAdagradParameterOptimizer : public ParameterOptimizer {
- public:
-  explicit DecayedAdagradParameterOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {
-    addParameterType(PARAMETER_MOMENTUM);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM);
-    addParameterType(PARAMETER_LEARNING_RATE);
-    rou_ = optConfig.ada_rou();
-    epsilon_ = optConfig.ada_epsilon();
-  }
-
-  virtual void init(size_t numRows, const ParameterConfig* config) {
-    t0Vec_.resize(numRows);
-    t0Vec_.assign(t0Vec_.size(), 0);
-    timer_ = 0;
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  }
-  virtual void finishBatch() { timer_++; }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-
- protected:
-  real rou_;
-  real epsilon_;
-
-  /**
-   *  counting batches, donot need catch up with
-   *  t(timer_) is current time,
-   *  t0(t0Vec_) are last occur time of i rows.
-   *  if one block is update by multi threads,
-   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
-   */
-  int64_t timer_;
-  mutable std::vector<int64_t> t0Vec_;
-};
-
-/**
- * Adam Optimizer.
- * Reference Paper: http://arxiv.org/abs/1412.6980 Algorithm 1
- */
-class AdamParameterOptimizer : public ParameterOptimizer {
- public:
-  explicit AdamParameterOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig),
-        beta1_(optConfig.adam_beta1()),
-        beta2_(optConfig.adam_beta2()),
-        epsilon_(optConfig.adam_epsilon()),
-        step_(1),
-        learningRate_(optConfig.learning_rate()) {
-    addParameterType(PARAMETER_MOMENTUM);
-    addParameterType(PARAMETER_SECOND_MOMENTUM);
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  }
-
-  virtual void finishBatch() { ++step_; }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-
- protected:
-  real beta1_;
-  real beta2_;
-  real epsilon_;
-  int64_t step_;
-  real learningRate_;
-};
-
-/**
- * AdaMax Optimizer.
- * Reference Paper: http://arxiv.org/abs/1412.6980 Algorithm 2
- */
-class AdamaxParameterOptimizer : public ParameterOptimizer {
- public:
-  explicit AdamaxParameterOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig),
-        beta1_(optConfig.adam_beta1()),
-        beta2_(optConfig.adam_beta2()),
-        step_(1),
-        learningRate_(optConfig.learning_rate()) {
-    addParameterType(PARAMETER_MOMENTUM);
-    addParameterType(PARAMETER_WEIGHTED_INFINITY_NORM);
-  }
-
-  virtual void finishBatch() { ++step_; }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-
- protected:
-  real beta1_;
-  real beta2_;
-  int64_t step_;
-  real learningRate_;
-};
-
-// Used in pserver,
-// when PARAMETER_DELTA stores in PARAMETER_GRADIENT.
-class AddOptimizer : public ParameterOptimizer {
- public:
-  explicit AddOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {}
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    // learningRate required by regularizer
-    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  }
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      size_t sparseId) const {
-    vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_GRADIENT],
-                               optConfig_.delta_add_rate());
-  }
-};
-
-// A optimizer which does nothing.
-class DummyOptimizer : public ParameterOptimizer {
- public:
-  explicit DummyOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {}
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      size_t sparseId) const {}
-};
-
-// Do gradient clipping before sgd update
-class OptimizerWithGradientClipping : public ParameterOptimizer {
- public:
-  OptimizerWithGradientClipping(const OptimizationConfig& optConfig,
-                                ParameterOptimizer* optimizer)
-      : ParameterOptimizer(optConfig), optimizer_(optimizer) {
-    parameterTypes_ = optimizer_->getParameterTypes();
-  }
-
-  virtual void init(size_t numRows, const ParameterConfig* config) {
-    optimizer_->init(numRows, config);
-  }
-
-  virtual void startPass() { optimizer_->startPass(); }
-  virtual void finishPass() { optimizer_->finishPass(); }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    optimizer_->startBatch(numSamplesProcessed);
-    learningRate_ = optimizer_->getLearningRate();
-  }
-  virtual void finishBatch() { optimizer_->finishBatch(); }
-
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const {
-    return optimizer_->needSpecialTraversal(config);
-  }
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-
-  virtual void setNoDecay() { optimizer_->setNoDecay(); }
-
- protected:
-  std::unique_ptr<ParameterOptimizer> optimizer_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/LearningRateScheduler.cpp b/paddle/legacy/parameter/LearningRateScheduler.cpp
deleted file mode 100644
index 68c44a7ec49..00000000000
--- a/paddle/legacy/parameter/LearningRateScheduler.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LearningRateScheduler.h"
-#include "paddle/legacy/utils/StringUtil.h"
-
-namespace paddle {
-
-ClassRegistrar<LearningRateScheduler, OptimizationConfig>
-    LearningRateScheduler::registrar_;
-
-LearningRateScheduler* LearningRateScheduler::create(
-    const OptimizationConfig& config) {
-  return registrar_.createByType(config.learning_rate_schedule(), config);
-}
-
-// LRS stands for LearningRateScheduler
-
-class BaseLRS : public LearningRateScheduler {
- public:
-  explicit BaseLRS(const OptimizationConfig& config)
-      : learningRate_(config.learning_rate()),
-        a_(config.learning_rate_decay_a()),
-        b_(config.learning_rate_decay_b()) {}
-
- protected:
-  real learningRate_;
-  real a_;
-  real b_;
-};
-
-class ConstLRS : public BaseLRS {
- public:
-  explicit ConstLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return learningRate_;
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(constant, ConstLRS);
-
-class PolyLRS : public BaseLRS {
- public:
-  explicit PolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return learningRate_ * pow(1.0 + a_ * numSamplesProcessed, -b_);
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(poly, PolyLRS);
-
-class CaffePolyLRS : public BaseLRS {
- public:
-  explicit CaffePolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    if (numSamplesProcessed > a_) {
-      LOG_FIRST_N(WARNING, 1)
-          << "Using caffe_poly learning rate schedule, "
-          << "learning rate hits ZERO when "
-          << "numSamplesProcessed > config.learning_rate_decay_b(), "
-          << "training is over and you can stop it. "
-          << "See common/LearningRateScheduler.cpp for more info.";
-      return 0;
-    } else {
-      return learningRate_ * pow(1.0 - numSamplesProcessed / a_, b_);
-    }
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(caffe_poly, CaffePolyLRS);
-
-class ExpLRS : public BaseLRS {
- public:
-  explicit ExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    double decayRatio = (double)numSamplesProcessed / b_;
-    return learningRate_ * pow(a_, decayRatio);
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(exp, ExpLRS);
-
-class DiscreteExpLRS : public BaseLRS {
- public:
-  explicit DiscreteExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    int numDecays = floor(numSamplesProcessed / b_);
-    return learningRate_ * pow(a_, numDecays);
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(discexp, DiscreteExpLRS);
-
-class LinearLRS : public BaseLRS {
- public:
-  explicit LinearLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return std::max(learningRate_ - a_ * numSamplesProcessed, b_);
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(linear, LinearLRS);
-
-/*
-  specify learning rate through
-  learning_rate_args = 'seg0:rate0,seg1:rate1,...,segK:rateK'
-  if seg_{i-1} <= numSamples <= seg_i,
-  then learning_rate = learning_rate_base * rate_i
-*/
-class ManualLRS : public BaseLRS {
- public:
-  explicit ManualLRS(const OptimizationConfig& config)
-      : BaseLRS(config), currentSegment_(0), lastNum_(0) {
-    std::vector<std::string> pieces;
-    str::split(config.learning_rate_args(), ',', &pieces);
-    rates_.reserve(pieces.size());
-    std::string s1, s2;
-
-    for (auto& piece : pieces) {
-      auto pos = piece.find(':');
-      CHECK(pos != std::string::npos) << "Wrong format for learning_rate_args: "
-                                      << config.learning_rate_args();
-      segments_.push_back(str::to<int64_t>(piece.substr(0, pos)));
-      rates_.push_back(str::to<real>(piece.substr(pos + 1)));
-    }
-  }
-
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return calc(numSamplesProcessed);
-  }
-
-  real calc(int64_t num) {
-    // We assume that num never decreases.
-    CHECK_LE(lastNum_, num);
-    lastNum_ = num;
-    while (currentSegment_ < rates_.size()) {
-      if (num <= segments_[currentSegment_]) {
-        return learningRate_ * rates_[currentSegment_];
-      }
-      ++currentSegment_;
-      if (currentSegment_ < rates_.size()) {
-        LOG(INFO) << " learning_rate changes to "
-                  << learningRate_ * rates_[currentSegment_];
-      }
-    }
-    return learningRate_ * rates_.back();
-  }
-
- protected:
-  std::vector<real> rates_;
-  std::vector<int64_t> segments_;
-  size_t currentSegment_;
-  int64_t lastNum_;
-};
-
-REGISTER_LEARNING_RATE_SCHEDULER(manual, ManualLRS);
-
-class PassManualLRS : public ManualLRS {
- public:
-  explicit PassManualLRS(const OptimizationConfig& config)
-      : ManualLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return calc(pass);
-  }
-};
-
-REGISTER_LEARNING_RATE_SCHEDULER(pass_manual, PassManualLRS);
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/LearningRateScheduler.h b/paddle/legacy/parameter/LearningRateScheduler.h
deleted file mode 100644
index fc7e380a6af..00000000000
--- a/paddle/legacy/parameter/LearningRateScheduler.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "TrainerConfig.pb.h"
-#include "paddle/legacy/utils/ClassRegistrar.h"
-
-namespace paddle {
-// NOLINTNEXTLINES_4
-#define REGISTER_LEARNING_RATE_SCHEDULER(__type_name, __class_name) \
-  static InitFunction __reg_type_##__type_name([]() {               \
-    LearningRateScheduler::registrar_.registerClass<__class_name>(  \
-        #__type_name);                                              \
-  })
-
-class LearningRateScheduler {
- public:
-  static LearningRateScheduler* create(const OptimizationConfig& config);
-  virtual ~LearningRateScheduler() {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) = 0;
-
-  static ClassRegistrar<LearningRateScheduler, OptimizationConfig> registrar_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/OptimizerFunctions.cpp b/paddle/legacy/parameter/OptimizerFunctions.cpp
deleted file mode 100644
index b7f920b89cc..00000000000
--- a/paddle/legacy/parameter/OptimizerFunctions.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "AverageOptimizer.h"
-#include "FirstOrderOptimizer.h"
-#include "OptimizerWithRegularizer.h"
-
-namespace paddle {
-
-// creator for AverageOptimizer
-ParameterOptimizer* sgdOptimizerCreate(const OptimizationConfig& optConfig,
-                                       const ParameterConfig& paraConfig,
-                                       bool isParameterSparse,
-                                       bool inPserver) {
-  ParameterOptimizer* optimizer = OptimizerWithRegularizer::create(
-      optConfig, paraConfig, isParameterSparse, inPserver);
-  return AverageOptimizer::create(
-      optConfig, optimizer, isParameterSparse, inPserver /*useParameterApply*/);
-}
-
-std::vector<ParameterType> sgdOptimizerGetTypes(
-    const OptimizationConfig& optConfig, bool inPserver) {
-  std::unique_ptr<ParameterOptimizer> optimizer;
-  optimizer.reset(
-      AverageOptimizer::create(optConfig,
-                               ParameterOptimizer::create(optConfig, inPserver),
-                               false /*isParameterSparse*/,
-                               inPserver));
-  CHECK(optimizer) << "fail to create optimizer: "
-                   << optConfig.learning_method();
-  return optimizer->getParameterTypes();
-}
-
-bool useApplyInPserver(const OptimizationConfig& optConfig) {
-  auto types = sgdOptimizerGetTypes(optConfig, true /*inPserver*/);
-  return types.end() != std::find(types.begin(), types.end(), PARAMETER_APPLY);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/OptimizerFunctions.h b/paddle/legacy/parameter/OptimizerFunctions.h
deleted file mode 100644
index 57f6fc9d40e..00000000000
--- a/paddle/legacy/parameter/OptimizerFunctions.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "FirstOrderOptimizer.h"
-
-namespace paddle {
-
-/*
- * Factory function creates the corresponding SgdOptimizer
- * according to the configuration in optConfig.
- */
-ParameterOptimizer* sgdOptimizerCreate(const OptimizationConfig& optConfig,
-                                       const ParameterConfig& paraConfig,
-                                       bool isParameterSparse,
-                                       bool inPserver);
-
-/*
- * Get the parameter types needed for the specific optimization
- * algorithm specified in optConfig.
- */
-std::vector<ParameterType> sgdOptimizerGetTypes(
-    const OptimizationConfig& optConfig, bool inPserver);
-
-/*
- * Whether trainer need call apply() in pserver and get result back.
- * currently, only averager depend on this.
- */
-bool useApplyInPserver(const OptimizationConfig& optConfig);
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/OptimizerWithRegularizer.cpp b/paddle/legacy/parameter/OptimizerWithRegularizer.cpp
deleted file mode 100644
index 9e914ae4ece..00000000000
--- a/paddle/legacy/parameter/OptimizerWithRegularizer.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "OptimizerWithRegularizer.h"
-
-namespace paddle {
-
-ParameterOptimizer::TraverseCallback
-OptimizerWithRegularizerEveryNumBatches::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  TraverseCallbackVec callbacks;
-
-  if (isRegularizationBatch(config)) {
-    callbacks.emplace_back(
-        [this](const VectorPtr vecs[],
-               const ParameterConfig& config,
-               size_t sparseId) { this->doTraversal(vecs, config); });
-  }
-
-  if (auto callback = optimizer_->needSpecialTraversal(config)) {
-    callbacks.emplace_back(callback);
-  }
-
-  return composeCallbacks(callbacks);
-}
-
-void OptimizerWithRegularizerEveryNumBatches::doTraversal(
-    const VectorPtr vecs[], const ParameterConfig& config) const {
-  int32_t base =
-      std::max(baseTimer_, (timer_ + 1 - config.num_batches_regularization()));
-  regularizer_->update(
-      vecs, config, optimizer_->getLearningRate(), base, timer_ + 1);
-}
-
-ParameterOptimizer::TraverseCallback
-OptimizerWithRegularizerEveryNumBatches::startCatchUpWith() const {
-  TraverseCallbackVec callbacks;
-
-  if (auto callback = optimizer_->startCatchUpWith()) {
-    callbacks.emplace_back(callback);
-  }
-
-  if (baseTimer_ < timer_) {
-    callbacks.emplace_back(
-        [this](const VectorPtr vecs[],
-               const ParameterConfig& config,
-               size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
-  }
-
-  return composeCallbacks(callbacks);
-}
-
-void OptimizerWithRegularizerEveryNumBatches::catchUpWith(
-    const VectorPtr vecs[],
-    const ParameterConfig& config,
-    size_t sparseId) const {
-  int32_t base = timer_ - timer_ % config.num_batches_regularization();
-  regularizer_->update(vecs,
-                       config,
-                       optimizer_->getLearningRate(),
-                       std::max(base, baseTimer_),
-                       timer_);
-}
-
-void OptimizerWithRegularizerSparse::init(size_t numRows,
-                                          const ParameterConfig* config) {
-  OptimizerWithRegularizer::init(numRows, config);
-  t0Vec_.resize(numRows);
-
-  timer_ = 0;
-  t0Vec_.assign(t0Vec_.size(), 0);
-}
-
-void OptimizerWithRegularizerSparse::update(const VectorPtr vecs[],
-                                            const ParameterConfig& config,
-                                            size_t sparseId) const {
-  optimizer_->update(vecs, config, sparseId);
-  // para W(t0) -> W(t+1)
-  CHECK_LT(sparseId, t0Vec_.size());
-  regularizer_->update(vecs,
-                       config,
-                       optimizer_->getLearningRate(),
-                       t0Vec_[sparseId],
-                       timer_ + 1);
-  t0Vec_[sparseId] = timer_ + 1;
-}
-
-ParameterOptimizer::TraverseCallback
-OptimizerWithRegularizerSparse::startCatchUpWith() const {
-  TraverseCallbackVec callbacks;
-
-  if (auto callback = optimizer_->startCatchUpWith()) {
-    callbacks.emplace_back(callback);
-  }
-
-  if (timer_ > 0) {
-    callbacks.emplace_back(
-        [this](const VectorPtr vecs[],
-               const ParameterConfig& config,
-               size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
-  }
-
-  return composeCallbacks(callbacks);
-}
-
-void OptimizerWithRegularizerSparse::catchUpWith(const VectorPtr vecs[],
-                                                 const ParameterConfig& config,
-                                                 size_t sparseId) const {
-  // para W(t0) -> W(t+1)
-  CHECK_LT(sparseId, t0Vec_.size());
-  regularizer_->update(
-      vecs, config, optimizer_->getLearningRate(), t0Vec_[sparseId], timer_);
-}
-
-// factory method to create instance of OptimizerWithRegularizer
-ParameterOptimizer* OptimizerWithRegularizer::create(
-    const OptimizationConfig& optConfig,
-    const ParameterConfig& paraConfig,
-    bool isParameterSparse,
-    bool inPserver) {
-  ParameterOptimizer* optimizer =
-      ParameterOptimizer::create(optConfig, inPserver);
-  if ((optConfig.gradient_clipping_threshold() > 0.0f ||
-       paraConfig.gradient_clipping_threshold() > 0.0f) &&
-      !dynamic_cast<AddOptimizer*>(optimizer)) {
-    optimizer = new OptimizerWithGradientClipping(optConfig, optimizer);
-  }
-  Regularizer* regularizer =
-      Regularizer::get(optimizer->getParameterTypes(), paraConfig);
-  if (!regularizer) {
-    return optimizer;
-  }
-
-  if (paraConfig.num_batches_regularization() > 1) {
-    if (optConfig.num_batches_per_send_parameter() > 1) {
-      CHECK_EQ(optConfig.num_batches_per_send_parameter() %
-                   paraConfig.num_batches_regularization(),
-               0)
-          << "regularization should be apply in sending batch";
-    }
-    CHECK(paraConfig.momentum() == 0.0f) << "Parameter cannot support momentum "
-                                            "if num_batches_regularization set";
-
-    if (optConfig.center_parameter_update_method() == "average" &&
-        optConfig.num_batches_per_send_parameter() ==
-            paraConfig.num_batches_regularization()) {
-      LOG(INFO) << "decay in pserver and no decay in trainer";
-      if (inPserver) {  // decay in pserver
-        optimizer->setNoDecay();
-        return new OptimizerWithRegularizer(optConfig, optimizer, regularizer);
-      }
-      // no decay in trainer
-      optimizer->setNoDecay();
-      return optimizer;
-    }
-    if (dynamic_cast<AddOptimizer*>(optimizer)) {
-      return optimizer;  // normal average, no decay in pserver
-    }
-    // normal
-    optimizer->setNoDecay();
-    return new OptimizerWithRegularizerEveryNumBatches(
-        optConfig, optimizer, regularizer);
-  }
-  if (isParameterSparse) {
-    CHECK(paraConfig.momentum() == 0.0f)
-        << "Parameter cannot support momentum if it's sparse.";
-    optimizer->setNoDecay();
-    return new OptimizerWithRegularizerSparse(
-        optConfig, optimizer, regularizer);
-  }
-  // dense
-  if (paraConfig.decay_rate_l1() == 0.0f ||
-      dynamic_cast<AddOptimizer*>(optimizer)) {
-    return optimizer;
-  }
-  CHECK(paraConfig.momentum() == 0.0f)
-      << "Parameter cannot support momentum if it use L1 decay.";
-  optimizer->setNoDecay();
-  return new OptimizerWithRegularizer(optConfig, optimizer, regularizer);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/OptimizerWithRegularizer.h b/paddle/legacy/parameter/OptimizerWithRegularizer.h
deleted file mode 100644
index bd29b396632..00000000000
--- a/paddle/legacy/parameter/OptimizerWithRegularizer.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "FirstOrderOptimizer.h"
-
-namespace paddle {
-
-// add regularizer for objective function to do optimization
-class OptimizerWithRegularizer : public ParameterOptimizer {
- public:
-  static ParameterOptimizer* create(const OptimizationConfig& optConfig,
-                                    const ParameterConfig& paraConfig,
-                                    bool isParameterSparse,
-                                    bool inPserver);
-
-  OptimizerWithRegularizer(const OptimizationConfig& optConfig,
-                           ParameterOptimizer* optimizer,
-                           Regularizer* regularizer)
-      : ParameterOptimizer(optConfig),
-        optimizer_(optimizer),
-        regularizer_(regularizer) {
-    parameterTypes_ = optimizer_->getParameterTypes();
-  }
-
-  virtual void init(size_t numRows, const ParameterConfig* config) {
-    optimizer_->init(numRows, config);
-  }
-
-  virtual void startPass() {
-    optimizer_->startPass();
-    timer_ = 0;
-  }
-
-  virtual void finishPass() { optimizer_->finishPass(); }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    optimizer_->startBatch(numSamplesProcessed);
-  }
-
-  virtual void finishBatch() {
-    optimizer_->finishBatch();
-    ++timer_;
-  }
-
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const {
-    return optimizer_->needSpecialTraversal(config);
-  }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const {
-    optimizer_->update(vecs, config, sparseId);
-    regularizer_->update(vecs, config, optimizer_->getLearningRate(), 0, 1);
-  }
-
- protected:
-  std::unique_ptr<ParameterOptimizer> optimizer_;
-  Regularizer* regularizer_;
-
-  /**
-   *  counting batches, clear after catch up with
-   *  t(timer_) is current time,
-   *  t0(t0Vec_) are last occur time of i rows.
-   *  if one block is update by multi threads,
-   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
-   */
-  int timer_;
-};
-
-// Regularized Loss function for every num of batches
-class OptimizerWithRegularizerEveryNumBatches
-    : public OptimizerWithRegularizer {
- public:
-  OptimizerWithRegularizerEveryNumBatches(const OptimizationConfig& optConfig,
-                                          ParameterOptimizer* optimizer,
-                                          Regularizer* regularizer)
-      : OptimizerWithRegularizer(optConfig, optimizer, regularizer) {}
-
-  virtual void startPass() {
-    OptimizerWithRegularizer::startPass();
-    baseTimer_ = 0;
-  }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const {
-    optimizer_->update(vecs, config, sparseId);
-  }
-
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const;
-  void doTraversal(const VectorPtr vecs[], const ParameterConfig& config) const;
-
-  void catchUpWith(const VectorPtr vecs[],
-                   const ParameterConfig& config,
-                   size_t sparseId) const;
-
-  virtual TraverseCallback startCatchUpWith() const;
-  virtual void finishCatchUpWith() { baseTimer_ = timer_; }
-
- protected:
-  bool isRegularizationBatch(const ParameterConfig& config) const {
-    return ((timer_ + 1) % config.num_batches_regularization() == 0);
-  }
-
-  /**
-   *  recored the timer_ value while catchUpWith called.
-   */
-  int baseTimer_;
-};
-
-// Regularized Loss function with Sparse support
-class OptimizerWithRegularizerSparse : public OptimizerWithRegularizer {
- public:
-  OptimizerWithRegularizerSparse(const OptimizationConfig& optConfig,
-                                 ParameterOptimizer* optimizer,
-                                 Regularizer* regularizer)
-      : OptimizerWithRegularizer(optConfig, optimizer, regularizer) {}
-
-  virtual void init(size_t numRows, const ParameterConfig* config);
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-  void catchUpWith(const VectorPtr vecs[],
-                   const ParameterConfig& config,
-                   size_t sparseId) const;
-  virtual TraverseCallback startCatchUpWith() const;
-  virtual void finishCatchUpWith() {
-    timer_ = 0;
-    t0Vec_.assign(t0Vec_.size(), 0);
-  }
-
- protected:
-  /**
-   *  t0Vec_ are last occur time of i rows
-   *  if one block is update by multi threads,
-   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
-   */
-  mutable std::vector<int32_t> t0Vec_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/Parameter.cpp b/paddle/legacy/parameter/Parameter.cpp
deleted file mode 100644
index 666d808f0c1..00000000000
--- a/paddle/legacy/parameter/Parameter.cpp
+++ /dev/null
@@ -1,425 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Parameter.h"
-#include <gflags/gflags.h>
-#include <fstream>
-#include "AverageOptimizer.h"
-#include "FirstOrderOptimizer.h"
-#include "OptimizerFunctions.h"
-#include "OptimizerWithRegularizer.h"
-#include "ParameterUpdateFunctions.h"
-#include "ThreadLocalBuffer.h"
-#include "hl_gpu.h"
-#include "paddle/legacy/math/CpuSparseMatrix.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/SparseRowMatrix.h"
-#include "paddle/legacy/utils/Logging.h"
-
-DEFINE_int32(enable_grad_share,
-             (100 * 1024 * 1024),
-             "threshold for enable gradient parameter share for batch "
-             "multi-cpu training");
-DEFINE_int32(
-    grad_share_block_num,
-    64,
-    "block number of gradient parameter share for batch multi-cpu training");
-
-namespace paddle {
-
-const std::string Parameter::kMissParameterFail = "fail";
-const std::string Parameter::kMissParameterRand = "rand";
-const std::string Parameter::kMissParameterZero = "zero";
-
-Parameter::Parameter(const ParameterConfig& config, bool useGpu, bool doInit)
-    : config_(config),
-      useGpu_(useGpu),
-      deviceId_(-1),
-      sharedCount_(0),
-      updateCounter_(0),
-      updated_(false),
-      headerFormat_(PARAM_FORMAT_ORIGINAL) {
-  setID(-1); /* capture uninitialized id */
-  if (useGpu_ && FLAGS_parallel_nn) {
-    /* gpu environment is specified by device property */
-    deviceId_ = config_.device();
-    if (deviceId_ < 0) {
-      useGpu_ = false;
-    }
-  }
-
-  if (doInit) {
-    initialize();
-  }
-
-  for (int i = 0; i < config.update_hooks_size(); ++i) {
-    this->updaterHooks_.push_back(IParameterUpdaterHook::create(config, i));
-  }
-}
-
-void Parameter::initialize() {
-  SetDevice device(deviceId_);
-
-  bufs_[PARAMETER_VALUE] =
-      Vector::createParallelVector(config_.size(), useGpu_);
-  bufs_[PARAMETER_VALUE]->zeroMem();
-
-  if (config_.is_sparse()) {
-    enableSparseParameter();
-  }
-
-  if (!isStatic()) {
-    bufs_[PARAMETER_GRADIENT] =
-        Vector::createParallelVector(config_.size(), useGpu_);
-    bufs_[PARAMETER_MOMENTUM] =
-        Vector::createParallelVector(config_.size(), useGpu_);
-
-    bufs_[PARAMETER_GRADIENT]->zeroMem();
-    bufs_[PARAMETER_MOMENTUM]->zeroMem();
-  }
-}
-
-void Parameter::randomize(const VectorPtr& value,
-                          const ParameterConfig& config) {
-  if (PARAMETER_INIT_UNIFORM == config.initial_strategy()) {
-    // initialize the parameter as uniform distribution
-    real initial_min = config.initial_mean() - config.initial_std();
-    real initial_max = config.initial_mean() + config.initial_std();
-    value->uniform(initial_min, initial_max);
-    VLOG(1) << config.name() << ": initial_min=" << initial_min
-            << ", initial_max=" << initial_max;
-  } else if (PARAMETER_INIT_NORMAL == config.initial_strategy()) {
-    /* Initialize the parameters randomly */
-    value->randnorm(config.initial_mean(), config.initial_std());
-    VLOG(1) << config.name() << ": initial_mean=" << config.initial_mean()
-            << ", initial_std=" << config.initial_std();
-  } else {
-    LOG(FATAL) << "not supported initial_strategy: "
-               << config.initial_strategy();
-  }
-}
-
-void Parameter::randomize() {
-  if (!bufs_[PARAMETER_VALUE]) return;
-  SetDevice device(deviceId_);
-  Parameter::randomize(bufs_[PARAMETER_VALUE], config_);
-
-  if (config_.is_sparse()) {
-    if (format_ == SPARSE_CSC) {
-      sparseRand(intBufs_[PARAMETER_COLS]->getData(),
-                 intBufs_[PARAMETER_ROWS]->getData(),
-                 config_.size(),
-                 config_.dims(1) + 1,
-                 config_.dims(0),
-                 useGpu_);
-    } else {
-      sparseRand(intBufs_[PARAMETER_ROWS]->getData(),
-                 intBufs_[PARAMETER_COLS]->getData(),
-                 config_.size(),
-                 config_.dims(0) + 1,
-                 config_.dims(1),
-                 useGpu_);
-    }
-  }
-  setValueUpdated();
-}
-
-void Parameter::zeroMem() {
-  if (!bufs_[PARAMETER_VALUE]) return;
-  bufs_[PARAMETER_VALUE]->zeroMem();
-  setValueUpdated();
-  LOG(INFO) << getName() << " set to 0";
-}
-
-bool Parameter::isGradShared(size_t* blockNum) {
-  if (!useGpu_ && !isStatic() && FLAGS_enable_grad_share > 0 &&
-      !isGradSparseUpdate() &&
-      this->getSize() > (size_t)FLAGS_enable_grad_share) {
-    if (blockNum) {
-      *blockNum = (size_t)FLAGS_grad_share_block_num;
-    }
-    return true;
-  }
-  return false;
-}
-
-bool Parameter::isValueShared() {
-  return !useGpu_ && config_.is_shared() && FLAGS_trainer_count > 1;
-}
-
-bool Parameter::isGradSparseUpdate() const {
-  return !useGpu_ && !isStatic() &&
-         (config_.sparse_update() || config_.sparse_remote_update());
-}
-
-void Parameter::setMat(ParameterType pType, int matType) {
-  CHECK(!mats_[pType]);
-
-  if (config_.dims_size() == 0 && matType == MAT_NORMAL) {
-    return;
-  }
-
-  CHECK_EQ((size_t)config_.dims_size(), 2LU);
-  size_t height = config_.dims(0);
-  size_t width = config_.dims(1);
-  if (matType == MAT_NORMAL) {
-    if (!config_.is_sparse()) {
-      CHECK_EQ(height * width, bufs_[pType]->getSize());
-      mats_[pType] =
-          Matrix::create(bufs_[pType]->getMemoryHandle(), height, width);
-    } else {
-      size_t size = bufs_[pType]->getSize();
-      CHECK_GE(height * width, size);
-      if (format_ == SPARSE_CSR) {
-        CHECK_EQ(height + 1, intBufs_[PARAMETER_ROWS]->getSize());
-        CHECK_EQ(size, intBufs_[PARAMETER_COLS]->getSize());
-      } else {
-        CHECK_EQ(width + 1, intBufs_[PARAMETER_COLS]->getSize());
-        CHECK_EQ(size, intBufs_[PARAMETER_ROWS]->getSize());
-      }
-      mats_[pType] =
-          Matrix::createSparseMatrix(bufs_[pType]->getData(),
-                                     intBufs_[PARAMETER_ROWS]->getData(),
-                                     intBufs_[PARAMETER_COLS]->getData(),
-                                     height,
-                                     width,
-                                     bufs_[pType]->getSize(),
-                                     FLOAT_VALUE,
-                                     format_,
-                                     false,
-                                     useGpu_);
-    }
-  }
-#ifndef PADDLE_MOBILE_INFERENCE
-  // NOLINTNEXTLINE
-  else if (matType == MAT_NORMAL_SHARED) {
-    CHECK_EQ(height * width, bufs_[pType]->getSize());
-    size_t blockNum = 0;
-    CHECK(isGradShared(&blockNum));
-    mats_[pType] = std::make_shared<SharedCpuMatrix>(
-        blockNum,
-        std::dynamic_pointer_cast<CpuMemoryHandle>(
-            bufs_[pType]->getMemoryHandle()),
-        height,
-        width);
-  } else if (matType == MAT_VALUE_SHARED) {
-    CHECK_EQ(height * width, bufs_[pType]->getSize());
-    mats_[pType] = std::make_shared<SharedCpuMatrix>(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(
-            bufs_[pType]->getMemoryHandle()),
-        height,
-        width);
-  } else if (matType == MAT_SPARSE_ROW_IDS) {
-    CHECK_EQ(height * width, bufs_[pType]->getSize());
-    mats_[pType] = std::make_shared<SparseRowIdsCpuMatrix>(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(
-            bufs_[pType]->getMemoryHandle()),
-        height,
-        width);
-  } else if (matType == MAT_SPARSE_ROW) {
-    auto valueMat =
-        std::dynamic_pointer_cast<SparseRowCpuMatrix>(mats_[PARAMETER_VALUE]);
-    SparseRowCpuMatrix::IndexDictPtr indexDict(nullptr);
-    if (pType != PARAMETER_VALUE) {
-      CHECK(valueMat) << "The matrix for PARAMETER_VALUE must be set "
-                      << " and its type must be MAT_SPARSE_ROW,"
-                      << " MAT_SPARSE_ROW_PREFETCH or MAT_CACHE_ROW";
-      indexDict = valueMat->getIndexDictHandle();
-    }
-    auto mat =
-        std::make_shared<SparseRowCpuMatrix>(nullptr,
-                                             height,
-                                             width,
-                                             // grad share index with value
-                                             indexDict);
-    mats_[pType] = mat;
-  } else if (matType == MAT_CACHE_ROW) {
-    CHECK(isGradSparseUpdate());
-    auto mat = std::make_shared<CacheRowCpuMatrix>(height, width);
-    mats_[pType] = mat;
-  } else if (matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
-             matType == MAT_SPARSE_ROW_PREFETCH) {
-    auto mat = std::make_shared<SparsePrefetchRowCpuMatrix>(
-        bufs_[pType] ? std::dynamic_pointer_cast<CpuMemoryHandle>(
-                           bufs_[pType]->getMemoryHandle())
-                     : nullptr,
-        height,
-        width,
-        nullptr,  // indexDictHandle
-        getGlobalSyncThreadPool());
-    mats_[pType] = mat;
-  } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) {
-    CHECK(isGradSparseUpdate());
-    mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(height, width);
-  }
-#endif
-  // NOLINTNEXTLINE
-  else {
-    LOG(FATAL) << "Unsupported mat type" << matType;
-  }
-}
-
-void Parameter::incUpdate(const UpdateCallback& callback) {
-  // Static parameter is fixed, and does not need to be updated
-  if (isStatic()) {
-    return;
-  }
-
-  ++updateCounter_;
-  if (isUpdatable()) {
-    if (callback) callback(this);
-    clearUpdate();
-  }
-}
-
-bool Parameter::save(const std::string& filename) const {
-  std::ofstream fs(filename, std::ios_base::binary);
-  CHECK(fs) << "Fail to open " << filename;
-  return save(fs);
-}
-
-bool Parameter::save(std::ostream& s) const {
-  CpuVector vec(*bufs_[PARAMETER_VALUE].get());
-  Header header;
-  header.format = headerFormat_;
-  header.valueSize = sizeof(real);
-  header.size = getSize();
-
-  CHECK_EQ(header.size, vec.getSize());
-
-  CHECK(s.write(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to write parameter " << getName();
-
-  CHECK(s.write(reinterpret_cast<char*>(vec.getData()),
-                header.size * sizeof(real)))
-      << "Fail to write parameter " << getName();
-  if (config_.is_sparse()) {
-    CpuIVector rows(*intBufs_[PARAMETER_ROWS].get());
-    CpuIVector cols(*intBufs_[PARAMETER_COLS].get());
-    CHECK(s.write(reinterpret_cast<char*>(rows.getData()),
-                  rows.getSize() * sizeof(int)))
-        << "Fail to write parameter " << getName();
-    CHECK(s.write(reinterpret_cast<char*>(cols.getData()),
-                  cols.getSize() * sizeof(int)))
-        << "Fail to write parameter " << getName();
-  }
-
-  return true;
-}
-
-/**
- * Load parameter value from a file
- */
-bool Parameter::load(const std::string& filename) {
-  std::ifstream fs(filename, std::ios_base::binary);
-  if (!fs) {
-    LOG(INFO) << "missing parameters [" << filename << "] while loading model.";
-    if (kMissParameterFail == FLAGS_load_missing_parameter_strategy) {
-      LOG(FATAL) << getName() << " missing, not allowed.";
-      return false;
-    }
-    if (kMissParameterRand == FLAGS_load_missing_parameter_strategy) {
-      LOG(INFO) << getName() << " missing, set to random.";
-      randomize();
-      return true;
-    }
-    if (kMissParameterZero == FLAGS_load_missing_parameter_strategy) {
-      LOG(INFO) << getName() << " missing, set to zero.";
-      zeroMem();
-      return true;
-    }
-    LOG(FATAL) << "unsupported load_missing_parameter_strategy: "
-               << FLAGS_load_missing_parameter_strategy;
-    return false;
-  }
-  return load(fs);
-}
-
-bool Parameter::load(std::istream& s) {
-  CpuVector vec(*bufs_[PARAMETER_VALUE].get());
-  Header header;
-  CHECK(s.read(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to read parameter " << getName();
-  CHECK(isHeaderFormatSupported(header.format)) << "Incorrect format version: "
-                                                << header.format;
-  headerFormat_ = header.format;
-  CHECK_EQ(header.size, getSize())
-      << "The size (" << header.size << ") in the file does not match the size "
-      << "(" << getSize() << ") of the parameter: " << getName();
-  CHECK_EQ(header.valueSize, sizeof(real))
-      << "Unsupported valueSize " << header.valueSize << " at: " << getName();
-  CHECK(s.read(reinterpret_cast<char*>(vec.getData()),
-               header.size * sizeof(real)));
-
-  auto& tmp = *bufs_[PARAMETER_VALUE].get();
-  if (typeid(tmp) == typeid(GpuVector)) {
-    bufs_[PARAMETER_VALUE]->copyFrom(vec);
-  }
-
-  if (config_.is_sparse() && config_.need_compact()) {
-    // load from dense parameter with many zero
-    CHECK_EQ(config_.dims_size(), 2);
-    auto height = config_.dims(0);
-    auto width = config_.dims(1);
-    auto mat = Matrix::create(vec.getData(), height, width);
-    CpuSparseMatrix sparseMat(height,
-                              width,
-                              0,
-                              FLOAT_VALUE,
-                              format_,
-                              /*trans*/ false);
-    sparseMat.copyFrom(*mat, HPPL_STREAM_DEFAULT);
-    auto nnz = sparseMat.getElementCnt();
-    size_t rowSize = (format_ == SPARSE_CSR) ? height + 1 : nnz;
-    size_t colSize = (format_ == SPARSE_CSR) ? nnz : width + 1;
-
-    intBufs_[PARAMETER_ROWS]->copyFrom(sparseMat.getRows(), rowSize);
-    intBufs_[PARAMETER_COLS]->copyFrom(sparseMat.getCols(), colSize);
-    bufs_[PARAMETER_VALUE]->resize(nnz);  // for setMat check
-    bufs_[PARAMETER_VALUE]->copyFrom(sparseMat.getValue(), nnz);
-    config_.set_size(nnz);
-    LOG(INFO) << "compact nnz=" << (1. * nnz / (height * width))
-              << " name=" << config_.name();
-  } else if (config_.is_sparse()) {
-    CpuIVector rows(*intBufs_[PARAMETER_ROWS].get());
-    CpuIVector cols(*intBufs_[PARAMETER_COLS].get());
-    size_t rowSize, colSize;
-    CHECK_EQ(config_.dims_size(), 2);
-    if (format_ == SPARSE_CSR) {
-      rowSize = config_.dims(0) + 1;
-      colSize = config_.size();
-    } else {
-      rowSize = config_.size();
-      colSize = config_.dims(1) + 1;
-    }
-    CHECK(
-        s.read(reinterpret_cast<char*>(rows.getData()), rowSize * sizeof(int)));
-    CHECK(
-        s.read(reinterpret_cast<char*>(cols.getData()), colSize * sizeof(int)));
-    auto& paramRows = *intBufs_[PARAMETER_ROWS].get();
-    if (typeid(paramRows) == typeid(GpuIVector)) {
-      intBufs_[PARAMETER_ROWS]->copyFrom(rows);
-    }
-    auto& paramCols = *intBufs_[PARAMETER_COLS].get();
-    if (typeid(paramCols) == typeid(GpuIVector)) {
-      intBufs_[PARAMETER_COLS]->copyFrom(cols);
-    }
-  }
-
-  setValueUpdated();
-
-  return true;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/Parameter.h b/paddle/legacy/parameter/Parameter.h
deleted file mode 100644
index 43b567dad04..00000000000
--- a/paddle/legacy/parameter/Parameter.h
+++ /dev/null
@@ -1,380 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "ParameterConfig.pb.h"
-#include "TrainerConfig.pb.h"
-
-#include "ParameterUpdaterHook.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-typedef enum {
-  /// The paddle original basic format
-  PARAM_FORMAT_ORIGINAL = 0,
-
-  /// See mkldnn_memory_format_t in
-  /// https://github.com/01org/mkl-dnn/blob/master/include/mkldnn_types.h
-  /// for a detailed description.
-  /// 2D weights tensor in the format (output channels, input channels).
-  PARAM_FORMAT_MKLDNN_OI,
-
-  /// The total format items numbers
-  PARAM_FORMAT_ITEMS,
-} PARAM_FORMAT;
-
-class SparsePrefetchRowCpuMatrix;
-
-class Parameter;
-typedef std::function<void(Parameter* param)> UpdateCallback;
-typedef std::function<void(int paramId, Parameter* param)> ParamInitCallback;
-
-class Parameter;
-typedef std::shared_ptr<Parameter> ParameterPtr;
-
-class Parameter {
- public:
-  Parameter(const ParameterConfig& config, bool useGpu, bool doInit = true);
-  const std::string& getName() const { return config_.name(); }
-
-  size_t getSize() const { return config_.size(); }
-
-  bool isFullSize() const {
-    if (bufs_[PARAMETER_VALUE]) {
-      return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
-    }
-    return false;
-  }
-
-  inline bool useGpu() const { return useGpu_; }
-
-  int getDeviceId() const { return deviceId_; }
-
-  void setDevice(int deviceId) { deviceId_ = deviceId; }
-
-  /// The id ranges from 0 to the_total_number_of_parameters - 1
-  size_t getID() const { return config_.para_id(); }
-
-  /// ID is a implict value created until neural network is built.
-  void setID(size_t id) { config_.set_para_id(id); }
-
-  bool isStatic() const { return config_.is_static(); }
-
-  enum MatType {
-    MAT_NORMAL,
-    /// both value and grad are shared
-    MAT_NORMAL_SHARED,
-
-    /// Now used in BatchNorm in CPU mode
-    MAT_VALUE_SHARED,
-
-    /// sparse matrix, which has full size parameter
-    MAT_SPARSE_ROW_IDS,
-    /// sparse matrix, parameter size scale by sparse rates.
-    MAT_SPARSE_ROW_AUTO_GROW,
-    MAT_CACHE_ROW,
-    MAT_SPARSE_ROW,
-
-    /// sparse matrix for prefetching parameter from pserver
-    MAT_SPARSE_ROW_PREFETCH,
-    /// same as above, but parameter has full size for saving parameter in local
-    MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
-  };
-
-  void enableSparseParameter() {
-    if (config_.is_sparse()) {
-      if (config_.format() == "csr") {
-        size_t height = config_.dims(0);
-        size_t nnz = config_.size();
-        enableIntType(PARAMETER_ROWS, height + 1);
-        enableIntType(PARAMETER_COLS, nnz);
-        format_ = SPARSE_CSR;
-      } else {
-        size_t width = config_.dims(1);
-        size_t nnz = config_.size();
-        enableIntType(PARAMETER_COLS, width + 1);
-        enableIntType(PARAMETER_ROWS, nnz);
-        format_ = SPARSE_CSC;
-      }
-    }
-  }
-
-  /// allocate buffer for the give type
-  void enableType(ParameterType type, MatType matType = MAT_NORMAL) {
-    if (bufs_[type] || mats_[type]) {
-      return;
-    }
-    SetDevice device(deviceId_);
-    if (config_.dims_size() == 2) {
-      if (matType == MAT_NORMAL || matType == MAT_NORMAL_SHARED ||
-          matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
-          matType == MAT_VALUE_SHARED || matType == MAT_SPARSE_ROW_IDS) {
-        bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
-        bufs_[type]->zeroMem();
-      } else {
-        CHECK(isGradSparseUpdate());
-      }
-      if (config_.is_sparse() && type == PARAMETER_VALUE) {
-        enableSparseParameter();
-      }
-      setMat(type, matType);
-    } else {
-      bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
-      bufs_[type]->zeroMem();
-    }
-  }
-
-  void enableBufType(ParameterType type) {
-    if (bufs_[type]) return;
-    bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
-    bufs_[type]->zeroMem();
-  }
-
-  void enableIntType(ParameterType type, size_t intStoreSize = 0) {
-    if (!intBufs_[type]) {
-      SetDevice device(deviceId_);
-      size_t size = intStoreSize ? intStoreSize : config_.size();
-      intBufs_[type] = IVector::create(size, useGpu_);
-      intBufs_[type]->zeroMem();
-    }
-  }
-
-  void enableSharedType(ParameterType type,
-                        VectorPtr vec,
-                        MatrixPtr mat = nullptr) {
-    if (!bufs_[type] && !mats_[type]) {
-      bufs_[type] = vec;
-      mats_[type] = mat;
-    }
-  }
-
-  /// for batchGradientMachine: blockNum is number of partitions of the matrix.
-  bool isGradShared(size_t* blockNum = NULL);
-
-  bool isValueShared();
-
-  // for AsgdSparseGradientMachine & SgdSparseGradientMachine:
-  // and MultiGradientMachine
-  bool isGradSparseUpdate() const;
-
-  bool isSparseRemoteUpdate() const {
-    return config_.sparse_remote_update() && !useGpu();
-  }
-
-  const ParameterConfig& getConfig() const { return config_; }
-
-  ParameterConfig& getConfig() { return config_; }
-
-  bool hasType(ParameterType pType) const {
-    return bufs_[pType] || mats_[pType];
-  }
-
-  const VectorPtr& getBuf(ParameterType pType) const {
-    return this->bufs_[pType];
-  }
-
-  const VectorPtr* getBufs() const { return bufs_; }
-
-  const MatrixPtr& getMat(ParameterType pType) const { return mats_[pType]; }
-
-  void setValueUpdated() { updated_ = true; }
-
-  void clearValueUpdated() { updated_ = false; }
-
-  bool isValueUpdated() const { return updated_; }
-
-  /**
-   * Save parameter value to a file
-   */
-  bool save(const std::string& filename) const;
-
-  /**
-   * Save parameter to ostream
-   */
-  bool save(std::ostream& s) const;
-
-  /**
-   * Load parameter value from a file
-   */
-  bool load(const std::string& filename);
-
-  /**
-   * Load parameter from istream
-   */
-  bool load(std::istream& is);
-
-  void incShared() { sharedCount_++; }
-
-  /**
-   * After one of the parameter's gradient is merged
-   * You should call this function to do some additional processing,
-   */
-  void incUpdate(const UpdateCallback& callbacks = NULL);
-
-  void clearGradient() {
-    auto& mat = getMat(PARAMETER_GRADIENT);
-    if (mat) {
-      // zeroMem will also clear rows for SparseRowCpuMatrix
-      mat->zeroMem();
-    } else {
-      auto& gradBuf = getBuf(PARAMETER_GRADIENT);
-      if (gradBuf) gradBuf->zeroMem();
-    }
-  }
-
-  void initialize();
-
-  /**
-   * Initialize the value according to config_: initial_mean,
-   * initial_std and initial_strategy.
-   */
-  void randomize();
-  static void randomize(const VectorPtr& value, const ParameterConfig& config);
-
-  /// Initialize the value to 0
-  void zeroMem();
-
-  /// file header structure
-  struct Header {
-    int32_t format;      // = PARAM_FORMAT
-    uint32_t valueSize;  // = sizeof(real)
-    uint64_t size;       // = getSize()
-  };
-
-  /**
-   * @brief Is the header format supported.
-   */
-  static bool isHeaderFormatSupported(int32_t fmt) {
-    return fmt < PARAM_FORMAT_ITEMS;
-  }
-
-  /**
-   * @brief Get the format in header.
-   */
-  int getHeaderFormat() { return headerFormat_; }
-
-  /**
-   * @brief Set the format in header.
-   */
-  void setHeaderFormat(int32_t fmt) {
-    CHECK(isHeaderFormatSupported(fmt)) << "Unsupported format version: "
-                                        << fmt;
-    headerFormat_ = fmt;
-  }
-
-  /**
-   * @brief  Parameter Update Hook.
-   *
-   * The parameter's update hook before ParameterUpdater::updateImpl
-   * It could modify gradient/momentum/etc here. Such as drop some gradient,
-   * etc.
-   */
-  void updateHook() {
-    for (auto& hook : updaterHooks_) {
-      hook->update(this);
-    }
-  }
-
-  /**
-   * @brief  Initialize all updater hook.
-   *
-   * This method should be invoked in ParameterUpdater::init() only.
-   */
-  void initHook() {
-    for (auto& hook : updaterHooks_) {
-      hook->init(this);
-    }
-  }
-
- protected:
-  /**
-   * @brief create matrix to matType.
-   *
-   * used by gradient machine which needs specify matrix type,
-   * instead of creating in weights.cpp.
-   *
-   * @note  pType should be enabled already.
-   */
-  void setMat(ParameterType pType, int matType);
-
-  bool isUpdatable() { return (updateCounter_ == sharedCount_); }
-
-  void clearUpdate() { updateCounter_ = 0; }
-
- protected:
-  ParameterConfig config_;
-
-  bool useGpu_;
-
-  int deviceId_;
-
-  /**
-   * @brief bufs_ stores parameter value and gradient.
-   *
-   * Layer should use bufs_[PARAMETER_VALUE] to form weight matrix for
-   * calculation and stores gradient to bufs_[PARAMETER_GRADIENT].
-   */
-  VectorPtr bufs_[NUM_PARAMETER_TYPES];
-
-  /**
-   * @brief Weight matrix for bufs_.
-   *
-   * It's helpfull when parameter shared by multi-layers.
-   * Caller should check, if mats exist, do not create it again.
-   */
-  MatrixPtr mats_[NUM_PARAMETER_TYPES];
-
-  /// Int vectors, used in some User defined parameter types
-  IVectorPtr intBufs_[NUM_PARAMETER_TYPES];
-
-  int sharedCount_;
-  int updateCounter_;
-
-  bool updated_;
-  SparseFormat format_;
-
-  /// The header format for saving or loading param
-  int32_t headerFormat_;
-
-  std::vector<std::shared_ptr<IParameterUpdaterHook>> updaterHooks_;
-
- public:
-  void setSharedCount(int cnt) { sharedCount_ = cnt; }
-  int getSharedCount() { return sharedCount_; }
-
-  bool isSparse() { return config_.is_sparse(); }
-  SparseFormat getFormat() { return format_; }
-
-  static const std::string kMissParameterFail;
-  static const std::string kMissParameterRand;
-  static const std::string kMissParameterZero;
-};
-
-typedef std::map<std::string, ParameterPtr> ParameterMap;
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterOptimizer.cpp b/paddle/legacy/parameter/ParameterOptimizer.cpp
deleted file mode 100644
index b9dffa5afb4..00000000000
--- a/paddle/legacy/parameter/ParameterOptimizer.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include <fstream>
-
-#include "AverageOptimizer.h"
-#include "FirstOrderOptimizer.h"
-#include "OptimizerFunctions.h"
-#include "OptimizerWithRegularizer.h"
-#include "ParameterOptimizer.h"
-#include "hl_gpu.h"
-
-namespace paddle {
-
-ParameterOptimizer* ParameterOptimizer::create(
-    const OptimizationConfig& optConfig, bool inPserver) {
-  if (inPserver && optConfig.num_batches_per_send_parameter() > 1) {
-    return new AddOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "momentum") {
-    return new SgdOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "torch_momentum") {
-    return new SgdOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "adagrad") {
-    return new AdagradParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "adadelta") {
-    return new AdaDeltaParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "rmsprop") {
-    return new RMSPropParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "decayed_adagrad") {
-    return new DecayedAdagradParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "adam") {
-    return new AdamParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "adamax") {
-    return new AdamaxParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "sparse_momentum") {
-    return new SparseMomentumParameterOptimizer(optConfig);
-  }
-  return nullptr;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterOptimizer.h b/paddle/legacy/parameter/ParameterOptimizer.h
deleted file mode 100644
index 019afa1358a..00000000000
--- a/paddle/legacy/parameter/ParameterOptimizer.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "LearningRateScheduler.h"
-#include "Parameter.h"
-
-namespace paddle {
-
-/**
- * Some member functions are set to const for two reasons:
- *
- * 1. For sparse update thread safe: update(), traverse callback(const this)
- *    may be called many times, each time one row, and these function
- *    can be called parallelly by multi worker, to speed up large block.
- *
- * 2. For predicate functions, needSpecialTraversal(), startCatchUpWith()
- *    may be called many times, should be no state change between calls.
- */
-class ParameterOptimizer {
- public:
-  typedef std::function<void(
-      const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId)>
-      TraverseCallback;
-
- public:
-  explicit ParameterOptimizer(const OptimizationConfig& optConfig)
-      : applyDecay_(true),
-        optConfig_(optConfig),
-        parameterTypes_{PARAMETER_VALUE, PARAMETER_GRADIENT},
-        learningRate_(optConfig.learning_rate()),
-        learningRateScheduler_(LearningRateScheduler::create(optConfig)),
-        pass_(0),
-        firstTime_(true) {}
-
-  real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return learningRateScheduler_->calcLearningRate(numSamplesProcessed, pass);
-  }
-
-  virtual ~ParameterOptimizer() {}
-
-  /**
-   * For sparse update, optimizer can maintain numRows of timer(t0).
-   * Some sparse optimizer depends on parameter config in functions
-   * such as startBatch(). Optimizer can get it here. But notice that,
-   * not all callers can pass config here, so the optimizer should check
-   * config passed in is not null ptr.
-   */
-  virtual void init(size_t numRows, const ParameterConfig* config) {}
-
-  virtual void startPass() {}
-  virtual void finishPass() { ++pass_; }
-
-  /// called by Trainer before forward() of a batch.
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    (void)numSamplesProcessed;
-  }
-
-  /**
-   * following hooks useful for sparse update,
-   * because the traversal in block costs.
-   * called by Trainer after update and before finishBatch
-   * e.g. Trainer call like this:
-   *
-   * @code
-   * startBatch();
-   * if (dense) {
-   *   update(blockVec);
-   * } else {//sparse
-   *   for (row : rows_in_block) {update(rowVec)}
-   * }
-   * auto callback = needSpecialTraversal();
-   * if (callback) {
-   *   // do traverse, maybe multi-thread
-   *   if (dense) {
-   *     callback();
-   *   } else {//sparse
-   *     for (row : all_rows_in_block) {callback();}
-   *   }
-   * }
-   * finishBatch();
-   * @endcode
-   *
-   * @return callback if need traverse,
-   *         else return nullptr.
-   *         It should be no state change.
-   */
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const {
-    return nullptr;
-  }
-
-  /// called by Trainer after backward() of a batch
-  virtual void finishBatch() {}
-
-  /**
-   * between startBatch() and finishBatch(), update() will be called
-   * by the trainer multiple times, each time for updating one Parameter
-   * with its gradient in PARAMETER_GRADIENT. sparseId is row id,
-   * when sparseId set, update is sparse, each time one row.
-   */
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId = -1LU) const = 0;
-
-  /**
-   * following hooks catch up with current time for sparse update,
-   * In the beginning, call startCatchUpWith() and check return.
-   * In the end, call finishCatchUpWith() to finish state.
-   * callback do the actual works, can call many times for sparse data.
-   * e.g. Trainer call like this:
-   *
-   * @code
-   * auto callback = startCatchUpWith();
-   * if (callback) {
-   *   // do catch up with, maybe multi-thread
-   *   if (dense) {
-   *     callback();
-   *   } else {//sparse
-   *     for (row : rows_in_block) {callback();}
-   *   }
-   *   // finish catch up with, main thread
-   *   finishCatchUpWith();
-   * }
-   * @endcode
-   *
-   * @return callback if need catch up with,
-   *         else return nullptr.
-   *         It should be no state change.
-   */
-  virtual TraverseCallback startCatchUpWith() const { return nullptr; }
-  virtual void finishCatchUpWith() {}
-
-  /**
-   * following two hooks used by averager,
-   * apply to final parameter value (PARAMETER_VALUE or PARAMETER_APPLY).
-   *
-   * restore() will restore orginal value if it apply to PARAMETER_VALUE.
-   * Caller must ensure it's catched up with current time before apply.
-   *
-   * Use returned callback same way as callback returned by
-   * ParameterOptimizer::needSpecialTraversal()
-   */
-  virtual TraverseCallback apply() { return nullptr; }
-  virtual TraverseCallback restore() { return nullptr; }
-
-  /// return the parameter types used by this updater
-  const std::vector<ParameterType>& getParameterTypes() const {
-    return parameterTypes_;
-  }
-
-  void addParameterType(ParameterType type) {
-    for (auto t : parameterTypes_) {
-      if (t == type) return;
-    }
-    parameterTypes_.push_back(type);
-  }
-
-  real getLearningRate() const { return learningRate_; }
-
-  virtual void setNoDecay() { applyDecay_ = false; }
-
-  static ParameterOptimizer* create(const OptimizationConfig& optConfig,
-                                    bool inPserver = false);
-
- protected:
-  typedef std::vector<ParameterOptimizer::TraverseCallback> TraverseCallbackVec;
-
-  static TraverseCallback composeCallbacks(
-      const TraverseCallbackVec& callbacks) {
-    if (callbacks.size() > 1LU) {
-      return [callbacks](const VectorPtr vecs[],
-                         const ParameterConfig& config,
-                         size_t sparseId) {
-        for (auto callback : callbacks) {
-          callback(vecs, config, sparseId);
-        }
-      };
-    }
-    return (callbacks.size() == 1LU) ? callbacks[0] : nullptr;
-  }
-
-  bool applyDecay_;
-  const OptimizationConfig& optConfig_;
-  std::vector<ParameterType> parameterTypes_;
-
-  /**
-   * global learning rate, init value is opt_config.learning_rate,
-   * sparse regularizer get this value per batch, after StartBatch() called
-   * so, if lr change in StartBatch, please assign to learningRate_
-   */
-  real learningRate_;
-
-  std::unique_ptr<LearningRateScheduler> learningRateScheduler_;
-  int64_t pass_;  // current training pass (starting from 0)
-  bool firstTime_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdateFunctions.cpp b/paddle/legacy/parameter/ParameterUpdateFunctions.cpp
deleted file mode 100644
index 72c9841acf6..00000000000
--- a/paddle/legacy/parameter/ParameterUpdateFunctions.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Logging.h"
-#ifdef __AVX__
-#include <x86intrin.h>
-#include <xmmintrin.h>
-#endif
-
-#include "ParameterUpdateFunctions.h"
-
-namespace paddle {
-
-void sgdUpdateCpu(real learningRate,
-                  real momentum,
-                  real decayRate,
-                  size_t size,
-                  real* value,
-                  const real* grad,
-                  real* momentumVec) {
-  decayRate *= learningRate;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (size_t i = 0; i < size; ++i) {
-    momentumVec[i] = momentum * momentumVec[i] - learningRate * grad[i] -
-                     decayRate * value[i];
-    value[i] += momentumVec[i];
-  }
-}
-
-void sgdUpdate(real learningRate,
-               real momentum,
-               real decayRate,
-               Vector* value,
-               Vector* grad,
-               Vector* momentumVec) {
-  size_t size = value->getSize();
-  real* val = value->getData();
-  real* grd = grad->getData();
-  real* mom = momentumVec->getData();
-  if (typeid(*value) == typeid(CpuVector)) {
-    sgdUpdateCpu(learningRate, momentum, decayRate, size, val, grd, mom);
-  } else if (typeid(*value) == typeid(GpuVector)) {
-    value->sgdUpdate(*grad, *momentumVec, learningRate, momentum, decayRate);
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void sgdUpdateAvx(float learningRate,
-                  float momentum,
-                  float decayRate,
-                  size_t size,
-                  float* value,
-                  const float* _grad,
-                  float* momentumVec) {
-#ifdef __AVX__
-  float* grad = const_cast<float*>(_grad);  // the gradient is not modified
-                                            // but when invoke simd functions
-                                            // need non-const pointer.
-  size_t gradientAlign = 0;
-  size_t gradientAlignHeader = (size_t)grad % sizeof(__m256);
-  CHECK_EQ(gradientAlignHeader, (size_t)momentumVec % sizeof(__m256))
-      << "Gradent buffer didn't align with momentum buffer";
-  CHECK_EQ(gradientAlignHeader, (size_t)value % sizeof(__m256))
-      << "Gradent buffer didn't align with value buffer";
-  if (0 != gradientAlignHeader) {
-    gradientAlignHeader = sizeof(__m256) - gradientAlignHeader;
-    gradientAlign = gradientAlignHeader / sizeof(real);
-
-    // handle the unalign buffer
-    for (size_t i = 0; i < gradientAlign; i++) {
-      momentumVec[i] = momentum * momentumVec[i] - (learningRate * grad[i]) -
-                       (decayRate * learningRate * value[i]);
-      value[i] += momentumVec[i];
-    }
-    grad += gradientAlign;
-    momentumVec += gradientAlign;
-    value += gradientAlign;
-  }
-
-  constexpr size_t kParallelNum = 8;
-  constexpr size_t nStepSize = (sizeof(__m256) / sizeof(real)) * kParallelNum;
-  size_t cntLoop = (size - gradientAlign) / nStepSize;
-  size_t cntRem = (size - gradientAlign) % nStepSize;
-  __m256 gradientTmp[kParallelNum];
-  __m256 valueTmp[kParallelNum];
-  __m256 lr, mom, dr;
-  std::function<void(void)> loopFun;
-
-  learningRate *= -1;
-  lr = _mm256_set_ps(learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate);
-
-  if (0 != momentum) {
-    mom = _mm256_set_ps(momentum,
-                        momentum,
-                        momentum,
-                        momentum,
-                        momentum,
-                        momentum,
-                        momentum,
-                        momentum);
-  }
-
-  decayRate *= learningRate;
-  if (0 != decayRate) {
-    dr = _mm256_set_ps(decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate);
-  }
-
-  auto gradMulFun = [&](void) {
-    gradientTmp[0] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad), lr);
-    gradientTmp[1] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 8), lr);
-    gradientTmp[2] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 16), lr);
-    gradientTmp[3] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 24), lr);
-    gradientTmp[4] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 32), lr);
-    gradientTmp[5] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 40), lr);
-    gradientTmp[6] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 48), lr);
-    gradientTmp[7] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 56), lr);
-  };
-
-  auto valueMulFun = [&](void) {
-    valueTmp[0] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value), dr);
-    valueTmp[1] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 8), dr);
-    valueTmp[2] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 16), dr);
-    valueTmp[3] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 24), dr);
-    valueTmp[4] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 32), dr);
-    valueTmp[5] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 40), dr);
-    valueTmp[6] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 48), dr);
-    valueTmp[7] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 56), dr);
-  };
-
-  auto momentumMulFun = [&](void) {
-    *reinterpret_cast<__m256*>(momentumVec) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 8) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 8), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 16) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 16), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 24) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 24), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 32) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 32), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 40) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 40), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 48) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 48), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 56) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 56), mom);
-  };
-
-  auto momentumAddGradFun = [&](void) {
-    *reinterpret_cast<__m256*>(momentumVec) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec), gradientTmp[0]);
-    *reinterpret_cast<__m256*>(momentumVec + 8) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 8), gradientTmp[1]);
-    *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 16), gradientTmp[2]);
-    *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 24), gradientTmp[3]);
-    *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 32), gradientTmp[4]);
-    *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 40), gradientTmp[5]);
-    *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 48), gradientTmp[6]);
-    *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 56), gradientTmp[7]);
-  };
-
-  auto momentumZeroFun = [&](void) {
-    *reinterpret_cast<__m256*>(momentumVec) = gradientTmp[0];
-    *reinterpret_cast<__m256*>(momentumVec + 8) = gradientTmp[1];
-    *reinterpret_cast<__m256*>(momentumVec + 16) = gradientTmp[2];
-    *reinterpret_cast<__m256*>(momentumVec + 24) = gradientTmp[3];
-    *reinterpret_cast<__m256*>(momentumVec + 32) = gradientTmp[4];
-    *reinterpret_cast<__m256*>(momentumVec + 40) = gradientTmp[5];
-    *reinterpret_cast<__m256*>(momentumVec + 48) = gradientTmp[6];
-    *reinterpret_cast<__m256*>(momentumVec + 56) = gradientTmp[7];
-  };
-
-  auto momentumAddValueFun = [&](void) {
-    *reinterpret_cast<__m256*>(momentumVec) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec), valueTmp[0]);
-    *reinterpret_cast<__m256*>(momentumVec + 8) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec + 8), valueTmp[1]);
-    *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 16), valueTmp[2]);
-    *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 24), valueTmp[3]);
-    *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 32), valueTmp[4]);
-    *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 40), valueTmp[5]);
-    *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 48), valueTmp[6]);
-    *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 56), valueTmp[7]);
-  };
-
-  auto valueAddMomentumFun = [&](void) {
-    *reinterpret_cast<__m256*>(value) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value),
-                      *reinterpret_cast<__m256*>(momentumVec));
-    *reinterpret_cast<__m256*>(value + 8) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 8),
-                      *reinterpret_cast<__m256*>(momentumVec + 8));
-    *reinterpret_cast<__m256*>(value + 16) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 16),
-                      *reinterpret_cast<__m256*>(momentumVec + 16));
-    *reinterpret_cast<__m256*>(value + 24) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 24),
-                      *reinterpret_cast<__m256*>(momentumVec + 24));
-    *reinterpret_cast<__m256*>(value + 32) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 32),
-                      *reinterpret_cast<__m256*>(momentumVec + 32));
-    *reinterpret_cast<__m256*>(value + 40) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 40),
-                      *reinterpret_cast<__m256*>(momentumVec + 40));
-    *reinterpret_cast<__m256*>(value + 48) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 48),
-                      *reinterpret_cast<__m256*>(momentumVec + 48));
-    *reinterpret_cast<__m256*>(value + 56) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 56),
-                      *reinterpret_cast<__m256*>(momentumVec + 56));
-  };
-
-  if (0 == decayRate && 0 == momentum) {
-    loopFun = [&](void) {
-      gradMulFun();
-      momentumZeroFun();
-      valueAddMomentumFun();
-    };
-  } else if (0 == decayRate && 0 != momentum) {
-    loopFun = [&](void) {
-      gradMulFun();
-      momentumMulFun();
-      momentumAddGradFun();
-      valueAddMomentumFun();
-    };
-  } else if (0 != decayRate && 0 == momentum) {
-    loopFun = [&](void) {
-      gradMulFun();
-      valueMulFun();
-      momentumZeroFun();
-      momentumAddValueFun();
-      valueAddMomentumFun();
-    };
-  } else if (0 != decayRate && 0 != momentum) {
-    loopFun = [&](void) {
-      gradMulFun();
-      valueMulFun();
-      momentumMulFun();
-      momentumAddGradFun();
-      momentumAddValueFun();
-      valueAddMomentumFun();
-    };
-  }
-
-  for (size_t i = 0; i < cntLoop; i++) {
-    loopFun();
-    grad += nStepSize;
-    momentumVec += nStepSize;
-    value += nStepSize;
-  }
-
-  for (size_t i = 0; i < cntRem; i++) {
-    momentumVec[i] = momentum * momentumVec[i] + (learningRate * grad[i]) +
-                     (decayRate * value[i]);
-    value[i] += momentumVec[i];
-  }
-#endif
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdateFunctions.h b/paddle/legacy/parameter/ParameterUpdateFunctions.h
deleted file mode 100644
index a7cc1c4c47b..00000000000
--- a/paddle/legacy/parameter/ParameterUpdateFunctions.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/utils/Common.h"
-
-namespace paddle {
-
-/**
- * Performs the following operations.
- *
- * momentumVec = momentum * momentumVec
- *               - learningRate * grad
- *               - learningRate * decayRate * value
- *
- * value = value + momentumVec
- * momentum = 0 or decayRate = 0 are specially handled to avoid unnecessary
- * computation.
- */
-void sgdUpdate(real learningRate,
-               real momentum,
-               real decayRate,
-               Vector* value,
-               Vector* grad,
-               Vector* momentumVec);
-
-void sgdUpdateCpu(real learningRate,
-                  real momentum,
-                  real decayRate,
-                  size_t size,
-                  real* value,
-                  const real* grad,
-                  real* momentumVec);
-
-void sgdUpdateAvx(float learningRate,
-                  float momentum,
-                  float decayRate,
-                  size_t size,
-                  float* value,
-                  const float* grad,
-                  float* momentumVec);
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdaterBase.cpp b/paddle/legacy/parameter/ParameterUpdaterBase.cpp
deleted file mode 100644
index 7d9d3fad631..00000000000
--- a/paddle/legacy/parameter/ParameterUpdaterBase.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterUpdaterBase.h"
-#include <fstream>
-#include "hl_gpu.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-void ParameterUpdater::init(const std::vector<ParameterPtr>& parameters) {
-  parameters_ = parameters;
-  for (ParameterType type : getParameterTypes()) {
-    for (auto& para : parameters) {
-      para->enableType(type);
-    }
-  }
-  for (size_t pid = 0; pid < parameters_.size(); ++pid) {
-    nonStaticParaIDMap_.insert(
-        std::pair<size_t, size_t>(parameters_[pid]->getID(), pid));
-  }
-
-  for (auto& para : parameters) {
-    if (!para->isStatic()) {
-      para->initHook();
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdaterBase.h b/paddle/legacy/parameter/ParameterUpdaterBase.h
deleted file mode 100644
index 493512886ca..00000000000
--- a/paddle/legacy/parameter/ParameterUpdaterBase.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Parameter.h"
-
-namespace paddle {
-
-class ParameterOptimizer;
-
-class ParameterUpdater {
- public:
-  ParameterUpdater() : parameterTypes_{PARAMETER_VALUE, PARAMETER_GRADIENT} {}
-  virtual ~ParameterUpdater() {}
-
-  void addParameterType(ParameterType type) {
-    for (auto t : parameterTypes_) {
-      if (t == type) return;
-    }
-    parameterTypes_.push_back(type);
-  }
-
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-
-  // called by Trainer when starting a new pass
-  virtual void startPass() {}
-
-  // called by Trainer then finishing a pass, ruturn true if pass accepted
-  virtual bool finishPass() { return true; }
-
-  // called by Trainer before backward() of a batch
-  // Return the type of pass it needs. This pass type will be passed
-  // to GradientMachine::forward() by the caller.
-  virtual PassType startBatch(int64_t batchSize) {
-    (void)batchSize;
-    return PASS_TRAIN;
-  }
-
-  // called by Trainer after backward() of a batch
-  // cost: the cost for this batch
-  virtual void finishBatch(real cost) { (void)cost; }
-
-  // between startBatch() and finishBatch(), update() will be called
-  // by the trainer multiple times, each time for updating one Parameter
-  // with its gradient in PARAMETER_GRADIENT
-  void update(Parameter* para) {
-    SetDevice setDevice(para->getDeviceId());
-    para->updateHook();
-    this->updateImpl(para);
-  }
-
-  // only get required sparse rows by default,
-  // get full matrix parameter if *fullSize* set
-  // get PARAMETER_APPLY on pserver if *apply* set
-  virtual void getParametersRemote(bool fullSize = false, bool apply = false) {}
-
-  virtual void loadParametersRemote(const std::string& dirName) {}
-  virtual void saveParametersRemote(const std::string& dirName) {}
-  virtual void randParametersRemote() {}
-
-  // something like regularization may be delayed apply
-  // trainer should catch up with before parameter is saved or sended.
-  virtual void catchUpWith() {}
-
-  // following two hooks used by averager
-  // apply to final parameter value (PARAMETER_VALUE or PARAMETER_APPLY).
-  // restore() will restore orginal value if it apply to PARAMETER_VALUE.
-  virtual void apply() {}
-  virtual void restore() {}
-
-  // return the parameter types used by this updater
-  const std::vector<ParameterType>& getParameterTypes() const {
-    return parameterTypes_;
-  }
-
-#ifndef PADDLE_DISABLE_TIMER
-  virtual void setForwardbackwardTime(uint64_t delta) {}
-#endif
-
- protected:
-  virtual void updateImpl(Parameter* para) = 0;
-
-  std::vector<ParameterType> parameterTypes_;
-  std::vector<ParameterPtr> parameters_;
-  std::map<size_t, size_t> nonStaticParaIDMap_;
-};
-
-// Composite of ParameterUpdaters, each ParameterUpdater handle
-// part of all Parameters. It's useful when we need different
-// update strategy for different Parameter.
-class ParameterUpdaterComposite : public ParameterUpdater {
- public:
-  ParameterUpdaterComposite() {}
-  virtual ~ParameterUpdaterComposite() {}
-
-  virtual void init(const std::vector<ParameterPtr>& parameters) = 0;
-
-  virtual void startPass() {
-    syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->startPass(); });
-  }
-
-  virtual bool finishPass() {
-    syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->finishPass(); });
-    return true;
-  }
-
-  virtual PassType startBatch(int64_t batchSize) {
-    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-      updaters_[tid]->startBatch(batchSize);
-    });
-    return PASS_TRAIN;
-  }
-
-  virtual void finishBatch(real cost) {
-    syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->finishBatch(cost); });
-  }
-
-  virtual void getParametersRemote(bool fullSize, bool apply) {
-    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-      updaters_[tid]->getParametersRemote(fullSize, apply);
-    });
-  }
-  virtual void loadParametersRemote(const std::string& dirName) {
-    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-      updaters_[tid]->loadParametersRemote(dirName);
-    });
-  }
-  virtual void saveParametersRemote(const std::string& dirName) {
-    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-      updaters_[tid]->saveParametersRemote(dirName);
-    });
-  }
-  virtual void randParametersRemote() {
-    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-      updaters_[tid]->randParametersRemote();
-    });
-  }
-
-  virtual void catchUpWith() {
-    syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->catchUpWith(); });
-  }
-
-#ifndef PADDLE_DISABLE_TIMER
-  virtual void setForwardbackwardTime(uint64_t delta) {
-    for (auto& updater : updaters_) {
-      updater->setForwardbackwardTime(delta);
-    }
-  }
-#endif
-
-  virtual void apply() {
-    syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->apply(); });
-  }
-  virtual void restore() {
-    syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->restore(); });
-  }
-
- protected:
-  virtual void updateImpl(Parameter* para) {}
-  std::vector<std::unique_ptr<ParameterUpdater>> updaters_;
-  std::unique_ptr<SyncThreadPool> syncThreadPool_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdaterHook.cpp b/paddle/legacy/parameter/ParameterUpdaterHook.cpp
deleted file mode 100644
index bfb9769fb67..00000000000
--- a/paddle/legacy/parameter/ParameterUpdaterHook.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterUpdaterHook.h"
-
-#include <algorithm>
-#include <atomic>
-#include <fstream>
-#include <mutex>
-#include <thread>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-/**
- * The static pruning hook
- * Static means user specify a sparsity_ratio before training started, and the
- * network will prune the parameters based on the sparsity_ratio. More details
- * can be found https://arxiv.org/pdf/1506.02626.pdf.
- */
-
-class StaticPruningHook : public IParameterUpdaterHook {
- public:
-  explicit StaticPruningHook(const ParameterUpdaterHookConfig &hookConfig)
-      : initCount_(0) {
-    sparsityRatio_ = hookConfig.sparsity_ratio();
-  }
-
-  static bool sortPairAscend(const std::pair<real, size_t> &pair1,
-                             const std::pair<real, size_t> &pair2) {
-    return pair1.first > pair2.first;
-  }
-
-  void update(Parameter *para) {
-    updateThreadChecker_.check();
-    auto &vec = para->getBuf(PARAMETER_GRADIENT);
-    if (vec) {
-      vec->dotMul(*maskVec_);
-    }
-  }
-
-  void generateMask(Parameter *para) {
-    VectorPtr maskTemp = Vector::create(para->getSize(), false);
-    maskTemp->zeroMem();
-    real *maskTempData = maskTemp->getData();
-    size_t nonZeroNum = para->getSize() * (1 - sparsityRatio_);
-
-    VectorPtr paraVec = para->getBuf(PARAMETER_VALUE);
-    VectorPtr paraCpuCopy = Vector::create(para->getSize(), false);
-
-    paraCpuCopy->copyFrom(*paraVec);
-    std::vector<std::pair<real, size_t>> param;
-
-    for (size_t i = 0; i < para->getSize(); i++)
-      param.push_back(std::make_pair(fabs(paraCpuCopy->getData()[i]), i));
-
-    std::partial_sort(
-        param.begin(), param.begin() + nonZeroNum, param.end(), sortPairAscend);
-    for (size_t i = 0; i < nonZeroNum; i++) maskTempData[param[i].second] = 1.0;
-
-    // Currently just use a mask vector for hack.
-    if (para->useGpu()) {
-      maskVec_ = Vector::create(para->getSize(), para->useGpu());
-      maskVec_->copyFrom(*maskTemp);
-    } else {
-      maskVec_ = maskTemp;
-    }
-  }
-
-  void init(Parameter *para) {
-    generateMask(para);
-    size_t initCount = this->initCount_.fetch_add(1);
-    CHECK_EQ(initCount, 0UL) << "Currently the StaticPruningHook must invoke "
-                                "in same ParamterUpdater";
-    VLOG(3) << "Initialize Parameter " << para;
-    SetDevice device(para->getDeviceId());
-
-    auto &paraVec = para->getBuf(PARAMETER_VALUE);
-    paraVec->dotMul(*maskVec_);
-  }
-
- private:
-  SameThreadChecker updateThreadChecker_;
-  std::atomic<size_t> initCount_;
-  VectorPtr maskVec_;
-  real sparsityRatio_;
-};
-
-IParameterUpdaterHook::IParameterUpdaterHook() {}
-
-IParameterUpdaterHook::~IParameterUpdaterHook() {}
-
-/**
- * A Hasher used by g_hooks.
- *
- * Use the independent hasher intendedly. There is a hasher in PServer for hash
- * ParameterBlock. But not to use same hasher to reduce dependency.
- *
- * May be extracted to Util.h to unify the hasher.
- */
-class StringIntPairHasher {
- public:
-  size_t operator()(const std::pair<std::string, int> &k) const {
-    return intHasher_(strHasher_(k.first) + k.second);
-  }
-
- private:
-  std::hash<std::string> strHasher_;
-  std::hash<int> intHasher_;
-};
-
-static WeakKVCache<std::pair<std::string, int>,
-                   IParameterUpdaterHook,
-                   StringIntPairHasher>
-    g_hookCache_;
-
-/**
- * ParameterUpdaterHook actually factory method.
- */
-static IParameterUpdaterHook *createImpl(
-    const ParameterUpdaterHookConfig &config) {
-  auto &type = config.type();
-  if (type == "pruning") {
-    return new StaticPruningHook(config);
-  }
-
-  LOG(FATAL) << "Unknown Hook type:  " << type;
-  return nullptr;
-}
-
-std::shared_ptr<IParameterUpdaterHook> IParameterUpdaterHook::create(
-    const ParameterConfig &paramConfig, int idx) {
-  std::pair<std::string, int> key = {paramConfig.name(), idx};
-  return g_hookCache_.get(
-      key, [&] { return createImpl(paramConfig.update_hooks(idx)); });
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdaterHook.h b/paddle/legacy/parameter/ParameterUpdaterHook.h
deleted file mode 100644
index cb96e4cf007..00000000000
--- a/paddle/legacy/parameter/ParameterUpdaterHook.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-
-#include "ParameterConfig.pb.h"
-
-namespace paddle {
-
-class Parameter;
-
-/**
- * The parameter updater hook interface.
- *
- * The Parameter Updater hooks is a group of methods invoke before
- * ParameterUpdater::updateImpl. It can modify gradient/momentum/etc before
- * parameter optimization.
- */
-class IParameterUpdaterHook {
- public:
-  virtual ~IParameterUpdaterHook();
-
-  /**
-   * Create A ParameterUpdaterHook.
-   *
-   * The same parameter shared the same hooks. So it returns shared_ptr.
-   *
-   * @param param_config The parameter config.
-   * @param idx  The element index of param_config.updater_hooks() array.
-   */
-  static std::shared_ptr<IParameterUpdaterHook> create(
-      const ParameterConfig& paramConfig, int idx);
-
-  /**
-   * The update hook method. Invoke before ParameterUpdater::updateImpl
-   */
-  virtual void update(Parameter* para) = 0;
-
-  /**
-   * The init hook method. Invoke in ParameterUpdater::init
-   */
-  virtual void init(Parameter* para) = 0;
-
- protected:
-  /**
-   * Ctor.
-   */
-  IParameterUpdaterHook();
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/Regularizer.cpp b/paddle/legacy/parameter/Regularizer.cpp
deleted file mode 100644
index c1d5f4fa684..00000000000
--- a/paddle/legacy/parameter/Regularizer.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Regularizer.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-Regularizer* Regularizer::get(const std::vector<ParameterType>& types,
-                              const ParameterConfig& paraConfig) {
-  bool useLearningRateVec =
-      std::find(types.begin(), types.end(), PARAMETER_LEARNING_RATE) !=
-      types.end();
-  if (paraConfig.decay_rate_l1() > 0.0f &&
-      paraConfig.decay_rate() > 0.0f) {  // use L1 and L2
-    if (useLearningRateVec) {
-      static L1L2LrRegularizer regularizer_;
-      return &regularizer_;
-    }
-    static L1L2Regularizer regularizer_;
-    return &regularizer_;
-  }
-  if (paraConfig.decay_rate_l1() > 0.0f) {  // use L1 only
-    if (useLearningRateVec) {
-      static L1LrRegularizer regularizer_;
-      return &regularizer_;
-    }
-    static L1Regularizer regularizer_;
-    return &regularizer_;
-  }
-  if (paraConfig.decay_rate() > 0.0f) {  // use L2 only
-    if (useLearningRateVec) {
-      static L2LrRegularizer regularizer_;
-      return &regularizer_;
-    }
-    static L2Regularizer regularizer_;
-    return &regularizer_;
-  }
-  return nullptr;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/Regularizer.h b/paddle/legacy/parameter/Regularizer.h
deleted file mode 100644
index fa5384e2325..00000000000
--- a/paddle/legacy/parameter/Regularizer.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ParameterUpdaterBase.h"
-
-namespace paddle {
-
-// Regularizer function for parameter, e.g. L1/L2
-class Regularizer {
- public:
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,  // learningrate from optimizer
-                      int t0,             // last occurence time
-                      int t) const = 0;   // current time
-  virtual ~Regularizer() {}
-
-  static Regularizer* get(const std::vector<ParameterType>& types,
-                          const ParameterConfig& paraConfig);
-};
-
-// L1 Regularizer, |w|_1
-class L1Regularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,
-                      int t0,
-                      int t) const {
-    vecs[PARAMETER_VALUE]->applyL1(learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate_l1() * (t - t0));
-  }
-};
-
-// L1 Lr Regularizer
-class L1LrRegularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,
-                      int t0,
-                      int t) const {
-    vecs[PARAMETER_VALUE]->applyL1(*vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate_l1() * (t - t0));
-  }
-};
-
-// L2 Regularizer, |w|_2^2
-class L2Regularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,
-                      int t0,
-                      int t) const {
-    vecs[PARAMETER_VALUE]->applyL2(learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate() * (t - t0));
-  }
-};
-
-// L2 Lr Regularizer
-class L2LrRegularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,
-                      int t0,
-                      int t) const {
-    vecs[PARAMETER_VALUE]->applyL2(*vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate() * (t - t0));
-  }
-};
-
-// L1 + L2 Regularizer, |w|_1 + |w|_2^2
-class L1L2Regularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,
-                      int t0,
-                      int t) const {
-    vecs[PARAMETER_VALUE]->applyL1(learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate_l1() * (t - t0));
-    vecs[PARAMETER_VALUE]->applyL2(learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate() * (t - t0));
-  }
-};
-
-// L1 + L2 Lr Regularizer
-class L1L2LrRegularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,
-                      int t0,
-                      int t) const {
-    vecs[PARAMETER_VALUE]->applyL1(*vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate_l1() * (t - t0));
-    vecs[PARAMETER_VALUE]->applyL2(*vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate() * (t - t0));
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ThreadLocalBuffer.cpp b/paddle/legacy/parameter/ThreadLocalBuffer.cpp
deleted file mode 100644
index 550e41dfdaa..00000000000
--- a/paddle/legacy/parameter/ThreadLocalBuffer.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ThreadLocalBuffer.h"
-#include "Parameter.h"
-
-namespace paddle {
-namespace parameter {
-
-static ThreadLocal<std::vector<VectorPtr>> tlsTempBufs_;
-
-VectorPtr* getThreadLocalBuffer() {
-  std::vector<VectorPtr>& bufs = *tlsTempBufs_;
-  if (bufs.empty()) {
-    bufs.resize(NUM_PARAMETER_TYPES);
-    for (auto& vec : bufs) {
-      vec.reset(new CpuVector(0, nullptr));
-    }
-  }
-  return bufs.data();
-}
-
-}  // namespace parameter
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ThreadLocalBuffer.h b/paddle/legacy/parameter/ThreadLocalBuffer.h
deleted file mode 100644
index d360feeed6c..00000000000
--- a/paddle/legacy/parameter/ThreadLocalBuffer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/legacy/math/Vector.h"
-
-namespace paddle {
-namespace parameter {
-extern VectorPtr* getThreadLocalBuffer();
-}  // namespace parameter
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/Weight.cpp b/paddle/legacy/parameter/Weight.cpp
deleted file mode 100644
index 9d94050a5cd..00000000000
--- a/paddle/legacy/parameter/Weight.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Weight.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-Weight::Weight(size_t height, size_t width, ParameterPtr param) {
-  VectorPtr vPtr = param->getBuf(PARAMETER_VALUE);
-  VectorPtr gPtr = param->getBuf(PARAMETER_GRADIENT);
-
-  // create a new weight
-  if (param->isSparse()) {
-    CHECK_LE(param->getSize(), width * height);
-  } else {
-    CHECK_EQ(param->getSize(), width * height);
-  }
-
-  // weight_
-  weight_ = param->getMat(PARAMETER_VALUE);
-  if (!weight_ && vPtr) {
-    weight_ = Matrix::create(vPtr->getMemoryHandle(), height, width);
-  }
-  if (weight_) {
-    CHECK_EQ(height, weight_->getHeight());
-    CHECK_EQ(width, weight_->getWidth());
-  }
-
-  // weightGrad
-  weightGrad_ = param->getMat(PARAMETER_GRADIENT);
-  if (!weightGrad_ && gPtr) {
-    weightGrad_ = Matrix::create(gPtr->getMemoryHandle(), height, width);
-  }
-  if (weightGrad_) {
-    CHECK_EQ(height, weightGrad_->getHeight());
-    CHECK_EQ(width, weightGrad_->getWidth());
-  }
-
-  parameter_ = param;
-}
-
-Weight::Weight(size_t height, size_t width, ParameterPtr param, size_t offset) {
-  VectorPtr vPtr = param->getBuf(PARAMETER_VALUE);
-  VectorPtr gPtr = param->getBuf(PARAMETER_GRADIENT);
-
-  // create a new weight
-  CHECK_LE(offset + width * height, param->getSize());
-
-  // weight_
-  if (vPtr) {
-    weight_ = Matrix::create(vPtr->getData() + offset,
-                             height,
-                             width,
-                             /* trans */ false,
-                             param->useGpu());
-  }
-
-  // weightGrad
-  if (gPtr) {
-    weightGrad_ = Matrix::create(gPtr->getData() + offset,
-                                 height,
-                                 width,
-                                 /* trans */ false,
-                                 param->useGpu());
-  }
-
-  parameter_ = param;
-}
-
-const ParameterPtr& Weight::getParameterPtr() { return parameter_; }
-void Weight::setParameterPtr(ParameterPtr param) { parameter_ = param; }
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/Weight.h b/paddle/legacy/parameter/Weight.h
deleted file mode 100644
index 241c8d829cd..00000000000
--- a/paddle/legacy/parameter/Weight.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <vector>
-
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/SparseRowMatrix.h"
-#include "paddle/legacy/parameter/Parameter.h"
-
-namespace paddle {
-
-class Weight {
- private:
-  MatrixPtr weight_;
-  MatrixPtr weightGrad_;
-  ParameterPtr parameter_;
-
- public:
-  Weight(size_t height, size_t width, ParameterPtr parameter);
-  Weight(size_t height, size_t width, ParameterPtr parameter, size_t offset);
-
-  const MatrixPtr& getW() { return weight_; }
-  const MatrixPtr& getWGrad() { return weightGrad_; }
-  const ParameterPtr& getParameterPtr();
-
-  void incUpdate(const UpdateCallback& callback) {
-    getParameterPtr()->incUpdate(callback);
-  }
-
-  void setParameterPtr(ParameterPtr param);
-};
-
-typedef std::vector<std::unique_ptr<Weight>> WeightList;
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/tests/CMakeLists.txt b/paddle/legacy/parameter/tests/CMakeLists.txt
deleted file mode 100644
index 181ccdc1f09..00000000000
--- a/paddle/legacy/parameter/tests/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-add_simple_unittest(test_common)
-add_simple_unittest(test_argument)
diff --git a/paddle/legacy/parameter/tests/test_argument.cpp b/paddle/legacy/parameter/tests/test_argument.cpp
deleted file mode 100644
index 0c632e0cd10..00000000000
--- a/paddle/legacy/parameter/tests/test_argument.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/parameter/Argument.h>
-
-using namespace paddle;  // NOLINT
-
-TEST(Argument, poolSequenceWithStride) {
-  Argument input, output;
-  ICpuGpuVector::resizeOrCreate(input.sequenceStartPositions, 5, false);
-  int* inStart = input.sequenceStartPositions->getMutableData(false);
-  inStart[0] = 0;
-  inStart[1] = 9;
-  inStart[2] = 14;
-  inStart[3] = 17;
-  inStart[4] = 30;
-
-  int strideResult[] = {0, 5, 9, 14, 17, 22, 27, 30};
-  int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};
-
-  for (auto reversed : {false, true}) {
-    ICpuGpuVectorPtr stridePositions;
-    output.poolSequenceWithStride(
-        input, 5 /* stride */, &stridePositions, reversed);
-
-    const int* outStart = output.sequenceStartPositions->getData(false);
-    CHECK_EQ(outStart[0], 0);
-    CHECK_EQ(outStart[1], 2);
-    CHECK_EQ(outStart[2], 3);
-    CHECK_EQ(outStart[3], 4);
-    CHECK_EQ(outStart[4], 7);
-
-    CHECK_EQ(stridePositions->getSize(), 8UL);
-    auto result = reversed ? strideResultReversed : strideResult;
-    for (int i = 0; i < 8; i++) {
-      CHECK_EQ(stridePositions->getData(false)[i], result[i]);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/parameter/tests/test_common.cpp b/paddle/legacy/parameter/tests/test_common.cpp
deleted file mode 100644
index 8de9d6da983..00000000000
--- a/paddle/legacy/parameter/tests/test_common.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/legacy/utils/Util.h>
-#include <stdlib.h>
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/parameter/ParameterUpdateFunctions.h>
-#include <paddle/legacy/utils/Flags.h>
-#include <paddle/legacy/utils/Stat.h>
-#include <paddle/legacy/utils/Thread.h>
-
-using namespace paddle;  // NOLINT
-
-class CommonTest : public ::testing::Test {
- protected:
-  CommonTest() : testStat_("test") {}
-  virtual ~CommonTest() {}
-  virtual void SetUp() {
-    const size_t buffSize[] = {
-        100, 128, 500, 1024, 4096, 10240, 102400, 1000000};
-    sizeVec_.resize(8);
-    memcpy(&sizeVec_[0], &buffSize[0], 8 * sizeof(size_t));
-    valueUint_.resize(4);
-    valueUint_[0].first = 0.0;
-    valueUint_[0].second = 0.0;
-    valueUint_[1].first = 0.0;
-    valueUint_[1].second = 1.0;
-    valueUint_[2].first = 1.0;
-    valueUint_[2].second = 0.0;
-    valueUint_[3].first = 1.0;
-    valueUint_[3].second = 1.0;
-    learningRate_ = 1.0;
-  }
-
-  void test_sgdUpadate(real* gradientBuffer,
-                       real* valueBuffer,
-                       real* momentumBuffer,
-                       size_t size);
-
-  virtual void TreaDown() { LOG(INFO) << "All Test Finished."; }
-
- protected:
-  std::vector<std::pair<real, real>> valueUint_;
-  std::vector<size_t> sizeVec_;
-  real learningRate_;
-  StatSet testStat_;
-};
-
-void CommonTest::test_sgdUpadate(real* gradientBuffer,
-                                 real* valueBuffer,
-                                 real* momentumBuffer,
-                                 size_t size) {
-// sgdUpdateAvx has no double version yet
-#if defined(__AVX__) && !defined(PADDLE_TYPE_DOUBLE)
-  real valueSum1 = 0, valueSum2 = 0, momSum1 = 0, momSum2 = 0;
-  real* gradTmp = new real[size];
-  real* valueTmp = new real[size];
-  real* momentumTmp = new real[size];
-  memcpy(gradTmp, gradientBuffer, size * sizeof(real));
-  memcpy(valueTmp, valueBuffer, size * sizeof(real));
-  memcpy(momentumTmp, momentumBuffer, size * sizeof(real));
-  for (auto& arg : valueUint_) {
-    {
-      {
-        struct timeval t;
-        REGISTER_TIMER("gettimeofday", 0, testStat_);
-        gettimeofday(&t, NULL);
-      }
-      REGISTER_TIMER("avxTimer", 0);
-      sgdUpdateAvx(learningRate_,
-                   arg.first,
-                   arg.second,
-                   size,
-                   valueBuffer,
-                   gradientBuffer,
-                   momentumBuffer);
-    }
-    for (size_t i = 0; i < size; i++) {
-      valueSum1 += valueBuffer[i];
-      momSum1 += momentumBuffer[i];
-      // std::cout << "["
-      //          << valueBuffer[i]
-      //          << "," << momentumBuffer[i]
-      //          << "," << gradientBuffer[i] << "],";
-    }
-    {
-      REGISTER_TIMER("cpuTimer", 0);
-      sgdUpdateCpu(learningRate_,
-                   arg.first,
-                   arg.second,
-                   size,
-                   valueTmp,
-                   gradTmp,
-                   momentumTmp);
-    }
-    for (size_t i = 0; i < size; i++) {
-      valueSum2 += valueTmp[i];
-      momSum2 += momentumTmp[i];
-      // std::cout << "["
-      //          << valueTmp[i]
-      //          << "," << momentumTmp[i]
-      //          << "," << gradTmp[i] << "],";
-    }
-
-    VLOG(3) << "valueSum1 = " << valueSum1 << " ; valueSum2 = " << valueSum2;
-    VLOG(3) << "momSum1 = " << momSum1 << " ; momSum2 = " << momSum2;
-    ASSERT_EQ(valueSum1, valueSum2);
-    ASSERT_EQ(momSum1, momSum2);
-  }
-  delete[] gradTmp;
-  delete[] valueTmp;
-  delete[] momentumTmp;
-#endif
-}
-
-TEST_F(CommonTest, sgdUpdate) {
-  const size_t alignHeader[] = {0, 2, 3, 5, 7, 8};
-  for (auto& size : sizeVec_) {
-    real *gradientBuffer, *valueBuffer, *momentumBuffer;
-    CHECK_EQ(posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size),
-             0);
-    CHECK_EQ(posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size), 0);
-    CHECK_EQ(posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size),
-             0);
-
-    for (size_t i = 0; i < size; i++) {
-      gradientBuffer[i] = 1.0;
-      valueBuffer[i] = 2.0;
-      momentumBuffer[i] = 3.0;
-    }
-    for (int i = 0; i < 6; i++) {
-      LOG(INFO) << "----------------------" << size << ":" << alignHeader[i]
-                << "-------------------------";
-      test_sgdUpadate(&gradientBuffer[alignHeader[i]],
-                      &valueBuffer[alignHeader[i]],
-                      &momentumBuffer[alignHeader[i]],
-                      size - alignHeader[i]);
-    }
-    free(gradientBuffer);
-    free(valueBuffer);
-    free(momentumBuffer);
-  }
-  globalStat.printAllStatus();
-  testStat_.printAllStatus();
-}
-
-TEST_F(CommonTest, syncThreadPool) {
-  SyncThreadPool pool(10);
-
-  std::vector<int> nums;
-  nums.resize(10);
-
-  pool.exec([&](int tid, size_t numThreads) { nums[tid] = tid; });
-  for (size_t i = 0; i < nums.size(); ++i) {
-    EXPECT_EQ((int)i, nums[i]);
-  }
-
-  pool.exec([&](int tid, size_t numThreads) { nums[tid] -= tid; });
-  for (size_t i = 0; i < nums.size(); ++i) {
-    EXPECT_EQ((int)0, nums[i]);
-  }
-}
diff --git a/paddle/legacy/pserver/BaseClient.cpp b/paddle/legacy/pserver/BaseClient.cpp
deleted file mode 100644
index 13bb8a1cc58..00000000000
--- a/paddle/legacy/pserver/BaseClient.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BaseClient.h"
-#include <gflags/gflags.h>
-#include <string.h>
-#include <vector>
-#include "paddle/legacy/utils/Stat.h"
-
-DECLARE_string(pservers);
-
-namespace paddle {
-
-BaseClient::BaseClient(bool separate, int numPorts)
-    : stopping_(false), numPorts_(numPorts), separateSendAndRecv_(separate) {
-  CHECK_GT(numPorts, 0);
-}
-
-BaseClient::~BaseClient() {}
-
-void BaseClient::recvData() { recvSyncBarrier_->wait(); }
-
-void BaseClient::synchronize(SyncObject syncObjectId) {
-  SynchronizeRequest request;
-  request.set_sync_object_id(syncObjectId);
-  std::vector<SynchronizeResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void BaseClient::startThreads() {
-  if (!separateSendAndRecv_) {
-    return;
-  }
-  recvSyncBarrier_.reset(new ThreadBarrier(threadNum_ + 1));
-
-  sendThreads_.resize(threadNum_);
-  recvThreads_.resize(threadNum_);
-  sendJobQueue_.resize(threadNum_);
-  recvJobQueue_.resize(threadNum_);
-
-  for (int i = 0; i < threadNum_; ++i) {
-    sendJobQueue_[i].reset(new SendQueue());
-    recvJobQueue_[i].reset(new SendQueue());
-
-    sendThreads_[i].reset(
-        new std::thread([this](int id) { this->send(id); }, i));
-
-    recvThreads_[i].reset(
-        new std::thread([this](int id) { this->recv(id); }, i));
-  }
-}
-
-void BaseClient::finishThreads() {
-  if (!separateSendAndRecv_) {
-    return;
-  }
-  stopping_ = true;
-  for (int i = 0; i < threadNum_; i++) {
-    sendJobQueue_[i]->enqueue(nullptr);
-  }
-  for (auto& thread : sendThreads_) {
-    thread->join();
-  }
-  for (auto& thread : recvThreads_) {
-    thread->join();
-  }
-  stopping_ = false;
-}
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/BaseClient.h b/paddle/legacy/pserver/BaseClient.h
deleted file mode 100644
index 66e8f39cd60..00000000000
--- a/paddle/legacy/pserver/BaseClient.h
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ParameterService.pb.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/pserver/ProtoServer.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/Queue.h"
-
-namespace paddle {
-
-/**
- * it manages all connections to pservers.
- * it exists two modes to manage connections to all pservers. Firstly, one
- * connection owns two threads that separately manage to send and receive
- * data. Secondly, each thread uses one connection for all activation in it.
- * the first solution arms with sendThreads_/recvThreads_ and sendJobQueue_/
- * recvJobQueue_. the second solution use some shared thread pool to manage
- * connections.
- */
-class BaseClient {
- protected:
-  typedef std::unique_ptr<std::thread> ThreadPtr;
-  typedef std::vector<std::vector<iovec>> InputIovs;
-  typedef std::vector<SendParameterRequest> SendRequest;
-  typedef std::vector<SendDataRequest> SendDataRequestVec;
-
-  // TODO(yanfei):
-  // refine data structure to unify parameter and features communication
-  struct SendJob {
-    /// store parameters related blocks data
-    InputIovs parallelInputIovs;
-    /// store protobuf request
-    SendRequest parallelRequests;
-    /// store data, such as features for metric learning
-    SendDataRequestVec parallelDataRequests;
-  };
-
- public:
-  explicit BaseClient(bool separate = false, int numPorts = FLAGS_ports_num);
-
-  virtual ~BaseClient();
-
-  typedef std::shared_ptr<SendJob> SendJobPtr;
-  typedef Queue<SendJobPtr> SendQueue;
-
-  /// send data to server, support only synchronize
-  template <class DataType>
-  void putData(int clientId,
-               SendDataType type,
-               DataType* datas,
-               size_t size,
-               DataUpdateMode mode) {
-    synchronize(SYNC_DATA);
-    sendData(clientId, type, mode, datas, size);
-    recvData();
-    synchronize(SYNC_DATA);
-  }
-
-  template <class DataType>
-  void putOwnData(int clientId,
-                  SendDataType type,
-                  DataType* datas,
-                  size_t size) {
-    putData(clientId, type, datas, size, DATA_UPDATE_MODE_SET_OWN);
-  }
-
-  template <class DataType>
-  void getAllData(int clientId,
-                  SendDataType type,
-                  DataType* datas,
-                  size_t size) {
-    sendData(clientId,
-             type,
-             DATA_UPDATE_MODE_GET_ALL,
-             reinterpret_cast<DataType*>(NULL),
-             0);
-    recvData();
-    size_t dataOffset = 0;
-    for (auto& recvMem : recvDataMems_) {
-      CHECK_LE(dataOffset, size);
-      size_t memSize = std::min(recvMem.get()->getSize(),
-                                sizeof(DataType) * (size - dataOffset));
-      CHECK_EQ(memSize % sizeof(DataType), size_t(0));
-      memcpy(datas + dataOffset, recvMem.get()->getBuf(), memSize);
-      dataOffset += memSize / sizeof(DataType);
-    }
-    CHECK_EQ(dataOffset, size);
-  }
-
-  /**
-   * Reduces values on all clients.
-   * This reduce just support SUM.
-   * The results are saved in recvBuf of rootId client
-   */
-  template <class DataType>
-  void reduce(DataType* sendBuf,
-              DataType* recvBuf,
-              size_t size,
-              int clientId,
-              int rootId) {
-    putOwnData(clientId, DATA_REDUCE_SUM, sendBuf, size);
-    if (rootId == clientId) {
-      getAllData(clientId, DATA_REDUCE_SUM, recvBuf, size);
-    }
-  }
-
-  /**
-   * return trans data type according to the input type
-   */
-  virtual TransDataType getTransDtype(const std::type_info& info) {
-    TransDataType dataType;
-    if (typeid(int*) == info) {  // NOLINT
-      dataType = TRANS_INT32;
-    } else if (typeid(uint32_t*) == info) {  // NOLINT
-      dataType = TRANS_UINT32_T;
-    } else if (typeid(int64_t*) == info) {  // NOLINT
-      dataType = TRANS_INT64_T;
-    } else if (typeid(uint64_t*) == info) {  // NOLINT
-      dataType = TRANS_UINT64_T;
-    } else if (typeid(float*) == info) {  // NOLINT
-      dataType = TRANS_FLOAT;
-    } else if (typeid(double*) == info) {  // NOLINT
-      dataType = TRANS_DOUBLE;
-    } else {
-      LOG(FATAL) << "not supported";
-    }
-    return dataType;
-  }
-
- protected:
-  /// for a > 0, b > 0:
-  /// return the smallest x s.t. b*x >= a
-  static int divup(int a, int b) { return (a + b - 1) / b; }
-
-  int calcClientId(int i, int serviceNum) {
-    return (i + FLAGS_trainer_id * numPorts_) % serviceNum;
-  }
-
-  /// start threads in sendThreads_ and recvThreads_
-  void startThreads();
-
-  /// finish threads in sendThreads_ and recvThreads_
-  void finishThreads();
-
-  template <class DataType>
-  void prepareData(int clientId,
-                   SendDataType type,
-                   DataUpdateMode updateMode,
-                   DataType* datas,
-                   size_t size,
-                   SendJob* sendJob) {
-    sendJob->parallelDataRequests.resize(serviceNum_);
-    sendJob->parallelInputIovs.resize(serviceNum_);
-    for (int i = 0; i < serviceNum_; ++i) {
-      auto& request = sendJob->parallelDataRequests[i];
-      request.set_update_mode(updateMode);
-      request.set_type(type);
-      request.set_client_id(clientId);
-      request.set_server_id(i);
-    }
-
-    /// split datas which need send to Server into serviceNum_ pieces
-    if (!datas) {
-      CHECK(!size) << "ownSize should be zero since datas is nullptr";
-    }
-    size_t baseSize = size / serviceNum_;
-    size_t dataOffset = 0;
-    for (int i = 0; i < serviceNum_; ++i) {
-      auto& request = sendJob->parallelDataRequests[i];
-      DataBlock* block = request.add_blocks();
-      size_t ownSize = size_t(i) < size % serviceNum_ ? baseSize + 1 : baseSize;
-      size_t realSize = datas ? std::max(ownSize, size_t(1)) : 0;
-      block->set_total_size(realSize * sizeof(DataType));
-      block->set_data_size(sizeof(DataType));
-      // TODO(yuyang18): The getTransDtype can be rewritten as template method
-      //                 to reduce runtime overhead.
-      block->set_data_type(getTransDtype(typeid(DataType*)));  // NOLINT
-      if (datas) {
-        sendJob->parallelInputIovs[i].push_back(
-            {datas + dataOffset, realSize * sizeof(DataType)});
-      }
-      dataOffset += ownSize;
-    }
-    CHECK_EQ(dataOffset, size);
-  }
-
-  /**
-   * @brief send data to all data servers
-   *
-   * @note  each trainer sends all its data to all data servers
-   *        it's for broadcast data synchronization, such as features
-   *        synchronization in metric learning.
-   */
-  template <class DataType>
-  void sendData(int clientId,
-                SendDataType type,
-                DataUpdateMode updateMode,
-                DataType* datas,
-                size_t size) {
-    SendJobPtr sendJob = std::make_shared<SendJob>();
-    prepareData(clientId, type, updateMode, datas, size, sendJob.get());
-    for (int i = 0; i < threadNum_; ++i) {
-      sendJobQueue_[i]->enqueue(sendJob);
-    }
-  }
-
-  /**
-   * @brief recv data from all data servers
-   *
-   * @note  synchronize all recv threads
-   */
-  void recvData();
-
-  /// send request, and recv responses
-  template <typename ProtoIn, typename ProtoOut>
-  void multiCall(const char* funcName,
-                 const ProtoIn& request,
-                 std::vector<ProtoOut>* responses) {
-    responses->resize(clients_.size());
-    size_t numClients = clients_.size();
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].send(funcName, request);
-    }
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].recv(&(*responses)[i]);
-    }
-  }
-
-  /**
-   * @brief synchronize all trainers and pservers
-   *
-   * @note  used to ensure that data of all trainers have been received
-   */
-  void synchronize(SyncObject syncObjectId = SYNC_DEFAULT);
-
-  /**
-   * @brief use multithread to separately send data
-   *
-   * @note  each thread should read its own JobQueue to handle requests
-   *        each thread should calcClientId() to retrieve connections
-   *        managed by himself.
-   *        send and recv are implemented in child class.
-   */
-  virtual void send(int threadId) = 0;
-
-  /**
-   * @brief use multithread to separately receive data
-   *
-   * @note  almost same as send()
-   */
-  virtual void recv(int threadId) = 0;
-
- protected:
-  bool stopping_;
-  /// nodes * ports that means the number of real pservers
-  int serviceNum_;
-  /**
-   * threads num for managing all services. Normally the
-   * number of pservers are relatively less than several
-   * hundreds so that using thread-based parallelization
-   * can benifit traffic performance and pserver's sgd
-   * optimization performance.
-   */
-  int threadNum_;
-  /// the connection manager at client end
-  std::vector<ProtoClient> clients_;
-  /// send threads for parallelization
-  std::vector<ThreadPtr> sendThreads_;
-  /// recv threads for parallelization
-  std::vector<ThreadPtr> recvThreads_;
-  std::unique_ptr<ThreadBarrier> recvSyncBarrier_;
-
-  // TODO(yanfei):
-  // current pserver's will return value until all parameters'
-  // optimization are finished so that recv are not overlapped
-  // in reality. More robust implimentation should be to pipeline
-  // all send/recv action based on parameter unit level, and
-  // it will benifits deep and larger model training in future,
-  // especially local node compution power surpasses inter-connection
-  // such as GPU cluster, even with BOX GPU cluster.
-  // queue for buffering send request
-  /**
-   * send/recv queue cooperates with each other to accomplish
-   * overlapping communication with forwardBackward action.
-   */
-  std::vector<std::unique_ptr<SendQueue>> sendJobQueue_;
-  /// queue for buffering recv request
-  std::vector<std::unique_ptr<SendQueue>> recvJobQueue_;
-  /// specific for dserver
-  SendJob sendJob_;
-  /// port num for each node
-  int numPorts_;
-  /// if set, overlapped optimization is disabled
-  bool separateSendAndRecv_;
-  std::vector<CpuMemHandlePtr> recvDataMems_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/CMakeLists.txt b/paddle/legacy/pserver/CMakeLists.txt
deleted file mode 100644
index 0ae9c6ef6af..00000000000
--- a/paddle/legacy/pserver/CMakeLists.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-# parameter server package
-
-######################### paddle_network ####################
-set(NETWORK_SOURCES
-    LightNetwork.cpp
-    SocketChannel.cpp
-    ProtoServer.cpp)
-
-set(NETWORK_HEADERS
-    LightNetwork.h
-    SocketChannel.h
-    ProtoServer.h)
-
-add_library(paddle_network STATIC
-    ${NETWORK_SOURCES})
-
-add_dependencies(paddle_network paddle_proto ${external_project_dependencies})
-
-################### paddle_pserver ######################
-set(PSERVER_SOURCES
-    BaseClient.cpp
-    ParameterClient2.cpp
-    ParameterServer2.cpp
-    SparseParameterDistribution.cpp
-    ParameterServerController.cpp)
-
-set(PSERVER_HEADERS
-    BaseClient.h
-    ParameterClient2.h
-    ParameterServer2.h
-    SparseParameterDistribution.h
-    ParameterServerController.h)
-
-add_library(paddle_pserver STATIC
-    ${PSERVER_SOURCES})
-
-add_dependencies(paddle_pserver paddle_proto ${external_project_dependencies})
-
-set(PSERVER_MAIN_SOURCES
-    ParameterServer2Main.cpp)
-
-if(WITH_TESTING)
-  add_subdirectory(test)
-endif()
-
-if(NOT MOBILE_INFERENCE)
-  add_executable(paddle_pserver_main ${PSERVER_MAIN_SOURCES})
-  link_paddle_exe(paddle_pserver_main)
-
-  install(TARGETS paddle_pserver_main
-          RUNTIME DESTINATION opt/paddle/bin
-          PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
-          GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
-
-  set_target_properties(paddle_pserver_main PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
-endif()
diff --git a/paddle/legacy/pserver/LightNetwork.cpp b/paddle/legacy/pserver/LightNetwork.cpp
deleted file mode 100644
index 469c95853ec..00000000000
--- a/paddle/legacy/pserver/LightNetwork.cpp
+++ /dev/null
@@ -1,459 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fcntl.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <chrono>
-
-#include <arpa/inet.h>
-#include <net/if.h>
-#include <sys/ioctl.h>
-#include <sstream>
-
-#include "LightNetwork.h"
-#include "RDMANetwork.h"
-#include "paddle/legacy/utils/StringUtil.h"
-#include "paddle/legacy/utils/Util.h"
-
-/// quick ack can reduce the latency of small message
-DEFINE_bool(small_messages,
-            false,
-            "if message size is small, recommend set it True to enable quick "
-            "ack and no delay");
-
-/// reasonable sock_send_buf_size can control the traffic injected into switch
-/// network. Injecting too many data into traffic could cause packets loss which
-/// cause long latency and degrade the efficiency of communication.
-DEFINE_int32(sock_send_buf_size,
-             1024 * 1024 * 40,
-             "restrict sock send buff size, can reduce network congestion if "
-             "set carefully");
-
-/// reasonable size can hold bursted packets and reduce packets loss
-DEFINE_int32(sock_recv_buf_size,
-             1024 * 1024 * 40,
-             "restrict sock recv buff size");
-
-/// reasonable sock_listen_queue_size can control maximum pending connections.
-DEFINE_int32(sock_listen_queue_size,
-             1024,
-             "listen queue size when pserver listen a TCP port");
-
-namespace paddle {
-
-/**
- * @brief get ip address from interface name
- *
- * @param[in] device device interface name
- */
-std::string getIpAddr(std::string &device) {
-  int sock;
-  struct sockaddr_in sin;
-  struct ifreq ifr;
-
-  sock = socket(AF_INET, SOCK_DGRAM, 0);
-  CHECK(sock >= 0) << "Create socket error.";
-
-  strncpy(ifr.ifr_name, device.c_str(), IFNAMSIZ);
-  ifr.ifr_name[IFNAMSIZ - 1] = 0;
-
-  CHECK_GE(ioctl(sock, SIOCGIFADDR, &ifr), 0);
-  memcpy(&sin, &ifr.ifr_addr, sizeof(sin));
-  close(sock);
-  return std::string(inet_ntoa(sin.sin_addr));
-}
-
-/**
- * @brief set sock option
- *
- * @param[in] sockfd sock file descriptor
- *
- * @note adjust some default sock option for better performance
- */
-void setOption(int sockfd) {
-#if !defined(__APPLE__) && !defined(__OSX__)
-  int sendSize = FLAGS_sock_send_buf_size;
-  int recvSize = FLAGS_sock_recv_buf_size;
-  CHECK_GE(
-      setsockopt(sockfd, SOL_SOCKET, SO_RCVBUF, &recvSize, sizeof(recvSize)),
-      0);
-  CHECK_GE(
-      setsockopt(sockfd, SOL_SOCKET, SO_SNDBUF, &sendSize, sizeof(sendSize)),
-      0);
-#endif
-
-  if (FLAGS_small_messages) {
-    int optval = 1;
-    CHECK_GE(
-        setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, &optval, sizeof(optval)),
-        0);
-#ifdef TCP_QUICKACK
-    optval = 1;
-    CHECK_GE(
-        setsockopt(sockfd, IPPROTO_TCP, TCP_QUICKACK, &optval, sizeof(optval)),
-        0);
-#endif
-  }
-  int reuse = 1;
-  CHECK_GE(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)),
-           0);
-}
-
-/**
- * @brief class constructor for SocketServer
- * @param[in] addr sock bind address
- * @param[in] port sock bind port
- * @param[in] rdmaCpu rdma sock bind cpu core
- *
- * @note start one socket server which hosts parameter server process.
- *       rdmaCpu is passed to rdma deamon for better performance, and
- *       start tcp socket instead of rdma socket if rdmaCpu is equal
- *       to -1. Each trainer process starts one connection to one socket
- *       server, and use --ports_num to build more connections to harness
- *       fat communication channel if necessary.
- *       each connection is controlled by single thread with blocking
- *       read and write.
- */
-SocketServer::SocketServer(const std::string &addr, int port, int rdmaCpu)
-    : port_(port), addr_(addr), stopping_(false) {
-  if (rdmaCpu == -1) {
-    tcpRdma_ = F_TCP;
-    socket_ = 0;
-    maxPendingConnections_ = FLAGS_sock_listen_queue_size;
-  } else {
-    tcpRdma_ = F_RDMA;
-    rdmaCpu_ = rdmaCpu;
-    rdmaSocket_ = 0;
-
-    std::stringstream ss;
-    ss << port;
-    rdmaUri_ = "rdma://" + addr + ":" + ss.str();
-  }
-
-  /// trigger to initialize RDMA lib
-  CHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n";
-}
-
-SocketServer::~SocketServer() {
-  stopping_ = true;
-  /// trigger accept thread to stop
-  {
-    SocketClient trigger(addr_.empty() ? "127.0.0.1" : addr_, port_, tcpRdma_);
-  }
-  this->join();
-}
-
-/**
- * @brief start one tcp server which hosts parameter server
- *
- * @note do tcp socket bind and listen. it will spawn one thread
- *       for each connection
- */
-void SocketServer::tcpServer() {
-  int newsockfd;
-  socklen_t clilen;
-  struct sockaddr_in serv_addr, cli_addr;
-  struct hostent *server;
-
-  /// First call to socket() function
-  socket_ = socket(AF_INET, SOCK_STREAM, 0);
-  CHECK(socket_ >= 0) << "ERROR opening socket";
-
-  /// Initialize socket structure
-  bzero((char *)&serv_addr, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  serv_addr.sin_port = htons(port_);
-  if (!addr_.empty()) {
-    server = gethostbyname(addr_.c_str());
-    CHECK(server) << "ERROR, no such host: " << addr_;
-    bcopy((char *)server->h_addr,
-          (char *)&serv_addr.sin_addr.s_addr,
-          server->h_length);
-  } else {
-    serv_addr.sin_addr.s_addr = INADDR_ANY;
-  }
-
-  setOption(socket_);
-
-  /// Now bind the host address using bind() call.
-  CHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
-      << "ERROR on binding " << addr_;
-
-  /// Now start listening for the clients, here process will
-  /// go in sleep mode and will wait for the incoming connection
-  listen(socket_, maxPendingConnections_);
-  clilen = sizeof(cli_addr);
-
-  while (true) {
-    /// Accept actual connection from the client
-    newsockfd = accept(socket_, (struct sockaddr *)&cli_addr, &clilen);
-    if (stopping_) {
-      break;
-    }
-    CHECK(newsockfd >= 0) << "ERROR on accept";
-    constexpr int kPeerNameLen = 128;
-    char peerName[kPeerNameLen];
-    CHECK(inet_ntop(AF_INET, &cli_addr.sin_addr, peerName, kPeerNameLen));
-
-    SocketWorker *worker =
-        new SocketWorker(createChannel(newsockfd, std::string(peerName)), this);
-    worker->start();
-    worker->detach();
-  }
-  close(socket_);
-  LOG(INFO) << "pserver accept thread finish, addr=" << addr_
-            << " port=" << port_;
-}
-
-/**
- * @brief start one rdma server which hosts parameter server
- *
- * @note do rdma bind and listen, which calling self-defined socket
- *       like rdma library. it will spawn one thread for each connection
- */
-void SocketServer::rdmaServer() {
-  struct sxi_sock *newsock;
-
-  /// First call to socket() function
-  rdmaSocket_ = rdma::ssocket(rdmaCpu_);
-  CHECK(rdmaSocket_) << "ERROR opening RDMA socket";
-
-  CHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0)
-      << "ERROR bind RDMA socket";
-
-  /// Now start listening for the clients, here process will
-  /// go in sleep mode and will wait for the incoming connection
-  CHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket";
-
-  while (true) {
-    /// Accept actual connection from the client
-    newsock = rdma::accept(rdmaSocket_);
-    if (stopping_) {
-      break;
-    }
-    CHECK(newsock) << "ERROR on accept";
-
-    constexpr int kPeerNameLen = 128;
-    char peerName[kPeerNameLen];
-
-    struct sockaddr_in *saddr = rdma::getSourceAddress(newsock);
-    CHECK(inet_ntop(AF_INET, &saddr->sin_addr, peerName, kPeerNameLen));
-
-    SocketWorker *worker =
-        new SocketWorker(createChannel(newsock, std::string(peerName)), this);
-    worker->start();
-    worker->detach();
-  }
-  rdma::close(rdmaSocket_);
-  LOG(INFO) << "pserver accept thread finish, rdma uri=" << rdmaUri_;
-}
-
-/**
- * @brief start a socket server
- *
- * @note framework for starting socket server
- */
-void SocketServer::run() {
-  if (tcpRdma_ == F_TCP) {
-    LOG(INFO) << "tcp server start ";
-    tcpServer();
-  } else if (tcpRdma_ == F_RDMA) {
-    LOG(INFO) << "rdma server start ";
-    rdmaServer();
-  }
-}
-
-/**
- * @brief class constructor for rdma client deamons
- *
- * @note  automatically start several client deamons for better performance
- */
-std::unique_ptr<RdmaClientDaemons> RdmaClientDaemons::daemons_ = nullptr;
-std::once_flag RdmaClientDaemons::initDataFlag_;
-
-RdmaClientDaemons::RdmaClientDaemons() {
-  if (FLAGS_rdma_tcp == "rdma") {
-    rdma::init();
-
-    struct sxi_socket *socket;
-    onlineCpus_ = rdma::numCpus();
-    for (auto i = 0; i < onlineCpus_; i++) {
-      socket = rdma::csocket(i);
-      CHECK(socket) << "ERROR open client socket daemon";
-
-      rdmaClientSocket_.push_back(socket);
-    }
-    LOG(INFO) << "RDMA client daemons started, onlineCpus_:" << onlineCpus_;
-    /// round robin scheduler for new connection
-    curCpu_ = 0;
-    /// wait daemons to start completely.
-    sleep(2);
-  }
-}
-
-RdmaClientDaemons::~RdmaClientDaemons() {
-  if (FLAGS_rdma_tcp == "rdma") {
-    for (auto i = 0; i < onlineCpus_; i++) {
-      rdma::close(rdmaClientSocket_[i]);
-    }
-    LOG(INFO) << "RDMA client daemons is destoryed, onlineCpus_ "
-              << onlineCpus_;
-  }
-}
-
-/**
- * @brief worker thread main context
- *
- * @note  each connection from client(trainer) is controlled by single worker
- *        thread, which is for handling all parameter server requests
- */
-void SocketWorker::run() {
-  LOG(INFO) << "worker started, peer = " << channel_->getPeerName();
-
-  std::vector<iovec> inputIovs;
-
-  while (true) {
-    std::unique_ptr<MsgReader> msgReader = channel_->readMessage();
-    if (!msgReader) {
-      break;
-    }
-
-    auto callback = [this](const std::vector<iovec> &outputIovs) {
-      channel_->writeMessage(outputIovs);
-    };
-
-    server_->handleRequest(std::move(msgReader), callback);
-  }
-
-  LOG(INFO) << "worker begin to finish, peer = " << channel_->getPeerName();
-  delete this;
-}
-
-/**
- * @brief start one tcp connection to tcp server
- * @param[in] serverAddr  tcp server ip
- * @param[in] serverPort  tcp server port
- *
- * @note each object contains one channel which accept byte stream
- */
-void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
-  struct sockaddr_in serv_addr;
-  struct hostent *server;
-
-  int errRet;  // temp for gethostbyname_r
-
-  /// Create a socket point
-  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
-  CHECK(sockfd >= 0) << "ERROR opening socket";
-
-#if defined(__OSX__) || defined(__APPLE__)
-  server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
-  CHECK_NE(HOST_NOT_FOUND, errRet) << "ERROR, no such host: " << serverAddr
-                                   << " ret = " << errRet;
-  CHECK(server) << "getipnodebyname error!";
-#else
-  struct hostent hostinfo;
-  char buf[1024];  // temp for gethostbyname_r
-  CHECK_EQ(
-      0,
-      gethostbyname_r(
-          serverAddr.c_str(), &hostinfo, buf, sizeof(buf), &server, &errRet))
-      << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
-  CHECK(server) << "gethostbyname_r error!";
-#endif
-
-  bzero((char *)&serv_addr, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  bcopy((char *)server->h_addr,
-        (char *)&serv_addr.sin_addr.s_addr,
-        server->h_length);
-  serv_addr.sin_port = htons(serverPort);
-
-  setOption(sockfd);
-
-  /// Now connect to the server
-  int retry_count = 0;
-  do {
-    if (connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)) == 0) {
-      break;
-    }
-
-    if (errno == ECONNREFUSED) {
-      LOG(WARNING) << "connection refused by pserver, try again!";
-      if (retry_count++ >= 7) {
-        LOG(FATAL) << "connection refused by pserver, maybe pserver failed!";
-      }
-      std::this_thread::sleep_for(std::chrono::seconds(1));
-    } else {
-      CHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
-                        << serverPort << "errorno: " << errno;
-    }
-  } while (errno == ECONNREFUSED);
-
-  channel_.reset(new SocketChannel(sockfd, serverAddr));
-  tcpRdma_ = F_TCP;
-}
-
-/**
- * @brief start one RDMA connection to rdma server
- * @param[in] serverAddr  rdma server ip
- * @param[in] serverPort  rdma server port
- *
- * @note  each object contains one channel which accept byte stream
- *        for rdma, low level sock also provide byte stream api.
- */
-void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) {
-  struct sxi_sock *sock;
-
-  std::stringstream ss;
-  ss << serverPort;
-
-  std::string rdmaUri = "rdma://" + serverAddr + ":" + ss.str();
-
-  RdmaClientDaemons *daemons = RdmaClientDaemons::daemons_->get();
-  socketDaemon_ = daemons->selectDaemon();
-
-  /// connect to server with socket daemon
-  sock = rdma::connect(socketDaemon_, rdmaUri.c_str());
-  CHECK(sock) << "ERROR connect to server" << rdmaUri;
-
-  std::vector<std::string> seg;
-  str::split(rdmaUri, '/', &seg);
-  std::string server = seg.at(seg.size() - 1);
-  channel_.reset(new SocketChannel(sock, server));
-  tcpRdma_ = F_RDMA;
-}
-
-/**
- * @brief class constructor
- * @param[in] serverAddr pserver ip address
- * @param[in] serverPort pserver port
- * @param[in] ChannelType F_TCP or F_RDMA
- *
- * @note  responsible for building one connection to specified pserver port
- */
-SocketClient::SocketClient(const std::string &serverAddr,
-                           int serverPort,
-                           enum ChannelType channelType) {
-  if (channelType == F_RDMA)
-    RdmaClient(serverAddr, serverPort);
-  else
-    TcpClient(serverAddr, serverPort);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/LightNetwork.h b/paddle/legacy/pserver/LightNetwork.h
deleted file mode 100644
index 380f86832f5..00000000000
--- a/paddle/legacy/pserver/LightNetwork.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "SocketChannel.h"
-
-#include <atomic>
-#include <memory>
-#include <thread>
-#include <vector>
-
-#include "paddle/legacy/utils/Thread.h"
-
-struct sxi_socket;
-
-namespace paddle {
-
-class SocketWorker;
-
-/**
- * @brief class for holding all parameters processing for current port
- *
- * @note  each parameter server inherits from one socket server, each
- *        server contains serveral woker threads which are to parallelize
- *        the processing of computation, but share some common datas stored
- *        in child class of socketserver.
- */
-class SocketServer : public Thread {
-  // rdmaCpu controls the cpu affinity of RDMA server daemon,
-  // which could benifit performance. rdmaCpu = -1 means TCP
-  // is used instead of RDMA transport.
- public:
-  SocketServer(const std::string& addr, int port, int rdmaCpu);
-  ~SocketServer();
-
-  virtual void run();
-
-  typedef std::function<void(const std::vector<iovec>& outputIovs)>
-      ResponseCallback;
-
- protected:
-  //
-  // The derived class needs to implement this function
-  // to handle the request received by SocketWorker
-  // The request is encapsulated by MsgReader, which contains
-  // a set of blocks.
-  virtual void handleRequest(std::unique_ptr<MsgReader> msgReader,
-                             ResponseCallback callback) = 0;
-
-  std::unique_ptr<SocketChannel> createChannel(int sock,
-                                               const std::string& peerName) {
-    return std::unique_ptr<SocketChannel>(new SocketChannel(sock, peerName));
-  }
-  std::unique_ptr<SocketChannel> createChannel(struct sxi_sock* sock,
-                                               const std::string& peerName) {
-    return std::unique_ptr<SocketChannel>(new SocketChannel(sock, peerName));
-  }
-
-  friend class SocketWorker;
-
- private:
-  void rdmaServer();
-  void tcpServer();
-
-  void detach() {}  // detach accept thread is forbidden
-
- protected:
-  enum ChannelType tcpRdma_;
-  // for rdma
-  int rdmaCpu_;
-  std::string rdmaUri_;
-  sxi_socket* rdmaSocket_;
-  // for tcp
-  int port_;
-  std::string addr_;
-  int socket_;
-  int maxPendingConnections_;
-  bool stopping_;
-};
-
-/**
- * @brief class for holding one connection from one trainer
- *
- * @note  all parameter processing will run in the context of this worker
- */
-class SocketWorker : public Thread {
- public:
-  SocketWorker(std::unique_ptr<SocketChannel>&& channel, SocketServer* server)
-      : channel_(std::move(channel)), server_(server) {}
-
-  virtual ~SocketWorker() {}
-
-  virtual void run();
-
- protected:
-  std::unique_ptr<SocketChannel> channel_;
-  SocketServer* server_;
-  enum ChannelType tcpRdma_;
-};
-
-/**
- * @brief class for providing rdma client deamon thread
- *
- * @note  the deamons are required by sock like rdam library. Here
- *        use singleton model for daemons. Each deamon hosts in
- *        single cpu core for better load balance performance
- */
-class RdmaClientDaemons {
- private:
-  RdmaClientDaemons();
-
-  static std::unique_ptr<RdmaClientDaemons> daemons_;
-
- public:
-  static RdmaClientDaemons* get() {
-    std::call_once(RdmaClientDaemons::initDataFlag_,
-                   &RdmaClientDaemons::getInstance);
-
-    return daemons_.get();
-  }
-
-  struct sxi_socket* selectDaemon() {
-    int cpu = curCpu_;
-    curCpu_ = (curCpu_ + 1) % onlineCpus_;
-
-    LOG(INFO) << "select daemon " << cpu << "onlineCpus_ " << onlineCpus_;
-    return rdmaClientSocket_[cpu];
-  }
-
-  ~RdmaClientDaemons();
-
- public:
-  friend class SocketClient;
-
- private:
-  static std::once_flag initDataFlag_;
-  static void getInstance() {
-    if (!daemons_.get()) daemons_.reset(new RdmaClientDaemons());
-  }
-
-  std::vector<struct sxi_socket*> rdmaClientSocket_;
-  std::atomic<int> curCpu_;
-  int onlineCpus_;
-};
-
-/**
- * @brief management for client connection which are from trainers
- *
- * @note  it contains one channel descriptor which used to write and
- *        read data
- */
-class SocketClient {
- public:
-  SocketClient(const std::string& serverAddr,
-               int serverPort,
-               enum ChannelType channelType);
-
-  SocketChannel* getChannel() { return channel_.get(); }
-
- protected:
-  std::unique_ptr<SocketChannel> channel_;
-  struct sxi_socket* socketDaemon_;
-  enum ChannelType tcpRdma_;
-
- private:
-  void RdmaClient(const std::string& serverAddr, int serverPort);
-  void TcpClient(const std::string& serverAddr, int serverPort);
-};
-
-std::string getIpAddr(std::string& device);
-void setOption(int sockfd);
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterClient2.cpp b/paddle/legacy/pserver/ParameterClient2.cpp
deleted file mode 100644
index 264faa79184..00000000000
--- a/paddle/legacy/pserver/ParameterClient2.cpp
+++ /dev/null
@@ -1,781 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-
-#include "ParameterClient2.h"
-#include "paddle/legacy/math/SparseRowMatrix.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/StringUtil.h"
-
-DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
-DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
-
-namespace paddle {
-
-template <typename T1, typename T2>
-void copyToRepeatedField(google::protobuf::RepeatedField<T1>* dest,
-                         const T2* src,
-                         size_t size) {
-  dest->Clear();
-  dest->Reserve(size);
-  for (size_t i = 0; i < size; ++i) {
-    dest->AddAlreadyReserved(src[i]);
-  }
-}
-
-ParameterClient2::ParameterClient2(bool separate, int port, int numPorts)
-    : BaseClient(separate, numPorts), port_(port) {
-#ifndef PADDLE_DISABLE_TIMER
-  forwardbackwordTime_ = 0;
-#endif
-}
-
-int ParameterClient2::calcParameterBlockSize(
-    const std::vector<ParameterPtr>& parameters, size_t serviceNum) {
-  size_t totalSize = 0;
-  for (auto& para : parameters) {
-    totalSize += para->getSize();
-  }
-  size_t perServerSize = totalSize / serviceNum;
-
-  int sizeBits = 64 - __builtin_clzl(perServerSize);
-
-  /// 2^10 is min block size
-  /// 2^7 will be max number of blocks in one pserver
-  int blockSizeBits = std::max((sizeBits - 7), 10);
-  return 1 << blockSizeBits;
-}
-
-void ParameterClient2::initThreads() {
-  threadNum_ = serviceNum_;
-  if (FLAGS_parallel_thread_num > 1) {
-    LOG(INFO) << "parallel_thread_num dosent need to set";
-  }
-  syncThreadPool_.reset(new SyncThreadPool(threadNum_));
-  startThreads();
-}
-
-bool ParameterClient2::init(const std::vector<ParameterPtr>& parameters) {
-  destroy();
-
-  std::vector<std::string> hosts;
-  str::split(FLAGS_pservers, ',', &hosts);
-  serviceNum_ = hosts.size() * numPorts_;
-  uint64_t denseBlockSize = calcParameterBlockSize(parameters, serviceNum_);
-
-  /// setup prefetch matrix if exists
-  for (auto& para : parameters) {
-    /// set block size for each parameter
-    para->getConfig().set_parameter_block_size(
-        para->getConfig().sparse_remote_update() ? para->getConfig().dims(1)
-                                                 : denseBlockSize);
-  }
-
-  for (auto& para : parameters) {
-    CHECK_NE(-1UL, para->getID()) << "id in parameter is not initialized";
-    parameterMap_[para->getID()] = para;
-  }
-
-  allSegments_.reserve(parameters.size());
-
-  for (auto& para : parameters) {
-    ParameterSegments segments;
-    segments.name = para->getName();
-    segments.id = para->getID();
-    allSegments_.push_back(segments);
-    if (para->getConfig().sparse_remote_update()) {
-      CHECK_EQ(para->getConfig().parameter_block_size(),
-               para->getConfig().dims(1))
-          << "For sparse remote update parameter,"
-          << " block size is the width of each row.";
-    }
-  }
-
-  /// init clients
-  clients_.reserve(serviceNum_);
-  recvDataMems_.resize(serviceNum_);
-
-  for (size_t i = 0; i < hosts.size(); ++i) {
-    for (int j = 0; j < numPorts_; ++j) {
-      LOG(INFO) << "pserver " << i * numPorts_ + j << " " << hosts[i] << ":"
-                << port_ + j;
-      if (FLAGS_rdma_tcp == "rdma") {
-        clients_.emplace_back(hosts[i], port_ + j, F_RDMA);
-      } else {
-        clients_.emplace_back(hosts[i], port_ + j, F_TCP);
-      }
-    }
-  }
-
-  sparseDistribution_.reset(new SparseParameterDistribution(serviceNum_));
-
-  sleep(2);
-
-  initThreads();
-
-  return true;
-}
-
-ParameterClient2::~ParameterClient2() { destroy(); }
-
-void ParameterClient2::destroy() {
-  if (clients_.empty()) {
-    /// this means not initialized.
-    return;
-  }
-  finishThreads();
-
-  parameterMap_.clear();
-  allSegments_.clear();
-  clients_.clear();
-}
-
-void ParameterClient2::sendParallel(int tid,
-                                    size_t numThreads,
-                                    ParameterType recvParameterType) {
-  int numMyClients = divup(serviceNum_ - tid, numThreads);
-
-  for (int j = 0; j < numMyClients; ++j) {
-    REGISTER_TIMER("client_sendAndRecv_send");
-    int i = numThreads * j + tid;
-    /// Try to make different clients to send data to different pservers
-    /// at the same time so that they will not flood data to the same
-    /// pserver.
-    i = calcClientId(i, serviceNum_);
-    clients_[i].send("sendParameter",
-                     sendJob_.parallelRequests[i],
-                     sendJob_.parallelInputIovs[i]);
-
-    /// clear large structure
-    sendJob_.parallelRequests[i].Clear();
-    sendJob_.parallelInputIovs[i].clear();
-  }
-
-  std::vector<void*> bufs;
-  SendParameterResponse response;
-  for (int j = 0; j < numMyClients; ++j) {
-    REGISTER_TIMER("client_sendAndRecv_recv");
-    int i = numThreads * j + tid;
-    i = calcClientId(i, serviceNum_);
-    auto msgReader = clients_[i].recv(&response);
-    CHECK_EQ(msgReader->getNumBlocks(), (size_t)response.blocks_size());
-    bufs.clear();
-    bufs.reserve(response.blocks_size());
-    for (auto& block : response.blocks()) {
-      auto it = parameterMap_.find(block.para_id());
-      CHECK(it != parameterMap_.end());
-      Parameter* parameter = it->second.get();
-      real* buf = nullptr;
-      if (parameter->getBuf(recvParameterType)) {
-        buf = parameter->getBuf(recvParameterType)->getPoint(block.begin_pos());
-      } else {
-        auto recvMat = dynamic_cast<SparseRowCpuMatrix*>(
-            parameter->getMat(recvParameterType).get());
-        CHECK(recvMat);
-        size_t width = parameter->getConfig().dims(1);
-        // TODO(wuyi): need add lock here? may also cause resize.
-        buf = recvMat->getLocalRow(block.begin_pos() / width);
-      }
-      /// sparse_id is not useful while receiving data since sparse data
-      /// storage is continuous, do commit recieved data as that of dense.
-      bufs.push_back(buf);
-    }
-    msgReader->readBlocks(bufs);
-  }
-}
-
-void ParameterClient2::prepareSendData(
-    ParameterUpdateMode updateMode,
-    ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments,
-    int64_t numSamples,
-    real cost,
-    bool sendBackParameter,
-    ParameterType sendBackParameterType,
-    BatchStatus batchStatus,
-    SendJob* sendJob) {
-  sendJob->parallelRequests.resize(serviceNum_);
-  sendJob->parallelInputIovs.resize(serviceNum_);
-
-  for (auto& request : sendJob->parallelRequests) {
-#ifndef PADDLE_DISABLE_TIMER
-    if (updateMode == PSERVER_UPDATE_MODE_ADD_GRADIENT) {
-      request.set_forwardbackward_time(forwardbackwordTime_);
-    }
-#endif
-    request.set_trainer_id(trainerId_);
-    request.set_update_mode(updateMode);
-    request.set_send_back_parameter(sendBackParameter);
-    request.set_send_back_parameter_type(sendBackParameterType);
-    request.set_num_samples(numSamples);
-    request.set_cost(cost);
-    request.set_batch_status(batchStatus);
-    CHECK_EQ(request.blocks_size(), 0);
-    VLOG(1) << "request: trainer_id: " << request.trainer_id() << " update_mode"
-            << request.update_mode()
-            << " send_back_parameter: " << request.send_back_parameter()
-            << " send_back_parameter_type: "
-            << request.send_back_parameter_type()
-            << " num_samples: " << request.num_samples()
-            << " cost: " << request.cost()
-            << " batch_status: " << request.batch_status();
-  }
-  for (const auto& segments : parameterSegments) {
-    const auto it = parameterMap_.find(segments.id);
-    CHECK(it != parameterMap_.end());
-    Parameter* parameter = it->second.get();
-    CHECK(parameter != nullptr) << "parameter is nullptr";
-    int64_t nameHash = std::hash<std::string>()(segments.name);
-    bool sendingPara = !(updateMode == PSERVER_UPDATE_MODE_GET_PARAM ||
-                         updateMode == PSERVER_UPDATE_MODE_GET_PARAM_SPARSE ||
-                         updateMode == PSERVER_UPDATE_MODE_SET_PARAM_ZERO);
-    bool sparseUpdate = parameter->getConfig().sparse_remote_update() &&
-                        (updateMode == PSERVER_UPDATE_MODE_ADD_GRADIENT ||
-                         updateMode == PSERVER_UPDATE_MODE_ASYNC_SGD ||
-                         updateMode == PSERVER_UPDATE_MODE_GET_PARAM_SPARSE);
-
-    const auto blockSize = parameter->getConfig().parameter_block_size();
-    CHECK_GE(blockSize, 1LU) << "blockSize should > 0 " << blockSize;
-    const auto paraSize = parameter->getSize();
-    if (sparseUpdate) {
-      auto prefetchMat = std::dynamic_pointer_cast<SparsePrefetchRowCpuMatrix>(
-          parameter->getMat(PARAMETER_VALUE));
-      CHECK(prefetchMat != nullptr) << "prefetchMat is nullptr";
-      auto sendMat = dynamic_cast<SparseRowCpuMatrix*>(
-          parameter->getMat(parameterType).get());
-      CHECK(sendMat != nullptr) << "sendMat is nullptr";
-
-      syncThreadPool_->exec([&](int tid, size_t numThreads) {
-        std::lock_guard<std::mutex> guard(sparseAutoGrowthMutex_);
-        const auto& localIndices = prefetchMat->getLocalIndices();
-        /// num of sparse rows
-        size_t nLocalBlocks = localIndices.size();
-        uint64_t beginDim = 0;
-        uint64_t endDim = 0;
-
-        // HACK(typhoonzero): let it resize first
-        prefetchMat->getLocalRow(nLocalBlocks);
-        sendMat->getLocalRow(nLocalBlocks);
-
-        for (size_t row = 0; row < nLocalBlocks; ++row) {
-          int64_t blockId = localIndices[row];  // local row -> sparse row
-          int serverId = std::abs((blockId + nameHash) % serviceNum_);
-          if (serverId % numThreads != (size_t)tid) {
-            continue;
-          }
-
-          beginDim = blockId * blockSize;
-          endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
-
-          auto& request = sendJob->parallelRequests[serverId];
-          ParameterBlock* block = request.add_blocks();
-          block->set_para_id(segments.id);
-          /// global sparse row id
-          block->set_block_id(blockId);
-          /// local row offset
-          block->set_begin_pos(row * blockSize);
-          /// block len
-          block->set_block_size(endDim - beginDim);
-          if (sendingPara) {
-            sendJob->parallelInputIovs[serverId].push_back(
-                {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize});
-            /// detect sparse parameter distribution
-            sparseDistribution_->probeDistribution(serverId,
-                                                   sizeof(real) * blockSize);
-          }
-        }
-      });
-
-    } else {  /// parameter set for dense and sparse
-      real* buf =
-          sendingPara ? parameter->getBuf(parameterType)->getPoint(0) : nullptr;
-      uint64_t endDim = 0;
-      for (uint64_t beginDim = 0; beginDim < paraSize; beginDim = endDim) {
-        endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
-        int64_t blockId = beginDim / blockSize;
-        int serverId = std::abs((blockId + nameHash) % serviceNum_);
-
-        auto& request = sendJob->parallelRequests[serverId];
-        ParameterBlock* block = request.add_blocks();
-        block->set_para_id(segments.id);
-        block->set_block_id(blockId);
-        block->set_begin_pos(beginDim);
-        block->set_block_size(endDim - beginDim);
-        if (buf) {
-          sendJob->parallelInputIovs[serverId].push_back(
-              {buf + beginDim, sizeof(real) * ((size_t)(endDim - beginDim))});
-        }
-      }
-    }
-  }  // parameterSegments
-
-  sparseDistribution_->checkAndResetDistribution();
-}
-
-void ParameterClient2::sendAndReceiveParameter(
-    ParameterUpdateMode updateMode,
-    ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments,
-    int64_t numSamples,
-    real cost,
-    bool sendBackParameter,
-    ParameterType sendBackParameterType,
-    ParameterType recvParameterType) {
-  prepareSendData(updateMode,
-                  parameterType,
-                  parameterSegments,
-                  numSamples,
-                  cost,
-                  sendBackParameter,
-                  sendBackParameterType,
-                  /*batchStatus = */ BATCH_START_AND_FINISH,
-                  &sendJob_);
-
-  syncThreadPool_->exec([&](int tid, size_t numThreads) {
-    this->sendParallel(tid, numThreads, recvParameterType);
-  });
-}
-
-void ParameterClient2::sendParameter(
-    ParameterUpdateMode updateMode,
-    ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments,
-    int64_t numSamples,
-    real cost,
-    bool sendBackParameter,
-    BatchStatus batchStatus) {
-  SendJobPtr sendJob = std::make_shared<SendJob>();
-  prepareSendData(updateMode,
-                  parameterType,
-                  parameterSegments,
-                  numSamples,
-                  cost,
-                  sendBackParameter,
-                  PARAMETER_VALUE,
-                  batchStatus,
-                  sendJob.get());
-
-  for (int i = 0; i < threadNum_; i++) {
-    sendJobQueue_[i]->enqueue(sendJob);
-  }
-}
-
-void ParameterClient2::recvParameter() { recvSyncBarrier_->wait(); }
-
-void ParameterClient2::send(int threadId) {
-  int index = threadId;
-  LOG(INFO) << "send thread " << threadId << " started";
-  int numMyClients = divup(serviceNum_ - index, threadNum_);
-  while (true) {
-    SendJobPtr recvJob = sendJobQueue_[index]->dequeue();
-    if (stopping_) {
-      recvJobQueue_[index]->enqueue(recvJob);
-      break;
-    }
-    for (int j = 0; j < numMyClients; ++j) {
-      REGISTER_TIMER("client_send");
-      int i = threadNum_ * j + index;
-      /// Try to make different clients to send data to different pservers
-      /// at the same time so that they will not flood data to the same
-      /// pserver.
-      i = calcClientId(i, serviceNum_);
-      if (recvJob->parallelRequests.size()) {
-        clients_[i].send("sendParameter",
-                         recvJob->parallelRequests[i],
-                         recvJob->parallelInputIovs[i]);
-      } else {
-        clients_[i].send("sendData",
-                         recvJob->parallelDataRequests[i],
-                         recvJob->parallelInputIovs[i]);
-      }
-    }
-    recvJobQueue_[index]->enqueue(recvJob);
-  }
-}
-
-void ParameterClient2::recv(int threadId) {
-  LOG(INFO) << "recv thread " << threadId << " started";
-  int index = threadId;
-  int numMyClients = divup(serviceNum_ - index, threadNum_);
-  while (true) {
-    std::vector<void*> bufs;
-    SendParameterResponse response;
-    SendDataResponse dataResponse;
-    SendJobPtr recvJob = recvJobQueue_[index]->dequeue();
-    if (stopping_) break;
-    for (int j = 0; j < numMyClients; ++j) {
-      REGISTER_TIMER("client_recv");
-      int i = threadNum_ * j + index;
-      i = calcClientId(i, serviceNum_);
-      if (recvJob->parallelRequests.size()) {
-        auto msgReader = clients_[i].recv(&response);
-        CHECK_EQ(msgReader->getNumBlocks(), (size_t)response.blocks_size());
-        bufs.clear();
-        bufs.reserve(response.blocks_size());
-        for (auto& block : response.blocks()) {
-          auto it = parameterMap_.find(block.para_id());
-          CHECK(it != parameterMap_.end());
-          Parameter* parameter = it->second.get();
-          real* buf =
-              parameter->getBuf(PARAMETER_VALUE)->getPoint(block.begin_pos());
-          CHECK_EQ(msgReader->getBlockLength(bufs.size()),
-                   sizeof(real) * (block.block_size()));
-          bufs.push_back(buf);
-        }
-        msgReader->readBlocks(bufs);
-      } else {
-        auto msgReader = clients_[i].recv(&dataResponse);
-        CHECK_EQ(msgReader->getNumBlocks(), (size_t)dataResponse.blocks_size());
-        size_t totalLen = msgReader->getTotalLength();
-        if (0 == totalLen) {
-          continue;
-        }
-        auto& recvMem = recvDataMems_[dataResponse.server_id()];
-        CHECK_EQ(dataResponse.blocks_size(), 1)
-            << "Only one block currently support now!";
-        auto& block = dataResponse.blocks(0);
-        CHECK_EQ(totalLen % sizeof(block.data_size()), 0U);
-        recvMem = std::make_shared<CpuMemoryHandle>(totalLen);
-        msgReader->readNextBlock(recvMem.get()->getBuf());
-      }
-    }
-    recvSyncBarrier_->wait();
-  }
-}
-
-void ParameterClient2::waitPassStart() {
-  WaitPassStartRequest request;
-  std::vector<WaitPassStartResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::waitPassFinish() {
-  WaitPassFinishRequest request;
-  std::vector<WaitPassFinishResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::synchronize(SyncObject syncObjectId) {
-  SynchronizeRequest request;
-  request.set_sync_object_id(syncObjectId);
-  std::vector<SynchronizeResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::asyncFinishPass(SyncObject syncObjectId) {
-  SynchronizeRequest request;
-  request.set_sync_object_id(syncObjectId);
-  request.set_trainer_id(trainerId_);
-  std::vector<SynchronizeResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::setConfig(const OptimizationConfig& optConfig,
-                                 const std::string& saveDir,
-                                 bool isSparseServer) {
-  SetConfigRequest request;
-  std::vector<SetConfigResponse> responses;
-
-  for (auto& nameAndPara : parameterMap_) {
-    *request.add_param_configs() = nameAndPara.second->getConfig();
-  }
-
-  *request.mutable_opt_config() = optConfig;
-  request.set_save_dir(saveDir);
-  request.set_is_sparse_server(isSparseServer);
-
-  std::vector<SetConfigRequest> requests;
-  requests.resize(clients_.size());
-  for (size_t i = 0; i < requests.size(); ++i) {
-    requests[i].CopyFrom(request);
-    requests[i].set_server_id(i);
-  }
-
-  responses.resize(clients_.size());
-  size_t numClients = clients_.size();
-  for (size_t i = 0; i < numClients; ++i) {
-    clients_[i].send(__func__, requests[i]);
-  }
-  for (size_t i = 0; i < numClients; ++i) {
-    clients_[i].recv(&responses[i]);
-  }
-}
-
-bool ParameterClient2::inStatus(PServerStatus status) {
-  GetStatusRequest request;
-  std::vector<GetStatusResponse> responses;
-
-  bool ok = true;
-  multiCall("getStatus", request, &responses);
-  for (auto& response : responses) {
-    if (response.status() != status) {
-      ok = false;
-    }
-  }
-
-  return ok;
-}
-
-void ParameterClient2::setStatus(PServerStatus status) {
-  SetStatusRequest request;
-  request.set_status(status);
-  std::vector<SetStatusResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::waitForStatus(PServerStatus status) {
-  while (!inStatus(status)) {
-    sleep(1);
-  }
-}
-
-template <typename Proto>
-static void validateResponses(const std::vector<Proto>& responses) {
-  for (auto& response : responses) {
-    CHECK(response.return_message().empty())
-        << "client" << &response - &responses[0]
-        << " error:" << response.return_message();
-  }
-}
-
-PServerVector ParameterClient2::createVector() {
-  CreateVectorRequest request;
-  std::vector<CreateVectorResponse> responses;
-  int64_t handle = -1;
-
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-
-  for (auto& response : responses) {
-    if (handle == -1) {
-      handle = response.handle();
-    } else {
-      CHECK_EQ(handle, response.handle()) << "Inconsistent handle from client"
-                                          << &response - &responses[0] << " "
-                                          << handle << " " << response.handle();
-    }
-  }
-  return PServerVector{handle};
-}
-
-void ParameterClient2::releaseVector(PServerVector handle) {
-  ReleaseVectorRequest request;
-  std::vector<ReleaseVectorResponse> responses;
-
-  request.set_handle(handle.handle);
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-PServerMatrix ParameterClient2::createMatrix(int32_t numCols) {
-  CreateMatrixRequest request;
-  std::vector<CreateMatrixResponse> responses;
-  int64_t handle = -1;
-
-  request.set_num_cols(numCols);
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-
-  for (auto& response : responses) {
-    if (handle == -1) {
-      handle = response.handle();
-    } else {
-      CHECK_EQ(handle, response.handle()) << "Inconsistent handle from client"
-                                          << &response - &responses[0] << " "
-                                          << handle << " " << response.handle();
-    }
-  }
-  return PServerMatrix{handle};
-}
-
-void ParameterClient2::releaseMatrix(PServerMatrix handle) {
-  ReleaseMatrixRequest request;
-  std::vector<ReleaseMatrixResponse> responses;
-
-  request.set_handle(handle.handle);
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-void PreparedOperations::addOperationHelper(Operation* op, CpuVectorPtr vec) {
-  ProtoVector& pvec = *op->add_vectors();
-  size_t dim = vec->getSize();
-  pvec.set_dim(dim);
-  copyToRepeatedField(pvec.mutable_values(), vec->getData(), vec->getSize());
-}
-
-void PreparedOperations::addOperationHelper(Operation* op, CpuMatrixPtr mat) {
-  ProtoMatrix& pmat = *op->add_matrices();
-  pmat.set_num_cols(mat->getWidth());
-  pmat.set_num_rows(mat->getHeight());
-  copyToRepeatedField(
-      pmat.mutable_values(), mat->getData(), pmat.num_cols() * pmat.num_rows());
-}
-
-static inline real addTwo(real a, double b) { return a + b; }
-
-void ParameterClient2::doOperation(PreparedOperations& ops,
-                                   bool waitForGradient,
-                                   bool sendBackGradient,
-                                   bool releasePass) {
-  std::vector<DoOperationResponse> responses;
-  ops.request_.set_wait_for_gradient(waitForGradient);
-  ops.request_.set_send_back_parameter(sendBackGradient);
-  ops.request_.set_release_pass(releasePass);
-  multiCall(__func__, ops.request_, &responses);
-  validateResponses(responses);
-  size_t numPassFinishServers = 0;
-
-  size_t numOps = ops.request_.operations_size();
-  for (auto& response : responses) {
-    numPassFinishServers += response.pass_finish();
-    CHECK_EQ(numOps, (size_t)response.results_size());
-    for (size_t opId = 0; opId < numOps; ++opId) {
-      const OperationResult& result = response.results(opId);
-      std::vector<real*>& resultScalars = ops.localResults_[opId].resultScalars;
-      std::vector<CpuVectorPtr>& resultVectors =
-          ops.localResults_[opId].resultVectors;
-      std::vector<CpuMatrixPtr>& resultMatrices =
-          ops.localResults_[opId].resultMatrices;
-
-      if (&response == &responses[0]) {
-        /// Initialize results to zero
-
-        resultScalars.resize(result.scalars_size());
-        for (auto p : resultScalars) {
-          if (!p) continue;
-          *p = 0;
-        }
-        size_t numVectors = result.vectors_size();
-        resultVectors.resize(numVectors);
-        for (size_t i = 0; i < numVectors; ++i) {
-          if (!resultVectors[i]) continue;
-          resultVectors[i]->resize(result.vectors(i).dim());
-          resultVectors[i]->zeroMem();
-        }
-        size_t numMatrices = result.matrices_size();
-        resultMatrices.resize(numMatrices);
-        for (size_t i = 0; i < numMatrices; ++i) {
-          if (!resultMatrices[i]) continue;
-          resultMatrices[i]->resize(result.matrices(i).num_rows(),
-                                    result.matrices(i).num_cols());
-          resultMatrices[i]->zeroMem();
-        }
-      }
-
-      // aggregate results from each pserver to results
-
-      CHECK_EQ(resultScalars.size(), (size_t)result.scalars_size());
-      for (ssize_t i = 0; i < result.scalars_size(); ++i) {
-        real* rscalar = resultScalars[i];
-        if (!rscalar) continue;
-        *rscalar += result.scalars(i);
-      }
-
-      CHECK_EQ(resultVectors.size(), (size_t)result.vectors_size());
-      for (auto& vec : result.vectors()) {
-        int i = &vec - &result.vectors(0);
-        CpuVectorPtr rvec = resultVectors[i];
-        if (!rvec) continue;
-        CHECK_EQ(rvec->getSize(), (size_t)vec.dim());
-        std::transform(rvec->getData(),
-                       rvec->getData() + rvec->getSize(),
-                       vec.values().data(),
-                       rvec->getData(),
-                       addTwo);
-      }
-
-      CHECK_EQ(resultMatrices.size(), (size_t)result.matrices_size());
-      for (auto& mat : result.matrices()) {
-        int i = &mat - &result.matrices(0);
-        CpuMatrixPtr rmat = resultMatrices[i];
-        if (!rmat) continue;
-        CHECK_EQ(rmat->getHeight(), (size_t)mat.num_rows());
-        CHECK_EQ(rmat->getWidth(), (size_t)mat.num_cols());
-
-        std::transform(rmat->getData(),
-                       rmat->getData() + rmat->getElementCnt(),
-                       mat.values().data(),
-                       rmat->getData(),
-                       addTwo);
-      }
-    }
-  }
-  passFinish_ = numPassFinishServers == clients_.size();
-}
-
-real ParameterClient2::vectorDotProduct(PServerVector u, PServerVector v) {
-  real result = 0.0;
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_utv, u, v)(&result);
-  doOperation(ops, false, false);
-  return result;
-}
-
-void ParameterClient2::vectorScale(PServerVector u, real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au, u, a);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorCopy(PServerVector src, PServerVector dst) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_COPY, src, dst);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorAddMult(PServerVector u, PServerVector v, real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)1);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorAddMultInto(PServerVector u,
-                                         PServerVector v,
-                                         PServerVector w,
-                                         real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au_bv_cw, v, w, u, (real)1, a, (real)0);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorScaleInto(PServerVector u,
-                                       PServerVector v,
-                                       real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)0);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::loadValueVector(const std::string& dirName) {
-  LoadValueRequest request;
-  request.set_dir_name(dirName);
-  std::vector<LoadValueResponse> responses;
-
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-void ParameterClient2::saveValueVector(const std::string& dirName) {
-  SaveValueRequest request;
-  request.set_dir_name(dirName);
-  std::vector<SaveValueResponse> responses;
-
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterClient2.h b/paddle/legacy/pserver/ParameterClient2.h
deleted file mode 100644
index 9320e19c4df..00000000000
--- a/paddle/legacy/pserver/ParameterClient2.h
+++ /dev/null
@@ -1,602 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <atomic>
-#include <mutex>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/pserver/BaseClient.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/Queue.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include "ParameterService.pb.h"
-
-#include "ProtoServer.h"
-#include "SparseParameterDistribution.h"
-
-DECLARE_int32(parallel_thread_num);
-
-namespace paddle {
-
-struct PServerMatrix {
-  int64_t handle;
-};
-
-struct PServerVector {
-  int64_t handle;
-};
-
-/**
- * @brief A class to help to prepare server-side operations.
- */
-class PreparedOperations {
- protected:
-  class ResultsAdder;
-  struct LocalOperationResult;
-
- public:
-  /**
-   * Offers an easy way to prepare operations that will be performed on
-   * server-side.
-   *
-   * Usage:
-   * @code
-   *   addOperation(optype, arguments...)(results...)
-   * @endcode
-   *
-   * Examples:
-   * 1. set pserver vector to 1:
-   * @code
-   *   PServerVector u = parameterClient.createVector();
-   *   addOperation(PSERVER_OP_RESET, u, (real)1);
-   * @endcode
-   *
-   * 2. Compute inner product of to pserver vectors.
-   * @code
-   *   PServerVector u = parameterClient.createVector();
-   *   PServerVector v = parameterClient.createVector();
-   *   real result;
-   *   addOperation(PSERVER_OP_utv, u, v)(&result)
-   * @endcode
-   *
-   * @param[in] operation The operation that pserver will perform.
-   * @param[in] args Argument list of the operation
-   * @return A ResultsAdder object initialized with the last element of
-   *         localResults_.
-   */
-  template <typename... Args>
-  ResultsAdder addOperation(MatrixVectorOperation operation, Args... args) {
-    Operation* op = request_.add_operations();
-    op->set_operation(operation);
-    localResults_.emplace_back();
-    addOperationHelper(op, args...);
-    return ResultsAdder(&localResults_.back());
-  }
-
- protected:
-  void addOperationHelper(Operation* op) {}
-
-  /**
-   * @brief Helper function to add an new operation that takes a PServerVector
-   *        as an operand.
-   */
-  void addOperationHelper(Operation* op, PServerVector arg) {
-    op->add_pvectors(arg.handle);
-  }
-
-  /**
-   * @brief Helper function to add an new operation that takes a PServerMatrix
-   *        as an operand.
-   */
-  void addOperationHelper(Operation* op, PServerMatrix arg) {
-    op->add_pmatrices(arg.handle);
-  }
-
-  /**
-   * @brief Helper function to add an new operation that takes a real valued
-   *        scalar as an operand.
-   */
-  void addOperationHelper(Operation* op, real arg) { op->add_scalars(arg); }
-
-  /**
-   * @brief Helper function to add an new operation that takes a CpuVectorPtr
-   *        as an operand.
-   * @note The array of CpuVectors that arg points to will be copied to
-   *       op's vectors field.
-   */
-  void addOperationHelper(Operation* op, CpuVectorPtr arg);
-
-  /**
-   * @brief Helper function to add an new operation that takes a CpuMatrixPtr
-   *        as an operand.
-   * @note The array of CpuMatrixs that arg points to will be copied to
-   *       op's matrices field.
-   */
-  void addOperationHelper(Operation* op, CpuMatrixPtr arg);
-
-  /**
-   * @brief Helper function to add an new operation and prepare the operands.
-   *
-   * @tparam Arg An operand of the operation.
-   * @tparam Args A list of rest operands of the operation.
-   * @param op Pointer to an Operation object.
-   */
-  template <typename Arg, typename... Args>
-  void addOperationHelper(Operation* op, Arg arg, Args... args) {
-    addOperationHelper(op, arg);
-    addOperationHelper(op, args...);
-  }
-
-  /**
-   * @brief ResultsAdder offers easy ways to quickly store operation results.
-   */
-  class ResultsAdder {
-   public:
-    explicit ResultsAdder(LocalOperationResult* localResult)
-        : localResult_(localResult) {}
-    template <typename... Args>
-    void operator()(Args... args) {
-      addResult(args...);
-    }
-    void addResult() {}
-    void addResult(real* arg) { localResult_->resultScalars.push_back(arg); }
-    void AddResult(CpuVectorPtr arg) {
-      localResult_->resultVectors.push_back(arg);
-    }
-    void AddResult(CpuMatrixPtr arg) {
-      localResult_->resultMatrices.push_back(arg);
-    }
-    template <typename Arg, typename... Args>
-    void addResult(Arg arg, Args... args) {
-      addResult(arg);
-      addResult(args...);
-    }
-
-   protected:
-    LocalOperationResult* localResult_;
-  };
-
- protected:
-  DoOperationRequest request_;
-  std::vector<iovec> inputIovs_;
-  struct LocalOperationResult {
-    std::vector<real*> resultScalars;
-    std::vector<CpuVectorPtr> resultVectors;
-    std::vector<CpuMatrixPtr> resultMatrices;
-  };
-  std::vector<LocalOperationResult> localResults_;
-  friend class ParameterClient2;
-};
-
-struct ParameterSegments {
-  std::string name;  // name of the parameter
-  size_t id;         // id of the parameter
-};
-
-/**
- * The client interface for parameter server. ParameterClient2 supports 2 modes
- * for managing connections to parameter servers, in the 1st mode one connection
- * is shared by 2 threads that are separately responsible for sending and
- * recieving activities, in the 2nd mode one connection is owned by only one
- * thread, and all the sending and recieving activities run in that single
- * thread.
- * TODO(yanfei):
- * Additional core idea to further optimizate pserver performance is
- * to do sync-sgd based parameter level instead of pserver level.
- * full-parallelization based parameter level for sync-sgd also can
- * sense forwardbackward computation layer-by-layer for more deeper layer
- * model.
- * Firstly, pserver can do full-parallelization on all computation based
- * parameter level instead of waiting for all gradients are finished and
- * start to send back parameters value immediately if parameter is ready
- * instead of waiting for all parameters value are ready
- * Secondly, parameter client can write back parameters to GPU instead of
- * waiting until all parameters are received to CPU host end.
- */
-class ParameterClient2 : public BaseClient {
- public:
-  /** Constructor.
-   * @param separate True if sending and recieving activities are separated
-   *                 into 2 threads, otherwise false.
-   * @param port Port number that parameter client runs on.
-   * @param numPorts Number of ports parameter clients occupies,
-   *                 numPorts * pserver number is the total number of
-   *                 connections the parameter client maintains.
-   */
-  ParameterClient2(bool separate = false,
-                   int port = FLAGS_port,
-                   int numPorts = FLAGS_ports_num);
-
-  ~ParameterClient2();
-
-  static int calcParameterBlockSize(const std::vector<ParameterPtr>& parameters,
-                                    size_t serviceNum);
-
- public:
-  bool init(const std::vector<ParameterPtr>& parameters);
-
-  /// service functions
-
-  /**
-   * @brief Sends the segments in parameter to parameter servers, then receives
-   *        the response from the servers.
-   * @param[in] updateMode Indicates how parameters should be updated on the
-   *            server side.
-   * @param[in] parameterType Type of parameter that will be sent.
-   * @param[in] segments Segments in the parameter that will be sent.
-   * @param[in] numSamples Number of samples this update is based on.
-   * @param[in] cost Cost of the batch, will be used to calculate global object
-   *            value.
-   * @param[in] sendBackParameter True if the updated parameters should be sent
-   *            back, otherwise false.
-   * @param[in] sendBackParameterType Send back parameter type on pserver,
-   *            PARAMETER_VALUE by default
-   * @param[in] recvParameterType pserver[sendBackParameterType] will be copy to
-   *            client[recvParameterType]
-   * @note Only parameterType will be sent.
-   */
-  void sendAndReceiveParameter(ParameterUpdateMode updateMode,
-                               ParameterType parameterType,
-                               const std::vector<ParameterSegments>& segments,
-                               int64_t numSamples,
-                               real cost,
-                               bool sendBackParameter,
-                               ParameterType sendBackParameterType,
-                               ParameterType recvParameterType);
-
-  /**
-   * @brief Sends all parameters to parameter servers, and receives the response
-   *        from the servers.
-   */
-  void sendAndReceiveParameter(
-      ParameterUpdateMode updateMode,
-      ParameterType parameterType,
-      int64_t numSamples,
-      real cost,
-      bool sendBackParameter,
-      ParameterType sendBackParameterType = PARAMETER_VALUE,
-      ParameterType recvParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(updateMode,
-                            parameterType,
-                            allSegments_,
-                            numSamples,
-                            cost,
-                            sendBackParameter,
-                            sendBackParameterType,
-                            recvParameterType);
-  }
-
-  /**
-   * @brief Sends the segments in parameter to parameter servers. Each
-   *        sendParameter() must be paired with a recvParameter() in the future.
-   *        Only parameterType will be sent.
-   *
-   * @param[in] updateMode Indicates how parameters should be updated on the
-   *            server side.
-   * @param[in] parameterType Type of parameter that will be sent.
-   * @param[in] segments Segments in the parameter that will be sent.
-   * @param[in] numSamples Number of samples this update is based on.
-   * @param[in] cost Cost of the batch, will be used to calculate global object
-   *            value.
-   * @param[in] sendBackParameter True if the updated parameters should be sent
-   *            back, otherwise false.
-   * @param[in] batchStatus Status of the batch.
-   * @note This function is non-blocking. This means that parameter should
-   *       not change between this call and recvParameter()
-   */
-  void sendParameter(ParameterUpdateMode updateMode,
-                     ParameterType parameterType,
-                     const std::vector<ParameterSegments>& segments,
-                     int64_t numSamples,
-                     real cost,
-                     bool sendBackParameter,
-                     BatchStatus batchStatus);
-
-  void recvParameter();
-
-  /**
-   * Sends all parameters to parameter servers, recvParameter() have to be
-   * invoked
-   * afterwards.
-   *
-   * @note This function is non-blocking. This means that if parameter should
-   *       not changes between this call and recvParameter()
-   */
-  void sendParameter(ParameterUpdateMode updateMode,
-                     ParameterType parameterType,
-                     int64_t numSamples,
-                     real cost,
-                     bool sendBackParameter,
-                     BatchStatus batchStatus) {
-    sendParameter(updateMode,
-                  parameterType,
-                  allSegments_,
-                  numSamples,
-                  cost,
-                  sendBackParameter,
-                  batchStatus);
-  }
-
-  /// Get all parameters from parameter servers
-  void getParameter(ParameterType recvParameterType = PARAMETER_VALUE,
-                    ParameterType sendBackParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
-                            PARAMETER_VALUE,
-                            0,     // numSamples = 0
-                            0,     // cost = 0
-                            true,  // sendBackParameter = true
-                            sendBackParameterType,
-                            recvParameterType);
-  }
-
-  /// Get parameters by sparse row ids from parameter servers
-  void getParameterSparse(
-      ParameterType recvParameterType = PARAMETER_VALUE,
-      ParameterType sendBackParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM_SPARSE,
-                            PARAMETER_VALUE,
-                            0,     // numSamples = 0
-                            0,     // cost = 0
-                            true,  // sendBackParameter = true
-                            sendBackParameterType,
-                            recvParameterType);
-  }
-
-  /// Set all parameters on parameter servers using the local parameters
-  void setParameter() {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
-                            PARAMETER_VALUE,
-                            0,       // numSamples = 0
-                            0,       // cost = 0
-                            false);  // sendBackParameter = false
-  }
-  /**
-   * Set all parameters on parameter servers, values will be zero
-   * means do not sending local parameters
-   */
-  void setParameterZero() {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM_ZERO,
-                            PARAMETER_VALUE,
-                            0,       // numSamples = 0
-                            0,       // cost = 0
-                            false);  // sendBackParameter = false
-  }
-
-  /**
-   * @brief Wait until all gradient servers start one pass.
-   *
-   * @note This is now only used by the gradient servers for "sgd"
-   *       algorithm. Calling this function means that the calling gradient
-   *       server is ready to start a new pass.
-   */
-  void waitPassStart();
-
-  /**
-   * @brief Wait until all gradient servers finish one pass.
-   *
-   * @note This is now only used by the gradient servers for "sgd" algorithm.
-   *       Calling this function means that the calling gradient server
-   *       finishes one pass.
-   */
-  void waitPassFinish();
-
-  /// Wait until all gradient servers call this function.
-  void synchronize(SyncObject syncObjectId = SYNC_DEFAULT);
-
-  /// Called when async-sgd finish pass.
-  void asyncFinishPass(SyncObject syncObjectId = SYNC_DEFAULT);
-
-  void asyncStartPass(SyncObject syncObjectId = SYNC_DEFAULT) {
-    return synchronize(syncObjectId);
-  }
-
-  /**
-   * @brief Execute the prepared operations on pservers, fetch the results and
-   *        aggregate results from different pservers.
-   * @param[in] ops Prepared operations that will be executed on pservers.
-   * @param[in] waitForGradient If true, wait for gradient to be ready before
-   *            starting the operations.
-   * @param[in] sendBackParameter If true, send back the parameter to clients
-   *            after the operations are finished.
-   * @param[in] If true, and if all clients call waitPassFinish, signal all
-   *            clients finish the pass.
-   */
-  void doOperation(PreparedOperations& ops,
-                   bool waitForGradient,
-                   bool sendBackParameter,
-                   bool releasePass = true);
-
-  /**
-   * Set the configuration of pserver, including parameter config and
-   * optimization config
-   */
-  void setConfig(const OptimizationConfig& optConfig,
-                 const std::string& saveDir = "",
-                 bool isSparseServer = false);
-
-  /// Return true if all pservers are in the given status
-  bool inStatus(PServerStatus status);
-  bool isPassFinish() { return passFinish_; }
-
-  /// Set pserver status
-  void setStatus(PServerStatus status);
-
-  /**
-   * @brief Wait until all pservers are at status
-   * @note This function is not suitable for frequent use,
-   *       because it sleeps 1 second each time when condition is satisfied.
-   */
-  void waitForStatus(PServerStatus status);
-
-  /// Create a column vector. The size is the dimension of parameter.
-  PServerVector createVector();
-
-  /// Release the PServerVector given handle.
-  void releaseVector(PServerVector handle);
-
-  /**
-   * Create a column major matrix. The number of rows is the dimension of
-   * parameter. The number of columns is specifed by numCols.
-   */
-  PServerMatrix createMatrix(int32_t numCols);
-
-  /// Release the PServerMatrix given handle.
-  void releaseMatrix(PServerMatrix handle);
-
-  // Some basic algebra functions
-  /// Calculate the dot product of u and v
-  real vectorDotProduct(PServerVector u, PServerVector v);
-
-  /// Scale u by a
-  void vectorScale(PServerVector u, real a);
-
-  /// Copy from src to dest
-  void vectorCopy(PServerVector src, PServerVector dst);
-
-  /// u += v * a
-  void vectorAddMult(PServerVector u, PServerVector v, real a);
-
-  /// u = v + w * a
-  void vectorAddMultInto(PServerVector u,
-                         PServerVector v,
-                         PServerVector w,
-                         real a);
-  /// u = v * a
-  void vectorScaleInto(PServerVector u, PServerVector v, real a);
-
-  /// Return pserver parameter value.
-  PServerVector getPServerParameterValue() {
-    PServerVector vec;
-    vec.handle = PARAMETER_VALUE;
-    return vec;
-  }
-
-  /// Return pserver parameter gradient.
-  PServerVector getPServerParameterGradient() {
-    PServerVector vec;
-    vec.handle = PARAMETER_GRADIENT;
-    return vec;
-  }
-
-  /**
-   * Tell pservers to load value vector from file.
-   *
-   * @param[in] dirName The directory that contains the value vector file.
-   */
-  void loadValueVector(const std::string& dirName);
-
-  /// Tell pservers to save value vector to file.
-  void saveValueVector(const std::string& dirName);
-
-  void setTrainerId(int trainerId) { trainerId_ = trainerId; }
-
-#ifndef PADDLE_DISABLE_TIMER
-  void setForwardbackwardTime(uint64_t delta) { forwardbackwordTime_ = delta; }
-#endif
-
- protected:
-  template <typename ProtoIn, typename ProtoOut>
-  void multiCall(const char* funcName,
-                 const ProtoIn& request,
-                 std::vector<ProtoOut>* responses) {
-    responses->resize(clients_.size());
-    size_t numClients = clients_.size();
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].send(funcName, request);
-    }
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].recv(&(*responses)[i]);
-    }
-  }
-
- private:
-  void destroy();
-
-  /**
-   * @brief management function for parallelizing send/recv all connections
-   *        to all pservers. it is called under one SyncThreadPool. it
-   *        supports to use N thread to control M connections. the receiving
-   *        actions can be started until all sending action to all connections
-   *        owned by current thread are finished. Different connections
-   * controlled
-   *        by different threads can transfer data asynchronously.
-   */
-  void sendParallel(int tid,
-                    size_t numThreads,
-                    ParameterType recvParameterType);
-  /// sending thread routine for asynchronously send data
-  void send(int threadId);
-  /// receiving thread routing for asynchronously receive data
-  void recv(int threadId);
-
-  /**
-   * @brief main routine to build data for pserver
-   *
-   * @note  it can prepare different kinds of parameter type data. it can
-   *        be regarded as layer for bridging real parameters data and
-   *        protobuf data for communication.
-   *        TODO(yanfei):
-   *        can abstract additional layer to encode and decode data to/from
-   *        protobuf data.
-   */
-  void prepareSendData(
-      ParameterUpdateMode updateMode,
-      ParameterType parameterType,  // client send type
-      const std::vector<ParameterSegments>& parameterSegments,
-      int64_t numSamples,
-      real cost,
-      bool sendBackParameter,
-      ParameterType sendBackParameterType,  // send back type in pserver
-      BatchStatus batchStatus,
-      SendJob* sendJob);
-
-  /// start necessary threads for threadPool
-  void initThreads();
-
- protected:
-  /// start port number of pserver
-  /// it deduce all ports for dense and sparse with some rules
-  int port_;
-  /// identify the trainer id using this client
-  int trainerId_;
-
-#ifndef PADDLE_DISABLE_TIMER
-  uint64_t forwardbackwordTime_;
-#endif
-  std::mutex sparseAutoGrowthMutex_;
-
-  /// map id to parameter used for decoding protobuf data
-  std::unordered_map<size_t, ParameterPtr> parameterMap_;
-  /// segments for all parameters that needed to sync
-  std::vector<ParameterSegments> allSegments_;
-
-  /// module for sensing sparse parameters distribution on all pservers
-  std::unique_ptr<SparseParameterDistribution> sparseDistribution_;
-
-  /// thread pool for parallelizing all connections to pservers
-  std::unique_ptr<SyncThreadPool> syncThreadPool_;
-
-  bool passFinish_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterServer2.cpp b/paddle/legacy/pserver/ParameterServer2.cpp
deleted file mode 100644
index 8533a322d92..00000000000
--- a/paddle/legacy/pserver/ParameterServer2.cpp
+++ /dev/null
@@ -1,1401 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterServer2.h"
-
-#include <algorithm>
-#include <fstream>
-
-#include "paddle/legacy/math/SIMDFunctions.h"
-#include "paddle/legacy/parameter/AverageOptimizer.h"
-#include "paddle/legacy/parameter/FirstOrderOptimizer.h"
-#include "paddle/legacy/parameter/OptimizerFunctions.h"
-#include "paddle/legacy/parameter/OptimizerWithRegularizer.h"
-#include "paddle/legacy/parameter/ParameterOptimizer.h"
-#include "paddle/legacy/parameter/ParameterUpdateFunctions.h"
-#include "paddle/legacy/parameter/Regularizer.h"
-#include "paddle/legacy/parameter/ThreadLocalBuffer.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/StringUtil.h"
-
-DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
-DEFINE_double(async_lagged_ratio_min,
-              1.0,
-              "control config_.async_lagged_grad_discard_ratio() min value");
-DEFINE_double(
-    async_lagged_ratio_default,
-    1.5,
-    "if async_lagged_grad_discard_ratio is not set in trainer_config.conf"
-    "use it as defalut value");
-
-namespace paddle {
-
-const std::string ParameterServer2::kRetMsgInvalidMatrixHandle =
-    "Invalid matrix handle";
-const std::string ParameterServer2::kRetMsgInvalidVectorHandle =
-    "Invalid vector handle";
-const std::string ParameterServer2::kRetMsgUnknownOperation =
-    "Unknown operation";
-
-ParameterServer2::ParameterServer2(const std::string& addr,
-                                   int port,
-                                   int rdmaCpu)
-    : ProtoServer(addr, port, rdmaCpu),
-      dataSize_(0),
-      size_(0),
-      gradientReadyBarrier_(FLAGS_num_gradient_servers + 1),
-      parameterReadyBarrier_(FLAGS_num_gradient_servers + 1),
-      passBarrier_(FLAGS_num_gradient_servers + 1),
-      numPassFinishClients_(0),
-      allClientPassFinish_(false),
-      serverId_(-1),
-      batchId_(-1) {
-  /**
-   * register function for remote client calling, these functions
-   * will be mapped to a data structure for quick looking up. each
-   * request from trainer can contains one function name to indicate
-   * remote action. this architecture looks like rpc style for pserver.
-   */
-  REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendParameter);
-  REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendData);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, setConfig);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, setStatus);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, getStatus);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, doOperation);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, createVector);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, releaseVector);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, createMatrix);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, releaseMatrix);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, waitPassStart);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, waitPassFinish);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, synchronize);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, asyncFinishPass);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, loadValueVector);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, saveValueVector);
-
-  /// thread pool for parallelizing some computations
-  if (FLAGS_pserver_num_threads > 1) {
-    syncThreadPool_.reset(new SyncThreadPool(FLAGS_pserver_num_threads, false));
-  }
-}
-
-bool ParameterServer2::init() {
-  vectors_.resize(NUM_PARAMETER_TYPES);
-  configMap_.clear();
-
-  numSamplesProcessed_ = 0;
-  cost_ = 0;
-  char* mpienv = getenv("OMPI_COMM_WORLD_SIZE");
-  if (mpienv != NULL) {
-    mpiSize_ = atoi(mpienv);
-  } else {
-    mpiSize_ = 1;
-  }
-  status_ = PSERVER_STATUS_NOT_SET;
-  dataMems_.resize(FLAGS_num_gradient_servers);
-  synchronizeBarriers_.resize(SyncObject_ARRAYSIZE);
-  for (auto& barrier : synchronizeBarriers_) {
-    barrier.reset(new ThreadBarrier(FLAGS_num_gradient_servers));
-  }
-
-  // initialization for dicarding lagging gradient
-  asyncUpdateSteps_ = 0;
-  asyncTrainerSteps_.resize(FLAGS_num_gradient_servers);
-  asyncTrainerSteps_.assign(asyncTrainerSteps_.size(), 0);
-  asyncLaggedGradientsNum_ = 0;
-  asyncUpdateStat_.resize(static_cast<int>(FLAGS_num_gradient_servers *
-                                           FLAGS_async_lagged_ratio_default));
-  asyncUpdateStat_.assign(asyncUpdateStat_.size(), 0);
-  asyncTrainerDiscardStat_.resize(FLAGS_num_gradient_servers);
-  asyncTrainerDiscardStat_.assign(asyncTrainerDiscardStat_.size(), 0);
-  asyncTrainerCommitStat_.resize(FLAGS_num_gradient_servers);
-  asyncTrainerCommitStat_.assign(asyncTrainerCommitStat_.size(), 0);
-
-  return true;
-}
-
-void ParameterServer2::getStatus(const GetStatusRequest& request,
-                                 ProtoResponseCallback callback) {
-  (void)request;
-  GetStatusResponse response;
-  response.set_status(status_);
-  callback(response);
-}
-
-void ParameterServer2::setStatus(const SetStatusRequest& request,
-                                 ProtoResponseCallback callback) {
-  status_ = request.status();
-  SetStatusResponse response;
-  callback(response);
-}
-
-void ParameterServer2::setConfig(const SetConfigRequest& request,
-                                 ProtoResponseCallback callback) {
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-
-    serverId_ = request.server_id();
-    isSparseServer_ = request.is_sparse_server();
-
-    if (!request.save_dir().empty()) {
-      mkDir(request.save_dir().c_str());
-    }
-
-    for (const auto& config : request.param_configs()) {
-      CHECK(!configMap_.count(config.para_id()))
-          << "Duplicated parameter name: " << config.name();
-      configMap_[config.para_id()] = config;
-      CHECK_EQ(config.sparse_remote_update(), isSparseServer_);
-    }
-
-    config_ = request.opt_config();
-    if (config_.algorithm() == TrainAlgorithm::AsyncSGD) {
-      auto asyncLaggedRatio = config_.async_lagged_grad_discard_ratio();
-      if (asyncLaggedRatio <= FLAGS_async_lagged_ratio_min) {
-        LOG(INFO) << "WARNING: async_lagged_grad_discard_ratio is too small"
-                  << "reset to default, async_lagged_grad_discard_ratio = "
-                  << FLAGS_async_lagged_ratio_default;
-        asyncLaggedRatio = FLAGS_async_lagged_ratio_default;
-      }
-      asyncLaggedThreshold_ =
-          static_cast<int64_t>(FLAGS_num_gradient_servers * asyncLaggedRatio);
-      LOG(INFO) << "discard lagged async gradient ratio: " << asyncLaggedRatio
-                << " asyncLaggedhreshold: " << asyncLaggedThreshold_;
-    }
-    if (isSparseServer_ && config_.num_batches_per_send_parameter() > 1) {
-      /// sparse server must NOT use local update mode
-      config_.set_num_batches_per_send_parameter(1);
-    }
-
-    if (config_.num_batches_per_send_parameter() > 1 &&
-        config_.center_parameter_update_method() == "average") {
-      /// scaling L1/L2 decay rate as large as L1/L2 apply in trainer
-      /// if parameter regularization in pserver
-      for (auto& pair : configMap_) {
-        ParameterConfig& config = pair.second;
-        if (config_.num_batches_per_send_parameter() ==
-            config.num_batches_regularization()) {
-          real scale =
-              config_.delta_add_rate() * config.num_batches_regularization();
-          if (config_.algorithm() == "sgd") {
-            scale *= FLAGS_num_gradient_servers;
-          }
-          config.set_decay_rate(config.decay_rate() * scale);
-          if (config.decay_rate() > 0.1f) {
-            LOG(FATAL) << "L2 decay=" << config.decay_rate()
-                       << " for parameter:" << config.name()
-                       << " is too large after scale in pserver!";
-          }
-          config.set_decay_rate_l1(config.decay_rate_l1() * scale);
-          if (config.decay_rate_l1() > 0.1f) {
-            LOG(FATAL) << "L1 decay=" << config.decay_rate_l1()
-                       << " for parameter:" << config.name()
-                       << " is too large after scale in pserver!";
-          }
-
-          LOG(INFO) << "parameter:" << config.name()
-                    << " decay apply in pserver,"
-                    << " L1 decay=" << config.decay_rate_l1()
-                    << " L2 decay=" << config.decay_rate();
-        }
-      }
-    }
-  }
-
-  SetConfigResponse response;
-  callback(response);
-}
-
-real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
-  real sum = 0;
-  for (const auto buffer : buffers) {
-    for (size_t i = 0; i < buffer.size; ++i) {
-      sum += buffer.base[i];
-    }
-  }
-  return sum;
-}
-
-void ParameterServer2::mergeSegments(BlockSegments* segments) {
-  if (segments->empty()) {
-    return;
-  }
-  std::sort(segments->begin(), segments->end());
-  auto curr = segments->begin();
-  for (auto it = segments->begin(); it != segments->end(); ++it) {
-    if (it->first <= curr->second) {
-      curr->second = std::max(curr->second, it->second);
-    } else {
-      ++curr;
-      *curr = *it;
-    }
-  }
-  ++curr;
-  segments->erase(curr, segments->end());
-}
-
-void ParameterServer2::setParameter(const SendParameterRequest& request,
-                                    std::vector<Buffer>& inputBuffers,
-                                    SendParameterResponse* response,
-                                    std::vector<Buffer>* outputBuffers) {
-  (void)response;
-  (void)outputBuffers;
-  LOG(INFO) << "pserver: setParameter";
-  std::lock_guard<RWLock> guard(parameterMutex_);
-
-  int64_t numBlocks = blockIdMap_.size();
-  CHECK_EQ(blockIdMap_.size(), blockOffsetMap_.size());
-  /// total bytes for all the added blocks
-  int64_t totalSize = size_;
-  std::vector<int64_t> offsets;
-  offsets.reserve(request.blocks_size());
-  std::vector<int64_t> blockIds;
-  blockIds.reserve(request.blocks_size());
-  int bufferIndex = 0;
-
-  if (!request.blocks().size()) {
-    LOG(WARNING)
-        << "--ports_num or --ports_num_for_sparse might be too large, "
-        << "or total dense parameter size or sparse parameters size "
-        << "might be too small, this psever doesn't store any parameter.";
-    return;
-  }
-
-  for (const auto& block : request.blocks()) {
-    /// block size for parameter(e.g. 128 for sparse row, 1K for dense)
-    uint64_t blockSize = getParameterConfig(block).parameter_block_size();
-    BlockKey key(block.para_id(), block.block_id());
-    if (inputBuffers.size()) {  // if !=PSERVER_UPDATE_MODE_SET_PARAM_ZERO
-      Buffer buffer = inputBuffers[bufferIndex];
-      ++bufferIndex;
-      CHECK_EQ(buffer.size, block.block_size())
-          << "data size is too big:"
-          << " block_size=" << block.block_size()
-          << " data_size=" << buffer.size;
-    }
-
-    /// add a new block
-    if (blockIdMap_.count(key) == 0) {
-      blockOffsetMap_[key] = totalSize;
-      blockIdMap_[key] = numBlocks;
-      ++numBlocks;
-      totalSize += blockSize;
-    }
-    offsets.push_back(blockOffsetMap_[key]);
-    blockIds.push_back(blockIdMap_[key]);
-  }
-
-  size_ = totalSize;
-  LOG(INFO) << "pserver: new cpuvector: size=" << size_;
-  if (!vectors_[PARAMETER_VALUE]) {
-    /// vectors_
-    const auto types = sgdOptimizerGetTypes(config_, true /*inPserver*/);
-    for (const auto type : types) {
-      vectors_[type].reset(new CpuVector(size_));
-      vectors_[type]->zeroMem();
-    }
-
-    blockInfos_.resize(numBlocks);
-    for (auto& info : blockInfos_) {
-      info.lock.reset(new std::mutex());
-    }
-  } else {
-    CHECK_EQ((size_t)size_, vectors_[PARAMETER_VALUE]->getSize())
-        << "Currently adding new blocks is not supported. "
-        << "All blocks must be added in one setParameter call";
-  }
-
-  VectorPtr buf = vectors_[PARAMETER_VALUE];
-  usedSegments_.reserve(offsets.size());
-  /// if offsets is empty, means parameter_block_size is too big or too many
-  /// nodes.
-  if (offsets.empty()) {
-    LOG(WARNING) << "in setParameter: offsets is empty";
-  }
-  for (size_t i = 0; i < offsets.size(); ++i) {
-    size_t blockId = blockIds[i];
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(request.blocks(i));
-    info.config = &config;
-    info.offset = offsets[i];
-    info.optimizer.reset(sgdOptimizerCreate(
-        config_, config, config.sparse_remote_update(), true /*inPserver*/));
-    if (config.sparse_remote_update()) {
-      size_t width = config.dims(1);
-      CHECK_EQ(config.parameter_block_size(), width)
-          << "block size: " << config.parameter_block_size()
-          << "width : " << width;
-    }
-    info.optimizer->init(1, info.config);
-    usedSegments_.push_back(std::make_pair(
-        offsets[i], offsets[i] + request.blocks(i).block_size()));
-  }
-  mergeSegments(&usedSegments_);
-
-  if (request.update_mode() == PSERVER_UPDATE_MODE_SET_PARAM) {
-    /// copy param from trainer
-    for (size_t i = 0; i < offsets.size(); ++i) {
-      Buffer buffer = inputBuffers[i];
-      real* start = buf->getPoint(offsets[i]);
-      CHECK_LE(offsets[i] + buffer.size, buf->getSize());
-      memcpy(start, buffer.base, sizeof(real) * buffer.size);
-    }
-  } else {
-    CHECK(request.update_mode() == PSERVER_UPDATE_MODE_SET_PARAM_ZERO);
-    /// nothing to do, value vector zero mem already
-  }
-}
-
-void ParameterServer2::addGradient(const SendParameterRequest& request,
-                                   std::vector<Buffer>& inputBuffers,
-                                   SendParameterResponse* response,
-                                   std::vector<Buffer>* outputBuffers) {
-  VLOG(1) << "pserver: addGradient";
-
-  {
-    ReadLockGuard guard(parameterMutex_);
-    int bufferIndex = 0;
-    for (const auto& block : request.blocks()) {
-      int64_t offset = getBlockOffset(block);
-      CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                          << " id=" << block.para_id()
-                          << " block id=" << block.block_id();
-
-      int64_t blockId = getBlockId(block);
-      CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
-                           << " id=" << block.para_id()
-                           << " block id=" << block.block_id();
-
-      Buffer buffer = inputBuffers[bufferIndex];
-      ++bufferIndex;
-
-      const real* gradientBuffer = buffer.base;
-      real* gradientSumBuffer = vectors_[PARAMETER_GRADIENT]->getPoint(offset);
-
-      size_t size = buffer.size;
-
-      BlockInfo& info = blockInfos_[blockId];
-      const ParameterConfig& config = getParameterConfig(blockId);
-      if (config.sparse_remote_update()) {
-        CHECK_EQ(size, config.parameter_block_size());
-      } else {  // dense
-        CHECK_LE(size, config.parameter_block_size());
-      }
-      std::lock_guard<std::mutex> guard(*info.lock);
-      simd::addTo(gradientSumBuffer, gradientBuffer, size);
-    }
-  }
-  if (request.batch_status() == BATCH_FINISH ||
-      request.batch_status() == BATCH_START_AND_FINISH) {
-    numSamplesProcessed_ += request.num_samples();
-    cost_ += request.cost();
-    VLOG(1) << "num samples: " << numSamplesProcessed_
-            << ", new cost:" << cost_;
-
-    /// notify doOperation gradient ready
-    gradientReadyBarrier_.wait();
-
-    /// wait doOperation finish
-    parameterReadyBarrier_.wait();
-    VLOG(1) << "start send back";
-  }
-}
-
-bool ParameterServer2::asyncGrdientCommitCheckAndStat(
-    const SendParameterRequest& request) {
-  const auto trainerId = request.trainer_id();
-  int64_t trainerSteps = asyncTrainerSteps_[trainerId];
-  CHECK_GE(asyncUpdateSteps_, trainerSteps)
-      << " async update steps overflows "
-      << " trainer id: " << trainerId
-      << " async update steps in pserver: " << asyncUpdateSteps_
-      << " async update steps in request: " << trainerSteps;
-
-  asyncUpdateSteps_++;
-  bool commitGradient = true;
-
-  int64_t delta = asyncUpdateSteps_ - trainerSteps;
-  if (delta >= asyncLaggedThreshold_) {
-    VLOG(1) << "discard Async Update: "
-            << " trainer id: " << trainerId
-            << " pserver steps: " << asyncUpdateSteps_
-            << " request steps: " << trainerSteps;
-    asyncLaggedGradientsNum_++;
-    commitGradient = false;
-  }
-  /// stat on lagged steps, to get total discard distribution
-  if (static_cast<size_t>(delta) < asyncUpdateStat_.size()) {
-    asyncUpdateStat_[delta]++;
-  } else {
-    asyncUpdateStat_[asyncUpdateStat_.size() - 1]++;
-  }
-  /// stat on trainerId and discard, to get trainer condition
-  if (commitGradient) {
-    asyncTrainerCommitStat_[trainerId]++;
-  } else {
-    asyncTrainerDiscardStat_[trainerId]++;
-  }
-
-  return commitGradient;
-}
-
-static ThreadLocal<std::vector<bool>> localBlockBitset_;
-
-void ParameterServer2::asyncSGD(const SendParameterRequest& request,
-                                std::vector<Buffer>& inputBuffers,
-                                SendParameterResponse* response,
-                                std::vector<Buffer>* outputBuffers) {
-  int64_t numBlocks = blockIdMap_.size();
-  auto& localBlockBitset = *localBlockBitset_;
-
-  if (isSparseServer_) {
-    if (localBlockBitset.empty()) {
-      localBlockBitset.resize(numBlocks);
-    }
-    localBlockBitset.assign(numBlocks, false);
-  }
-
-  ReadLockGuard guard(parameterMutex_);
-
-  if (request.send_back_parameter()) {
-    outputBuffers->reserve(request.blocks_size());
-  }
-
-  bool commitGradient = asyncGrdientCommitCheckAndStat(request);
-
-  VectorPtr* vecs = parameter::getThreadLocalBuffer();
-  size_t bufferIndex = 0;
-  for (const auto& block : request.blocks()) {
-    int64_t offset = getBlockOffset(block);
-    CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                        << " id=" << block.para_id()
-                        << " block id=" << block.block_id();
-    int64_t blockId = getBlockId(block);
-    CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
-                         << " id=" << block.para_id()
-                         << " block id=" << block.block_id();
-    Buffer buffer = inputBuffers[bufferIndex];
-    ++bufferIndex;
-
-    size_t size = buffer.size;
-
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-
-    std::lock_guard<std::mutex> guard(*info.lock);
-    /// gradients are too obsolete, will be discarded
-    if (commitGradient) {
-      info.optimizer->startBatch(numSamplesProcessed_);
-
-      for (const auto type : info.optimizer->getParameterTypes()) {
-        vecs[type]->subVecFrom(*vectors_[type], offset, size);
-      }
-      vecs[PARAMETER_GRADIENT]->subVecFrom(buffer.base, 0, size);
-      info.optimizer->update(vecs, config, isSparseServer_ ? 0 : -1);
-
-      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
-        blockTraverse(info, config, offset, size, vecs, callback);
-      }
-      info.optimizer->finishBatch();
-    }
-
-    if (commitGradient && isSparseServer_) {
-      localBlockBitset[blockId] = true;
-    }
-
-    if (!isSparseServer_ && request.send_back_parameter()) {  // dense
-      int type = request.send_back_parameter_type();
-      sendBackParameter(block, type, response, &buffer, outputBuffers);
-    }
-  }  /// foreach block
-
-  asyncTrainerSteps_[request.trainer_id()] = asyncUpdateSteps_;
-
-  if (commitGradient && isSparseServer_) {
-    /// find blocks that trainer do not request update
-    for (int64_t blockId = 0; blockId < numBlocks; ++blockId) {
-      if (localBlockBitset[blockId]) {
-        continue;
-      }
-
-      BlockInfo& info = blockInfos_[blockId];
-      const ParameterConfig& config = *info.config;
-      size_t size = config.parameter_block_size();
-
-      std::lock_guard<std::mutex> guard(*info.lock);
-      info.optimizer->startBatch(numSamplesProcessed_);
-      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
-        blockTraverse(info, config, info.offset, size, vecs, callback);
-      }
-      info.optimizer->finishBatch();
-    }
-  }
-
-  if (commitGradient && (request.batch_status() == BATCH_FINISH ||
-                         request.batch_status() == BATCH_START_AND_FINISH)) {
-    numSamplesProcessed_ += request.num_samples();
-  }
-
-  /// show some performance log if needed
-  if (request.trainer_id() == 0) {
-    /// batchId_ is approximately equal to "real batchId_"
-    batchId_++;
-  }
-}
-
-void ParameterServer2::getParameter(const SendParameterRequest& request,
-                                    std::vector<Buffer>& inputBuffers,
-                                    SendParameterResponse* response,
-                                    std::vector<Buffer>* outputBuffers) {
-  (void)inputBuffers;
-  LOG(INFO) << "pserver: getParameter";
-  ReadLockGuard guard(parameterMutex_);
-  for (const auto& block : request.blocks()) {
-    int type = request.send_back_parameter_type();
-    sendBackParameter(block, type, response, outputBuffers);
-  }
-}
-
-void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
-                                          std::vector<Buffer>& inputBuffers,
-                                          SendParameterResponse* response,
-                                          std::vector<Buffer>* outputBuffers) {
-  (void)inputBuffers;
-  auto& buffer = *readWriteBuffer_;
-  size_t numReals = 0;
-  for (const auto& block : request.blocks()) {
-    numReals += getParameterConfig(block).dims(1);
-  }
-  buffer.resize(numReals);
-
-  VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
-
-  ReadLockGuard guard(parameterMutex_);
-  size_t offset = 0;
-  for (const auto& block : request.blocks()) {
-    size_t width = getParameterConfig(block).dims(1);
-    Buffer buf = {buffer.data() + offset, width};
-    int type = request.send_back_parameter_type();
-    sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
-    offset += width;
-  }
-}
-
-void ParameterServer2::sendBackParameter(const ParameterBlock& block,
-                                         int parameterType,
-                                         SendParameterResponse* response,
-                                         std::vector<Buffer>* outputBuffers) {
-  ParameterBlock* returnBlock = response->add_blocks();
-  returnBlock->set_para_id(block.para_id());
-  returnBlock->set_block_id(block.block_id());
-  returnBlock->set_begin_pos(block.begin_pos());
-  returnBlock->set_block_size(block.block_size());
-
-  int64_t offset = getBlockOffset(block);
-  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                      << " id=" << block.para_id()
-                      << " block id=" << block.block_id();
-
-  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  outputBuffers->push_back({valueBuffer, (size_t)block.block_size()});
-}
-
-void ParameterServer2::sendBackParameter(const ParameterBlock& block,
-                                         int parameterType,
-                                         SendParameterResponse* response,
-                                         Buffer* buffer,
-                                         std::vector<Buffer>* outputBuffers) {
-  ParameterBlock* returnBlock = response->add_blocks();
-  returnBlock->set_para_id(block.para_id());
-  returnBlock->set_block_id(block.block_id());
-  returnBlock->set_begin_pos(block.begin_pos());
-  returnBlock->set_block_size(block.block_size());
-
-  int64_t offset = getBlockOffset(block);
-  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                      << " id=" << block.para_id()
-                      << " block id=" << block.block_id();
-
-  size_t size = buffer->size;
-  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  /// copy to second buffer to avoid to be polluted by other request
-  memcpy(buffer->base, valueBuffer, sizeof(real) * size);
-  outputBuffers->push_back({buffer->base, size});
-}
-
-void ParameterServer2::sendBackParameterSparse(
-    const ParameterBlock& block,
-    int parameterType,
-    SendParameterResponse* response,
-    Buffer* buffer,
-    size_t width,
-    std::vector<Buffer>* outputBuffers) {
-  ParameterBlock* returnBlock = response->add_blocks();
-  returnBlock->set_para_id(block.para_id());
-  returnBlock->set_block_id(block.block_id());
-  returnBlock->set_begin_pos(block.begin_pos());
-  returnBlock->set_block_size(block.block_size());
-  int64_t offset = getBlockOffset(block);
-  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                      << " id=" << block.para_id()
-                      << " block id=" << block.block_id();
-
-  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  CHECK_EQ(buffer->size, width);
-  memcpy(buffer->base, valueBuffer, width * sizeof(real));
-  outputBuffers->push_back(*buffer);
-}
-
-void ParameterServer2::readAllBlocks(
-    MsgReader* msgReader, std::vector<ParameterServer2::Buffer>* buffers) {
-  auto& buffer = *readWriteBuffer_;
-  size_t numBlocks = msgReader->getNumBlocks();
-  buffer.resizeWithAlignHints(msgReader->getTotalLength() / sizeof(real),
-                              numBlocks);
-  std::vector<void*> bufs(numBlocks);
-  buffers->clear();
-  buffers->reserve(numBlocks);
-  buffer.resetAlignAlloc();
-  for (size_t i = 0; i < numBlocks; ++i) {
-    size_t len = msgReader->getBlockLength(i);
-    CHECK_EQ(len % sizeof(real), (size_t)0);
-    size_t size = len / sizeof(real);
-    bufs[i] = buffer.nextBlock(size);
-    buffers->push_back({(real*)bufs[i], size});
-  }
-  msgReader->readBlocks(bufs);
-}
-
-void ParameterServer2::sendParameter(const SendParameterRequest& request,
-                                     std::unique_ptr<MsgReader> msgReader,
-                                     ProtoResponseCallbackEx callback) {
-  SendParameterResponse response;
-  std::vector<Buffer> inputBuffers;
-  std::vector<Buffer> outputBuffers;
-  readAllBlocks(msgReader.get(), &inputBuffers);
-  msgReader.reset();
-
-  switch (request.update_mode()) {
-    case PSERVER_UPDATE_MODE_SET_PARAM:
-    case PSERVER_UPDATE_MODE_SET_PARAM_ZERO:
-      setParameter(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_GET_PARAM:
-      getParameter(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_GET_PARAM_SPARSE:
-      getParameterSparse(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_ASYNC_SGD:
-      asyncSGD(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_ADD_GRADIENT:
-      addGradient(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_AVERAGE_PARAMETER:
-      break;
-  }
-  switch (request.update_mode()) {
-    case PSERVER_UPDATE_MODE_ADD_GRADIENT:
-      (*requestVec_).push_back(request);
-      (*callbackVec_).push_back(callback);
-      if (request.batch_status() == BATCH_FINISH ||
-          request.batch_status() == BATCH_START_AND_FINISH) {
-        for (size_t i = 0; i < (*requestVec_).size(); i++) {
-          ReadLockGuard guard(parameterMutex_);
-          SendParameterRequest& request = (*requestVec_)[i];
-          SendParameterResponse responseTemp;
-
-          std::vector<iovec> outputIovs;
-          if (request.send_back_parameter()) {
-            CHECK(!isSparseServer_);
-            std::vector<Buffer> outputBuffersTemp;
-            for (const auto& block : request.blocks()) {
-              int type = request.send_back_parameter_type();
-              sendBackParameter(block, type, &responseTemp, &outputBuffersTemp);
-            }
-            outputIovs.reserve(outputBuffersTemp.size());
-            for (auto buffer : outputBuffersTemp) {
-              outputIovs.push_back({buffer.base, buffer.size * sizeof(real)});
-            }
-          }
-
-          ProtoResponseCallbackEx& callbackTemp = (*callbackVec_)[i];
-          callbackTemp(responseTemp, outputIovs);
-        }
-        (*requestVec_).clear();
-        (*callbackVec_).clear();
-      }
-      break;
-    case PSERVER_UPDATE_MODE_SET_PARAM:
-    case PSERVER_UPDATE_MODE_SET_PARAM_ZERO:
-    case PSERVER_UPDATE_MODE_GET_PARAM:
-    case PSERVER_UPDATE_MODE_GET_PARAM_SPARSE:
-    case PSERVER_UPDATE_MODE_ASYNC_SGD:
-    case PSERVER_UPDATE_MODE_AVERAGE_PARAMETER:
-      std::vector<iovec> outputIovs;
-      outputIovs.reserve(outputBuffers.size());
-      for (auto buffer : outputBuffers) {
-        outputIovs.push_back({buffer.base, buffer.size * sizeof(real)});
-      }
-      callback(response, outputIovs);
-      break;
-  }
-}
-
-template <typename Dtype>
-void ParameterServer2::reduceAndSendData(const SendDataRequest& request,
-                                         std::unique_ptr<MsgReader>& msgReader,
-                                         ProtoResponseCallbackEx& callback) {
-  SendDataResponse response;
-  response.set_type(request.type());
-  response.set_server_id(serverId_);
-
-  auto sendData = reinterpret_cast<Dtype*>(dataMems_[0].get()->getBuf());
-  size_t rawMemSize = dataMems_[0].get()->getSize();
-  CHECK_EQ(rawMemSize % sizeof(Dtype), 0U);
-  size_t dataMemSize = rawMemSize / sizeof(Dtype);
-  for (size_t i = 1; i < dataMems_.size(); ++i) {
-    CHECK_EQ(dataMems_[i].get()->getSize(), rawMemSize);
-    auto data = reinterpret_cast<Dtype*>(dataMems_[i].get()->getBuf());
-    for (size_t j = 0; j < dataMemSize; ++j) {
-      sendData[j] += data[j];
-    }
-  }
-  std::vector<iovec> outputIovs;
-  auto block = response.add_blocks();
-  outputIovs.push_back({sendData, rawMemSize});
-  block->set_total_size(rawMemSize);
-  block->set_data_size(sizeof(Dtype));
-  callback(response, outputIovs);
-}
-
-void ParameterServer2::templateReduceSum(const SendDataRequest& request,
-                                         std::unique_ptr<MsgReader>& msgReader,
-                                         ProtoResponseCallbackEx& callback) {
-  const auto& block = request.blocks(0);
-  switch (block.data_type()) {
-    case TRANS_FLOAT:
-      reduceAndSendData<float>(request, msgReader, callback);
-      break;
-    case TRANS_DOUBLE:
-      reduceAndSendData<double>(request, msgReader, callback);
-      break;
-    case TRANS_INT32:
-      reduceAndSendData<int>(request, msgReader, callback);
-      break;
-    case TRANS_UINT32_T:
-      reduceAndSendData<uint32_t>(request, msgReader, callback);
-      break;
-    case TRANS_INT64_T:
-      reduceAndSendData<int64_t>(request, msgReader, callback);
-      break;
-    case TRANS_UINT64_T:
-      reduceAndSendData<uint64_t>(request, msgReader, callback);
-      break;
-    default:
-      LOG(FATAL) << "not supported";
-      break;
-  }
-}
-
-void ParameterServer2::sendData(const SendDataRequest& request,
-                                std::unique_ptr<MsgReader> msgReader,
-                                ProtoResponseCallbackEx callback) {
-  SendDataResponse response;
-  response.set_type(request.type());
-  response.set_server_id(serverId_);
-
-  switch (request.update_mode()) {
-    case DATA_UPDATE_MODE_SET_OWN: {
-      CHECK_EQ(msgReader->getNumBlocks(), (size_t)(request.blocks_size()));
-      size_t totalLen = msgReader->getTotalLength();
-      if (totalLen > 0) {
-        CHECK_EQ(msgReader->getNumBlocks(), 1U)
-            << "Only one block currently support now!";
-        const auto& block = request.blocks(0);
-        if (0 == dataSize_) {
-          dataSize_ = block.data_size();
-        } else {
-          CHECK_EQ(dataSize_, block.data_size());
-        }
-        int64_t serverId = request.server_id();
-        if (serverId_ < 0) {
-          serverId_ = serverId;
-        } else {
-          CHECK_EQ(serverId_, serverId);
-        }
-        int64_t clientId = request.client_id();
-        dataMems_[clientId] = std::make_shared<CpuMemoryHandle>(totalLen);
-        CHECK_EQ(totalLen % sizeof(block.data_size()), 0U);
-        msgReader->readNextBlock(dataMems_[clientId].get()->getBuf());
-      }
-      msgReader.reset();
-      std::vector<iovec> outputIovs;
-      callback(response, outputIovs);
-      break;
-    }
-    case DATA_UPDATE_MODE_GET_ALL: {
-      /// Currently only support DATA_REDUCE_SUM
-      /// And their Operations are just add
-      CHECK(DATA_REDUCE_SUM == request.type());
-      templateReduceSum(request, msgReader, callback);
-      break;
-    }
-    default: { LOG(FATAL) << "not supported"; }
-  }
-}
-
-void ParameterServer2::clearUnusedSegments(CpuVector* vec) {
-  real* data = vec->getData();
-  if (usedSegments_.empty()) {
-    return;
-  }
-  memset(data, 0, sizeof(real) * usedSegments_[0].first);
-  memset(data + usedSegments_.back().second,
-         0,
-         sizeof(real) * (size_ - usedSegments_.back().second));
-  size_t n = size_ - usedSegments_.back().second;
-
-  for (size_t i = 1; i < usedSegments_.size(); ++i) {
-    memset(
-        data + usedSegments_[i - 1].second,
-        0,
-        sizeof(real) * (usedSegments_[i].first - usedSegments_[i - 1].second));
-    n += usedSegments_[i].first - usedSegments_[i - 1].second;
-  }
-}
-
-void ParameterServer2::parallelExecForEachBlock(ExecFunc func) {
-  SyncThreadPool::execHelper(
-      syncThreadPool_.get(), [&](int tid, size_t numThreads) {
-        int64_t numBlocks = blockIdMap_.size();
-        VectorPtr* vecs = parameter::getThreadLocalBuffer();
-        for (int64_t blockId = tid; blockId < numBlocks;
-             blockId += numThreads) {
-          func(blockId, vecs);
-        }
-      });
-}
-
-void ParameterServer2::blockTraverse(
-    BlockInfo& info,
-    const ParameterConfig& config,
-    int64_t offset,
-    size_t size,
-    const VectorPtr vecs[],
-    const ParameterOptimizer::TraverseCallback& callback) {
-  /// setup sub bufs
-  for (const auto type : info.optimizer->getParameterTypes()) {
-    vecs[type]->subVecFrom(*vectors_[type], offset, size);
-  }
-  callback(vecs, config, config.sparse_remote_update() ? 0 : -1LU);
-}
-
-void ParameterServer2::op_SGD(const Operation& operation,
-                              OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  if (allClientPassFinish_) {
-    /// when all clients signal pass finished, the update
-    /// is empty.
-    return;
-  }
-
-  {
-    parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-      BlockInfo& info = blockInfos_[blockId];
-      const ParameterConfig& config = getParameterConfig(blockId);
-      int64_t offset = info.offset;
-      size_t size = config.parameter_block_size();
-
-      info.optimizer->startBatch(numSamplesProcessed_);
-
-      for (const auto type : info.optimizer->getParameterTypes()) {
-        vecs[type]->subVecFrom(*vectors_[type], offset, size);
-      }
-      info.optimizer->update(
-          vecs, config, config.sparse_remote_update() ? 0 : -1LU);
-      vecs[PARAMETER_GRADIENT]->zeroMem();
-
-      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
-        blockTraverse(info, config, offset, size, vecs, callback);
-      }
-      info.optimizer->finishBatch();
-    });
-  }
-
-  batchId_++;
-}
-
-void ParameterServer2::op_start_pass(const Operation& operation,
-                                     OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    info.optimizer->startPass();
-  });
-}
-
-void ParameterServer2::op_finish_pass(const Operation& operation,
-                                      OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-    size_t size = config.parameter_block_size();
-
-    /// catch up with
-    if (auto callback = info.optimizer->startCatchUpWith()) {
-      blockTraverse(info, config, info.offset, size, vecs, callback);
-      info.optimizer->finishCatchUpWith();
-    }
-
-    /// finish pass
-    info.optimizer->finishPass();
-  });
-  batchId_ = 0;
-}
-
-void ParameterServer2::op_apply(const Operation& operation,
-                                OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-    int64_t offset = info.offset;
-    size_t size = config.parameter_block_size();
-
-    // catch up with
-    if (auto callback = info.optimizer->startCatchUpWith()) {
-      blockTraverse(info, config, offset, size, vecs, callback);
-      info.optimizer->finishCatchUpWith();
-    }
-
-    // apply to PARAMETER_APPLY
-    if (auto callback = info.optimizer->apply()) {
-      blockTraverse(info, config, offset, size, vecs, callback);
-    }
-  });
-}
-
-void ParameterServer2::op_randomize(const Operation& operation,
-                                    OperationResult* result) {
-  LOG(INFO) << "ParameterServer2::op_randomize: serverId=" << serverId_;
-
-  CpuVector& valueVec = *vectors_[PARAMETER_VALUE];
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-    size_t size = config.parameter_block_size();
-
-    vecs[PARAMETER_VALUE]->subVecFrom(valueVec, info.offset, size);
-    Parameter::randomize(vecs[PARAMETER_VALUE], config);
-  });
-}
-
-void ParameterServer2::loadValueVector(const LoadValueRequest& request,
-                                       ProtoResponseCallback callback) {
-  LoadValueResponse response;
-  LOG(INFO) << "ParameterServer2::loadValueVector: serverId=" << serverId_;
-
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  snprintf(buf, kBufLen, "/pserver.%04d", static_cast<int>(serverId_));
-  std::string filename = request.dir_name() + buf;
-
-  std::ifstream fs(filename, std::ios_base::binary);
-  CHECK(fs) << "Fail to open " << filename;
-
-  CpuVector& vec = *vectors_[PARAMETER_VALUE];
-  Parameter::Header header;
-  CHECK(fs.read(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to read parameters in pserver";
-  CHECK(Parameter::isHeaderFormatSupported(header.format))
-      << "Incorrect format version: " << header.format;
-  CHECK_EQ(header.size, (size_t)size_)
-      << "The size (" << header.size << ") in the file does not match the size "
-      << "(" << size_ << ") of the pserver: " << serverId_;
-  CHECK_EQ(header.valueSize, sizeof(real)) << "Unsupported valueSize "
-                                           << header.valueSize;
-  CHECK(fs.read(reinterpret_cast<char*>(vec.getData()),
-                header.size * sizeof(real)));
-
-  callback(response);
-}
-
-void ParameterServer2::saveValueVector(const SaveValueRequest& request,
-                                       ProtoResponseCallback callback) {
-  SaveValueResponse response;
-  LOG(INFO) << "ParameterServer2::SaveValueVector: serverId=" << serverId_;
-
-  mkDir(request.dir_name().c_str());
-
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  snprintf(buf, kBufLen, "/pserver.%04d", static_cast<int>(serverId_));
-  std::string filename = request.dir_name() + buf;
-
-  std::ofstream fs(filename, std::ios_base::binary);
-  CHECK(fs) << "Fail to open " << filename;
-
-  CpuVector& vec = vectors_[PARAMETER_APPLY] ? *vectors_[PARAMETER_APPLY]
-                                             : *vectors_[PARAMETER_VALUE];
-  Parameter::Header header;
-  // TODO(TJ): save param headerFormat_
-  header.format = PARAM_FORMAT_ORIGINAL;
-  header.valueSize = sizeof(real);
-  header.size = size_;
-
-  CHECK_EQ(header.size, vec.getSize());
-
-  CHECK(fs.write(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to write parameter in pserver: " << serverId_;
-
-  CHECK(fs.write(reinterpret_cast<char*>(vec.getData()),
-                 header.size * sizeof(real)))
-      << "Fail to write parameter in pserver: " << serverId_;
-
-  callback(response);
-}
-
-void ParameterServer2::op_RESET(const Operation& operation,
-                                OperationResult* result) {
-  (void)result;
-  CpuVector* u = vectors_[operation.pvectors(0)].get();
-  u->reset(operation.scalars(0));
-  clearUnusedSegments(u);
-}
-
-void ParameterServer2::op_utv(const Operation& operation,
-                              OperationResult* result) {
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  double sum = 0;
-  for (int64_t i = 0; i < size; ++i) {
-    sum += (double)u[i] * (double)v[i];
-  }
-  result->add_scalars(sum);
-}
-
-void ParameterServer2::op_au_bv(const Operation& operation,
-                                OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  real a = operation.scalars(0);
-  real b = operation.scalars(1);
-  for (int64_t i = 0; i < size; ++i) {
-    v[i] = a * u[i] + b * v[i];
-  }
-}
-
-void ParameterServer2::op_COPY(const Operation& operation,
-                               OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  for (int64_t i = 0; i < size; ++i) {
-    v[i] = u[i];
-  }
-}
-
-void ParameterServer2::op_au(const Operation& operation,
-                             OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  int64_t size = size_;
-  real a = operation.scalars(0);
-  for (int64_t i = 0; i < size; ++i) {
-    u[i] *= a;
-  }
-}
-
-void ParameterServer2::op_au_bv_cw(const Operation& operation,
-                                   OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  real* w = vectors_[operation.pvectors(2)]->getData();
-  int64_t size = size_;
-  real a = operation.scalars(0);
-  real b = operation.scalars(1);
-  real c = operation.scalars(2);
-  for (int64_t i = 0; i < size; ++i) {
-    w[i] = a * u[i] + b * v[i] + c * w[i];
-  }
-}
-
-void ParameterServer2::op_make_steepest_desc_dir(const Operation& operation,
-                                                 OperationResult* result) {
-  (void)result;
-  real* dir = vectors_[operation.pvectors(0)]->getData();
-  real* grad = vectors_[operation.pvectors(1)]->getData();
-  real* x = vectors_[operation.pvectors(2)]->getData();
-  int64_t size = size_;
-  real l1weight = operation.scalars(0);
-  for (int64_t i = 0; i < size; ++i) {
-    if (x[i] < 0) {
-      dir[i] = -grad[i] + l1weight;
-    } else if (x[i] > 0) {
-      dir[i] = -grad[i] - l1weight;
-    } else {
-      if (grad[i] < -l1weight) {
-        dir[i] = -grad[i] - l1weight;
-      } else if (grad[i] > l1weight) {
-        dir[i] = -grad[i] + l1weight;
-      } else {
-        dir[i] = 0;
-      }
-    }
-  }
-}
-
-void ParameterServer2::op_fix_dir_signs(const Operation& operation,
-                                        OperationResult* result) {
-  (void)result;
-  real* dir = vectors_[operation.pvectors(0)]->getData();
-  real* steepestDescDir = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  for (int64_t i = 0; i < size; ++i) {
-    if (dir[i] * steepestDescDir[i] <= 0) {
-      dir[i] = 0;
-    }
-  }
-}
-
-void ParameterServer2::op_fix_omega_signs(const Operation& operation,
-                                          OperationResult* result) {
-  (void)result;
-  real* x = vectors_[operation.pvectors(0)]->getData();
-  real* newx = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  for (int64_t i = 0; i < size; ++i) {
-    if (x[i] * newx[i] < 0) {
-      newx[i] = 0;
-    }
-  }
-}
-
-void ParameterServer2::op_dir_deriv(const Operation& operation,
-                                    OperationResult* result) {
-  real* dir = vectors_[operation.pvectors(0)]->getData();
-  real* grad = vectors_[operation.pvectors(1)]->getData();
-  real* x = vectors_[operation.pvectors(2)]->getData();
-  int64_t size = size_;
-  real l1weight = operation.scalars(0);
-  double sum = 0;
-  for (int64_t i = 0; i < size; ++i) {
-    if (dir[i] != 0) {
-      if (x[i] < 0) {
-        sum += dir[i] * (grad[i] - l1weight);
-      } else if (x[i] > 0) {
-        sum += dir[i] * (grad[i] + l1weight);
-      } else if (dir[i] < 0) {
-        sum += dir[i] * (grad[i] - l1weight);
-      } else if (dir[i] > 0) {
-        sum += dir[i] * (grad[i] + l1weight);
-      }
-    }
-  }
-  result->add_scalars(sum);
-}
-
-void ParameterServer2::op_cost(const Operation& operation,
-                               OperationResult* result) {
-  real* x = vectors_[operation.pvectors(0)]->getData();
-  real* newgrad = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  real l1weight = operation.scalars(0);
-  real l2weight = operation.scalars(1);
-  double cost_real = cost_ / mpiSize_;
-  double sum_weight_l1 = 0;
-  double sum_weight_l2 = 0;
-  for (int64_t i = 0; i < size; ++i) {
-    sum_weight_l1 += std::abs(x[i]);
-    sum_weight_l2 += x[i] * x[i];
-    newgrad[i] += 2.0 * l2weight * x[i];
-  }
-  cost_real += l1weight * sum_weight_l1 + l2weight * sum_weight_l2;
-  result->add_scalars(cost_real);
-}
-
-ParameterServer2::OperatorFunction ParameterServer2::opFuncs[] = {
-    nullptr,                         // PSERVER_OP_utu = 0;
-    &ParameterServer2::op_utv,       // PSERVER_OP_utv = 1;
-    &ParameterServer2::op_au,        // PSERVER_OP_au = 2;
-    &ParameterServer2::op_au_bv,     // PSERVER_OP_au_bv = 3;
-    nullptr,                         // PSERVER_OP_aAx_bu = 4;
-    &ParameterServer2::op_SGD,       // PSERVER_OP_SGD = 5;
-    &ParameterServer2::op_RESET,     // PSERVER_OP_RESET = 6;
-    &ParameterServer2::op_COPY,      // PSERVER_OP_COPY = 7;
-    &ParameterServer2::op_au_bv_cw,  // PSERVER_OP_au_bv_cw = 8;
-    &ParameterServer2::op_make_steepest_desc_dir,
-    /// PSERVER_OP_MAKE_STEEPEST_DESC_DIR = 9;
-    &ParameterServer2::op_fix_dir_signs,    // PSERVER_OP_FIX_SIGNS = 10;
-    &ParameterServer2::op_dir_deriv,        // PSERVER_OP_DIR_DERIV = 11;
-    &ParameterServer2::op_fix_omega_signs,  // PSERVER_OP_FIX_OMEGA_SIGNS = 12;
-    &ParameterServer2::op_cost,             // PSERVER_OP_COST = 13
-    &ParameterServer2::op_start_pass,       // PSERVER_OP_START_PASS = 14
-    &ParameterServer2::op_finish_pass,      // PSERVER_OP_FINISH_PASS = 15
-    &ParameterServer2::op_randomize,        // PSERVER_OP_RANDOMIZE = 16
-    &ParameterServer2::op_apply,            // PSERVER_OP_APPLY = 17
-};
-
-void ParameterServer2::doOperation(const DoOperationRequest& request,
-                                   ProtoResponseCallback callback) {
-  if (request.wait_for_gradient()) {
-    /// wait gradient update
-    gradientReadyBarrier_.wait();
-    allClientPassFinish_ = numPassFinishClients_ == FLAGS_num_gradient_servers;
-  }
-
-  DoOperationResponse response;
-  response.set_pass_finish(allClientPassFinish_);
-
-  for (const auto& op : request.operations()) {
-    OperationResult* opResult = response.add_results();
-    if (op.operation() >= ARRAYSIZE(opFuncs)) {
-      LOG(ERROR) << "Unknown operation " << op.operation();
-      response.set_return_message(kRetMsgUnknownOperation);
-    }
-    OperatorFunction opFunc = opFuncs[op.operation()];
-    if (!opFunc) {
-      LOG(ERROR) << "Operation not implemented: " << op.operation();
-      response.set_return_message(kRetMsgUnknownOperation);
-    }
-    (this->*opFunc)(op, opResult);
-  }
-
-  if (request.send_back_parameter()) {
-    /// clean current cost
-    cost_ = 0;
-
-    if (allClientPassFinish_ && request.release_pass()) {
-      /// This signals that all clients finish one pass, so waitPassFinish()
-      /// will stop waiting.
-      numPassFinishClients_ = 0;
-    }
-
-    /// notify addGradient() to send back parameter
-    parameterReadyBarrier_.wait();
-  }
-  callback(response);
-}
-
-void ParameterServer2::waitPassStart(const WaitPassStartRequest& request,
-                                     ProtoResponseCallback callback) {
-  passBarrier_.wait();
-  callback(WaitPassStartResponse());
-}
-
-void ParameterServer2::waitPassFinish(const WaitPassFinishRequest& request,
-                                      ProtoResponseCallback callback) {
-  numPassFinishClients_ += 1;
-
-  while (numPassFinishClients_ != 0) {
-    /// notify doOperation gradient ready
-    gradientReadyBarrier_.wait();
-    /// wait doOperation finish
-    parameterReadyBarrier_.wait();
-  }
-
-  callback(WaitPassFinishResponse());
-}
-
-void ParameterServer2::synchronize(const SynchronizeRequest& request,
-                                   ProtoResponseCallback callback) {
-  synchronizeBarriers_[request.sync_object_id()]->wait();
-  dataSize_ = 0;
-  callback(SynchronizeResponse());
-}
-
-void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
-                                       ProtoResponseCallback callback) {
-  synchronizeBarriers_[request.sync_object_id()]->wait();
-  callback(SynchronizeResponse());
-
-  if (request.trainer_id() == 0) {
-    batchId_ = 0;
-  }
-}
-
-void ParameterServer2::createVector(const CreateVectorRequest& request,
-                                    ProtoResponseCallback callback) {
-  (void)request;
-  CreateVectorResponse response;
-  LOG(INFO) << "ParameterServer2::createVector: size=" << size_;
-  CpuVectorPtr vec = std::make_shared<CpuVector>(size_);
-  int64_t handle = -1;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    handle = vectors_.size();
-    vectors_.push_back(vec);
-  }
-  response.set_handle(handle);
-  callback(response);
-}
-
-void ParameterServer2::releaseVector(const ReleaseVectorRequest& request,
-                                     ProtoResponseCallback callback) {
-  ReleaseVectorResponse response;
-  CpuVectorPtr vec;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    vec.swap(vectors_[request.handle()]);
-  }
-  callback(response);
-}
-
-void ParameterServer2::createMatrix(const CreateMatrixRequest& request,
-                                    ProtoResponseCallback callback) {
-  CreateMatrixResponse response;
-  /// We need to create column major matrix of size_ * num_cols
-  /// Matrix is row majoar. Need to tranpose when use it.
-  CpuMatrixPtr mat = std::make_shared<CpuMatrix>(request.num_cols(), size_);
-  int64_t handle = -1;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    handle = matrices_.size();
-    matrices_.push_back(mat);
-  }
-  response.set_handle(handle);
-  callback(response);
-}
-
-void ParameterServer2::releaseMatrix(const ReleaseMatrixRequest& request,
-                                     ProtoResponseCallback callback) {
-  ReleaseMatrixResponse response;
-  CpuMatrixPtr mat;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    mat.swap(matrices_[request.handle()]);
-  }
-  callback(response);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterServer2.h b/paddle/legacy/pserver/ParameterServer2.h
deleted file mode 100644
index 069e730ea4e..00000000000
--- a/paddle/legacy/pserver/ParameterServer2.h
+++ /dev/null
@@ -1,696 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <atomic>
-#include <limits>
-#include <mutex>
-#include <string>
-#include <type_traits>
-#include <unordered_map>
-#include <vector>
-
-#include <stddef.h>
-#include <stdlib.h>
-
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/parameter/ParameterOptimizer.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-#include "ParameterService.pb.h"
-
-#include "ProtoServer.h"
-
-DECLARE_int32(port);
-
-namespace paddle {
-
-// @TODO(yanfei):
-// if armed with high density computation resource per node, pserver could also
-// utilize GPU to reduce overhead. if this mechanism is used, it could pipeline
-// network receiving and GPU computation to reduce the network overhead even
-// further. the pipeline could help to accelerate BIG model training.
-// @TODO:(yanfei)
-// for cpu and less/low gpu machine, the time exhausted by forward and backward
-// could be larger than optimization at pserver. However, if armed with lots of
-// gpus per node and if the model size is so large enough that limited cpu
-// computation causes big optmization latency, the GPU may be required by
-// pserver.
-
-/**
- * Client interface for the parameter server
- *
- * it implements several rpc API for remote parameter client usage.
- * for sync-sgd, client needs one controller thread to build connections
- * to all pservers, these controller connections do barriers
- * synchronization with these connections used for transfering data.
- * each data connection uses block based fine grained synchronization
- * to gain better scalability. Merging gradients from different trainers
- * are concurrently executed with block units, so that some network
- * overhead will be hidden in merging gradient.
- * for async-sgd, the difference is that pserver will do optimization
- * immediately if the gradients are ready, so that pserver needs to
- * prepare separate buffer to store value for sending back to trainer
- * to prevent from being polluted.
- */
-class ParameterServer2 : public ProtoServer {
- protected:
-  /// parameter_ mutex.
-  RWLock parameterMutex_;
-
-  typedef std::pair<size_t, int64_t> BlockKey;
-  struct BlockKeyHash {
-    size_t operator()(const BlockKey& key) const {
-      return std::hash<size_t>()(key.first) + key.second;
-    }
-  };
-
-  // TODO(yanfei):
-  // if index data structure is based on parameters instead of blocks, the
-  // lookup performance could be better. In addition, the block memory
-  // access almost exhibits good locality, so index data structure and
-  // block data structure can be refined further, especially if gpu is used
-  // for pserver.
-  /**
-   * all parameters are stored in CpuVector with a blockMap_ data structure
-   * to index block data required by requests.
-   */
-  typedef std::unordered_map<BlockKey, int64_t, BlockKeyHash> BlockMap;
-  /// <(para, block), global offset(byte) in all parameters>
-  BlockMap blockOffsetMap_;
-  /// <(para, block), global idx [0, nBlocksInAllParameters]>
-  BlockMap blockIdMap_;
-
-  std::vector<CpuVectorPtr> vectors_;
-  std::vector<CpuMatrixPtr> matrices_;
-  std::vector<CpuMemHandlePtr> dataMems_;
-
-  // TODO(yanfei):
-  // if storing sparse_remote_update() flag in request instead of
-  // reading configMap_, and storing config within new block wise
-  // overview data structure, the config mapping, block mapping
-  // can be unified in single clean data structure. Use para_id
-  // to index parameters, use offset to index block within parameter
-  // and keep two index into single one.
-  /**
-   * mapping between parameter and config
-   * different parameter allows different config, such as decay_rate.
-   * for each request, it need to read config for adding gradient
-   * and optmization.
-   */
-  std::unordered_map<size_t, ParameterConfig> configMap_;
-
-  /**
-   * to parallelize the multi-thread and multi-connnection
-   * computation at pserver, it use block unit to reduce
-   * the contention for computation, even further use block
-   * level optimizater control for each block for some special
-   * reason annotated below.
-   */
-  struct BlockInfo {
-    const ParameterConfig* config;
-    std::unique_ptr<std::mutex> lock;
-    /// global offset for all parameters
-    uint64_t offset;
-    /**
-     *
-     * Async sgd in pserver is very different from sync sgd.
-     * Each trainer follows startBatch, update*, finishBatch as in
-     * sync sgd, but all these actions are almost executed by
-     * multi-core and multi-thread simutaneously, so that async
-     * sgd optimization is based on block level in reality, then
-     * per block optimization is necessary indeed. In addition,
-     * per block optimization is also perfered for performance
-     * with multithreads.
-     */
-    std::unique_ptr<ParameterOptimizer> optimizer;
-  };
-  std::vector<BlockInfo> blockInfos_;
-
-  typedef std::vector<std::pair<int64_t, int64_t>> BlockSegments;
-  /// Because some blocks might not be fully used. We keep a
-  /// record of which segments are used.
-  BlockSegments usedSegments_;
-
-  /// record pserver status, all status defined in ParameterService.pb
-  PServerStatus status_;
-  /// record all samples processed which could be used by optimizater
-  std::atomic<int64_t> numSamplesProcessed_;
-  double cost_;
-  int mpiSize_;
-  int dataSize_;
-  /// configuration for current parameter optimizer
-  OptimizationConfig config_;
-
-  /**
-   * The ReadWriteBuffer is based on std::vector, but aligned for avx/sse
-   * compute. And add some helper method to allocate memory aligned blocks.
-   *
-   * @param T          type of element.
-   * @param AlignBytes the memory aligned bytes for allocated blocks.
-   */
-  template <typename T, size_t AlignBytes>
-  class ReadWriteBuffer
-      : public std::vector<T, AlignedAllocator<T, AlignBytes>> {
-   public:
-    static_assert(sizeof(T) % AlignBytes == 0 || AlignBytes % sizeof(T) == 0,
-                  "Type T must be able to aligned.");
-
-    /**
-     * @brief IsTLargerThanAlign compiled time calculated constant for is type
-     * T larger than alignments.
-     */
-    constexpr static bool IsTLargerThanAlign = sizeof(T) >= AlignBytes;
-
-    static_assert(std::is_pod<T>::value, "T must be POD type.");
-
-    /**
-     * @brief if AlignBytes > sizeof(T), then will calcuate how many elements
-     * can be stored in AlignBytes.
-     */
-    constexpr static size_t AlignElementCount = AlignBytes / sizeof(T);
-
-    static_assert(AlignElementCount ==
-                          (AlignElementCount & -AlignElementCount) ||
-                      AlignBytes > sizeof(T),
-                  "AlignElementCount should be exp of 2");
-
-    /**
-     * @brief Resize Buffer, with block count that will be allocated. Each block
-     * will be memory aligned in AlignBytes.
-     * @param size The element count in all blocks.
-     * @param alignBlockCount The block count that will be allocated.
-     */
-    void resizeWithAlignHints(size_t size, size_t alignBlockCount = 1) {
-      if (IsTLargerThanAlign) {  //! So, each elements is memory aligned.
-        this->resize(size);
-      } else {
-        //! at most, we need such elements in buffer to make sure each block is
-        //! aligned.
-        this->resize(size + alignBlockCount * (AlignElementCount - 1));
-      }
-    }
-
-    /**
-     * @brief reset aligned allocate blocks.
-     */
-    void resetAlignAlloc() { this->curOffset_ = 0; }
-
-    /**
-     * @brief get next aligned block address.
-     * @param blockSize is the element count in each block.
-     * @return Aligned block address.
-     */
-    T* nextBlock(size_t blockSize) {
-      T* r = &this->operator[](curOffset_);
-      curOffset_ += blockSize;
-
-      if (!IsTLargerThanAlign) {
-        curOffset_ =
-            (curOffset_ + AlignElementCount - 1) & ~(AlignElementCount - 1);
-      }
-      return r;
-    }
-
-   private:
-    size_t curOffset_;
-  };
-
-  /// to buffer the data from network for further processing to
-  /// reduce redundant memory allocation.
-  ThreadLocal<ReadWriteBuffer<real, ALIGN_HINT>> readWriteBuffer_;
-
-  /// size of the parameter
-  int64_t size_;
-
-  /// for synchronized training, check details in addGradient()
-  /// and doOperation()
-  ThreadBarrier gradientReadyBarrier_;
-  ThreadBarrier parameterReadyBarrier_;
-  ThreadBarrier passBarrier_;
-  ThreadLocal<std::vector<SendParameterRequest>> requestVec_;
-  ThreadLocal<std::vector<ProtoResponseCallbackEx>> callbackVec_;
-
-  std::atomic<int> numPassFinishClients_;
-  bool allClientPassFinish_;
-
-  std::vector<std::unique_ptr<ThreadBarrier>> synchronizeBarriers_;
-  std::atomic<int> serverId_;
-
-  /**
-   *
-   * for lagged async gradient gradient commit control in Async Sgd.
-   * discard lagged gradients from too slow nodes, whose gradients
-   * exhibits bad quality.
-   * Algorithm:
-   * pserver:
-   * 1. initial asyncUpdaterSteps = 0, asyncTrainerSteps_[N] = 0.
-   * syncUpdaterSteps means
-   *    the version of parameter value.
-   * 2. when pull arrives, record asyncUpdateSteps_ into
-   * syncTrainerSteps_[trainer_id]
-   * 3. when push arrives, compare asyncUpdateSteps_ with
-   * syncTrainerSteps_[trainer_id]
-   *    if delta > threshold, discard current gradient, else commit
-   *    gradient.
-   * 4. reset asyncUpdaterSteps_ and asyncTrainerSteps_[N] when pass
-   * finished
-   * Note:
-   * it can not discard all lag-gradient strictly in some special
-   * condition. part of gradients could be discarded if
-   * ConcurrentRemoteParameterUpdater is sed.
-   * this algorithm is implemented in asynSGD()
-   */
-  int64_t asyncLaggedThreshold_;
-  std::atomic<int64_t> asyncUpdateSteps_;
-  std::vector<int64_t> asyncTrainerSteps_;
-  size_t asyncLaggedGradientsNum_;
-  /// stat all async update
-  std::vector<size_t> asyncUpdateStat_;
-  /// stat per trainer_id
-  std::vector<size_t> asyncTrainerDiscardStat_;
-  /// stat per trainer_id
-  std::vector<size_t> asyncTrainerCommitStat_;
-
-  /// only used by controller and other control cmd from trainer number 0
-  std::unique_ptr<SyncThreadPool> syncThreadPool_;
-
-  /// pserver for sparse remote update parameters
-  bool isSparseServer_;
-
-  /// barrier performance tuning sync-sgd required
-  std::atomic<int64_t> batchId_;
-
- public:
-  struct Buffer {
-    real* base;
-    size_t size;
-  };
-
- protected:
-  /// async gradient commit control
-  bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request);
-
- public:
-  /// disable default parameter for overloading
-  /// @rdmaCpu:the id of cpu core hosting RDMA server(0-N)
-  /// -1 means using TCP transport instead of RDMA
-  ParameterServer2(const std::string& addr, int port, int rdmaCpu = -1);
-
-  ~ParameterServer2() {}
-
-  static const std::string kRetMsgInvalidMatrixHandle;
-  static const std::string kRetMsgInvalidVectorHandle;
-  static const std::string kRetMsgUnknownOperation;
-
-  /// service functions
-  template <typename Dtype>
-  void reduceAndSendData(const SendDataRequest& request,
-                         std::unique_ptr<MsgReader>& msgReader,
-                         ProtoResponseCallbackEx& callback);
-
-  void templateReduceSum(const SendDataRequest& request,
-                         std::unique_ptr<MsgReader>& msgReader,
-                         ProtoResponseCallbackEx& callback);
-
-  /**
-   * @brief framework for sending parameters
-   *
-   * @note  different parameter data type can be sent to pserver.
-   *        in most case, the api is used to send gradients from
-   *        trainer to pserver.
-   *        it also can be used to retrieve parameters from pserver
-   */
-  void sendParameter(const SendParameterRequest& request,
-                     std::unique_ptr<MsgReader> msgReader,
-                     ProtoResponseCallbackEx callback);
-
-  void sendData(const SendDataRequest& request,
-                std::unique_ptr<MsgReader> msgReader,
-                ProtoResponseCallbackEx callback);
-
-  /**
-   * @brief send config to pserver
-   *
-   * @note  it can help pserver to understand the configuration for
-   * optimization,
-   *        logging control, duplicated initialization, etc.
-   */
-  void setConfig(const SetConfigRequest& request,
-                 ProtoResponseCallback callback);
-
-  /**
-   * @brief get status for pserver
-   *
-   * @note  used to check if parameters are ready at pserver
-   */
-  void getStatus(const GetStatusRequest& request,
-                 ProtoResponseCallback callback);
-
-  /**
-   * @brief set status for pserver
-   *
-   * @note  used to check if parameters are ready at pserver, since parameters
-   *        at pserver are initialized by trainer
-   */
-  void setStatus(const SetStatusRequest& request,
-                 ProtoResponseCallback callback);
-
-  /**
-   * @brief framework for doing some operation at pserver end
-   *
-   * @note  if sync-sgd is used, controller will calling op_SGD action
-   *        for gradient optimization.
-   *        check avaiable operations in opFuncs[]
-   */
-  void doOperation(const DoOperationRequest& request,
-                   ProtoResponseCallback callback);
-
-  /// Create a column vector. The size is the dimension of parameter
-  void createVector(const CreateVectorRequest& request,
-                    ProtoResponseCallback callback);
-
-  void releaseVector(const ReleaseVectorRequest& request,
-                     ProtoResponseCallback callback);
-
-  /// Create a column major matrix. The number of rows is the dimension of
-  /// parameter. The number of columns is specifed by num_cols.
-  void createMatrix(const CreateMatrixRequest& request,
-                    ProtoResponseCallback callback);
-
-  void releaseMatrix(const ReleaseMatrixRequest& request,
-                     ProtoResponseCallback callback);
-  /**
-   * @brief stateful control for indicationg sync pass start
-   *
-   * @note  it is valuable for logging and state control,
-   *        especially for sync-sgd control
-   */
-  void waitPassStart(const WaitPassStartRequest& request,
-                     ProtoResponseCallback callback);
-
-  /**
-   * @brief stateful control for indicationg sync pass end
-   *
-   * @note  it is valuable for logging and state control,
-   *        especially for sync-sgd control
-   */
-  void waitPassFinish(const WaitPassFinishRequest& request,
-                      ProtoResponseCallback callback);
-
-  /**
-   * @brief synchronize all distributed trainers
-   *
-   * @note  it's general api for synchronizing trainer and pserver
-   */
-  void synchronize(const SynchronizeRequest& request,
-                   ProtoResponseCallback callback);
-
-  /**
-   * @brief stateful control for indicating async pass is finished
-   *
-   * @note  it is valuable for logging control, state reset, etc.
-   */
-  void asyncFinishPass(const SynchronizeRequest& request,
-                       ProtoResponseCallback callback);
-
-  void loadValueVector(const LoadValueRequest& request,
-                       ProtoResponseCallback callback);
-
-  void saveValueVector(const SaveValueRequest& request,
-                       ProtoResponseCallback callback);
-
- public:
-  /**
-   * @brief initialize parameter server
-   */
-  bool init();
-
-  /**
-   * @brief set parameters at pserver
-   *
-   * @note  do parameter initialization if neccessy.
-   */
-  void setParameter(const SendParameterRequest& request,
-                    std::vector<Buffer>& inputBuffers,
-                    SendParameterResponse* response,
-                    std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief receive gradients and do optimization for async-sgd
-   *
-   * @note  this api asynchronizately receives all data from all
-   *        trainers, and immediately do optimization and return
-   *        optimizated value for trainer.
-   *        this above routine are block based atomic updating,
-   *        which means different block could based different stale
-   *        gradient.
-   *        it will discard some lagged gradients by default for
-   *        better convergence.
-   */
-  void asyncSGD(const SendParameterRequest& request,
-                std::vector<Buffer>& inputBuffers,
-                SendParameterResponse* response,
-                std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief merge gradients from all trainer
-   *
-   * @note  this api use block based parallelization as fine grained
-   *        parallelization which benifits lock contention and latency
-   *        hidden for communication, also can harness multi-core
-   *        efficiently.
-   *        it also implements the synchronization for sync-sgd
-   */
-  void addGradient(const SendParameterRequest& request,
-                   std::vector<Buffer>& inputBuffers,
-                   SendParameterResponse* response,
-                   std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief get dense parameters from pserver
-   *
-   * @note  for some specified condition, trainer will get parameters from
-   *        pservers.
-   *        e.g.
-   *        if all parameters are stored at perver end for big model training
-   *        trainer can use it to retrieve all parameters if necessary.
-   */
-  void getParameter(const SendParameterRequest& request,
-                    std::vector<Buffer>& inputBuffers,
-                    SendParameterResponse* response,
-                    std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief get sparse value from parameter server
-   *
-   * @note  with sparse enabled, pservers own all latest value
-   *        while trainer only retrieve value that only are needed.
-   *        e.g.
-   *        trainer will do prefetch action to retrieve necessary latest
-   *        value from pserver for sparse calculation.
-   */
-  void getParameterSparse(const SendParameterRequest& request,
-                          std::vector<Buffer>& inputBuffers,
-                          SendParameterResponse* response,
-                          std::vector<Buffer>* outputBuffers);
-
- protected:
-  void mergeSegments(BlockSegments* segments);
-
-  /// set the unused segments to zero
-  void clearUnusedSegments(CpuVector* vec);
-
-  // TODO(yanfei):
-  // if read data and do optimization interleavely block by block,
-  // the performance could be better for gaining less network congestion.
-  /// read all data from connection and store it in static pre-allocated buffer
-  void readAllBlocks(MsgReader* msgReader,
-                     std::vector<ParameterServer2::Buffer>* buffers);
-
-  const ParameterConfig& getParameterConfig(const ParameterBlock& block) {
-    CHECK_LT(block.para_id(), -1UL) << "invalid parameter id:"
-                                    << block.para_id();
-    const auto it = configMap_.find(block.para_id());
-    CHECK(it != configMap_.end()) << "can not find parameter id: "
-                                  << block.para_id();
-    return it->second;
-  }
-
-  /// it implictly check blockOffsetMap_ while retrieving blockId
-  const ParameterConfig& getParameterConfig(int64_t blockId) const {
-    CHECK(blockId >= 0 && blockId < (int64_t)blockInfos_.size())
-        << "block idx out of range, id: " << blockId
-        << " info size: " << blockInfos_.size();
-    return *(blockInfos_[blockId].config);
-  }
-
-  template <class Response>
-  bool isValidVectorHandle(int64_t handle, Response* response) {
-    if (handle < 0 || (size_t)handle >= vectors_.size()) {
-      LOG(ERROR) << "Invalid vector handle " << handle;
-      response->set_return_message(kRetMsgInvalidVectorHandle);
-      return false;
-    }
-    return true;
-  }
-
-  template <class Response>
-  bool isValidMatrixHandle(int64_t handle, Response* response) {
-    if (handle < 0 || (size_t)handle >= matrices_.size()) {
-      LOG(ERROR) << "Invalid matrix handle " << handle;
-      response->set_return_message(kRetMsgInvalidMatrixHandle);
-      return false;
-    }
-    return true;
-  }
-
-  /**
-   * @brief get block offset
-   *
-   * @note  block.begin_dim is added to the block offset.
-   *        return -1 if block cannot be found
-   */
-  int64_t getBlockOffset(const ParameterBlock& block) const {
-    BlockKey key(block.para_id(), block.block_id());
-    auto it = blockOffsetMap_.find(key);
-    if (it == blockOffsetMap_.end()) {
-      return -1;
-    }
-    return it->second;
-  }
-
-  /// return -1 if block cannot be found
-  int64_t getBlockId(const ParameterBlock& block) const {
-    BlockKey key(block.para_id(), block.block_id());
-    auto it = blockIdMap_.find(key);
-    if (it == blockIdMap_.end()) {
-      return -1;
-    }
-    return it->second;
-  }
-
-  /**
-   * @brief prepare data for sending back
-   *
-   * @note  modify reponse and outputBuffers for sending parameter
-   *        back to client. The buffer for socket sending uses
-   *        vectors_[parameterType] directly
-   *        for dense with sync-sgd
-   */
-  void sendBackParameter(const ParameterBlock& block,
-                         int parameterType,
-                         SendParameterResponse* response,
-                         std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief prepare data for sending back
-   *
-   * @note  modify response and outputBuffers for sending parameter
-   *        back to client. The buffer for socket sending uses buffer->base
-   *        The parameter values are copied from vectors_[parameterType]
-   *        to buffer->base.
-   *        for dense with async-sgd
-   */
-  void sendBackParameter(const ParameterBlock& block,
-                         int parameterType,
-                         SendParameterResponse* response,
-                         Buffer* buffer,
-                         std::vector<Buffer>* outputBuffers);
-  /**
-   * @brief prepare data for sending back
-   *
-   * @note  specified for sparse
-   */
-  void sendBackParameterSparse(const ParameterBlock& block,
-                               int parameterType,
-                               SendParameterResponse* response,
-                               Buffer* buffer,
-                               size_t width,
-                               std::vector<Buffer>* outputBuffers);
-
-  /**
-   * framework routine for block parallelization
-   * e.g.
-   * for optimization on all blocks at pserver end, this routine can facilitize
-   * the parallelize of do optimization on all blocks with multithreads.
-   */
-  typedef std::function<void(int64_t blockId, const VectorPtr vecs[])> ExecFunc;
-  void parallelExecForEachBlock(ExecFunc func);
-  void blockTraverse(BlockInfo& info,
-                     const ParameterConfig& config,
-                     int64_t offset,
-                     size_t size,
-                     const VectorPtr vecs[],
-                     const ParameterOptimizer::TraverseCallback& callback);
-
- public:
-  typedef void (ParameterServer2::*OperatorFunction)(const Operation& operation,
-                                                     OperationResult* result);
-
-  /**
-   * doOperation will call following operations indirectly
-   * e.g.
-   * for sync-sgd control, the controller in remote updater will send op_SGD
-   * command to pserver, then send sendParameter request to pserver immediately.
-   * the two function at pserver end will do cooperation to achieve the sync-sgd
-   * gradient merge and optimization.
-   * the most following operations are specified for owlqn, all operations are
-   * under the context of doOperation function
-   */
-  static OperatorFunction opFuncs[];
-
-  void op_SGD(const Operation& operation, OperationResult* result);
-
-  void op_RESET(const Operation& operation, OperationResult* result);
-
-  void op_utv(const Operation& operation, OperationResult* result);
-
-  void op_au_bv(const Operation& operation, OperationResult* result);
-
-  void op_COPY(const Operation& operation, OperationResult* result);
-
-  void op_au(const Operation& operation, OperationResult* result);
-
-  void op_au_bv_cw(const Operation& operation, OperationResult* result);
-
-  void op_make_steepest_desc_dir(const Operation& operation,
-                                 OperationResult* result);
-
-  void op_fix_dir_signs(const Operation& operation, OperationResult* result);
-
-  void op_dir_deriv(const Operation& operation, OperationResult* result);
-
-  void op_fix_omega_signs(const Operation& operation, OperationResult* result);
-
-  void op_cost(const Operation& operation, OperationResult* result);
-
-  void op_start_pass(const Operation& operation, OperationResult* result);
-  void op_finish_pass(const Operation& operation, OperationResult* result);
-
-  void op_apply(const Operation& operation, OperationResult* result);
-
-  void op_randomize(const Operation& operation, OperationResult* result);
-
-  void op_load(const Operation& operation, OperationResult* result);
-  void op_save(const Operation& operation, OperationResult* result);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterServer2Main.cpp b/paddle/legacy/pserver/ParameterServer2Main.cpp
deleted file mode 100644
index dfbae0cd0f5..00000000000
--- a/paddle/legacy/pserver/ParameterServer2Main.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include "ParameterServerController.h"
-
-using namespace paddle;  // NOLINT
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-
-  std::unique_ptr<ParameterServerController> parameterServerPtr(
-      paddle::ParameterServerController::createFromGflags());
-  parameterServerPtr->start();
-  parameterServerPtr->wait();
-
-  return 0;
-}
diff --git a/paddle/legacy/pserver/ParameterServerController.cpp b/paddle/legacy/pserver/ParameterServerController.cpp
deleted file mode 100644
index 2a7dcc15aa6..00000000000
--- a/paddle/legacy/pserver/ParameterServerController.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterServerController.h"
-
-namespace paddle {
-
-ParameterServerController::ParameterServerController(
-    const ParameterServerConfig& config) {
-  // round robin to load balance RDMA server ENGINE
-  std::vector<std::string> devices;
-  int rdmaCpu = 0;
-  int onlineCpus = rdma::numCpus();
-  int numPorts = config.ports_num() + config.ports_num_for_sparse();
-
-  if (config.nics().empty()) {
-    parameterServers_.resize(numPorts);
-    for (int i = 0; i < numPorts; ++i) {
-      if (config.rdma_tcp() == "rdma") {
-        parameterServers_[i].reset(
-            new ParameterServer2(std::string(), config.port() + i, rdmaCpu++));
-        rdmaCpu = rdmaCpu % onlineCpus;
-      } else {
-        parameterServers_[i].reset(
-            new ParameterServer2(std::string(), config.port() + i));
-      }
-      CHECK(parameterServers_[i]->init()) << "Fail to initialize parameter "
-                                             "server on port "
-                                          << config.port() + i;
-    }
-  } else {
-    str::split(config.nics(), ',', &devices);
-    parameterServers_.resize(devices.size() * numPorts);
-    for (int i = 0; i < numPorts; ++i) {
-      for (size_t j = 0; j < devices.size(); ++j) {
-        if (config.rdma_tcp() == "rdma") {
-          parameterServers_[i * devices.size() + j].reset(new ParameterServer2(
-              getIpAddr(devices[j]), config.port() + i, rdmaCpu++));
-          rdmaCpu = rdmaCpu % onlineCpus;
-        } else {
-          parameterServers_[i * devices.size() + j].reset(
-              new ParameterServer2(getIpAddr(devices[j]), config.port() + i));
-        }
-        CHECK(parameterServers_[i * devices.size() + j]->init())
-            << "Fail to initialize parameter server with device " << devices[j]
-            << config.port() + i;
-      }
-    }
-  }
-}
-
-ParameterServerController::~ParameterServerController() { this->wait(); }
-
-ParameterServerController* ParameterServerController::createFromGflags() {
-  ParameterServerConfig config;
-
-  config.set_nics(FLAGS_nics);
-  config.set_rdma_tcp(FLAGS_rdma_tcp);
-  config.set_port(FLAGS_port);
-  config.set_ports_num(FLAGS_ports_num);
-  config.set_ports_num_for_sparse(FLAGS_ports_num_for_sparse);
-
-  return create(config);
-}
-
-ParameterServerController* ParameterServerController::create(
-    const ParameterServerConfig& config) {
-  return new ParameterServerController(config);
-}
-
-void ParameterServerController::start() {
-  LOG(INFO) << "number of parameterServer instances: "
-            << parameterServers_.size();
-  int i = 0;
-  for (const auto& parameterServer : parameterServers_) {
-    LOG(INFO) << "Starting parameterServer[" << i << "]";
-    parameterServer->start();
-    i++;
-  }
-}
-
-void ParameterServerController::wait() {
-  int i = 0;
-  for (const auto& parameterServer : parameterServers_) {
-    LOG(INFO) << "Waiting parameterServer[" << i << "]";
-    parameterServer->join();
-    i++;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterServerController.h b/paddle/legacy/pserver/ParameterServerController.h
deleted file mode 100644
index b90d0cbceaa..00000000000
--- a/paddle/legacy/pserver/ParameterServerController.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ParameterServer2.h"
-#include "ParameterServerConfig.pb.h"
-#include "RDMANetwork.h"
-#include "paddle/legacy/utils/StringUtil.h"
-
-namespace paddle {
-
-/**
- * @brief ParameterServerController is used for create, init and manage multi
- * parameter server instances. The num of the instances is decided by port
- * num(the ports number for parameter send) and network devices configured
- * by gflags or proto.
- */
-class ParameterServerController final {
- public:
-  DISABLE_COPY(ParameterServerController);
-
-  /**
-   * @brief Ctor, Create a ParameterServerController from ParameterServerConfig.
-   */
-  explicit ParameterServerController(const ParameterServerConfig& config);
-
-  /**
-   * @brief Dtor.
-   */
-  ~ParameterServerController();
-
-  /**
-   * @brief create ParameterServerController from gflags, this is used for
-   * compatibility with the old usage of configuration by gflags.
-   */
-  static ParameterServerController* createFromGflags();
-
-  /**
-   * @brief create ParameterServerController with ParameterServerConfig, remove
-   * gflags from ParameterServer. Init all ParameterServer2 instances according
-   * to
-   * the config.
-   */
-  static ParameterServerController* create(const ParameterServerConfig& config);
-
-  /**
-   * @brief start all ParameterServer2 instances in this
-   * ParameterServerController.
-   */
-  void start();
-
-  /**
-   * @brief join and wait for all ParameterServer2 instances thread in this
-   * ParameterServerController.
-   */
-  void wait();
-
- private:
-  std::vector<std::unique_ptr<ParameterServer2>> parameterServers_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/ProtoServer.cpp b/paddle/legacy/pserver/ProtoServer.cpp
deleted file mode 100644
index 6b7948a7d0a..00000000000
--- a/paddle/legacy/pserver/ProtoServer.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ProtoServer.h"
-
-namespace paddle {
-
-void ProtoServer::handleRequest(std::unique_ptr<MsgReader> msgReader,
-                                ResponseCallback callback) {
-  /// 0 for funcName
-  /// 1 for proto
-  CHECK_GE(msgReader->getNumBlocks(), (size_t)2);
-
-  std::string funcName(msgReader->getNextBlockLength(), 0);
-  /// read function name string
-  msgReader->readNextBlock(&funcName[0]);
-  /// looking up rpc wrapped callback function
-  auto it = nameToFuncMap_.find(funcName);
-  if (it != nameToFuncMap_.end()) {
-#ifndef PADDLE_DISABLE_TIMER
-    gettimeofday(&(*(handleRequestBegin_)), nullptr);
-#endif
-    it->second(std::move(msgReader), callback);
-  } else {
-    LOG(ERROR) << "Unknown funcName: " << funcName;
-    std::vector<iovec> iovs;
-    callback(iovs);
-  }
-}
-
-void ProtoServer::registerServiceFunctionImp(const std::string& funcName,
-                                             ServiceFunction func) {
-  CHECK(!nameToFuncMap_.count(funcName)) << "Duplicated registration: "
-                                         << funcName;
-  nameToFuncMap_[funcName] = func;
-}
-
-void ProtoClient::send(const char* funcName,
-                       const google::protobuf::MessageLite& proto,
-                       const std::vector<iovec>& userIovs) {
-  std::string protoStr;
-  CHECK(proto.SerializeToString(&protoStr));
-  std::vector<iovec> iovs;
-  iovs.reserve(iovs.size() + 2);
-  /// sending function name string, protobuf data and user additional data
-  iovs.push_back({(void*)funcName, strlen(funcName)});
-  iovs.push_back({&protoStr[0], protoStr.size()});
-  iovs.insert(iovs.end(), userIovs.begin(), userIovs.end());
-  channel_->writeMessage(iovs);
-}
-
-std::unique_ptr<MsgReader> ProtoClient::recv(
-    google::protobuf::MessageLite* proto) {
-  std::vector<iovec> iovs;
-  std::unique_ptr<MsgReader> msgReader = channel_->readMessage();
-  CHECK_GE(msgReader->getNumBlocks(), (size_t)1);
-  std::string str(msgReader->getNextBlockLength(), 0);
-  msgReader->readNextBlock(&str[0]);
-  CHECK(proto->ParseFromString(str));
-  return msgReader;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/ProtoServer.h b/paddle/legacy/pserver/ProtoServer.h
deleted file mode 100644
index 2943867de58..00000000000
--- a/paddle/legacy/pserver/ProtoServer.h
+++ /dev/null
@@ -1,267 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "LightNetwork.h"
-
-#include <map>
-
-#include <google/protobuf/message_lite.h>
-
-namespace paddle {
-
-/**
- *
- * It implements the rpc framework, which launchs one thread for each
- * connection. Here define one parameter server as single TCP server
- * binding on single port. All connections share single tcp ProtoServer
- * object, each connection handles all requests from specified trainer
- * within single worker thread.
- * to accelerate bandwidth efficiency and harness multicore for pserver
- * optimization to reduce pserver latency, you could launch more port
- * for single NIC hardward with --port=N(N>1) for small cluster job.
- */
-class ProtoServer : public SocketServer {
- public:
-  /// rdmaCpu controls the cpu affinity of RDMA server daemon,
-  /// which could benifit performance. rdmaCpu = -1 means TCP
-  /// is used instead of RDMA transport.
-  ProtoServer(const std::string& addr, int port, int rdmaCpu = -1)
-      : SocketServer(addr, port, rdmaCpu) {}
-
-  typedef std::function<void(const google::protobuf::MessageLite& protoOut,
-                             const std::vector<iovec>& outputIovs)>
-      ProtoResponseCallbackEx;
-
-  typedef std::function<void(const google::protobuf::MessageLite& protoOut)>
-      ProtoResponseCallback;
-
-  /**
-   * Register a service function for this server
-   * void(const ProtoIn& request,
-   *      ProtoResponseCallback callback)
-   * The service function process the request and call the callback
-   * after it finishes the request.
-
-   * Use macro REGISTER_SERVICE_FUNCTION as a helper
-   * to simplify the use.
-   */
-  template <class ProtoIn>
-  void registerServiceFunction(
-      const std::string& funcName,
-      std::function<void(const ProtoIn& request,
-                         ProtoResponseCallback callback)> func);
-
-  /**
-   * Register a service function for this server
-   * The signature of the service function is
-   * void(const ProtoIn&,
-   *      std::unique_ptr<MsgReader> msgReader,
-   *      ProtoResponseCallbackEx callback)
-   * The service function process the request and call the callback
-   * after it finishes the request.
-   * The extended service function can take extra input blocks from
-   * the communication channel by reading msgReader. It can also
-   * send extra blocks to the communication channel by providing
-   * outputIovs as the argument for the callback function.
-
-   * Use macro REGISTER_SERVICE_FUNCTION_EX as a helper
-   * to simplify the use.
-   */
-  template <class ProtoIn>
-  void registerServiceFunctionEx(
-      const std::string& funcName,
-      std::function<void(const ProtoIn&,
-                         std::unique_ptr<MsgReader> msgReader,
-                         ProtoResponseCallbackEx callback)> func);
-
- protected:
-  /**
-   * @brief handle rpc request
-   * @param[in] msgReader  Message reader for reading data from connection
-   * @param[in] callback   equal to channel->writeMessage
-   *
-   * @note  it lookups rpc function mapping table to find function pointer,
-   *        then call this function with further reading data from connection
-   */
-  virtual void handleRequest(std::unique_ptr<MsgReader> msgReader,
-                             ResponseCallback callback);
-
-  typedef std::function<void(std::unique_ptr<MsgReader> msgReader,
-                             ResponseCallback callback)>
-      ServiceFunction;
-
-  /**
-   * @brief register one RPC function in function mapping
-   * @param[in] funcName  function name string
-   * @param[in] func      rpc function wrapped with reading and writing data
-   */
-  void registerServiceFunctionImp(const std::string& funcName,
-                                  ServiceFunction func);
-
- protected:
-  /// Tuning bare network overhead: the beginning of receiving request
-  ThreadLocal<struct timeval> handleRequestBegin_;
-
-  /// mapping to find rpc function while handling request
-  std::map<std::string, ServiceFunction> nameToFuncMap_;
-};
-
-class ProtoClient : public SocketClient {
- public:
-  ProtoClient(const std::string& serverAddr,
-              int serverPort,
-              enum ChannelType channelType = F_TCP)
-      : SocketClient(serverAddr, serverPort, channelType) {}
-
-  /**
-   * @brief Make a request to the server.
-   * @param[in] funcName  request rpc function name string
-   * @param[in] proto     protobuf data for sending to pserver
-   * @param[in] iov       additional iov data for sending to pserver
-   *
-   * @note  iov provides additional blocks which need to be written to the
-   *        communication channel
-   */
-  void send(const char* funcName,
-            const google::protobuf::MessageLite& proto,
-            const std::vector<iovec>& iov = std::vector<iovec>());
-
-  /**
-   * @brief receive the response from the server.
-   * @param[in] proto     proto binary buffer
-   *
-   * @note  this must be paired with a corresponding send() call. The
-   *        returned MsgReader allows the caller to receive additional
-   *        blocks from the communication channel.
-   */
-  std::unique_ptr<MsgReader> recv(google::protobuf::MessageLite* proto);
-
-  /// combines send() and recv()
-  std::unique_ptr<MsgReader> sendAndRecv(
-      const char* funcName,
-      const google::protobuf::MessageLite& protoIn,
-      google::protobuf::MessageLite* protoOut) {
-    send(funcName, protoIn);
-    return recv(protoOut);
-  }
-
-  /// combines send() and recv()
-  std::unique_ptr<MsgReader> sendAndRecv(
-      const char* funcName,
-      const google::protobuf::MessageLite& protoIn,
-      const std::vector<iovec>& iov,
-      google::protobuf::MessageLite* protoOut) {
-    send(funcName, protoIn, iov);
-    return recv(protoOut);
-  }
-};
-
-template <class>
-struct service_arg_type;
-/// helper class for obtaining the argument type of a service function
-template <class R, class C, class Arg1, class Arg2>
-struct service_arg_type<R (C::*)(const Arg1&, Arg2)> {
-  typedef Arg1 _1;
-};
-
-template <class R, class C, class Arg1, class Arg2>
-struct service_arg_type<R (C::*)(  // NOLINT
-    const Arg1&,
-    std::unique_ptr<MsgReader>,
-    Arg2)> {
-  typedef Arg1 _1;
-};
-
-/// register a service function to the ProtoServer
-/// This should only be used within a member function of className
-#define REGISTER_SERVICE_FUNCTION(className, funcName)       \
-  registerServiceFunction<                                   \
-      service_arg_type<decltype(&className::funcName)>::_1>( \
-      #funcName,                                             \
-      std::bind(&className::funcName,                        \
-                this,                                        \
-                std::placeholders::_1,                       \
-                std::placeholders::_2))
-
-/// register a service function to the ProtoServer
-/// This should only be used within a member function of className
-#define REGISTER_SERVICE_FUNCTION_EX(className, funcName)    \
-  registerServiceFunctionEx<                                 \
-      service_arg_type<decltype(&className::funcName)>::_1>( \
-      #funcName,                                             \
-      std::bind(&className::funcName,                        \
-                this,                                        \
-                std::placeholders::_1,                       \
-                std::placeholders::_2,                       \
-                std::placeholders::_3))
-
-/// create wrapper function for parameter server high level function and
-/// register the wrapper function into function mapping.
-template <class ProtoIn>
-void ProtoServer::registerServiceFunctionEx(
-    const std::string& funcName,
-    std::function<void(const ProtoIn&,
-                       std::unique_ptr<MsgReader> msgReader,
-                       ProtoResponseCallbackEx callback)> func) {
-  auto f = [func](std::unique_ptr<MsgReader> msgReader,
-                  ResponseCallback callback) {
-    ProtoIn request;
-    std::string str(msgReader->getNextBlockLength(), 0);
-    msgReader->readNextBlock(&str[0]);
-    CHECK(request.ParseFromString(str));
-    auto pcob = [callback](const google::protobuf::MessageLite& response,
-                           const std::vector<iovec>& outputIovs) {
-      std::string out;
-      CHECK(response.SerializeToString(&out));
-      std::vector<iovec> iovs;
-      iovs.push_back({&out[0], out.size()});
-      iovs.insert(iovs.end(), outputIovs.begin(), outputIovs.end());
-      callback(iovs);
-    };
-
-    func(request, std::move(msgReader), pcob);
-  };
-
-  registerServiceFunctionImp(funcName, f);
-}
-
-template <class ProtoIn>
-void ProtoServer::registerServiceFunction(
-    const std::string& funcName,
-    std::function<void(const ProtoIn&, ProtoResponseCallback callback)> func) {
-  auto f = [func](std::unique_ptr<MsgReader> msgReader,
-                  ResponseCallback callback) {
-    ProtoIn request;
-    std::string str(msgReader->getNextBlockLength(), 0);
-    msgReader->readNextBlock(&str[0]);
-    CHECK(request.ParseFromString(str));
-    msgReader.reset();
-
-    auto pcob = [callback](const google::protobuf::MessageLite& response) {
-      std::string out;
-      CHECK(response.SerializeToString(&out));
-      std::vector<iovec> iovs;
-      iovs.push_back({&out[0], out.size()});
-      callback(iovs);
-    };
-
-    func(request, pcob);
-  };
-
-  registerServiceFunctionImp(funcName, f);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/RDMANetwork.h b/paddle/legacy/pserver/RDMANetwork.h
deleted file mode 100644
index c87056f72c5..00000000000
--- a/paddle/legacy/pserver/RDMANetwork.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_DISABLE_RDMA
-#include "sxi_sock.h"
-#else
-#define PROMPT_ERR() LOG(FATAL) << "Paddle is not compiled with rdma"
-#endif
-#include "paddle/legacy/utils/Logging.h"
-
-#include <netinet/in.h>
-struct sxi_sock;
-struct sxi_socket;
-
-#ifndef MAX_VEC_SIZE
-// define default MAX_VEC_SIZE
-#define MAX_VEC_SIZE (1UL << 16)
-#endif
-
-namespace paddle {
-/// Namespace rdma is adaptors for sxi_sock.h. Make paddle not depend on it
-/// when disable rdma support
-namespace rdma {
-inline int numCpus() {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_num_configured_cpus();
-#else
-  return 0;
-#endif
-}
-
-inline sxi_socket* ssocket(int cpuId) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_ssocket(cpuId);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline int listen(sxi_socket* s) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_listen(s);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline int bind(sxi_socket* s, const char* str) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_bind(s, str);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline sxi_sock* accept(sxi_socket* s) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_accept(s);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline sockaddr_in* getSourceAddress(sxi_sock* sock) {
-#ifndef PADDLE_DISABLE_RDMA
-  return reinterpret_cast<sockaddr_in*>(&sock->sa);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline int close(sxi_socket* sock) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_socket_close(sock);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline int close(sxi_sock* sock) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_sock_close(sock);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline void init() {
-#ifndef PADDLE_DISABLE_RDMA
-  sxi_module_init();
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline sxi_socket* csocket(int cpuId) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_csocket(cpuId);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline ssize_t read(sxi_sock* channel, void* data, size_t len) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_read(channel, data, len);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline ssize_t write(sxi_sock* channel, void* data, size_t len) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_write(channel, data, len);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline ssize_t readv(sxi_sock* channel, iovec* iov, int count) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_readv(channel, iov, count);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline ssize_t writev(sxi_sock* channel, iovec* iov, int count) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_writev(channel, iov, count);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline sxi_sock* connect(sxi_socket* socket, const char* url) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_connect(socket, url);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-}  //  namespace rdma
-}  //  namespace paddle
diff --git a/paddle/legacy/pserver/SocketChannel.cpp b/paddle/legacy/pserver/SocketChannel.cpp
deleted file mode 100644
index 79c763c62ba..00000000000
--- a/paddle/legacy/pserver/SocketChannel.cpp
+++ /dev/null
@@ -1,235 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SocketChannel.h"
-
-#include <netdb.h>
-#include <netinet/in.h>
-#include <stdio.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include "RDMANetwork.h"
-
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-/**
- * UIO_MAXIOV is documented in writev(2), but <sys/uio.h> only
- * declares it on osx/ios if defined(KERNEL)
- */
-#ifndef UIO_MAXIOV
-#define UIO_MAXIOV 512
-#endif
-
-SocketChannel::~SocketChannel() {
-  if (tcpRdma_ == F_TCP)
-    close(tcpSocket_);
-  else
-    rdma::close(rdmaSocket_);
-  LOG(INFO) << "destory connection in socket channel, peer = " << peerName_;
-}
-
-size_t SocketChannel::read(void* buf, size_t size) {
-  size_t total = 0;
-  while (total < size) {
-    ssize_t len;
-    if (tcpRdma_ == F_TCP)
-      len = ::read(tcpSocket_, (char*)buf + total, size - total);
-    else
-      len = rdma::read(rdmaSocket_, (char*)buf + total, size - total);
-
-    CHECK(len >= 0) << " peer=" << peerName_;
-    if (len <= 0) {
-      return total;
-    }
-    total += len;
-  }
-  return total;
-}
-
-size_t SocketChannel::write(const void* buf, size_t size) {
-  size_t total = 0;
-  while (total < size) {
-    ssize_t len;
-    if (tcpRdma_ == F_TCP)
-      len = ::write(tcpSocket_, (const char*)buf + total, size - total);
-    else
-      len = rdma::write(rdmaSocket_, (char*)buf + total, size - total);
-
-    CHECK(len >= 0) << " peer=" << peerName_;
-    if (len <= 0) {
-      return total;
-    }
-    total += len;
-  }
-  return total;
-}
-
-template <class IOFunc, class SocketType>
-static size_t readwritev(IOFunc iofunc,
-                         SocketType socket,
-                         iovec* iovs,
-                         int iovcnt,
-                         int maxiovs,
-                         const std::string& peerName) {
-  int curIov = 0;
-  size_t total = 0;
-
-  for (int i = 0; i < iovcnt; ++i) {
-    total += iovs[i].iov_len;
-  }
-
-  size_t size = 0;
-  size_t curIovSizeDone = 0;
-
-  while (size < total) {
-    ssize_t len =
-        iofunc(socket, &iovs[curIov], std::min(iovcnt - curIov, maxiovs));
-    CHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov
-                   << " iovCnt=" << iovcnt
-                   << " iovs[curIov].base=" << iovs[curIov].iov_base
-                   << " iovs[curIov].iov_len=" << iovs[curIov].iov_len;
-    size += len;
-
-    /// restore iovs[curIov] to the original value
-    iovs[curIov].iov_base =
-        (void*)((char*)iovs[curIov].iov_base - curIovSizeDone);
-    iovs[curIov].iov_len += curIovSizeDone;
-
-    len += curIovSizeDone;
-
-    while (curIov < iovcnt) {
-      if ((size_t)len < iovs[curIov].iov_len) break;
-      len -= iovs[curIov].iov_len;
-      ++curIov;
-    }
-    if (curIov < iovcnt) {
-      curIovSizeDone = len;
-      iovs[curIov].iov_base = (void*)((char*)iovs[curIov].iov_base + len);
-      iovs[curIov].iov_len -= len;
-    }
-  }
-  return size;
-}
-
-/// rdma::readv and rdma::writev can take advantage of RDMA blocking offload
-/// transfering
-size_t SocketChannel::writev(const std::vector<struct iovec>& iovs) {
-  if (tcpRdma_ == F_TCP)
-    return readwritev(::writev,
-                      tcpSocket_,
-                      const_cast<iovec*>(&iovs[0]),
-                      iovs.size(),
-                      UIO_MAXIOV,
-                      peerName_);
-  else
-    return readwritev(rdma::writev,
-                      rdmaSocket_,
-                      const_cast<iovec*>(&iovs[0]),
-                      iovs.size(),
-                      MAX_VEC_SIZE,
-                      peerName_);
-}
-
-size_t SocketChannel::readv(std::vector<struct iovec>* iovs) {
-  if (tcpRdma_ == F_TCP)
-    return readwritev(::readv,
-                      tcpSocket_,
-                      const_cast<iovec*>(&(*iovs)[0]),
-                      iovs->size(),
-                      UIO_MAXIOV,
-                      peerName_);
-  else
-    return readwritev(rdma::readv,
-                      rdmaSocket_,
-                      const_cast<iovec*>(&(*iovs)[0]),
-                      iovs->size(),
-                      MAX_VEC_SIZE,
-                      peerName_);
-}
-
-void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
-  MessageHeader header;
-  header.numIovs = userIovs.size();
-
-  std::vector<size_t> iovLengths;
-  iovLengths.reserve(userIovs.size());
-  for (auto& iov : userIovs) {
-    iovLengths.push_back(iov.iov_len);
-  }
-
-  std::vector<iovec> iovs;
-  iovs.reserve(userIovs.size() + 2);
-  iovs.push_back({&header, sizeof(header)});
-  iovs.push_back({&iovLengths[0],
-                  static_cast<size_t>(sizeof(iovLengths[0]) * header.numIovs)});
-  iovs.insert(iovs.end(), userIovs.begin(), userIovs.end());
-
-  header.totalLength = 0;
-  for (auto& iov : iovs) {
-    header.totalLength += iov.iov_len;
-  }
-
-  CHECK(writev(iovs) == (size_t)header.totalLength);
-}
-
-std::unique_ptr<MsgReader> SocketChannel::readMessage() {
-  MessageHeader header;
-
-  size_t len = read(&header, sizeof(header));
-  if (len == 0) {
-    return nullptr;
-  }
-
-  CHECK(len == sizeof(header));
-
-  std::unique_ptr<MsgReader> msgReader(new MsgReader(this, header.numIovs));
-
-  CHECK_EQ(msgReader->getTotalLength() + sizeof(header) +
-               msgReader->getNumBlocks() * sizeof(size_t),
-           (size_t)header.totalLength)
-      << " totalLength=" << msgReader->getTotalLength()
-      << " numBlocks=" << msgReader->getNumBlocks();
-  return msgReader;
-}
-
-MsgReader::MsgReader(SocketChannel* channel, size_t numBlocks)
-    : channel_(channel), blockLengths_(numBlocks), currentBlockIndex_(0) {
-  size_t size = numBlocks * sizeof(blockLengths_[0]);
-  CHECK(channel_->read(&blockLengths_[0], size) == size);
-}
-
-void MsgReader::readBlocks(const std::vector<void*>& bufs) {
-  CHECK_LE(currentBlockIndex_ + bufs.size(), blockLengths_.size());
-  std::vector<iovec> iovs;
-  iovs.reserve(bufs.size());
-  size_t totalLength = 0;
-  for (void* buf : bufs) {
-    iovs.push_back({buf, getNextBlockLength()});
-    totalLength += getNextBlockLength();
-    ++currentBlockIndex_;
-  }
-
-  CHECK(channel_->readv(&iovs) == totalLength);
-}
-
-void MsgReader::readNextBlock(void* buf) {
-  CHECK_LT(currentBlockIndex_, blockLengths_.size());
-  CHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength());
-  ++currentBlockIndex_;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/SocketChannel.h b/paddle/legacy/pserver/SocketChannel.h
deleted file mode 100644
index a7b3cd42f0a..00000000000
--- a/paddle/legacy/pserver/SocketChannel.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/utils/Util.h"
-
-#include <sys/uio.h>
-
-#include <memory>
-#include <vector>
-
-struct sxi_sock;
-
-namespace paddle {
-
-class SocketChannel;
-enum ChannelType {
-  F_TCP = 1,
-  F_RDMA = 2,
-};
-
-/// reading a set of blocks of data from SocketChannel.
-class MsgReader {
- public:
-  MsgReader(SocketChannel* channel, size_t numIovs);
-  ~MsgReader() {
-    /// ensure all data blocks have been processed
-    CHECK_EQ(currentBlockIndex_, blockLengths_.size());
-  }
-  /**
-   * @brief number of remaining parts
-   */
-  size_t getNumBlocks() const {
-    return blockLengths_.size() - currentBlockIndex_;
-  }
-
-  /**
-   * @brief lenght of next block
-   */
-  size_t getNextBlockLength() const { return getBlockLength(0); }
-
-  /**
-   * @brief get the total length of all the remaining blocks
-   */
-  size_t getTotalLength() const {
-    size_t total = 0;
-    for (size_t i = currentBlockIndex_; i < blockLengths_.size(); ++i) {
-      total += blockLengths_[i];
-    }
-    return total;
-  }
-
-  /**
-   * @brief Get the length for block currentBlockIndex + i
-   */
-  size_t getBlockLength(size_t i) const {
-    return blockLengths_[currentBlockIndex_ + i];
-  }
-
-  /**
-   * @brief  read blocks data and store it to buf
-   */
-  void readBlocks(const std::vector<void*>& bufs);
-  void readNextBlock(void* buf);
-
- protected:
-  SocketChannel* channel_;
-  std::vector<size_t> blockLengths_;
-  size_t currentBlockIndex_;
-};
-
-/// APIs for reading and writing byte stream data or naive iov data
-/// from the APIs both RDMA and TCP exhibits byte stream style
-class SocketChannel {
- public:
-  SocketChannel(int socket, const std::string& peerName)
-      : tcpSocket_(socket), peerName_(peerName) {
-    tcpRdma_ = F_TCP;
-  }
-  SocketChannel(struct sxi_sock* socket, const std::string& peerName)
-      : rdmaSocket_(socket), peerName_(peerName) {
-    tcpRdma_ = F_RDMA;
-  }
-
-  ~SocketChannel();
-
-  const std::string& getPeerName() const { return peerName_; }
-
-  /**
-   * @brief read size bytes.
-   *
-   * @note  keep reading until getting size bytes or sock is closed
-   *        is closed
-   */
-  size_t read(void* buf, size_t size);
-
-  /**
-   * @brief write size bytes.
-   *
-   * @note  keep writing until writing size bytes or sock is closed
-   */
-  size_t write(const void* buf, size_t size);
-
-  /**
-   * @brief write a set of buffers.
-   *
-   * @note  keep writing until all buffers are written or sock is closed
-   */
-  size_t writev(const std::vector<struct iovec>& iov);
-
-  /**
-   * @brief read a set of buffers.
-   *
-   * @note  keep reading until all buffers are full or sock is closed.
-   */
-  size_t readv(std::vector<struct iovec>* iov);
-
-  /**
-   * @brief write a set of buffers.
-   *
-   * @note  keep writing until all buffers are passed or sock is closed
-   */
-  void writeMessage(const std::vector<struct iovec>& iov);
-
-  /// return null to indicate socket is closed
-  std::unique_ptr<MsgReader> readMessage();
-
- protected:
-  struct MessageHeader {
-    int64_t totalLength;  /// include the header
-    int64_t numIovs;
-    int64_t iovLengths[0];
-  };
-
-  int tcpSocket_;
-  struct sxi_sock* rdmaSocket_;
-  const std::string peerName_;
-  enum ChannelType tcpRdma_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/SparseParameterDistribution.cpp b/paddle/legacy/pserver/SparseParameterDistribution.cpp
deleted file mode 100644
index 3f17b228f0e..00000000000
--- a/paddle/legacy/pserver/SparseParameterDistribution.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include "paddle/legacy/utils/Flags.h"
-
-#include "SparseParameterDistribution.h"
-
-DEFINE_bool(check_sparse_distribution_in_pserver,
-            false,
-            "check whether sparse parameter exhibts balanced distribution at "
-            "all pservers");
-DEFINE_bool(show_check_sparse_distribution_log,
-            false,
-            "show logs details for sparse parameter distribution in pserver");
-DEFINE_int32(check_sparse_distribution_batches,
-             100,
-             "run sparse parameter distribution check for N batches");
-DEFINE_double(
-    check_sparse_distribution_ratio,
-    0.6,
-    "if parameters dispatched to different pservers exhibit unbalanced "
-    " distribution for check_sparse_distribution_ratio * "
-    " check_sparse_distribution_batches times, crash program");
-DEFINE_double(check_sparse_distribution_unbalance_degree,
-              2.0,
-              "the ratio of maximum data size and minimun data size for "
-              "different pserver");
-
-namespace paddle {
-
-SparseParameterDistribution::SparseParameterDistribution(size_t serviceNum) {
-  totBytes_ = 0;
-  data_.resize(serviceNum);
-
-  batchPassed_ = 0;
-  unbalanceCnt_ = 0;
-}
-
-void SparseParameterDistribution::probeDistribution(int serverId,
-                                                    size_t dataSize) {
-  if (!FLAGS_check_sparse_distribution_in_pserver ||
-      batchPassed_ > FLAGS_check_sparse_distribution_batches) {
-    return;
-  }
-
-  CHECK_LT((size_t)serverId, data_.size())
-      << "invalid sparse parameter distribution probe";
-
-  data_[serverId] += dataSize;
-  totBytes_ += dataSize;
-}
-
-void SparseParameterDistribution::checkAndResetDistribution() {
-  if (!FLAGS_check_sparse_distribution_in_pserver ||
-      batchPassed_ >= FLAGS_check_sparse_distribution_batches) {
-    return;
-  }
-
-  /// at runtime, prepareSendData is called by many contexts,
-  /// so need to check if data is avaiable.
-  if (!totBytes_) {
-    return;
-  }
-
-  /// check if distribution is balanced
-  auto avgSize = totBytes_ / data_.size();
-  auto unbalanceDegree = FLAGS_check_sparse_distribution_unbalance_degree;
-  for (auto& dataSize : data_) {
-    if (dataSize > unbalanceDegree * avgSize ||
-        dataSize * unbalanceDegree < avgSize) {
-      unbalanceCnt_++;
-      break;
-    }
-  }
-
-  auto printData = [&]() {
-    std::stringstream ss;
-    for (auto& dataSize : data_) {
-      ss << dataSize * 0.001 << "KB ";
-    }
-    ss << std::endl;
-    LOG(INFO) << ss.str();
-  };
-
-  /// show all sparse data size for different pserver
-  if (FLAGS_show_check_sparse_distribution_log) {
-    LOG(INFO) << "sparse distribution:";
-    printData();
-  }
-
-  totBytes_ = 0;
-  batchPassed_++;
-
-  if (batchPassed_ == FLAGS_check_sparse_distribution_batches) {
-    LOG(INFO) << "show last parameter distribution sample:";
-    printData();
-    LOG(INFO) << "total unbalanced batches: " << unbalanceCnt_
-              << " in passed batches: " << batchPassed_;
-    CHECK_LE((float)unbalanceCnt_ / (float)batchPassed_,
-             FLAGS_check_sparse_distribution_ratio)
-        << "unbalanced sparse parameter distribution for different pserver. "
-        << "it could be caused by unbalanced sparse ids distribution, try "
-        << "to shuffle dimensions in input samples";
-  }
-
-  std::fill(data_.begin(), data_.end(), 0);
-}
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/SparseParameterDistribution.h b/paddle/legacy/pserver/SparseParameterDistribution.h
deleted file mode 100644
index ee78029958f..00000000000
--- a/paddle/legacy/pserver/SparseParameterDistribution.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <unistd.h>
-
-#include <atomic>
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-/*
- * if sparse_remote_updater is used, different ParameterServer could
- * be assigned with unbalanced gradients. the parameter value from
- * ParameterServer also be not balanced. the distribution of different
- * dimensions of sparse ids determines the unbalanced degree of data
- * distributed among all ParameterServers. Even distribution will
- * benifits cluster efficiency.
- * do check the unbalanced degree of gradients at runtime, crash program
- * if unbalanced distribution exhibts by default.
- */
-class SparseParameterDistribution {
- public:
-  /// serviceNum means the number of ParameterServers
-  explicit SparseParameterDistribution(size_t serviceNum);
-  ~SparseParameterDistribution() {}
-  /// collect data
-  void probeDistribution(int serverId, size_t data);
-  void checkAndResetDistribution();
-
- private:
-  std::vector<size_t> data_;
-  std::atomic<size_t> totBytes_;
-
-  /// after some batches, stop to check
-  int batchPassed_;
-
-  /// stat on unbalanced distribution found
-  int unbalanceCnt_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/test/.gitignore b/paddle/legacy/pserver/test/.gitignore
deleted file mode 100644
index aeb58c5b562..00000000000
--- a/paddle/legacy/pserver/test/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-log
-test_ParameterServer
-test_ParameterServer2
-socket_test
-test_ProtoServer
diff --git a/paddle/legacy/pserver/test/CMakeLists.txt b/paddle/legacy/pserver/test/CMakeLists.txt
deleted file mode 100644
index b66a00ba065..00000000000
--- a/paddle/legacy/pserver/test/CMakeLists.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-######################### socket_test ########################
-add_unittest_without_exec(socket_test
-    SocketTest.cpp)
-
-add_test(NAME socket_test
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port
-        ${CMAKE_CURRENT_BINARY_DIR}/socket_test --loop_time=10)
-
-####################### test_ProtoServer ####################
-add_unittest_without_exec(test_ProtoServer
-    test_ProtoServer.cpp)
-
-IF(NOT ON_TRAVIS)
-    add_test(NAME test_ProtoServer
-        COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port
-            ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer)
-ENDIF(NOT ON_TRAVIS)
-
-# TODO(yuyang18): Run test_ProtoServer when with rdma
-# add_test(NAME test_ProtoServerRDMA
-#   COMMAND ...)
-
-#################### test_ParameterServer2 ####################
-add_unittest_without_exec(test_ParameterServer2
-    test_ParameterServer2.cpp)
-add_test(NAME test_ParameterServer2
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port -n 4
-        ${CMAKE_CURRENT_BINARY_DIR}/test_ParameterServer2)
diff --git a/paddle/legacy/pserver/test/SocketTest.cpp b/paddle/legacy/pserver/test/SocketTest.cpp
deleted file mode 100644
index 3a781fcbf65..00000000000
--- a/paddle/legacy/pserver/test/SocketTest.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Util.h"
-
-#include <netdb.h>
-#include <netinet/in.h>
-#include <stdio.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-
-#include <thread>
-
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/utils/Logging.h"
-
-struct MessageHeader {
-  int64_t dataLength;
-};
-
-class Thread {
- public:
-  void start();
-  virtual void run() = 0;
-  virtual ~Thread() {}
-
- protected:
-  std::unique_ptr<std::thread> thread_;
-};
-
-void Thread::start() {
-  thread_.reset(new std::thread([this]() { this->run(); }));
-}
-
-class SocketChannel {
- public:
-  explicit SocketChannel(int socket) : socket_(socket) {}
-  int getSocketFd() const { return socket_; }
-  uint64_t readAll(void* buf, size_t size);
-  uint64_t writeAll(const void* buf, size_t size);
-
- protected:
-  int socket_;
-};
-
-uint64_t SocketChannel::readAll(void* buf, size_t size) {
-  uint64_t total = 0;
-  while (total < size) {
-    int64_t len = read(socket_, (char*)buf + total, size - total);
-    if (len <= 0) {
-      return total;
-    }
-    total += len;
-  }
-  return total;
-}
-
-uint64_t SocketChannel::writeAll(const void* buf, size_t size) {
-  uint64_t total = 0;
-  while (total < size) {
-    int64_t len = write(socket_, (const char*)buf + total, size - total);
-    if (len <= 0) {
-      return total;
-    }
-    total += len;
-  }
-  return total;
-}
-
-class SocketWorker : public Thread {
- public:
-  explicit SocketWorker(int socket) : channel_(socket) {}
-  virtual void run();
-
-  // read n bytes.
-  int64_t readAll(char* buf, size_t n);
-
-  // write n bytes
-
- protected:
-  SocketChannel channel_;
-  std::string buffer_;
-};
-
-class SocketServer : public Thread {
- public:
-  explicit SocketServer(int port)
-      : port_(port), socket_(0), maxPendingConnections_(100) {}
-
-  virtual void run();
-
- protected:
-  int port_;
-  int socket_;
-  int maxPendingConnections_;
-};
-
-void SocketServer::run() {
-  int newsockfd;
-  socklen_t clilen;
-  struct sockaddr_in serv_addr, cli_addr;
-
-  /* First call to socket() function */
-  socket_ = socket(AF_INET, SOCK_STREAM, 0);
-  CHECK(socket_ >= 0) << "ERROR opening socket";
-
-  /* Initialize socket structure */
-  bzero((char*)&serv_addr, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  serv_addr.sin_addr.s_addr = INADDR_ANY;
-  serv_addr.sin_port = htons(port_);
-
-  /* Now bind the host address using bind() call.*/
-  CHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
-      << "ERROR on binding";
-
-  /* Now start listening for the clients, here process will
-   * go in sleep mode and will wait for the incoming connection
-   */
-  listen(socket_, maxPendingConnections_);
-  clilen = sizeof(cli_addr);
-
-  while (true) {
-    /* Accept actual connection from the client */
-    newsockfd = accept(socket_, (struct sockaddr*)&cli_addr, &clilen);
-    CHECK(newsockfd >= 0) << "ERROR on accept";
-
-    SocketWorker* worker = new SocketWorker(newsockfd);
-    worker->start();
-  }
-}
-
-void SocketWorker::run() {
-  MessageHeader header;
-
-  while (true) {
-    int64_t n = channel_.readAll(&header, sizeof(header));
-    CHECK(n == sizeof(header)) << "ERROR reading from socket";
-
-    buffer_.resize(header.dataLength);
-    n = channel_.readAll(&buffer_[0], header.dataLength);
-    CHECK(n == header.dataLength) << "ERROR reading from socket";
-
-    /* Write a response to the client */
-    n = channel_.writeAll(&header, sizeof(header));
-    CHECK(n == sizeof(header)) << "ERROR reading from socket";
-    n = channel_.writeAll(buffer_.data(), buffer_.size());
-    CHECK(n == header.dataLength) << "ERROR writing to socket";
-  }
-}
-
-class SocketClient {
- public:
-  SocketClient(const std::string& serverAddr, int serverPort);
-  SocketChannel* getChannel() const { return channel_.get(); }
-
- protected:
-  std::unique_ptr<SocketChannel> channel_;
-};
-
-SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
-  struct sockaddr_in serv_addr;
-  struct hostent* server;
-
-  // char buffer[256];
-
-  /* Create a socket point */
-  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
-  CHECK(sockfd >= 0) << "ERROR opening socket";
-  server = gethostbyname(serverAddr.c_str());
-  CHECK(server) << "ERROR, no such host: " << serverAddr;
-
-  bzero((char*)&serv_addr, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  bcopy((char*)server->h_addr,
-        (char*)&serv_addr.sin_addr.s_addr,
-        server->h_length);
-  serv_addr.sin_port = htons(serverPort);
-
-  /* Now connect to the server */
-  CHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
-      << "ERROR connecting";
-
-  channel_.reset(new SocketChannel(sockfd));
-}
-
-DEFINE_string(server_addr, "127.0.0.1", "Server address");
-DEFINE_int64(dim, 10000000, "Data size");
-DEFINE_int32(loop_time, 100000, "test loop time");
-
-using namespace paddle;  // NOLINT
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  SocketServer server(FLAGS_port);
-  server.start();
-  sleep(1);
-
-  SocketClient client(FLAGS_server_addr, FLAGS_port);
-
-  SocketChannel* channel = client.getChannel();
-
-  MessageHeader header;
-
-  uint64_t dataSize = FLAGS_dim * sizeof(real);
-
-#ifdef PADDLE_WITH_CUDA
-  GpuVector gpuParam(FLAGS_dim);
-  GpuVector gpuGrad(FLAGS_dim);
-#else
-  CpuVector gpuParam(FLAGS_dim);
-  CpuVector gpuGrad(FLAGS_dim);
-#endif
-  CpuVector cpuParam(FLAGS_dim);
-  CpuVector cpuGrad(FLAGS_dim);
-
-  gpuParam.rand();
-  gpuGrad.rand();
-  cpuParam.rand();
-  cpuGrad.rand();
-
-  for (int i = 0; i < FLAGS_loop_time; ++i) {
-    cpuGrad.copyFrom(gpuGrad);
-
-    header.dataLength = dataSize;
-    CHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
-        << "Client write header error";
-
-    CHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
-        << "Client write data error";
-
-    /* Now read server response */
-    CHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
-        << "Client read header error";
-
-    CHECK_EQ((uint64_t)header.dataLength, dataSize);
-    CHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
-        << "Client read data error";
-
-    gpuParam.copyFrom(cpuParam);
-
-    LOG_EVERY_N(INFO, 100) << "i=" << i;
-  }
-  exit(0);
-}
diff --git a/paddle/legacy/pserver/test/test_ParameterServer2.cpp b/paddle/legacy/pserver/test/test_ParameterServer2.cpp
deleted file mode 100644
index 542e80e0469..00000000000
--- a/paddle/legacy/pserver/test/test_ParameterServer2.cpp
+++ /dev/null
@@ -1,624 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/pserver/ParameterClient2.h>
-#include <paddle/legacy/pserver/ParameterServer2.h>
-#include <paddle/legacy/utils/Flags.h>
-#include <paddle/legacy/utils/Util.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(num_gradient_servers);
-DEFINE_string(server_addr, "127.0.0.1", "assign server address");
-DEFINE_int32(server_cpu, 0, "assign server cpu");
-
-class ParameterServer2Tester : public ParameterServer2 {
- public:
-  ParameterServer2Tester(std::string serverAddr,
-                         int port,
-                         int rdmaCpu = -1,
-                         bool sepSendAndRecv = false)
-      : ParameterServer2(serverAddr, port, rdmaCpu), client_(sepSendAndRecv) {}
-  virtual ~ParameterServer2Tester() {}
-  void setup() {
-    CHECK(ParameterServer2::init());
-
-    parameters_.clear();
-    clientConfigs_.clear();
-
-    clientConfigs_.resize(2);
-    {
-      ParameterConfig& config = clientConfigs_[0];
-      config.set_name("para0");
-      config.set_para_id(0);
-      config.set_size(10000);
-      config.set_device(-1);
-      config.set_learning_rate(1.0);
-      config.set_momentum(0.9);
-    }
-
-    {
-      ParameterConfig& config = clientConfigs_[1];
-      config.set_name("para1");
-      config.set_para_id(1);
-      config.set_size(5000);
-      config.set_device(-1);
-      config.set_learning_rate(0.5);
-      config.set_momentum(0.4);
-    }
-
-    for (auto& config : clientConfigs_) {
-      parameters_.emplace_back(new Parameter(config, /* useGpu= */ false));
-    }
-
-    size_t id = 0;
-    for (auto& para : parameters_) {
-      para->setID(id++);
-    }
-
-    CHECK(client_.init(parameters_));
-    OptimizationConfig optConfig;
-    optConfig.set_algorithm("async_sgd");
-    optConfig.set_batch_size(100);
-    optConfig.set_learning_rate(0.1);
-    client_.setConfig(optConfig);
-    client_.setParameter();
-  }
-
-  void setConfigTest();
-  void setStatusTest();
-  void sendParameterTest();
-  void sendDataTest(SendDataType type, size_t size);
-  void operationTest();
-  void mergeBlockSegmentTest();
-  void checkSegments(const BlockSegments& expected, const BlockSegments& segs);
-  void waitPassFinishTest();
-  void synchronizeTest();
-
- protected:
-  ParameterClient2 client_;
-  vector<ParameterConfig> clientConfigs_;
-  vector<ParameterPtr> parameters_;
-};
-
-std::unique_ptr<ParameterServer2Tester> g_server;
-
-void ParameterServer2Tester::setConfigTest() {
-  setup();
-
-  for (auto& config : clientConfigs_) {
-    auto it = configMap_.find(config.para_id());
-    EXPECT_TRUE(it != configMap_.end());
-    auto& serverConfig = it->second;
-    EXPECT_EQ(config.name(), serverConfig.name());
-    EXPECT_EQ(config.size(), serverConfig.size());
-    EXPECT_EQ(config.learning_rate(), serverConfig.learning_rate());
-    EXPECT_EQ(config.momentum(), serverConfig.momentum());
-  }
-}
-
-void ParameterServer2Tester::setStatusTest() {
-  setup();
-  EXPECT_TRUE(client_.inStatus(PSERVER_STATUS_NOT_SET));
-  client_.setStatus(PSERVER_STATUS_PARAMETER_READY);
-  EXPECT_EQ(PSERVER_STATUS_PARAMETER_READY, status_);
-  EXPECT_TRUE(client_.inStatus(PSERVER_STATUS_PARAMETER_READY));
-}
-
-real sumVector(const CpuVector& vec) {
-  const real* data = vec.getData();
-  size_t dim = vec.getSize();
-  real sum = 0;
-  for (size_t i = 0; i < dim; ++i) {
-    sum += data[i];
-  }
-  return sum;
-}
-
-void ParameterServer2Tester::sendParameterTest() {
-  setup();
-
-  client_.sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
-                                  PARAMETER_VALUE,
-                                  0,       // numSamples = 0
-                                  0,       // cost = 0
-                                  false);  // sendBackParameter = false
-
-  vector<ParameterPtr> parameterCopies;
-
-  for (auto& parameter : parameters_) {
-    parameterCopies.emplace_back(
-        new Parameter(parameter->getConfig(), /* useGpu= */ false));
-    parameterCopies.back()
-        ->getBuf(PARAMETER_VALUE)
-        ->copyFrom(*parameter->getBuf(PARAMETER_VALUE));
-  }
-
-  client_.sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
-                                  PARAMETER_VALUE,
-                                  0,      // numSamples = 0
-                                  0,      // cost = 0
-                                  true);  // sendBackParameter = true
-
-  for (size_t i = 0; i != parameters_.size(); ++i) {
-    real* v1 = parameters_[i]->getBuf(PARAMETER_VALUE)->getData();
-    real* v2 = parameterCopies[i]->getBuf(PARAMETER_VALUE)->getData();
-    EXPECT_EQ(parameters_[i]->getSize(), parameterCopies[i]->getSize());
-    size_t size = parameters_[i]->getSize();
-    real sum1 = 0, sum2 = 0;
-    for (size_t j = 0; j < size; ++j) {
-      sum1 += v1[j];
-      sum2 += v2[j];
-    }
-    EXPECT_EQ(sum1, sum2);
-  }
-}
-
-void ParameterServer2Tester::sendDataTest(SendDataType type, size_t size) {
-  ParameterClient2 client1(true);
-  client1.init(parameters_);
-  ParameterClient2 client2(true);
-  client2.init(parameters_);
-  ParameterClient2 client3(true);
-  client3.init(parameters_);
-
-  ThreadWorker worker1;
-  ThreadWorker worker2;
-  ThreadWorker worker3;
-
-  double* testData1 = new double[size];
-  double* testData2 = new double[size];
-  double* testData3 = new double[size];
-  double* getDataExpect = new double[size];
-  double* getDataReal = new double[size];
-  for (size_t i = 0; i < size; ++i) {
-    testData1[i] = rand();  // NOLINT TODO(yuyang18): Use rand_r instead.
-    testData2[i] = rand();  // NOLINT
-    testData3[i] = rand();  // NOLINT
-    getDataExpect[i] = testData1[i] + testData2[i] + testData3[i];
-  }
-
-  auto put1 = [&]() {
-    LOG(INFO) << "putOwnData1 start";
-    client1.putOwnData(0, type, testData1, size);
-    LOG(INFO) << "putOwnData1 finish";
-  };
-
-  auto get1 = [&]() {
-    LOG(INFO) << "sendData1 get all start";
-    client1.getAllData(0, type, getDataReal, size);
-    for (size_t i = 0; i < size; ++i) {
-      CHECK_EQ(getDataReal[i], getDataExpect[i]);
-    }
-    LOG(INFO) << "sendData1 get all finish";
-  };
-
-  auto put2 = [&]() {
-    LOG(INFO) << "putOwnData2 start";
-    client2.putOwnData(1, type, testData2, size);
-    LOG(INFO) << "putOwnData2 finish";
-  };
-
-  auto put3 = [&]() {
-    LOG(INFO) << "putOwnData3 start";
-    client3.putOwnData(2, type, testData3, size);
-    LOG(INFO) << "putOwnData3 finish";
-  };
-
-  worker1.addJob(put1);
-  worker1.addJob(get1);
-  worker2.addJob(put2);
-  worker3.addJob(put3);
-
-  worker1.addJob(put1);
-  worker2.addJob(put2);
-  worker3.addJob(put3);
-  worker1.addJob(get1);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-  free(testData1);
-  free(testData2);
-  free(testData3);
-  free(getDataExpect);
-  free(getDataReal);
-}
-
-void ParameterServer2Tester::operationTest() {
-  PServerVector v1, v2;
-  v1 = client_.createVector();
-  EXPECT_EQ(NUM_PARAMETER_TYPES, v1.handle);
-
-  v2 = client_.createVector();
-  EXPECT_EQ(NUM_PARAMETER_TYPES + 1, v2.handle);
-
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_RESET, v1, (real)1);
-  ops.addOperation(PSERVER_OP_RESET, v2, (real)2);
-
-  real res1, res2, res3;
-  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res1);
-
-  ops.addOperation(PSERVER_OP_au_bv, v1, v2, (real)-1, (real)1);
-  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res2);
-
-  ops.addOperation(PSERVER_OP_au_bv, v1, v2, (real)-1, (real)1);
-  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res3);
-  client_.doOperation(ops, false, false);
-
-  EXPECT_EQ(30000, res1);
-  EXPECT_EQ(15000, res2);
-  EXPECT_EQ(0, res3);
-
-  PServerMatrix m1, m2;
-  m1 = client_.createMatrix(4);
-  EXPECT_EQ(0, m1.handle);
-  m2 = client_.createMatrix(8);
-  EXPECT_EQ(1, m2.handle);
-
-  // TODO(yuyang18): add tests for other operations OP_COPY, OP_au
-
-  client_.releaseVector(v1);
-  client_.releaseVector(v2);
-  client_.releaseMatrix(m1);
-  client_.releaseMatrix(m2);
-}
-
-void ParameterServer2Tester::checkSegments(const BlockSegments& expected,
-                                           const BlockSegments& segs) {
-  EXPECT_EQ(expected.size(), segs.size());
-  if (expected.size() != segs.size()) {
-    return;
-  }
-  for (size_t i = 0; i < expected.size(); ++i) {
-    EXPECT_EQ(expected[i], segs[i]);
-  }
-}
-
-void ParameterServer2Tester::mergeBlockSegmentTest() {
-  {
-    BlockSegments segs{{10, 20}, {30, 45}, {50, 70}};
-    mergeSegments(&segs);
-    checkSegments({{10, 20}, {30, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 20}};
-    mergeSegments(&segs);
-    checkSegments({{10, 20}, {30, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 30}};
-    mergeSegments(&segs);
-    checkSegments({{10, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {10, 70}, {10, 30}};
-    mergeSegments(&segs);
-    checkSegments({{10, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 35}};
-    mergeSegments(&segs);
-    checkSegments({{10, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 60}};
-    mergeSegments(&segs);
-    checkSegments({{10, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {30, 47}};
-    mergeSegments(&segs);
-    checkSegments({{30, 47}, {50, 70}}, segs);
-  }
-}
-
-void ParameterServer2Tester::waitPassFinishTest() {
-  ParameterClient2 client1;
-  ParameterClient2 client2;
-  ParameterClient2 client3;
-
-  ThreadWorker worker1;
-  ThreadWorker worker2;
-  ThreadWorker worker3;
-
-  auto init1 = [&]() {
-    LOG(INFO) << "init1 start";
-    client1.init(parameters_);
-    LOG(INFO) << "init1 finish";
-  };
-
-  auto init2 = [&]() {
-    LOG(INFO) << "init2 start";
-    client2.init(parameters_);
-    LOG(INFO) << "init2 finish";
-  };
-
-  auto init3 = [&]() {
-    LOG(INFO) << "init3 start";
-    client3.init(parameters_);
-    LOG(INFO) << "init3 finish";
-  };
-
-  auto update1 = [&]() {
-    LOG(INFO) << "update1 start";
-    client1.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ADD_GRADIENT,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update1 finish";
-  };
-
-  auto wait1 = [&]() {
-    LOG(INFO) << "wait1 start";
-    client1.waitPassFinish();
-    LOG(INFO) << "wait1 finish";
-  };
-
-  auto update2 = [&]() {
-    LOG(INFO) << "update2 start";
-    client2.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ADD_GRADIENT,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update2 finish";
-  };
-
-  auto wait2 = [&]() {
-    LOG(INFO) << "wait2 start";
-    client2.waitPassFinish();
-    LOG(INFO) << "wait2 finish";
-  };
-
-  auto op3 = [&]() {
-    LOG(INFO) << "op3 start";
-    PreparedOperations ops;
-    ops.addOperation(PSERVER_OP_SGD);
-    client3.doOperation(ops,
-                        /* waitForGradient= */ true,
-                        /* sendBackarameter= */ true);
-    LOG(INFO) << "op3 finish";
-  };
-
-  worker1.addJob(init1);
-  worker2.addJob(init2);
-  worker3.addJob(init3);
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-  worker3.addJob(op3);
-
-  worker3.addJob(op3);
-  worker3.addJob(op3);
-  worker2.addJob(update2);
-  worker2.addJob(update2);
-  worker1.addJob(wait1);
-
-  worker2.addJob(wait2);
-  worker3.addJob(op3);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-
-  LOG(INFO) << "Pass 1 finished";
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-  worker3.addJob(op3);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-
-  worker3.addJob(op3);
-  worker3.addJob(op3);
-  worker1.addJob(update1);
-  worker1.addJob(wait1);
-  worker2.addJob(wait2);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-
-  LOG(INFO) << "Pass 2 finished";
-}
-
-void ParameterServer2Tester::synchronizeTest() {
-  ParameterClient2 client1;
-  ParameterClient2 client2;
-
-  ThreadWorker worker1;
-  ThreadWorker worker2;
-
-  FLAGS_log_period_server = 2;
-
-  auto init1 = [&]() {
-    LOG(INFO) << "init1 start";
-    client1.init(parameters_);
-    client1.setTrainerId(0);
-    LOG(INFO) << "init1 finish";
-  };
-
-  auto init2 = [&]() {
-    LOG(INFO) << "init2 start";
-    client2.init(parameters_);
-    client2.setTrainerId(1);
-    LOG(INFO) << "init2 finish";
-  };
-
-  auto update1 = [&]() {
-    LOG(INFO) << "update1 start";
-    client1.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ASYNC_SGD,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update1 finish";
-  };
-
-  auto wait1 = [&]() {
-    LOG(INFO) << "wait1 start";
-    client1.asyncFinishPass();
-    LOG(INFO) << "wait1 finish";
-  };
-
-  auto update2 = [&]() {
-    LOG(INFO) << "update2 start";
-    client2.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ASYNC_SGD,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update2 finish";
-  };
-
-  auto wait2 = [&]() {
-    LOG(INFO) << "wait2 start";
-    client2.asyncFinishPass();
-    LOG(INFO) << "wait2 finish";
-  };
-
-  worker1.addJob(init1);
-  worker2.addJob(init2);
-  // call wait to reset some stats at pserver
-  worker1.addJob(wait1);
-  worker2.addJob(wait2);
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-
-  worker2.addJob(update2);
-  worker2.addJob(update2);
-  worker1.addJob(wait1);
-
-  worker2.addJob(wait2);
-
-  worker1.wait();
-  worker2.wait();
-  LOG(INFO) << "Pass 1 finished";
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-
-  worker1.wait();
-  worker2.wait();
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(wait1);
-  worker2.addJob(wait2);
-
-  worker1.wait();
-  worker2.wait();
-  LOG(INFO) << "Pass 2 finished";
-}
-
-TEST(ParameterServer2, sendParameter) { g_server->sendParameterTest(); }
-
-TEST(ParameterServer2, setConfig) { g_server->setConfigTest(); }
-
-TEST(ParameterServer2, setStatus) { g_server->setStatusTest(); }
-
-TEST(ParameterServer2, operation) { g_server->operationTest(); }
-
-TEST(ParameterServer2, mergeBlockSegment) { g_server->mergeBlockSegmentTest(); }
-
-TEST(ParameterServer2, waitPassFinish) { g_server->waitPassFinishTest(); }
-
-TEST(ParameterServer2, synchronize) { g_server->synchronizeTest(); }
-
-TEST(ParameterServer2, sendData) {
-  // Set gserver and pserver all 3, so that the test is sufficient.
-  int oldFlagsPortsNUm = FLAGS_ports_num;
-  int oldFlagsNumGradientServers = FLAGS_num_gradient_servers;
-  int oldFlagsPort = FLAGS_port;
-  FLAGS_ports_num = 3;
-  FLAGS_num_gradient_servers = 3;
-  FLAGS_port = FLAGS_port + 1;
-  std::unique_ptr<ParameterServer2Tester> g_server1;
-  std::unique_ptr<ParameterServer2Tester> g_server2;
-  std::unique_ptr<ParameterServer2Tester> g_server3;
-  if (FLAGS_rdma_tcp == "rdma") {
-    g_server1.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
-    g_server1->start();
-    g_server2.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port + 1, FLAGS_server_cpu + 1));
-    g_server2->start();
-    g_server3.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port + 2, FLAGS_server_cpu + 2));
-    g_server3->start();
-  } else {  // tcp
-    g_server1.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
-    g_server1->start();
-    g_server2.reset(
-        new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port + 1));
-    g_server2->start();
-    g_server3.reset(
-        new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port + 2));
-    g_server3->start();
-  }
-
-  g_server2->init();
-  g_server3->init();
-  sleep(2);
-  g_server1->setup();
-  g_server1->sendDataTest(DATA_REDUCE_SUM, 1 << 24);
-  sleep(2);
-  g_server1->sendDataTest(DATA_REDUCE_SUM, 2);
-  sleep(2);
-  g_server1.reset();
-  g_server2.reset();
-  g_server3.reset();
-
-  FLAGS_ports_num = oldFlagsPortsNUm;
-  FLAGS_num_gradient_servers = oldFlagsNumGradientServers;
-  FLAGS_port = oldFlagsPort;
-}
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-
-  FLAGS_num_gradient_servers = 2;
-
-  if (FLAGS_rdma_tcp == "rdma") {
-    g_server.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
-  } else {
-    g_server.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
-  }
-
-  g_server->start();
-
-  sleep(2);
-
-  int ret = RUN_ALL_TESTS();
-
-  g_server.reset();
-
-  exit(ret);
-}
diff --git a/paddle/legacy/pserver/test/test_ProtoServer.cpp b/paddle/legacy/pserver/test/test_ProtoServer.cpp
deleted file mode 100644
index f7ab2e8af45..00000000000
--- a/paddle/legacy/pserver/test/test_ProtoServer.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <memory>
-#include "ParameterService.pb.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/pserver/ProtoServer.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-
-DEFINE_string(server_addr, "127.0.0.1", "Server address");
-DEFINE_int64(dim, 50000000, "Data size");
-DEFINE_bool(test_proto_server, true, "whether to test ProtoServer");
-DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
-
-using namespace paddle;  // NOLINT
-
-class MyServer : public ProtoServer {
- public:
-  explicit MyServer(int port, int rdmaCpu = -1)
-      : ProtoServer(FLAGS_server_addr, port, rdmaCpu),
-        status_(PSERVER_STATUS_NOT_SET) {
-    REGISTER_SERVICE_FUNCTION(MyServer, getStatus);
-    REGISTER_SERVICE_FUNCTION(MyServer, setStatus);
-    REGISTER_SERVICE_FUNCTION_EX(MyServer, getStatusEx);
-  }
-  void getStatus(const GetStatusRequest& request,
-                 ProtoResponseCallback callback) {
-    (void)request;
-    GetStatusResponse response;
-    response.set_status(status_);
-    callback(response);
-  }
-
-  void getStatusEx(const GetStatusRequest& request,
-                   std::unique_ptr<MsgReader> msgReader,
-                   ProtoResponseCallbackEx callback) {
-    (void)request;
-    GetStatusResponse response;
-    response.set_status(status_);
-    buffer_.resize(msgReader->getNextBlockLength());
-    msgReader->readNextBlock(&buffer_[0]);
-    callback(response, {{&buffer_[0], buffer_.size()}});
-  }
-
-  void setStatus(const SetStatusRequest& request,
-                 ProtoResponseCallback callback) {
-    SetStatusResponse response;
-    status_ = request.status();
-    callback(response);
-  }
-
- protected:
-  PServerStatus status_;
-  std::string buffer_;
-};
-
-TEST(ProtoServer, regular) {
-  ProtoClient* client;
-  if (FLAGS_rdma_tcp == "rdma")
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
-  else
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_TCP);
-  {
-    GetStatusRequest request;
-    GetStatusResponse response;
-    auto msgReader = client->sendAndRecv("getStatus", request, &response);
-    EXPECT_EQ(response.status(), PSERVER_STATUS_NOT_SET);
-    EXPECT_EQ(msgReader->getNumBlocks(), (size_t)0);
-  }
-
-  {
-    SetStatusRequest request;
-    SetStatusResponse response;
-    request.set_status(PSERVER_STATUS_PARAMETER_READY);
-    client->sendAndRecv("setStatus", request, &response);
-  }
-
-  {
-    GetStatusRequest request;
-    GetStatusResponse response;
-    client->sendAndRecv("getStatus", request, &response);
-    EXPECT_EQ(response.status(), PSERVER_STATUS_PARAMETER_READY);
-  }
-
-  delete client;
-}
-
-TEST(ProtoServer, extended) {
-#ifdef PADDLE_WITH_CUDA
-  ProtoClient* client;
-  if (FLAGS_rdma_tcp == "rdma")
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
-  else
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_TCP);
-  int64_t dataSize = FLAGS_dim * sizeof(real);
-
-  GpuVector gpuParam(FLAGS_dim);
-  GpuVector gpuGrad(FLAGS_dim);
-  CpuVector cpuParam(FLAGS_dim);
-  CpuVector cpuGrad(FLAGS_dim);
-
-  gpuParam.rand();
-  gpuGrad.rand();
-  cpuParam.rand();
-  cpuGrad.rand();
-
-  for (int k = 0; k < 4; ++k) {
-    for (int i = 0; i < 10; ++i) {
-      cpuGrad.copyFrom(gpuGrad);
-      if (FLAGS_test_proto_server) {
-        GetStatusRequest request;
-        GetStatusResponse response;
-        {
-          REGISTER_TIMER("sendAndRecv");
-          auto msgReader =
-              client->sendAndRecv("getStatusEx",
-                                  request,
-                                  {{cpuGrad.getData(), (size_t)dataSize}},
-                                  &response);
-
-          EXPECT_EQ(msgReader->getNumBlocks(), (size_t)1);
-          EXPECT_EQ(msgReader->getNextBlockLength(), (size_t)dataSize);
-          msgReader->readNextBlock(cpuParam.getData());
-        }
-        if (!FLAGS_benchmark) {
-          real* v1 = cpuGrad.getData();
-          real* v2 = cpuParam.getData();
-          real sum1 = 0, sum2 = 0;
-          for (int j = 0; j < FLAGS_dim; ++j) {
-            sum1 += v1[j];
-            sum2 += v2[j];
-          }
-          EXPECT_EQ(sum1, sum2);
-        }
-      }
-      gpuParam.copyFrom(cpuParam);
-
-      LOG_EVERY_N(INFO, 10) << "i=" << i;
-    }
-    globalStat.printAllStatus();
-    globalStat.reset();
-  }
-
-  delete client;
-#endif
-}
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  MyServer server(FLAGS_port, FLAGS_rdma_tcp == "rdma" ? 0 : -1);
-  server.start();
-  usleep(10000);
-
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/pserver/test/test_ProtoServer.sh b/paddle/legacy/pserver/test/test_ProtoServer.sh
deleted file mode 100755
index 14393508473..00000000000
--- a/paddle/legacy/pserver/test/test_ProtoServer.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -x
-for ((port=12340;port<=12360;port++))
-do
-    port_used_num=`netstat -a |grep $port|wc -l`
-    if [ $port_used_num -eq 0 ]
-    then
-        echo $port;
-        legacy/pserver/test/test_ProtoServer --port=$port
-        if [ $? -eq 0 ]
-           then
-               exit 0
-           else
-               echo "test_ProtoServer run wrong"
-       	       exit 1
-        fi
-fi
-done
-echo "test_ProtoServer port not found"
-exit 1
diff --git a/paddle/legacy/trainer/CMakeLists.txt b/paddle/legacy/trainer/CMakeLists.txt
deleted file mode 100644
index 6192de4388c..00000000000
--- a/paddle/legacy/trainer/CMakeLists.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-# paddle trainer package
-
-set(TRAINER_SOURCES
-        ParameterUpdater.cpp
-        ParamUtil.cpp
-        RemoteParameterUpdater.cpp
-        NewRemoteParameterUpdater.cpp
-        Tester.cpp
-        Trainer.cpp
-        TrainerInternal.cpp
-        TrainerBenchmark.cpp
-        ThreadParameterUpdater.cpp
-        TrainerInternalConfig.cpp
-        TrainerConfigHelper.cpp)
-
-set(TRAINER_HEADERS
-        ParameterUpdater.h
-        ParamUtil.h
-        RemoteParameterUpdater.h
-        NewRemoteParameterUpdater.h
-        Tester.h
-        TesterConfig.h
-        Trainer.h
-        TrainerInternal.h
-        TrainerInternalConfig.h
-        ThreadParameterUpdater.h
-        TrainerConfigHelper.h)
-
-if(NOT WITH_GOLANG)
-  list(REMOVE_ITEM TRAINER_SOURCES
-          NewRemoteParameterUpdater.cpp)
-  list(REMOVE_ITEM TRAINER_HEADERS
-          NewRemoteParameterUpdater.h)
-endif()
-
-add_library(paddle_trainer_lib STATIC
-    ${TRAINER_SOURCES})
-
-add_dependencies(paddle_trainer_lib
-    paddle_proto
-    ${external_project_dependencies})
-
-macro(add_paddle_exe TARGET_NAME)
-  add_executable(${TARGET_NAME} ${ARGN})
-  link_paddle_exe(${TARGET_NAME})
-endmacro()
-
-if(WITH_TESTING)
-  add_subdirectory(tests)
-endif()
-
-if(NOT MOBILE_INFERENCE)
-  add_paddle_exe(paddle_trainer TrainerMain.cpp)
-  add_paddle_exe(paddle_merge_model MergeModel.cpp)
-
-  install(TARGETS paddle_trainer paddle_merge_model
-          RUNTIME DESTINATION opt/paddle/bin
-          PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
-          GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
-
-  set_target_properties(paddle_trainer PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
-  set_target_properties(paddle_merge_model PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
-endif()
-
-if(APPLE)
-  set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
-endif()
-
-if(WITH_GOLANG)
-  add_dependencies(paddle_trainer_lib paddle_pserver_cclient)
-  target_link_libraries(paddle_trainer_lib paddle_pserver_cclient)
-  target_link_libraries(paddle_trainer paddle_pserver_cclient)
-endif(WITH_GOLANG)
diff --git a/paddle/legacy/trainer/MergeModel.cpp b/paddle/legacy/trainer/MergeModel.cpp
deleted file mode 100644
index 8a3601f1922..00000000000
--- a/paddle/legacy/trainer/MergeModel.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-
-#include "ParamUtil.h"
-#include "Trainer.h"
-#include "paddle/legacy/pserver/ParameterServer2.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-
-DEFINE_string(model_dir, "", "Directory for separated model files");
-DEFINE_string(config_file, "", "Config file for the model");
-DEFINE_string(model_file, "", "File for merged model file");
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-
-  if (FLAGS_model_dir.empty() || FLAGS_config_file.empty() ||
-      FLAGS_model_file.empty()) {
-    LOG(INFO) << "Usage: ./paddle_merge_model --model_dir=pass-00000 "
-                 "--config_file=config.py --model_file=out.paddle";
-    return 0;
-  }
-
-  string confFile = FLAGS_config_file;
-#ifndef PADDLE_WITH_CUDA
-  FLAGS_use_gpu = false;
-#endif
-  auto config = std::make_shared<TrainerConfigHelper>(confFile);
-  unique_ptr<GradientMachine> gradientMachine(GradientMachine::create(*config));
-  gradientMachine->loadParameters(FLAGS_model_dir);
-
-  ofstream os(FLAGS_model_file);
-
-  string buf;
-  config->getConfig().SerializeToString(&buf);
-  int64_t size = buf.size();
-  os.write((char*)&size, sizeof(size));
-  CHECK(os) << "Fail to write to " << FLAGS_model_file;
-  os.write(buf.data(), buf.size());
-  vector<ParameterPtr>& parameters = gradientMachine->getParameters();
-  for (auto& para : parameters) {
-    para->save(os);
-    CHECK(os) << "Fail to write to " << FLAGS_model_file;
-  }
-  os.close();
-
-  return 0;
-}
diff --git a/paddle/legacy/trainer/NewRemoteParameterUpdater.cpp b/paddle/legacy/trainer/NewRemoteParameterUpdater.cpp
deleted file mode 100644
index cdd832acd16..00000000000
--- a/paddle/legacy/trainer/NewRemoteParameterUpdater.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NewRemoteParameterUpdater.h"
-#include "Trainer.h"
-#include "paddle/legacy/utils/Stat.h"
-
-DECLARE_int32(trainer_id);
-DECLARE_string(save_dir);
-
-namespace paddle {
-NewRemoteParameterUpdater::NewRemoteParameterUpdater(
-    const OptimizationConfig &config, const std::string pserverSpec)
-    : trainerConfig_(config),
-      parameterClient_(-1),
-      newParameters_(nullptr),
-      newGradients_(nullptr),
-      pserverSpec_(pserverSpec) {}
-
-NewRemoteParameterUpdater::NewRemoteParameterUpdater(
-    const OptimizationConfig &config,
-    const std::string pserverSpec,
-    const bool useEtcd)
-    : trainerConfig_(config),
-      parameterClient_(-1),
-      newParameters_(nullptr),
-      newGradients_(nullptr),
-      pserverSpec_(pserverSpec),
-      useEtcd_(useEtcd) {}
-
-void NewRemoteParameterUpdater::init(
-    const std::vector<ParameterPtr> &parameters) {
-  ParameterUpdater::init(parameters);
-
-  // create parameter server client.
-  if (useEtcd_) {
-    parameterClient_ =
-        paddle_new_etcd_pserver_client((char *)pserverSpec_.c_str());
-  } else {
-    parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
-                                                 FLAGS_trainer_id == 0);
-  }
-
-  // init new parameter and gradient.
-  newParameters_ = initNewParameter(PARAMETER_VALUE);
-  newGradients_ = initNewParameter(PARAMETER_GRADIENT);
-
-  // init parameter, one trainer will get the opportunity to int parameter and
-  // send them to parameter server. Others will get the initialized parameter
-  // from parameter server
-  if (paddle_begin_init_params(parameterClient_)) {
-    LOG(INFO) << "paddle_begin_init_params start";
-    // NOTE: convert V1 OptimizatioinConfig proto to V2 OptimizerConfig.
-    // This makes golang pserver compatible with handy V1 demos.
-    // TODO(wuyi): Refine or remove these ugly converting lines
-    OptimizerConfig optimizerConfigV2;
-    if (trainerConfig_.learning_method() == "momentum") {
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
-    } else if (trainerConfig_.learning_method() == "adagrad") {
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
-      optimizerConfigV2.mutable_adagrad()->set_epsilon(
-          trainerConfig_.ada_epsilon());
-    } else if (trainerConfig_.learning_method() == "adadelta") {
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
-      optimizerConfigV2.mutable_adadelta()->set_epsilon(
-          trainerConfig_.ada_epsilon());
-      optimizerConfigV2.mutable_adadelta()->set_rho(trainerConfig_.ada_rou());
-    } else if (trainerConfig_.learning_method() == "adam") {
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adam);
-      optimizerConfigV2.mutable_adam()->set_beta_1(trainerConfig_.adam_beta1());
-      optimizerConfigV2.mutable_adam()->set_beta_2(trainerConfig_.adam_beta2());
-      optimizerConfigV2.mutable_adam()->set_epsilon(
-          trainerConfig_.adam_epsilon());
-    } else {
-      LOG(ERROR) << "got unsupported v1 optimizer config: "
-                 << trainerConfig_.learning_method();
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
-    }
-
-    if (trainerConfig_.learning_rate_schedule() == "constant") {
-      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
-      optimizerConfigV2.mutable_const_lr()->set_learning_rate(
-          trainerConfig_.learning_rate());
-    } else if (trainerConfig_.learning_rate_schedule() == "linear") {
-      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Linear);
-      optimizerConfigV2.mutable_linear_lr()->set_learning_rate(
-          trainerConfig_.learning_rate());
-      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_a(
-          trainerConfig_.learning_rate_decay_a());
-      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_b(
-          trainerConfig_.learning_rate_decay_b());
-    } else {
-      LOG(ERROR) << "got unsupported v1 learning_rate_schedule config: "
-                 << trainerConfig_.learning_rate_schedule() << ", set to const";
-      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
-      optimizerConfigV2.mutable_const_lr()->set_learning_rate(
-          trainerConfig_.learning_rate());
-    }
-
-    // overwrite optimizerConfigV2 for per-parameter(layer) configs
-    for (int i = 0; i < parameterSize(); ++i) {
-      // FIXME(typhoonzero): paramConfig always have default values,
-      // how to check if it's default?
-      // TODO(typhoonzero): log output: optimizerConfigV2.DebugString();
-      LOG(INFO) << "trainerConfig_: " << trainerConfig_.DebugString();
-      // send param and config to pserver
-      std::string bytes = optimizerConfigV2.SerializeAsString();
-      const char *array = bytes.data();
-      int size = (int)bytes.size();
-      paddle_init_param(
-          parameterClient_, *newParameters_[i], (void *)array, size);
-    }
-    paddle_finish_init_params(parameterClient_);
-    LOG(INFO) << "paddle_begin_init_params done";
-  } else {
-    paddle_get_params(parameterClient_, newParameters_, parameterSize());
-  }
-
-  LOG(INFO) << "NewRemoteParameterUpdater initialized";
-}
-
-void NewRemoteParameterUpdater::updateImpl(Parameter *para) {}
-
-void NewRemoteParameterUpdater::finishBatch(real cost) {
-  // send gradient to parameter server.
-  paddle_send_grads(parameterClient_, newGradients_, parameterSize());
-  // get the updated parameter from parameterClient.
-  paddle_get_params(parameterClient_, newParameters_, parameterSize());
-
-  // clear gradient after update parameter.
-  for (auto &para : parameters_) {
-    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-  }
-}
-
-void NewRemoteParameterUpdater::startPass() {}
-
-bool NewRemoteParameterUpdater::finishPass() { return true; }
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/NewRemoteParameterUpdater.h b/paddle/legacy/trainer/NewRemoteParameterUpdater.h
deleted file mode 100644
index 707e9ceb9b6..00000000000
--- a/paddle/legacy/trainer/NewRemoteParameterUpdater.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <thread>
-#include "OptimizerConfig.pb.h"
-#include "ParameterUpdater.h"
-#include "libpaddle_pserver_cclient.h"
-#include "paddle/legacy/pserver/ParameterClient2.h"
-#include "paddle/legacy/utils/Queue.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-/**
- * New remote parameter updater for dense parameters that use cclient of go.
- */
-class NewRemoteParameterUpdater : public ParameterUpdater {
- public:
-  NewRemoteParameterUpdater(const OptimizationConfig& config,
-                            const std::string pserverSpec);
-  NewRemoteParameterUpdater(const OptimizationConfig& config,
-                            const std::string pserverSpec,
-                            const bool useEtcd);
-  ~NewRemoteParameterUpdater() {
-    releaseNewParameter(newParameters_);
-    releaseNewParameter(newGradients_);
-    if (parameterClient_ >= 0) paddle_pserver_client_release(parameterClient_);
-  }
-
-  /**
-   * initialize the internal parameter client and itself.
-   */
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-  /**
-   * @brief start batch
-   *
-   * @note  one batch training exhibits stateful feature to help
-   *        to do performance tuning, sgd optimization if necessary.
-   */
-  virtual PassType startBatch(int64_t batchSize) { return PASS_TRAIN; }
-
-  /**
-   * send parameters to pservers and get returned parameters
-   * from all pservers if necessary.
-   */
-  virtual void finishBatch(real cost);
-  virtual void startPass();
-  virtual bool finishPass();
-
- protected:
-  /**
-   * work need to do after finishBatch
-   */
-  virtual void updateImpl(Parameter* para);
-
- private:
-  int parameterSize() { return (int)parameters_.size(); }
-
-  /**
-   * init parameter of go paddle pserver cclient.
-   * @param new_params
-   * @param type
-   */
-  paddle_parameter** initNewParameter(ParameterType type) {
-    paddle_parameter** new_params =
-        (paddle_parameter**)malloc(sizeof(paddle_parameter*) * parameterSize());
-    for (int i = 0; i < parameterSize(); ++i) {
-      new_params[i] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
-      memset(new_params[i], 0, sizeof(paddle_parameter));
-    }
-
-    for (int i = 0; i < parameterSize(); ++i) {
-      ParameterPtr param = parameters_[i];
-      new_params[i]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
-      new_params[i]->name = (char*)param->getName().c_str();
-      new_params[i]->content =
-          (unsigned char*)(param->getBuf(type).get()->getData());
-      new_params[i]->content_len =
-          (int)param->getBuf(type).get()->getSize() * sizeof(real);
-    }
-    return new_params;
-  }
-
-  void releaseNewParameter(paddle_parameter** newParams) {
-    if (newParams != nullptr) {
-      for (int i = 0; i < parameterSize(); ++i) {
-        free(newParams[i]);
-      }
-      free(newParams);
-    }
-  }
-
- protected:
-  const OptimizationConfig& trainerConfig_;
-  /// internal parameter client object for exchanging data with pserver
-  paddle_pserver_client parameterClient_;
-  /// the parameters for new pserver client
-  paddle_parameter** newParameters_;
-  /// the gradinets for new pserver client
-  paddle_parameter** newGradients_;
-  /// the specification of parameter server "host1:port,host1:port"
-  std::string pserverSpec_;
-  /// true if pserverSpec_ is etcd endpoint, else pserverSpec_ is pserver addr
-  bool useEtcd_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/ParamUtil.cpp b/paddle/legacy/trainer/ParamUtil.cpp
deleted file mode 100644
index b5aba32dee1..00000000000
--- a/paddle/legacy/trainer/ParamUtil.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParamUtil.h"
-
-#include <fenv.h>
-#include <stdio.h>
-
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-#include <google/protobuf/text_format.h>
-#include <paddle/legacy/utils/Version.h>
-
-#include "paddle/legacy/utils/GlobalConstants.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include "TesterConfig.h"
-#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/legacy/gserver/layers/ValidationLayer.h"
-
-namespace paddle {
-
-ParameterUtil::ParameterUtil(
-    const std::shared_ptr<TrainerConfigHelper> &config,
-    std::unique_ptr<ParameterUtilConfig> &&intconfig,
-    const GradientMachinePtr &gradientMachine,
-    const std::shared_ptr<ParameterUpdater> &parameterUpdater) {
-  config_ = config;
-  intConfig_ = std::move(intconfig);
-  gserver_ = gradientMachine;
-  pUpdater_ = parameterUpdater;
-}
-
-bool ParameterUtil::loadParameters(int passId, bool local, bool remote) {
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  snprintf(buf, kBufLen, "pass-%05d", passId);
-  std::string doneFile = path::join(config_->getSaveDir(), buf, "done");
-  if (!fileExist(doneFile.c_str())) return false;
-  loadParametersWithPath(path::join(config_->getSaveDir(), buf), local, remote);
-  return true;
-}
-
-void ParameterUtil::loadParametersWithPath(const std::string &dir,
-                                           bool local,
-                                           bool remote) {
-  if (local) {
-    gserver_->loadParameters(dir);
-  }
-  if (remote && pUpdater_) {
-    pUpdater_->loadParametersRemote(dir);
-  }
-}
-
-void ParameterUtil::saveParametersOnePass(int passId, int passInnerId) {
-  pUpdater_->apply();
-  saveParameters(passId, passInnerId);
-  if (intConfig_->save_only_one_ && passId >= intConfig_->saving_period_) {
-    deleteParameters(passId - intConfig_->saving_period_);
-  }
-  pUpdater_->restore();
-}
-
-void ParameterUtil::saveParameters(int passId, int passInnerId) {
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  if (passInnerId > 0) {
-    snprintf(buf, kBufLen, "pass-%05d-%03d", passId, passInnerId);
-  } else {
-    snprintf(buf, kBufLen, "pass-%05d", passId);
-  }
-
-  std::string basePath = config_->getSaveDir();
-  if (basePath.find('/') == std::string::npos) {
-    basePath = "./" + basePath;
-  }
-  mkDirRecursively(basePath.c_str());
-
-  std::string saveDir = path::join(basePath, buf);
-  mkDir(saveDir.c_str());
-  if (!intConfig_->load_save_param_pserver_) {
-    pUpdater_->getParametersRemote(true /*full parameter*/,
-                                   true /*after apply*/);
-  }
-
-  gserver_->saveParameters(saveDir);
-  if (intConfig_->load_save_param_pserver_) {
-    pUpdater_->saveParametersRemote(saveDir);
-  }
-  std::string doneFile = path::join(saveDir, "done");
-  touchFile(doneFile.c_str());
-  std::ofstream out(doneFile);
-  version::printVersion(out);
-  out.close();
-  VLOG(1) << "save dir " << saveDir;
-  saveConfigWithPath(saveDir);
-}
-
-void ParameterUtil::deleteParameters(int passId, int passInnerId) {
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  const std::string &saveDir = config_->getSaveDir();
-  if (passInnerId > 0) {
-    snprintf(buf,
-             kBufLen,
-             "%s/pass-%05d-%03d",
-             saveDir.c_str(),
-             passId,
-             passInnerId);
-  } else {
-    snprintf(buf, kBufLen, "%s/pass-%05d", saveDir.c_str(), passId);
-  }
-  mkDir(saveDir.c_str());
-  LOG(INFO) << "delete dir " << buf;
-  rmDir(buf);
-}
-
-void ParameterUtil::saveConfigWithPath(const std::string &path) {
-  std::string src;
-  // save config in some path
-  if (!intConfig_->config_.empty()) {
-    src = intConfig_->config_;
-  } else {
-    bool ok;
-    src = config_->getConfigName(&ok);
-    if (!ok) {
-      return;
-    }
-  }
-  copyFileToPath(src, path);
-
-  // save other import config file name to path.txt
-  std::string ss = path::join(path, "path.txt");
-  std::ofstream os(ss);
-  std::string fileName = path::basename(src);
-  CHECK(os.write(fileName.c_str(), fileName.length()))
-      << "Fail to write config file name " << ss;
-  VLOG(1) << "fileName " << fileName;
-  os.close();
-
-  // copy other import config files
-  for (int i = 0; i < config_->getConfig().config_files_size(); ++i) {
-    copyFileToPath(config_->getConfig().config_files(i), path);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/ParamUtil.h b/paddle/legacy/trainer/ParamUtil.h
deleted file mode 100644
index 07786967762..00000000000
--- a/paddle/legacy/trainer/ParamUtil.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include "ParameterUpdater.h"
-#include "TrainerConfig.pb.h"
-#include "TrainerConfigHelper.h"
-
-namespace paddle {
-
-/**
- * Configuration for parameter utils.
- */
-struct ParameterUtilConfig {
-  DISABLE_COPY(ParameterUtilConfig);
-
-  ParameterUtilConfig(bool save_only_one,
-                      int saving_period,
-                      bool load_save_parameters_in_pserver,
-                      std::string config)
-      : save_only_one_(save_only_one),
-        saving_period_(saving_period),
-        load_save_param_pserver_(load_save_parameters_in_pserver),
-        config_(config) {}
-
-  bool save_only_one_;
-  int saving_period_;
-  bool load_save_param_pserver_;
-  std::string config_;
-};
-
-/**
- * ParameterUtil
- * Utility class for loading and saving parameters
- */
-class ParameterUtil {
- public:
-  /**
-   * Ctor.
-   *
-   * @param config
-   * @param intconfig
-   * @param gradientMachine
-   * @param parameterUpdater
-   * @return
-   */
-  ParameterUtil(const std::shared_ptr<TrainerConfigHelper> &config,
-                std::unique_ptr<ParameterUtilConfig> &&intconfig,
-                const GradientMachinePtr &gradientMachine,
-                const std::shared_ptr<ParameterUpdater> &parameterUpdater);
-
-  /// Load parameter from the saved parameter file as pass passId
-  /// if loadsave_parameters_in_pserver is set, some parameters MUST
-  /// load in pserver, which is "remote".
-  /// loadParameters can choose to load local/remote parameter, or both.
-  bool loadParameters(int passId, bool local = true, bool remote = false);
-
-  /// load parameters given path info
-  void loadParametersWithPath(const std::string &dir,
-                              bool local = true,
-                              bool remote = false);
-
-  /// Save parameter to dist for pass passId
-  /// passInnerId means saving times in one pass, some users want to
-  /// save parameters when have processed some batches in one pass
-  /// passInnerId = 0 means do not need to save in one inner pass
-  void saveParameters(int passId, int passInnerId = 0);
-
-  /// save parameters for one pass, when passInnerId > 0 means saving
-  /// the passInnerId times in one pass
-  void saveParametersOnePass(int passId, int passInnerId = 0);
-
-  /// delete parameter from disk via passId
-  void deleteParameters(int passId, int passInnerId = 0);
-
-  /// save config given path info
-  void saveConfigWithPath(const std::string &path);
-
-  /**
-   * Try to load parameter from config.
-   * @return true if can load from trainer config.
-   */
-  inline bool tryLoadParametersFromConfig() {
-    auto &c = config_->getConfig();
-    if (!c.init_model_path().empty()) {
-      loadParametersWithPath(c.init_model_path());
-      return true;
-    } else if (c.start_pass() > 0) {
-      CHECK(loadParameters(c.start_pass() - 1));
-      return true;
-    } else {
-      return false;
-    }
-  }
-
- private:
-  std::shared_ptr<TrainerConfigHelper> config_;
-  std::unique_ptr<ParameterUtilConfig> intConfig_;
-  GradientMachinePtr gserver_;
-  std::shared_ptr<ParameterUpdater> pUpdater_;
-};
-
-}  //  namespace paddle
diff --git a/paddle/legacy/trainer/ParameterUpdater.cpp b/paddle/legacy/trainer/ParameterUpdater.cpp
deleted file mode 100644
index 549fb0332da..00000000000
--- a/paddle/legacy/trainer/ParameterUpdater.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterUpdater.h"
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include "paddle/legacy/utils/Thread.h"
-
-namespace paddle {
-
-static const hl_stream_t kDeviceToHostStream = HPPL_STREAM_1;
-static const hl_stream_t kHostToDeviceStream = HPPL_STREAM_2;
-
-SgdUpdaterWithCpuAverager::SgdUpdaterWithCpuAverager(
-    const OptimizationConfig& optConfig)
-    : SgdLocalUpdater(optConfig, false /*with averager*/) {
-  CHECK(FLAGS_use_gpu && optConfig.do_average_in_cpu());
-  averager_.reset(AverageOptimizer::create(optConfig,
-                                           new DummyOptimizer(optConfig),
-                                           false /*sparse*/,
-                                           true /*apply*/));
-  updateWorker_.addJob([]() { hl_set_device(FLAGS_gpu_id); });
-}
-
-void SgdUpdaterWithCpuAverager::init(
-    const std::vector<ParameterPtr>& parameters) {
-  SgdLocalUpdater::init(parameters);
-  averager_->init(parameters_.size(), nullptr);
-  copyEvents_.resize(parameters_.size());
-  for (auto& parameter : parameters) {
-    SetDevice device(parameter->getDeviceId());
-    cpuParameters_.emplace_back(new Parameter(parameter->getConfig(),
-                                              /* useGpu= */ false,
-                                              /* doInit= */ false));
-    if (parameter->useGpu()) {
-      cpuParameters_.back()->enableType(PARAMETER_APPLY);
-    } else {
-      cpuParameters_.back()->enableSharedType(
-          PARAMETER_APPLY, parameter->getBuf(PARAMETER_VALUE));
-    }
-    for (ParameterType type : averager_->getParameterTypes()) {
-      cpuParameters_.back()->enableType(type);
-    }
-
-    hl_create_event(&copyEvents_[nonStaticParaIDMap_[parameter->getID()]]);
-  }
-}
-
-SgdUpdaterWithCpuAverager::~SgdUpdaterWithCpuAverager() {
-  for (auto& event : copyEvents_) {
-    hl_destroy_event(event);
-  }
-}
-
-void SgdUpdaterWithCpuAverager::updateImpl(Parameter* para) {
-  SgdLocalUpdater::updateImpl(para);
-
-  if (para->useGpu()) {
-    size_t pid = nonStaticParaIDMap_[para->getID()];
-    Parameter* cpuPara = cpuParameters_[pid].get();
-    cpuPara->getBuf(PARAMETER_VALUE)
-        ->copyFrom(*para->getBuf(PARAMETER_VALUE), kDeviceToHostStream);
-    hl_stream_record_event(kDeviceToHostStream, copyEvents_[pid]);
-  }
-
-  updateWorker_.addJob(
-      std::bind(&SgdUpdaterWithCpuAverager::updateFunc, this, para));
-}
-
-void SgdUpdaterWithCpuAverager::updateFunc(Parameter* para) {
-  SetDevice setDevice(para->getDeviceId());
-  size_t pid = nonStaticParaIDMap_[para->getID()];
-  Parameter* cpuPara = cpuParameters_[pid].get();
-  if (para->useGpu()) {
-    hl_event_synchronize(copyEvents_[pid]);
-  }
-  averager_->update(cpuPara->getBufs(), cpuPara->getConfig(), -1LU);
-}
-
-void SgdUpdaterWithCpuAverager::finishBatch(real cost) {
-  SgdLocalUpdater::finishBatch(cost);
-
-  updateWorker_.wait();
-  for (auto para : cpuParameters_) {
-    if (auto callback = averager_->needSpecialTraversal(para->getConfig())) {
-      callback(para->getBufs(), para->getConfig(), -1LU);
-    }
-  }
-  averager_->finishBatch();
-}
-
-void SgdUpdaterWithCpuAverager::apply() {
-  // backup gpu value
-  for (auto& para : parameters_) {
-    SetDevice setDevice(para->getDeviceId());
-    para->getBuf(PARAMETER_GRADIENT)
-        ->copyFrom(*para->getBuf(PARAMETER_VALUE), kHostToDeviceStream);
-  }
-
-  // apply on cpu parameter
-  if (auto callback = averager_->apply()) {
-    for (auto para : cpuParameters_) {
-      callback(para->getBufs(), para->getConfig(), -1LU);
-    }
-  }
-
-  // copy to gpu value
-  for (auto& para : parameters_) {
-    SetDevice setDevice(para->getDeviceId());
-    size_t pid = nonStaticParaIDMap_[para->getID()];
-    Parameter* cpuPara = cpuParameters_[pid].get();
-    if (parameters_[pid]->useGpu()) {
-      para->getBuf(PARAMETER_VALUE)
-          ->copyFrom(*cpuPara->getBuf(PARAMETER_APPLY), kHostToDeviceStream);
-    }
-  }
-  hl_stream_synchronize(kHostToDeviceStream);
-  for (auto& para : parameters_) {
-    para->setValueUpdated();
-  }
-}
-
-void SgdUpdaterWithCpuAverager::restore() {
-  // restore on cpu parameter
-  if (auto callback = averager_->restore()) {
-    for (auto para : cpuParameters_) {
-      callback(para->getBufs(), para->getConfig(), -1LU);
-    }
-  }
-
-  // restore gpu value
-  for (auto& para : parameters_) {
-    SetDevice device(para->getDeviceId());
-    para->getBuf(PARAMETER_VALUE)->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
-    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-    para->setValueUpdated();
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/ParameterUpdater.h b/paddle/legacy/trainer/ParameterUpdater.h
deleted file mode 100644
index acddc3702d7..00000000000
--- a/paddle/legacy/trainer/ParameterUpdater.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/utils/Thread.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include "paddle/legacy/parameter/AverageOptimizer.h"
-#include "paddle/legacy/parameter/FirstOrderOptimizer.h"
-#include "paddle/legacy/parameter/OptimizerFunctions.h"
-#include "paddle/legacy/parameter/OptimizerWithRegularizer.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/parameter/ParameterUpdaterBase.h"
-
-#include "TrainerConfig.pb.h"
-#include "paddle/legacy/gserver/layers/Layer.h"
-
-#include <memory>
-#include <vector>
-
-namespace paddle {
-
-/**
- * @brief Parameter Updater for SGD, and local(not cluster) run.
- */
-class SgdLocalUpdater : public ParameterUpdater {
- public:
-  /**
-   * @brief Ctor. Initialize optimizer locally by optConfig.
-   * @param optConfig optimization config.
-   * @param withAverager with average optimizer or not, default is true.
-   */
-  explicit SgdLocalUpdater(const OptimizationConfig& optConfig,
-                           bool withAverager = true)
-      : numSamplesProcessed_(0) {
-    auto baseOptimizer = ParameterOptimizer::create(optConfig);
-    optimizer_.reset(withAverager
-                         ? AverageOptimizer::create(optConfig, baseOptimizer)
-                         : baseOptimizer);
-    CHECK(optimizer_) << "fail to create optimizer: "
-                      << optConfig.learning_method();
-    auto types = optimizer_->getParameterTypes();
-    for (auto type : types) {
-      addParameterType(type);
-    }
-  }
-
-  /**
-   * @brief Initialize parameters and optimizer_.
-   *        For example,
-   *           If optimizer need hassien vector, then parameter's hassien will
-   *           be initialized.
-   * @param parameters The parameter need to be initialized.
-   */
-  virtual void init(const std::vector<ParameterPtr>& parameters) {
-    ParameterUpdater::init(parameters);
-    optimizer_->init(parameters_.size(), nullptr);
-    // check no L1 decay in parameter configs
-    CHECK(std::find_if(parameters.begin(),
-                       parameters.end(),
-                       [](const ParameterPtr& para) {
-                         return para->getConfig().decay_rate_l1() > 0.0f;
-                       }) == parameters.end())
-        << "SgdLocalUpdater cannot support L1 decay in parameter";
-  }
-
-  /**
-   * @brief Start a batch with current mini-batch size
-   * @param current mini-batch size.
-   * @return Always PASS_TRAIN.
-   */
-  virtual PassType startBatch(int64_t batchSize) {
-    numSamplesProcessed_ += batchSize;
-    optimizer_->startBatch(numSamplesProcessed_);
-    return PASS_TRAIN;
-  }
-
-  /**
-   * @brief finish a mini-batch.
-   */
-  virtual void finishBatch(real cost) { optimizer_->finishBatch(); }
-
-  /**
-   * @brief start a pass.
-   */
-  virtual void startPass() { optimizer_->startPass(); }
-
-  /**
-   * @brief finish a pass.
-   * @param cost sum cost during one pass.
-   * @return true if accept (used for owlqn).
-   */
-  virtual bool finishPass() {
-    optimizer_->finishPass();
-    return ParameterUpdater::finishPass();
-  }
-
-  /**
-   * @brief apply model average.
-   */
-  virtual void apply() {
-    if (auto callback = optimizer_->apply()) {
-      for (auto para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        callback(para->getBufs(), para->getConfig(), -1UL);
-      }
-    }
-  }
-
-  /**
-   * @brief restore parameter value before model average
-   */
-  virtual void restore() {
-    if (auto callback = optimizer_->restore()) {
-      for (auto para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        callback(para->getBufs(), para->getConfig(), -1UL);
-      }
-    }
-  }
-
- protected:
-  /**
-   * @brief update method. Update value from gradient.
-   * @param para parameter that will be updated.
-   */
-  virtual void updateImpl(Parameter* para) {
-    optimizer_->update(para->getBufs(), para->getConfig());
-    if (auto callback = optimizer_->needSpecialTraversal(para->getConfig())) {
-      callback(para->getBufs(), para->getConfig(), -1UL);
-    }
-
-    para->setValueUpdated();
-    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-  }
-
-  std::unique_ptr<ParameterOptimizer> optimizer_;
-
-  /**
-   * @brief total number of samples processed.
-   */
-  int64_t numSamplesProcessed_;
-};
-
-/**
- * @brief SgdCpuUpdater is used only in recursive neural network
- * @deprecated
- */
-class SgdCpuUpdater : public SgdLocalUpdater, public Deprecated {
- public:
-  explicit SgdCpuUpdater(const OptimizationConfig& optConfig)
-      : SgdLocalUpdater(optConfig),
-        Deprecated(
-            "SgdCpuUpdater is used only in recursive neural network, "
-            "and recursive neural network is deprecated in paddle. "
-            "Use it all by your own.") {}
-
-  /**
-   * @brief update all parameter on finish batch.
-   * @param cost
-   */
-  virtual void finishBatch(real cost) {
-    for (auto para : parameters_) {
-      SgdLocalUpdater::update(para.get());
-    }
-    optimizer_->finishBatch();
-  }
-
- protected:
-  /**
-   * @brief do nothing.
-   * @param para
-   */
-  virtual void updateImpl(Parameter* para) {}
-};
-
-/**
- * @brief Sgd Local Updater With average in cpu.
- *
- * It will do model average in cpu to reduce gpu memory comsuption.
- */
-class SgdUpdaterWithCpuAverager : public SgdLocalUpdater {
- public:
-  /**
-   * @brief Ctor.
-   *
-   * SgdUpdaterWithCpuAverager will do everything as a
-   * SgdLocalUpdater, then copy parameter from GPU to CPU, and do model
-   * average in cpu.
-   */
-  explicit SgdUpdaterWithCpuAverager(const OptimizationConfig& optConfig);
-  ~SgdUpdaterWithCpuAverager();
-
-  /**
-   * @brief init. Initialize cpu parameters, model average optimizer.
-   * @param parameters
-   */
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-
-  virtual PassType startBatch(int64_t batchSize) {
-    averager_->startBatch(-1UL);
-    return SgdLocalUpdater::startBatch(batchSize);
-  }
-  virtual void finishBatch(real cost);
-
-  virtual void startPass() {
-    averager_->startPass();
-    SgdLocalUpdater::startPass();
-  }
-  virtual bool finishPass() {
-    averager_->finishPass();
-    return SgdLocalUpdater::finishPass();
-  }
-
-  /// apply the averaged parameter to PARAMETER_VALUE
-  /// use PARAETER_GRADIENT for backing up PARAMETER_VALUE
-  virtual void apply();
-
-  /**
-   * @brief Restore parameter before apply().
-   */
-  virtual void restore();
-
- protected:
-  virtual void updateImpl(Parameter* para);
-
-  void updateFunc(Parameter* para);
-
- protected:
-  std::unique_ptr<ParameterOptimizer> averager_;
-
-  /**
-   * @brief The thread worker which do model average.
-   *
-   * For each parameter, GPU->CPU parameter is async, and do model average in
-   * another thread. Because the training process don't need model average while
-   * training, and model average only used in evaluation stage and saving stage.
-   * So the model average is totally async.
-   */
-  ThreadWorker updateWorker_;
-
-  /**
-   * @brief The parameter mirror in cpu.
-   */
-  std::vector<ParameterPtr> cpuParameters_;
-
-  /**
-   * @brief GPU -> CPU copy event. Model average will wait after copy done.
-   */
-  std::vector<hl_event_t> copyEvents_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/RemoteParameterUpdater.cpp b/paddle/legacy/trainer/RemoteParameterUpdater.cpp
deleted file mode 100644
index 5de1cc7827a..00000000000
--- a/paddle/legacy/trainer/RemoteParameterUpdater.cpp
+++ /dev/null
@@ -1,843 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RemoteParameterUpdater.h"
-#include "Trainer.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-#include "paddle/legacy/utils/Stat.h"
-
-DECLARE_int32(trainer_id);
-DECLARE_string(save_dir);
-
-namespace paddle {
-
-static const hl_stream_t kDeviceToHostStream = HPPL_STREAM_1;
-static const hl_stream_t kHostToDeviceStream = HPPL_STREAM_2;
-static const int kFinishBatchPid = -1;
-
-const std::string RemoteParameterUpdater::kAverage = "average";
-const std::string RemoteParameterUpdater::kElasticAverage = "elastic_average";
-
-RemoteParameterUpdater::RemoteParameterUpdater(
-    const OptimizationConfig& config,
-    int expectedPassCount,
-    std::unique_ptr<ParameterUpdater>&& localUpdater)
-    : config_(config),
-      localUpdater_(std::move(localUpdater)),
-      numBatches_(0),
-      passCount_(0),
-      expectedPassCount_(expectedPassCount),
-      separateSendAndRecv_(false),
-      isFirstPass_(true),
-      useApplyInPserver_(false) {
-  addParameterType(PARAMETER_MOMENTUM);
-}
-
-void RemoteParameterUpdater::init(const std::vector<ParameterPtr>& parameters) {
-  ParameterUpdater::init(parameters);
-
-  if (localUpdater_) {
-    localUpdater_->init(parameters);
-
-    for (auto& parameter : parameters) {
-      parameter->enableType(PARAMETER_DELTA);
-    }
-
-    CHECK(config_.center_parameter_update_method() == kAverage ||
-          config_.center_parameter_update_method() == kElasticAverage)
-        << "unknown center_parameter_update_method";
-
-    // modify delta_add_rate
-    CHECK_GT(FLAGS_num_gradient_servers, 1)
-        << "FLAGS_num_gradient_servers should be set in trainer args.";
-    real delta_add_rate = config_.delta_add_rate() / FLAGS_num_gradient_servers;
-    config_.set_delta_add_rate(delta_add_rate);
-    LOG(INFO) << "center parameter in pserver,"
-              << " modify delta_add_rate=" << delta_add_rate;
-  }
-
-  if (!FLAGS_use_gpu) {
-    cpuParameters_ = parameters;
-  } else {
-    for (auto& parameter : parameters) {
-      cpuParameters_.emplace_back(new Parameter(parameter->getConfig(),
-                                                /* useGpu= */ false));
-      cpuParameters_.back()->setID(parameter->getID());
-      if (localUpdater_) {
-        cpuParameters_.back()->enableType(PARAMETER_DELTA);
-      }
-    }
-  }
-
-  parameterClient_.reset(new ParameterClient2(separateSendAndRecv_));
-  parameterClient_->init(cpuParameters_);
-  parameterClient_->setTrainerId(FLAGS_trainer_id);
-
-  if (FLAGS_trainer_id == 0) {
-    parameterClient_->setConfig(config_);
-    copyParametersFromDevice(PARAMETER_VALUE);
-    parameterClient_->setParameter();
-    parameterClient_->setStatus(PSERVER_STATUS_PARAMETER_READY);
-  } else {
-    parameterClient_->waitForStatus(PSERVER_STATUS_PARAMETER_READY);
-    parameterClient_->getParameter();
-    copyParametersToDevice(PARAMETER_VALUE);
-  }
-  if (FLAGS_trainer_id == 0 &&
-      (config_.algorithm() != TrainAlgorithm::AsyncSGD)) {
-    startController();
-    useApplyInPserver_ = useApplyInPserver(config_);
-  }
-}
-
-void RemoteParameterUpdater::startController() {
-  controllerThread_.reset(new std::thread([this]() { this->controller(); }));
-}
-
-void RemoteParameterUpdater::controller() {
-  ParameterClient2 client(false);
-  client.init(cpuParameters_);
-  while (true) {
-    /*start pass*/ {
-      client.waitPassStart();
-
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_START_PASS);
-      client.doOperation(ops,
-                         /* waitForGradient= */ false,
-                         /* sendBackarameter= */ false,
-                         /* releasePass= */ false);
-    }
-
-    while (true) {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_SGD);
-      client.doOperation(ops,
-                         /* waitForGradient= */ true,
-                         /* sendBackarameter= */ true,
-                         /* releasePass= */ false);
-      if (client.isPassFinish()) {
-        break;
-      }
-    }
-
-    /*finish pass*/ {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_FINISH_PASS);
-      client.doOperation(ops,
-                         /* waitForGradient= */ true,
-                         /* sendBackarameter= */ true,
-                         /* releasePass= */ true);
-    }
-
-    passCount_++;
-    if (passCount_ == expectedPassCount_) {
-      break;
-    }
-  }
-}
-
-void RemoteParameterUpdater::copyParametersToDevice(
-    ParameterType parameterType) {
-  if (!FLAGS_use_gpu) {
-    return;
-  }
-  int numParameters = cpuParameters_.size();
-  for (int i = 0; i < numParameters; ++i) {
-    parameters_[i]
-        ->getBuf(parameterType)
-        ->copyFrom(*cpuParameters_[i]->getBuf(parameterType));
-    if (parameterType == PARAMETER_VALUE) {
-      parameters_[i]->setValueUpdated();
-    }
-  }
-}
-
-void RemoteParameterUpdater::copyParametersFromDevice(
-    ParameterType parameterType) {
-  if (!FLAGS_use_gpu) {
-    return;
-  }
-  int numParameters = cpuParameters_.size();
-  for (int i = 0; i < numParameters; ++i) {
-    cpuParameters_[i]
-        ->getBuf(parameterType)
-        ->copyFrom(*parameters_[i]->getBuf(parameterType));
-  }
-}
-
-void RemoteParameterUpdater::updateImpl(Parameter* para) {
-  REGISTER_TIMER("update");
-  if (localUpdater_) {
-    localUpdater_->update(para);
-  }
-}
-
-void RemoteParameterUpdater::finishBatch(real cost) {
-  if (localUpdater_) {
-    localUpdater_->finishBatch(cost);
-  }
-
-  const std::string& algorithm = config_.algorithm();
-  ParameterUpdateMode mode;
-  if (algorithm == TrainAlgorithm::AsyncSGD) {
-    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
-  } else if (algorithm == TrainAlgorithm::SGD) {
-    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
-  } else {
-    LOG(FATAL) << "Unknown algorithm: " << algorithm;
-  }
-
-  ParameterType sendType;
-  bool sendBackParameter = true;
-  if (localUpdater_) {
-    ++numBatches_;
-    if (numBatches_ % config_.num_batches_per_send_parameter() != 0) {
-      return;
-    }
-
-    if (config_.center_parameter_update_method() == kElasticAverage) {
-      parameterClient_->getParameter(PARAMETER_DELTA);
-      copyParametersToDevice(PARAMETER_DELTA);
-      sendBackParameter = false;  // no need send back after send
-
-      // calc delta
-      for (auto& para : parameters_) {
-        // DELTA = LOCAL_VALUE - CENTER_VALUE/*store in DELTA*/
-        para->getBuf(PARAMETER_DELTA)
-            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
-
-        // when delta send to pserver, pserver will do:
-        // CENTER_VALUE += alpha * (LOCAL_VALUE - CENTER_VALUE)
-      }
-    } else {
-      // calc delta
-      for (auto& para : parameters_) {
-        // DELTA = NEW_VALUE - OLD_VALUE/*store in DELTA*/
-        para->getBuf(PARAMETER_DELTA)
-            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
-      }
-    }
-
-    sendType = PARAMETER_DELTA;
-
-  } else {
-    // In this case, we perform SGD on pserver.
-    sendType = PARAMETER_GRADIENT;
-  }
-
-  copyParametersFromDevice(sendType);
-
-  {
-    REGISTER_TIMER("sendAndRecv_dense");
-    parameterClient_->sendAndReceiveParameter(mode,
-                                              sendType,
-                                              batchSize_,
-                                              0,  // cost = 0
-                                              sendBackParameter);
-  }
-
-  if (sendBackParameter) {
-    copyParametersToDevice(PARAMETER_VALUE);
-  }
-
-  if (localUpdater_) {
-    if (config_.center_parameter_update_method() == kElasticAverage) {
-      for (auto& para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        // LOCAL_VALUE += -alpha * (LOCAL_VALUE - CENTER_VALUE)
-        para->getBuf(PARAMETER_VALUE)
-            ->add(*para->getBuf(PARAMETER_DELTA), -config_.delta_add_rate());
-      }
-
-    } else {  // average
-      // copy value to delta
-      for (auto& para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
-      }
-    }
-  } else {
-    for (auto& para : parameters_) {
-      SetDevice device(para->getDeviceId());
-      para->getBuf(sendType)->zeroMem();
-    }
-  }
-}
-
-void RemoteParameterUpdater::startPass() {
-  if (config_.algorithm() == TrainAlgorithm::SGD) {
-    parameterClient_->waitPassStart();
-  } else {
-    // sync could benifits reducing lagged trainer for async-sgd
-    // even if sync could not remove all lagged trainer for the
-    // sake of file loading, buffer etc.
-    parameterClient_->asyncStartPass();
-  }
-
-  if (localUpdater_) {
-    localUpdater_->startPass();
-    numBatches_ = 0;
-
-    if (config_.center_parameter_update_method() == kElasticAverage) {
-      if (!isFirstPass_) {
-        // restore local value from delta
-        for (auto& para : parameters_) {
-          SetDevice device(para->getDeviceId());
-          para->getBuf(PARAMETER_VALUE)
-              ->copyFrom(*para->getBuf(PARAMETER_DELTA));
-        }
-      }
-    } else {  // average
-      // copy value to delta
-      for (auto& para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
-      }
-    }
-  }
-}
-
-bool RemoteParameterUpdater::finishPass() {
-  if (localUpdater_) {
-    localUpdater_->finishPass();
-  }
-
-  if (config_.algorithm() == TrainAlgorithm::SGD) {
-    parameterClient_->waitPassFinish();
-  } else {
-    parameterClient_->asyncFinishPass();
-  }
-  if (localUpdater_) {
-    if (config_.center_parameter_update_method() == kElasticAverage) {
-      // backup local value to delta as we will get
-      // the remote parameter for saving/testing
-      for (auto& para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
-      }
-    }
-  }
-  parameterClient_->getParameter();
-  copyParametersToDevice(PARAMETER_VALUE);
-
-  isFirstPass_ = false;
-  return true;
-}
-
-void RemoteParameterUpdater::apply() {
-  if (useApplyInPserver_) {
-    PreparedOperations ops;
-    ops.addOperation(PSERVER_OP_APPLY);
-    parameterClient_->doOperation(ops,
-                                  /* waitForGradient= */ false,
-                                  /* sendBackarameter= */ false);
-    parameterClient_->getParameter(
-        /* recvParameterType= */ PARAMETER_VALUE,
-        /* sendBackParameterType= */ PARAMETER_APPLY);
-    copyParametersToDevice(PARAMETER_VALUE);
-  }
-}
-
-void RemoteParameterUpdater::restore() {
-  if (useApplyInPserver_) {
-    parameterClient_->getParameter();
-    copyParametersToDevice(PARAMETER_VALUE);
-  }
-}
-
-ConcurrentRemoteParameterUpdater::ConcurrentRemoteParameterUpdater(
-    OptimizationConfig config,
-    int passCount,
-    std::unique_ptr<ParameterUpdater>&& localUpdater)
-    : RemoteParameterUpdater(config, passCount, std::move(localUpdater)) {
-  sendThread_.reset(new std::thread([this]() { this->send(); }));
-  recvThread_.reset(new std::thread([this]() { this->recv(); }));
-
-  stopping_ = false;
-  oneBatchFinished_ = false;
-  separateSendAndRecv_ = true;
-}
-
-ConcurrentRemoteParameterUpdater::~ConcurrentRemoteParameterUpdater() {
-  stopping_ = true;
-  sendQueue_.enqueue(0);
-  sendThread_->join();
-  recvQueue_.enqueue(0);
-  recvThread_->join();
-}
-
-void ConcurrentRemoteParameterUpdater::finishBatch(real cost) {
-  if (localUpdater_) {
-    localUpdater_->finishBatch(cost);
-
-    if (!needToUpdateRemotely()) {
-      ++numBatches_;
-      return;
-    }
-  }
-
-  sendQueue_.enqueue(kFinishBatchPid);
-
-  finishBatchCond_.wait([this]() { return oneBatchFinished_; });
-  oneBatchFinished_ = false;
-  {
-    REGISTER_TIMER("sync_hostToDeviceStream");
-    for (auto& para : parameters_) {
-      SetDevice device(para->getDeviceId());
-      hl_stream_synchronize(kHostToDeviceStream);
-    }
-  }
-
-  if (localUpdater_) {
-    ++numBatches_;
-  }
-}
-
-// Use para=NULL to signal the end of one batch
-void ConcurrentRemoteParameterUpdater::send(Parameter* para) {
-  const std::string& algorithm = config_.algorithm();
-  ParameterUpdateMode mode;
-  if (algorithm == TrainAlgorithm::AsyncSGD) {
-    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
-  } else if (algorithm == TrainAlgorithm::SGD) {
-    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
-  } else {
-    LOG(FATAL) << "Unknown algorithm: " << algorithm;
-  }
-  ParameterType sendType;
-  if (localUpdater_) {
-    sendType = PARAMETER_DELTA;
-  } else {
-    // In this case, we perform SGD on pserver.
-    sendType = PARAMETER_GRADIENT;
-  }
-  std::vector<ParameterSegments> paraSegment;
-  if (para == NULL) {
-    parameterClient_->sendParameter(
-        mode,
-        sendType,
-        paraSegment,
-        batchSize_,
-        0,              // cost=0
-        true,           // sendBackParameter = true
-        batchStatus_);  // batchStatus_ = BATCH_FINISH
-
-  } else {
-    ParameterSegments paraSegTemp;
-    paraSegment.reserve(1);
-    paraSegTemp.name = para->getName();
-    paraSegTemp.id = para->getID();
-    paraSegment.push_back(paraSegTemp);
-    {
-      SetDevice device(para->getDeviceId());
-      REGISTER_TIMER("copySingleParaFromDevice");
-      copySingleParaFromDevice(para, sendType);
-      hl_stream_synchronize(kDeviceToHostStream);
-    }
-    parameterClient_->sendParameter(mode,
-                                    sendType,
-                                    paraSegment,
-                                    batchSize_,
-                                    0,     // cost=0
-                                    true,  // sendBackParameter = true
-                                    batchStatus_);
-    if (batchStatus_ == BATCH_START) batchStatus_ = BATCH_ON;
-  }
-}
-void ConcurrentRemoteParameterUpdater::recv(Parameter* para) {
-  parameterClient_->recvParameter();
-  if (para != NULL) {
-    REGISTER_TIMER("copySingleParaToDevice");
-    SetDevice device(para->getDeviceId());
-    copySingleParaToDevice(para, PARAMETER_VALUE);
-
-    if (localUpdater_) {
-      para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
-    } else {
-      // if cpu, parameter should not changes until recvParameter().
-      // if gpu, zero mem when send finish
-      if (!FLAGS_use_gpu) {
-        para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-      }
-    }
-  }
-}
-
-void ConcurrentRemoteParameterUpdater::recv() {
-  if (FLAGS_use_gpu) hl_set_device(FLAGS_gpu_id);
-  StatPtr stat = getStat("recv");
-  FOR_TIMING(Timer timer);
-  while (true) {
-    int pid;
-    {
-      REGISTER_TIMER("recv_dequeue");
-      pid = recvQueue_.dequeue();
-    }
-    if (pid == kFinishBatchPid) {
-      Parameter* para = NULL;
-      FOR_TIMING(timer.start());
-      recv(para);
-      FOR_TIMING(timer.stop());
-      FOR_TIMING(stat->addSample(timer.get()));
-      FOR_TIMING(timer.reset());
-      finishBatchCond_.notify_all([this] { oneBatchFinished_ = true; });
-    } else {
-      if (stopping_) break;
-      Parameter* para = parameters_[pid].get();
-      FOR_TIMING(timer.start());
-      recv(para);
-      FOR_TIMING(timer.stop());
-      oneBatchFinished_ = false;
-    }
-  }
-}
-
-void ConcurrentRemoteParameterUpdater::send() {
-  if (FLAGS_use_gpu) hl_set_device(FLAGS_gpu_id);
-  StatPtr stat = getStat("send");
-  FOR_TIMING(Timer timer);
-  while (true) {
-    int pid;
-    {
-      REGISTER_TIMER("send_dequeue");
-      pid = sendQueue_.dequeue();
-    }
-    if (pid == kFinishBatchPid) {
-      batchStatus_ = BATCH_FINISH;
-      if (!localUpdater_) {
-        // if cpu, parameter should not changes until recvParameter().
-        // if gpu, zeroMem() at the end of batch so that it won't
-        // interfere with computation.
-        if (FLAGS_use_gpu) {
-          REGISTER_TIMER("para_zeroMem");
-          for (auto& para : parameters_) {
-            SetDevice device(para->getDeviceId());
-            para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-          }
-        }
-      }
-      Parameter* para = NULL;
-      FOR_TIMING(timer.start());
-      send(para);
-      FOR_TIMING(timer.stop());
-      FOR_TIMING(stat->addSample(timer.get()));
-      FOR_TIMING(timer.reset());
-      recvQueue_.enqueue(pid);
-    } else {
-      if (stopping_) break;
-      Parameter* para = parameters_[pid].get();
-      if (localUpdater_) {
-        // DELTA = NEW_VALUE - OLD_VALUE/*store in DELTA*/
-        para->getBuf(PARAMETER_DELTA)
-            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
-      }
-      FOR_TIMING(timer.start());
-      send(para);
-      FOR_TIMING(timer.stop());
-      recvQueue_.enqueue(nonStaticParaIDMap_[para->getID()]);
-    }
-  }
-}
-
-void ConcurrentRemoteParameterUpdater::updateImpl(Parameter* para) {
-  REGISTER_TIMER("update");
-  if (localUpdater_) {
-    localUpdater_->update(para);
-    if (!needToUpdateRemotely()) {
-      return;
-    }
-  }
-  sendQueue_.enqueue(nonStaticParaIDMap_[para->getID()]);
-}
-
-void ConcurrentRemoteParameterUpdater::copySingleParaToDevice(
-    Parameter* para, ParameterType parameterType) {
-  if (!FLAGS_use_gpu) {
-    return;
-  }
-  int i = nonStaticParaIDMap_[para->getID()];
-  para->getBuf(parameterType)
-      ->copyFrom(*cpuParameters_[i]->getBuf(parameterType),
-                 kHostToDeviceStream);
-  if (parameterType == PARAMETER_VALUE) {
-    para->setValueUpdated();
-  }
-}
-
-void ConcurrentRemoteParameterUpdater::copySingleParaFromDevice(
-    Parameter* para, ParameterType parameterType) {
-  if (!FLAGS_use_gpu) {
-    return;
-  }
-  int i = nonStaticParaIDMap_[para->getID()];
-  cpuParameters_[i]
-      ->getBuf(parameterType)
-      ->copyFrom(*para->getBuf(parameterType), kDeviceToHostStream);
-}
-
-SparseRemoteParameterUpdater::SparseRemoteParameterUpdater(
-    const OptimizationConfig& config, int expectedPassCount, bool testing)
-    : config_(config),
-      passCount_(0),
-      expectedPassCount_(expectedPassCount),
-      testing_(testing),
-      useApplyInPserver_(false) {}
-
-void SparseRemoteParameterUpdater::init(
-    const std::vector<ParameterPtr>& parameters) {
-  ParameterUpdater::init(parameters);
-
-  parameterClient_.reset(new ParameterClient2(
-      false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse));
-  parameterClient_->init(parameters_);
-  parameterClient_->setTrainerId(FLAGS_trainer_id);
-
-  if (FLAGS_trainer_id == 0) {
-    parameterClient_->setConfig(
-        config_, FLAGS_save_dir, true /*is_sparse_server*/);
-    if (parameters[0]->isFullSize()) {
-      parameterClient_->setParameter();
-    } else {  // init in pserver
-      parameterClient_->setParameterZero();
-    }
-  }
-  if (FLAGS_trainer_id == 0 && !testing_ &&
-      config_.algorithm() == TrainAlgorithm::SGD) {
-    startController();
-    useApplyInPserver_ = useApplyInPserver(config_);
-  }
-}
-
-void SparseRemoteParameterUpdater::startController() {
-  controllerThread_.reset(new std::thread([this]() { this->controller(); }));
-}
-
-void SparseRemoteParameterUpdater::controller() {
-  ParameterClient2 client(
-      false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse);
-  client.init(parameters_);
-
-  while (true) {
-    /*start pass*/ {
-      client.waitPassStart();
-
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_START_PASS);
-      client.doOperation(ops,
-                         /* waitForGradient= */ false,
-                         /* sendBackarameter= */ false,
-                         /* releasePass= */ false);
-    }
-
-    while (true) {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_SGD);
-      client.doOperation(ops,
-                         /* waitForGradient= */ true,
-                         /* sendBackarameter= */ true,
-                         /* releasePass= */ false);
-      if (client.isPassFinish()) {
-        break;
-      }
-    }
-
-    /*finish pass*/ {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_FINISH_PASS);
-      client.doOperation(ops,
-                         /* waitForGradient= */ true,
-                         /* sendBackarameter= */ true,
-                         /* releasePass= */ true);
-    }
-
-    passCount_++;
-    if (passCount_ == expectedPassCount_) {
-      break;
-    }
-  }
-}
-
-PassType SparseRemoteParameterUpdater::startBatch(int64_t batchSize) {
-  batchSize_ = batchSize;
-  return PASS_TRAIN;
-}
-
-void SparseRemoteParameterUpdater::finishBatch(real cost) {
-  const std::string& algorithm = config_.algorithm();
-  ParameterUpdateMode mode;
-  if (algorithm == TrainAlgorithm::AsyncSGD) {
-    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
-  } else if (algorithm == TrainAlgorithm::SGD) {
-    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
-  } else {
-    LOG(FATAL) << "Unknown algorithm: " << algorithm;
-  }
-
-  ParameterType sendType = PARAMETER_GRADIENT;
-
-  REGISTER_TIMER("sendSparseParam");
-  parameterClient_->sendAndReceiveParameter(mode,
-                                            sendType,
-                                            batchSize_,
-                                            0,       // cost = 0
-                                            false);  // sendBackParameter
-
-  // grad zero move to sgd grad machine, before merge grad sparse remote
-}
-
-void SparseRemoteParameterUpdater::startPass() {
-  if (config_.algorithm() == TrainAlgorithm::SGD) {
-    parameterClient_->waitPassStart();
-  } else {
-    if (FLAGS_trainer_id == 0) {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_START_PASS);
-      parameterClient_->doOperation(ops,
-                                    /* waitForGradient= */ false,
-                                    /* sendBackarameter= */ false);
-    }
-    parameterClient_->asyncStartPass();
-  }
-}
-
-bool SparseRemoteParameterUpdater::finishPass() {
-  if (config_.algorithm() == TrainAlgorithm::SGD) {
-    parameterClient_->waitPassFinish();
-  } else {
-    if (FLAGS_trainer_id == 0) {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_FINISH_PASS);
-      parameterClient_->doOperation(ops,
-                                    /* waitForGradient= */ false,
-                                    /* sendBackarameter= */ false);
-    }
-    parameterClient_->asyncFinishPass();
-  }
-
-  return true;
-}
-
-// Trainer will call getParametersRemote at batch start or before save,
-// so we do not get values in apply() and restore().
-void SparseRemoteParameterUpdater::apply() {
-  if (useApplyInPserver_) {
-    PreparedOperations ops;
-    ops.addOperation(PSERVER_OP_APPLY);
-    parameterClient_->doOperation(ops,
-                                  /* waitForGradient= */ false,
-                                  /* sendBackarameter= */ false);
-  }
-}
-
-void SparseRemoteParameterUpdater::restore() {}
-
-void SparseRemoteParameterUpdater::getParametersRemote(bool fullSize,
-                                                       bool apply) {
-  ParameterType sendBackParameterType =
-      (useApplyInPserver_ && apply) ? PARAMETER_APPLY : PARAMETER_VALUE;
-  std::function<void()> getParams;
-  std::function<void(Parameter&, real)> applyL1;
-  if (fullSize) {
-    getParams = [&] {
-      parameterClient_->getParameter(
-          /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
-    };
-    applyL1 = [](Parameter& para, real decayRate) {
-      para.getBuf(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
-    };
-  } else {
-    getParams = [&] {
-      parameterClient_->getParameterSparse(
-          /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
-    };
-    applyL1 = [](Parameter& para, real decayRate) {
-      para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
-    };
-  }
-  {
-    REGISTER_TIMER("getParamDenseAndSparse");
-    getParams();
-    if (config_.shrink_parameter_value() > 0) {
-      for (auto& para : parameters_) {
-        if (para->getConfig().decay_rate_l1() > 0) {
-          applyL1(*para, config_.shrink_parameter_value());
-        }
-      }
-    }
-  }
-}
-
-void SparseRemoteParameterUpdater::randParametersRemote() {
-  CHECK_EQ(FLAGS_trainer_id, 0);
-
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_RANDOMIZE);
-  parameterClient_->doOperation(ops,
-                                /* waitForGradient= */ false,
-                                /* sendBackarameter= */ false);
-}
-
-void SparseRemoteParameterUpdater::loadParametersRemote(
-    const std::string& dirName) {
-  if (FLAGS_trainer_id == 0) {
-    parameterClient_->loadValueVector(dirName);
-  }
-
-  if (testing_) {
-    // we do not use synchronize() here,
-    // because test mode may run only one tester
-    if (FLAGS_trainer_id == 0) {
-      parameterClient_->setStatus(PSERVER_STATUS_PARAMETER_READY);
-    } else {
-      parameterClient_->waitForStatus(PSERVER_STATUS_PARAMETER_READY);
-    }
-  }
-}
-
-void SparseRemoteParameterUpdater::saveParametersRemote(
-    const std::string& dirName) {
-  if (FLAGS_trainer_id == 0) {
-    parameterClient_->saveValueVector(dirName);
-  }
-}
-
-void SparseRemoteParameterUpdaterComposite::init(
-    const std::vector<ParameterPtr>& parameters) {
-  parameters_ = parameters;
-
-  std::vector<ParameterPtr> parametersArray[NUMBER_UPDATERS];
-
-  for (auto& para : parameters_) {
-    if (para->isSparseRemoteUpdate()) {
-      parametersArray[UPDATER_SPARSE_REMOTE].push_back(para);
-    } else {
-      parametersArray[UPDATER_NORMAL].push_back(para);
-    }
-  }
-  CHECK(!parametersArray[UPDATER_SPARSE_REMOTE].empty());
-  CHECK(!parametersArray[UPDATER_NORMAL].empty());
-
-  syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-    updaters_[tid]->init(parametersArray[tid]);
-  });
-
-  parameterTypes_ = updaters_[UPDATER_NORMAL]->getParameterTypes();
-}
-
-std::vector<std::function<ParameterUpdater*(
-    const std::string&, const OptimizationConfig&, bool, size_t)>>
-    ParameterUpdaterCreators::constructors_;
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/RemoteParameterUpdater.h b/paddle/legacy/trainer/RemoteParameterUpdater.h
deleted file mode 100644
index 68468532981..00000000000
--- a/paddle/legacy/trainer/RemoteParameterUpdater.h
+++ /dev/null
@@ -1,416 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <thread>
-#include "ParameterUpdater.h"
-#include "paddle/legacy/pserver/ParameterClient2.h"
-#include "paddle/legacy/utils/Queue.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-// TODO(yanfei):
-// I think that the biggest feature of rdma is packet lossless control
-// feature instead of high bandwiths, zero copy and gpu-direct rdma in
-// theroy.
-// But zero-copy and gpu-direct rdma features can help to reduce latency
-// caused by os system.
-// So, for some specified cluster, such as high density gpu cluster,
-// gpu-direct and zero copy could help to improve cluster communication
-// performance.
-//
-
-/**
- * Normal remote parameter updater for dense parameters.
- *
- * It first packs all parameters for all pservers using ParameterClient
- * module, then wait for merged parameters data from all pservers.
- * The synchronization pattern specified by sync-sgd or async-sgd is
- * achieved by all pservers with the help of the controller within this
- * remote parameter updater.
- * This module indeedly bridges the gradient machines and parameter servers.
- * It helps to transfer the parameters from acceleration device to cpu end
- * for network. It contains additional parameters copy buffers for
- * acceleration devices at cpu end, such as gpu, otherwise it will
- * directly use original parameters data to update pservers.
- *
- * This remote parameter updater does not use pipeline mechanism to hide
- * copy latency from gpu to cpu buffer. In addition the overlapped between
- * backward and communication is not supported.
- */
-class RemoteParameterUpdater : public ParameterUpdater {
- public:
-  RemoteParameterUpdater(
-      const OptimizationConfig& config,
-      int expectedPassCount,
-      std::unique_ptr<ParameterUpdater>&& localUpdater = nullptr);
-  ~RemoteParameterUpdater() {
-    if (controllerThread_) {
-      controllerThread_->join();
-    }
-  }
-
-  /**
-   * initialize the internal parameter client and itself.
-   */
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-  /**
-   * @brief start batch
-   *
-   * @note  one batch training exhibits stateful feature to help
-   *        to do performance tuning, sgd optimization if necessary.
-   */
-  virtual PassType startBatch(int64_t batchSize) {
-    if (localUpdater_) {
-      localUpdater_->startBatch(batchSize);
-    }
-    batchSize_ = batchSize;
-    batchStatus_ = BATCH_START;
-    return PASS_TRAIN;
-  }
-
-  /**
-   * send parameters to pservers and get returned parameters
-   * from all pservers if necessary. it will implictly
-   * cooperate with controller thread for sync-sgd.
-   */
-  virtual void finishBatch(real cost);
-  virtual void startPass();
-  virtual bool finishPass();
-
-#ifndef PADDLE_DISABLE_TIMER
-  virtual void setForwardbackwardTime(uint64_t delta) {
-    parameterClient_->setForwardbackwardTime(delta);
-  }
-#endif
-
-  virtual void apply();
-  virtual void restore();
-
- protected:
-  /**
-   * control all pservers with all trainers for sync-sgd
-   */
-  virtual void controller();
-
-  /**
-   * work need to do after finishBatch
-   */
-  virtual void updateImpl(Parameter* para);
-
-  void startController();
-
-  /**
-   * @brief copy parameters from cpu host to device, such as gpu.
-   *
-   * @note  return if all data are transfered.
-   */
-  void copyParametersToDevice(ParameterType parameterType);
-
-  /**
-   * @brief copy parameters from device to cpu host
-   *
-   * @note  return if all data are transfered
-   */
-  void copyParametersFromDevice(ParameterType parameterType);
-
- protected:
-  /// Optimization config used to guide initialization and finishBatch
-  OptimizationConfig config_;
-  /// internal parameter client object for exchanging data with pserver
-  std::unique_ptr<ParameterClient2> parameterClient_;
-  /// internal shadow buffer at cpu host end, use original parameters_
-  /// if no acceleration devices are used.
-  std::vector<ParameterPtr> cpuParameters_;
-  /// local updater for aggregating multi-batches local delta
-  std::unique_ptr<ParameterUpdater> localUpdater_;
-  /// the size of mini-batch
-  int64_t batchSize_;
-  /// batches passed
-  int64_t numBatches_;
-  /// for stateful control
-  BatchStatus batchStatus_;
-  /// controller thread for sync-sgd
-  std::unique_ptr<std::thread> controllerThread_;
-  /// passed already finished
-  int64_t passCount_;
-  /// expected passes to finished
-  int64_t expectedPassCount_;
-  /// use normal synchronization communication if True
-  bool separateSendAndRecv_;
-  /// true if it's first pass
-  bool isFirstPass_;
-  bool useApplyInPserver_;
-
-  static const std::string kAverage;
-  static const std::string kElasticAverage;
-};
-
-// TODO(yanfei):
-// do parameters level synchronization Optimization at pserver end with
-// ConcurrentRemoteParameterUpdater to get more parallelization, at last
-// to really hide pserver latency in backward computation.
-//
-/**
- * This updater add additional optimization for overlapping synchronization
- * from pservers with backward computation.
- *
- * Parameter can be sent to pservers when related backward stage is finished.
- * This concurrent udpater does data copy from acceleration device to host
- * memory aynchronously. In addition internal parameter client reads data in
- * host memory and send them to all pservers in next stage. So this class
- * help to pipeline device-to-host copy and host-to-network to hide network
- * latency in backward stage.
- * It contains separate send and recv thread for pipeline usage.
- */
-class ConcurrentRemoteParameterUpdater : public RemoteParameterUpdater {
- public:
-  ConcurrentRemoteParameterUpdater(
-      OptimizationConfig config,
-      int expectedPassCount,
-      std::unique_ptr<ParameterUpdater>&& localUpdater);
-  ~ConcurrentRemoteParameterUpdater();
-
-  /**
-   * @brief send paraemeters to all pservers
-   *
-   * @note  it just signal the end signal to internal parameter client
-   *        to finished the aynchronous send action. In addition it also
-   *        do synchronization for all asynchronous host-to-device copy.
-   */
-  virtual void finishBatch(real cost);
-
- protected:
-  virtual void updateImpl(Parameter* para);
-  /// internal thread called in send thread
-  void send(Parameter* para);  // para == NULL indicate end of a minibatch
-  /// internal function called in recv thread
-  void recv(Parameter* para);
-  /**
-   * @brief send thread for relaying data from gradient to parameter client
-   *
-   * @note  just pipe data to internal parameter client for pipeline
-   */
-  void send();
-  /**
-   * @brief recv thread for relaying data from internal parameter client to
-   *        host memory
-   *
-   * @note  it contains the asynchronous data copy form host to device
-   */
-  void recv();
-  /// copy specified parameter from host to device
-  void copySingleParaToDevice(Parameter* para, ParameterType parameterType);
-  /// copy specified parameter from device to host
-  void copySingleParaFromDevice(Parameter* para, ParameterType parameterType);
-  bool needToUpdateRemotely() {
-    return (numBatches_ + 1) % config_.num_batches_per_send_parameter() == 0;
-  }
-
- private:
-  /// send thread used for overlapping
-  std::unique_ptr<std::thread> sendThread_;
-  /// recv thread used for overlapping
-  std::unique_ptr<std::thread> recvThread_;
-  /// buffer queue for overlapping
-  Queue<int> sendQueue_;
-  /// buffer queue for overlapping
-  Queue<int> recvQueue_;
-  /// flags indicating to stop
-  bool stopping_;
-  /// conditional variable for threads synchronization between the
-  /// thread calling finishBatch and internal recv thread
-  LockedCondition finishBatchCond_;
-  bool oneBatchFinished_;
-};
-
-// TODO(yanfei):
-// merge sparse updater with dense updater, and could help to reduce
-// the synchronization between sparse and dense udpater. it could also
-// reduce the threads for managing all connections.
-/**
- * This class is specified for updating sparse parameters.
- *
- * It allows part of parameter to be exchanged with all pservers.
- * If sparse input assigned, part gradients of first hidden layer
- * could remained zero which can not need to be exchanged within
- * all pservers. This is the key optimization point for this updater
- *
- * For updating sparse parameters, all latest parameters are stored
- * in pservers instead of keeping full copy at train end, so need to
- * prefetch parameters weight value which can be changed in next-batch
- * before doing next forwardbackward. Also, with above fact that the
- * parameters can be stored in pserver instead of trainer, we can
- * fetch specified parmeters if necessary, and can support huge
- * parameters which is larger enough than  the RAM size in single
- * node.
- *
- * Internally, this updater will direct internal parameter client
- * to encapsulate sparse specified message for all pservers.
- */
-class SparseRemoteParameterUpdater : public ParameterUpdater {
- public:
-  SparseRemoteParameterUpdater(const OptimizationConfig& config,
-                               int expectedPassCount,
-                               bool testing);
-  ~SparseRemoteParameterUpdater() {
-    if (controllerThread_) {
-      controllerThread_->join();
-    }
-  }
-
-  /// initialization
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-
-  /// stateful batch control
-  virtual PassType startBatch(int64_t batchSize);
-  /// send all sparse related parameters to all pservers
-  virtual void finishBatch(real cost);
-  virtual void startPass();
-  virtual bool finishPass();
-
-  virtual void apply();
-  virtual void restore();
-
-  /// load parameters from pservers
-  virtual void loadParametersRemote(const std::string& dirName);
-  /// save parameters to pservers
-  virtual void saveParametersRemote(const std::string& dirName);
-  /**
-   * @brief get latest sparse parameters value from all pservers
-   *
-   * @note  call it before next mini-batch
-   */
-  virtual void getParametersRemote(bool fullSize, bool apply);
-  virtual void randParametersRemote();
-#ifndef PADDLE_DISABLE_TIMER
-  virtual void setForwardbackwardTime(uint64_t delta) {
-    parameterClient_->setForwardbackwardTime(delta);
-  }
-#endif
-
- protected:
-  /// update implimentation, not implemented
-  virtual void updateImpl(Parameter* para) {}
-
-  /// internal controller routine for controller thread
-  virtual void controller();
-
-  /// start controller thread
-  void startController();
-
- protected:
-  /// optimization config
-  OptimizationConfig config_;
-  /// internal parameter client
-  std::unique_ptr<ParameterClient2> parameterClient_;
-  int64_t batchSize_;
-  std::unique_ptr<std::thread> controllerThread_;
-  int64_t passCount_;
-  int64_t expectedPassCount_;
-  bool testing_;
-  bool useApplyInPserver_;
-};
-
-/**
- * Class for supporting normal updater and sparse updater
- *
- * Not all parts of one model are sparse, so it exists dense updater
- * for normal layers while sparse updater is for sparse layers.
- *
- * it directly call internal dense and sparse udpater individually.
- */
-class SparseRemoteParameterUpdaterComposite : public ParameterUpdaterComposite {
- public:
-  enum {
-    UPDATER_SPARSE_REMOTE = 0,  // execute in sync thread pool(tid:0)
-    UPDATER_NORMAL = 1,         // execute in Owner thread(tid:1)
-    NUMBER_UPDATERS = 2,
-  };
-  /**
-   * @brief create one dense updater and one sparse updater
-   *
-   * @note  use syncThreadPool to synchronize these two updaters
-   */
-  SparseRemoteParameterUpdaterComposite(
-      const OptimizationConfig& config,
-      int expectedPassCount,
-      bool testing,
-      std::unique_ptr<ParameterUpdater>&& normalUpdater) {
-    updaters_.resize(NUMBER_UPDATERS);
-    updaters_[UPDATER_SPARSE_REMOTE].reset(
-        new SparseRemoteParameterUpdater(config, expectedPassCount, testing));
-    updaters_[UPDATER_NORMAL] = std::move(normalUpdater);
-
-    syncThreadPool_.reset(new SyncThreadPool(NUMBER_UPDATERS - 1));
-  }
-
-  /// initialization of dense and sparse updaters
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-};
-
-class ParameterUpdaterCreators {
- public:
-  /**
-   * @brief add a creator to create custom ParameterUpdater while training.
-   *        The creator is a function with type (alogrithm, optConfig, isLocal,
-   *        numPasses) -> ParameterUpdater*. Trainer will use this
-   *        ParameterUpdater if creator can create a no nullptr
-   *        ParameterUpdater. Return nullptr will use trainer's default
-   *        updaters.
-   *
-   * @param creator method which can create ParameterUpdater.
-   */
-  static void addCreator(
-      const std::function<ParameterUpdater*(
-          const std::string&,         // algo
-          const OptimizationConfig&,  // optConfig
-          bool,                       // isLocal
-          size_t                      // numPasses
-          )>& creator) {  // NOLINT  explicit move closing ) in this line
-                          // for readability
-    constructors_.push_back(creator);
-  }
-
-  /**
-   * @brief Try to create an updater by given algo, optConfig, isLocal,
-   *        numPasses. Return nullptr if cannot create anyone.
-   * @param algo algorithm string.
-   * @param optConfig optimization config.
-   * @param isLocal is in local mode or not.
-   * @param numPasses total passes that trainer will train.
-   * @return nullptr if fail, not nullptr if we can create an updater.
-   */
-  static ParameterUpdater* tryCreateUpdater(const std::string& algo,
-                                            const OptimizationConfig& optConfig,
-                                            bool isLocal,
-                                            size_t numPasses) {
-    for (auto& c : constructors_) {
-      if (auto updater = c(algo, optConfig, isLocal, numPasses)) {
-        return updater;
-      }
-    }
-    return nullptr;
-  }
-
- private:
-  static std::vector<std::function<ParameterUpdater*(
-      const std::string&, const OptimizationConfig&, bool, size_t)>>
-      constructors_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/Tester.cpp b/paddle/legacy/trainer/Tester.cpp
deleted file mode 100644
index d977ca9657a..00000000000
--- a/paddle/legacy/trainer/Tester.cpp
+++ /dev/null
@@ -1,380 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Tester.h"
-
-#include <fenv.h>
-#include <stdio.h>
-
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-#include <google/protobuf/text_format.h>
-
-#include "paddle/legacy/utils/GlobalConstants.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include "TesterConfig.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachineMode.h"
-#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/legacy/gserver/layers/ValidationLayer.h"
-
-namespace paddle {
-
-Tester::Tester(const std::shared_ptr<TrainerConfigHelper>& config,
-               std::unique_ptr<TesterConfig>&& intconfig,
-               const GradientMachinePtr& gradientMachine,
-               const std::shared_ptr<ParameterUpdater>& parameterUpdater,
-               std::shared_ptr<DataProvider> testDataProvider)
-    : config_(config),
-      intconfig_(std::move(intconfig)),
-      gradientMachine_(gradientMachine),
-      parameterUpdater_(parameterUpdater),
-      testDataProvider_(testDataProvider) {
-  if (config_->getOptConfig().use_sparse_remote_updater()) {
-    LOG(FATAL) << "It's prohibited to set sparse_remote_update "
-               << "when doing train and test jobs in the same "
-               << "process. You could run paddle --job=test in "
-               << "a separate process.";
-  }
-  testEvaluator_.reset(gradientMachine_->makeEvaluator());
-  if (intconfig_->distributeTest) {
-    testParameterClient_.reset(new ParameterClient2(true));
-  }
-
-  if (testParameterClient_) {
-    testParameterClient_->init(gradientMachine_->getParameters());
-  }
-
-  std::unique_ptr<ParameterUtilConfig> paramConfig(
-      new ParameterUtilConfig(intconfig_->saveOnlyOne,
-                              intconfig_->savingPeriod,
-                              intconfig_->loadsaveParametersInPserver,
-                              intconfig_->config));
-
-  paramUtil_.reset(new ParameterUtil(
-      config_, std::move(paramConfig), gradientMachine_, parameterUpdater_));
-}
-
-void Tester::startTestPeriod() {
-  if (testDataProvider_) {
-    testDataProvider_->reset();
-  }
-  testEvaluator_->start();
-  testContext_.cost = 0;
-  testContext_.numSamples = 0;
-
-  parameterUpdater_->apply();
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->getState(*intconfig_->trainState);
-    gradientMachine_->setState(*intconfig_->testState);
-  }
-}
-
-void Tester::testOneDataBatch(const DataBatch& dataBatch,
-                              std::vector<Argument>* outArgs) {
-  testContext_.cost +=
-      forwardOneBatch(dataBatch, testEvaluator_.get(), outArgs);
-  testContext_.numSamples += dataBatch.getSize();
-}
-
-void Tester::testOnePeriod() {
-  DataBatch dataBatch;
-  int64_t batchSize = config_->getOptConfig().batch_size();
-  std::vector<Argument> outArgs;
-  startTestPeriod();
-  while (testDataProvider_->getNextBatch(batchSize, &dataBatch) != 0) {
-    testOneDataBatch(dataBatch, &outArgs);
-  }
-  finishTestPeriod();
-}
-
-void Tester::finishTestPeriod() {
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->resetState();
-  }
-  testEvaluator_->finish();
-  CHECK_GT(testContext_.numSamples, 0)
-      << "There is no samples in your test batch. Possibly "
-         "wrong implementation of DataProvidor.reset()";
-  LOG(INFO) << " Test samples=" << testContext_.numSamples
-            << " cost=" << testContext_.cost / testContext_.numSamples
-            << " Eval: " << *testEvaluator_;
-  parameterUpdater_->restore();
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->getState(*intconfig_->testState);
-    gradientMachine_->setState(*intconfig_->trainState);
-  }
-}
-
-int64_t Tester::testOneBatchById(int64_t batchId) {
-  DataBatch dataBatch;
-  int32_t batchSize = config_->getOptConfig().batch_size();
-
-  testDataProvider_->getNextBatch(batchSize, &dataBatch);
-
-  int64_t actualBatchSize = dataBatch.getSize();
-  if (actualBatchSize == 0) {
-    return 0;
-  }
-
-  std::vector<Argument> outArgs;
-
-  stats_ += std::pair<int64_t, real>{
-      actualBatchSize,
-      forwardOneBatch(dataBatch, testEvaluator_.get(), &outArgs)};
-
-  if (((batchId + 1) % intconfig_->logPeriod) == 0) {
-    LOG(INFO) << " Batch=" << batchId + 1 << " " << stats_.getStats(false);
-  }
-
-  return actualBatchSize;
-}
-
-real Tester::forwardOneBatch(const DataBatch& dataBatch,
-                             Evaluator* evaluator,
-                             std::vector<Argument>* pOutArgs) {
-  auto& outArgs = *pOutArgs;
-  const std::vector<Argument>& inArgs = dataBatch.getStreams();
-  if (intconfig_->loadsaveParametersInPserver) {
-    REGISTER_TIMER("prefetch");
-    gradientMachine_->prefetch(inArgs);
-    parameterUpdater_->getParametersRemote(false /*full parameter*/,
-                                           true /*after apply*/);
-  }
-
-  gradientMachine_->forward(inArgs, &outArgs, PASS_TEST);
-
-  // write features if set this flag and outArgs is not empty
-  std::string featFile = intconfig_->featFile;
-  if (!featFile.empty() && outArgs.empty()) {
-    size_t numOutputs = outArgs.size();
-    std::vector<MatrixPtr> featMatrices;
-    featMatrices.resize(numOutputs);
-    for (size_t i = 0; i < numOutputs; ++i) {
-      featMatrices[i] = Matrix::create(outArgs[i].value->getHeight(),
-                                       outArgs[i].value->getWidth(),
-                                       false,
-                                       false);  // CPU data buffer
-      featMatrices[i]->copyFrom(*(outArgs[i].value), HPPL_STREAM_DEFAULT);
-    }
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-    FILE* fp = fopen(featFile.c_str(), "ab+");
-    CHECK(!ferror(fp)) << "Fail to open " << featFile;
-
-    size_t sampleNum = featMatrices[0]->getHeight();
-    for (size_t i = 0; i < sampleNum; ++i) {
-      for (size_t j = 0; j < numOutputs; ++j) {
-        size_t dim = featMatrices[j]->getWidth();
-        fwrite(featMatrices[j]->getData() + i * dim, sizeof(real), dim, fp);
-      }
-    }
-    fclose(fp);
-  }
-  if (evaluator) {
-    gradientMachine_->eval(evaluator);
-  }
-
-  // Save the output layers if predict_output_dir is not empty
-  std::string predictOutputDir = intconfig_->predictOutputDir;
-  if (!predictOutputDir.empty() && !outArgs.empty()) {
-    CHECK(intconfig_->testing) << "Only valid in test mode";
-    if (!os_.is_open()) {
-      // TODO(yuyang18): Refactor these lines.
-      constexpr int kBufLen = 100;
-      char buf[kBufLen];
-      snprintf(buf, kBufLen, "rank-%05d", intconfig_->trainerId);
-      mkDir(predictOutputDir.c_str());
-      std::string filename = path::join(predictOutputDir, buf);
-      os_.open(filename, std::ofstream::trunc);
-      CHECK(os_.is_open()) << "Failed to open file " << filename;
-    }
-    printOutput(outArgs, os_);
-    return 0.0;  // In this case, there is no meaning to calculate cost
-  }
-
-  return Argument::sum(outArgs);
-}
-
-void Tester::testOnePassBatch(int passId) {
-  stats_.reset();
-  const std::vector<Argument> inArgs;
-  gradientMachine_->forward(inArgs, nullptr, PASS_TEST);
-  int64_t num;
-  real cost;
-  gradientMachine_->getStats(cost, num);
-  stats_ += std::pair<int64_t, real>{num, cost};
-  gradientMachine_->onPassEnd();
-
-  LOG(INFO) << " Pass=" << passId << " " << stats_.getStats(false);
-}
-
-void Tester::testOnePass(int passId) {
-  stats_.reset();
-  int64_t batchId = 0;
-  int num = 0;
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->resetState();
-  }
-
-  testEvaluator_->start();
-
-  do {
-    num = testOneBatchById(batchId);
-    ++batchId;
-  } while (num > 0);
-
-  gradientMachine_->onPassEnd();
-  testEvaluator_->finish();
-
-  LOG(INFO) << " Pass=" << passId << " " << stats_.getStats(false)
-            << " Eval: " << *testEvaluator_;
-
-  if (intconfig_->distributeTest) {
-    testEvaluator_->distributeEval(testParameterClient_.get());
-    if (0 == intconfig_->trainerId) {
-      LOG(INFO) << "distribute eval: " << *testEvaluator_;
-    }
-  }
-}
-
-void Tester::test() {
-  CHECK(testDataProvider_) << "TestData is not specified";
-  testDataProvider_->setSkipShuffle();
-  testDataProvider_->reset();
-  gradientMachine_->start();
-
-  // For evaluation
-  std::vector<std::string> modelList;
-  std::string modelListFromConfig = intconfig_->modelList;
-  std::string initModelPath = intconfig_->initModelPath;
-  if (!modelListFromConfig.empty()) {
-    loadFileList(modelListFromConfig, modelList);
-    intconfig_->testPass = 0;
-    intconfig_->numPasses = modelList.size();
-    intconfig_->savingPeriod = 1;
-    CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation";
-  } else if (!initModelPath.empty()) {
-    modelList.push_back(initModelPath);
-    intconfig_->testPass = 0;
-    intconfig_->numPasses = 1;
-    intconfig_->savingPeriod = 1;
-    CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation";
-  }
-
-  for (int i = intconfig_->testPass; i < intconfig_->numPasses; ++i) {
-    int passId = i;
-    if (passId % intconfig_->savingPeriod == 0) {
-      if (intconfig_->testWait) {
-        while (paramUtil_->loadParameters(
-                   passId, true /*local*/, true /*remote*/) == false) {
-          LOG(INFO) << "Waiting for parameters of pass " << passId;
-          sleep(60);  // sleep 60s
-        }
-      } else {
-        if (modelList.size() == 0) {
-          CHECK_EQ(paramUtil_->loadParameters(
-                       passId, true /*local*/, true /*remote*/),
-                   true);
-        } else {
-          paramUtil_->loadParametersWithPath(
-              modelList[i], true /*local*/, true /*remote*/);
-        }
-      }
-      if (IGradientMachineMode::trainWholeDataInOneBatch(intconfig_->mode)) {
-        testOnePassBatch(passId);
-      } else {
-        testOnePass(passId);
-      }
-      if (passId + intconfig_->savingPeriod < intconfig_->numPasses) {
-        // if there is at least 1 more pass to test, then call reset,
-        // otherwise not.
-        testDataProvider_->reset();
-      }
-    }
-  }
-
-  gradientMachine_->finish();
-}
-
-void Tester::printOutput(const std::vector<Argument>& outArgs,
-                         std::ostream& os) {
-  size_t numOutputs = outArgs.size();
-  size_t numIns = outArgs[0].getBatchSize();
-  if (cpuMat_.size() != numOutputs || cpuVec_.size() != numOutputs) {
-    cpuMat_.resize(numOutputs, nullptr);
-    cpuVec_.resize(numOutputs, nullptr);
-  }
-
-  for (size_t i = 0; i < numOutputs; ++i) {
-    if (outArgs[i].value != nullptr) {
-      if (outArgs[i].value->useGpu()) {
-        if (dynamic_cast<GpuMatrix*>(outArgs[i].value.get())) {
-          size_t dim = outArgs[i].value->getWidth();
-          Matrix::resizeOrCreate(cpuMat_[i], numIns, dim, false, false);
-          cpuMat_[i]->copyFrom(*outArgs[i].value);
-        } else if (dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get())) {
-          auto sparseMat =
-              dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get());
-          cpuMat_[i] = Matrix::createSparseMatrix(sparseMat->getHeight(),
-                                                  sparseMat->getWidth(),
-                                                  sparseMat->getElementCnt(),
-                                                  sparseMat->getValueType(),
-                                                  sparseMat->format_,
-                                                  false,  /* trans */
-                                                  false); /* useGpu */
-          hl_stream_t stream = HPPL_STREAM_DEFAULT;
-          cpuMat_[i]->copyFrom(*sparseMat, stream);
-        } else {
-          LOG(WARNING) << "Not supported gpu matrix type";
-        }
-      }
-    } else if (outArgs[i].ids != nullptr) {
-      if (outArgs[i].ids->useGpu()) {
-        IVector::resizeOrCreate(cpuVec_[i], outArgs[i].ids->getSize(), false);
-        cpuVec_[i]->copyFrom(*outArgs[i].ids);
-      }
-    } else if (outArgs[i].strs != nullptr) {
-      continue;
-    } else {
-      LOG(WARNING) << "outArgs[" << i << "] has no data to print";
-    }
-  }
-
-  for (size_t i = 0; i < numIns; ++i) {
-    for (size_t j = 0; j < numOutputs; ++j) {
-      if (outArgs[j].value != nullptr) {
-        if (outArgs[j].value->useGpu()) {
-          cpuMat_[j]->printOneRow(os, i);
-        } else {
-          outArgs[j].value->printOneRow(os, i);
-        }
-      } else if (outArgs[j].ids != nullptr) {
-        if (outArgs[j].ids->useGpu()) {
-          cpuVec_[j]->printOneElement(os, i);
-        } else {
-          outArgs[j].ids->printOneElement(os, i);
-        }
-      } else if (outArgs[j].strs != nullptr) {
-        os << (*outArgs[j].strs)[i] << ";";
-      }
-    }
-    os << std::endl;
-  }
-}
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/Tester.h b/paddle/legacy/trainer/Tester.h
deleted file mode 100644
index a298602d1d0..00000000000
--- a/paddle/legacy/trainer/Tester.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-
-#include "TrainerConfig.pb.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include "ParamUtil.h"
-#include "ParameterUpdater.h"
-#include "TesterConfig.h"
-#include "TrainerInternalConfig.h"
-
-namespace paddle {
-
-/**
- * Neural Network test logics code.
- * It is a private class for Trainer.
- */
-class Tester {
- public:
-  /**
-   * Ctor
-   * @param config Trainer Config.
-   * @param intconfig Tester Config.
-   * @param gradientMachine Gradient machine(neuralnetwork) that will be tested.
-   * @param parameterUpdater Parameter Updater. Not for updating parameter, just
-   *                         for getting parameter from parameter-server.
-   * @param testDataProvider Test data provider.
-   */
-  Tester(const std::shared_ptr<TrainerConfigHelper>& config,
-         std::unique_ptr<TesterConfig>&& intconfig,
-         const GradientMachinePtr& gradientMachine,
-         const std::shared_ptr<ParameterUpdater>& parameterUpdater,
-         std::shared_ptr<DataProvider> testDataProvider);
-
-  /**
-   * test one period.
-   *
-   * One period means 2 things.
-   *   if test_period !=0 and not test_all_data_in_one_period, then
-   *      will test test_period * batch_size data.
-   *   else
-   *      will test whole test data.
-   *
-   * It is convenience to test small set of data when test data set is large and
-   * is training at same time.
-   */
-  void testOnePeriod();
-  void startTestPeriod();
-  void finishTestPeriod();
-  void testOneDataBatch(const DataBatch& dataBatch,
-                        std::vector<Argument>* outArgs);
-
-  /**
-   * Test for given data batch.
-   * @param dataBatch Data batch.
-   * @param evaluator Evaluator
-   * @return cost
-   */
-  real forwardOneBatch(const DataBatch& dataBatch,
-                       Evaluator* evaluator,
-                       std::vector<Argument>* outArgs);
-
-  /**
-   * performance the full pass of test given test data provider
-   */
-  void test();
-
- protected:
-  std::shared_ptr<ParameterClient2> testParameterClient_;
-  std::shared_ptr<TrainerConfigHelper> config_;
-  std::unique_ptr<TesterConfig> intconfig_;
-  GradientMachinePtr gradientMachine_;
-  std::shared_ptr<ParameterUpdater> parameterUpdater_;
-  std::unique_ptr<Evaluator> testEvaluator_;
-  std::unique_ptr<ParameterUtil> paramUtil_;
-  DataProviderPtr testDataProvider_;
-  TrainerStats stats_;
-
-  // Used for saving the values of output layers
-  std::ofstream os_;
-  std::vector<MatrixPtr> cpuMat_;
-  std::vector<IVectorPtr> cpuVec_;
-  struct {
-    int64_t numSamples;
-    real cost;
-  } testContext_;
-
- private:
-  /**
-   * Test one batch by batchId. It is only used for testOnePass.
-   *
-   * Durning testOnePass, each log_period will print cost statistics.
-   *
-   * @param batchId current batch id (from 0)
-   * @return num of tested samples. Zero if end of pass.
-   */
-  int64_t testOneBatchById(int64_t batchId);
-
-  /**
-   * Test whole pass in one batch.
-   *
-   *
-   * @param passId current pass id (from 0)
-   */
-  void testOnePassBatch(int passId);
-
-  /**
-   * test for one pass in several mini-batches.
-   *
-   * Used for sgd method.
-   *
-   * @param passId current pass id (from 0)
-   */
-  void testOnePass(int passId);
-
-  /**
-   * print the outArgs to a stream
-   *
-   * used for save feature file
-   *
-   * @param [in] outArgs output arguments for network.
-   * @param [in,out] os output stream.
-   */
-  void printOutput(const std::vector<Argument>& outArgs, std::ostream& os);
-};
-
-}  //  namespace paddle
diff --git a/paddle/legacy/trainer/TesterConfig.h b/paddle/legacy/trainer/TesterConfig.h
deleted file mode 100644
index 6c78f7cda34..00000000000
--- a/paddle/legacy/trainer/TesterConfig.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-
-#include "TrainerConfig.pb.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include "ParameterUpdater.h"
-
-namespace paddle {
-
-/**
- * TesterConfig
- * general configs for training
- */
-struct TesterConfig {
-  /**
-   * indicate test period
-   */
-  int testPeriod;
-
-  /**
-   * indicate whether to save previous batch state
-   */
-  bool prevBatchState;
-
-  /**
-   * log period
-   */
-  int logPeriod;
-
-  /**
-   * loadsave parameters in pserver
-   */
-  bool loadsaveParametersInPserver;
-
-  /**
-   * feat file
-   */
-  std::string featFile;
-
-  /**
-   * predict output dir
-   */
-  std::string predictOutputDir;
-
-  /**
-   * trianer id
-   */
-  int trainerId;
-
-  /**
-   * distribute test
-   */
-  bool distributeTest;
-
-  /**
-   * training state
-   */
-  MachineState* trainState;
-
-  /**
-   * test state
-   */
-  MachineState* testState;
-
-  /**
-   * model list
-   */
-  std::string modelList;
-
-  /**
-   * test passes
-   */
-  int testPass;
-
-  /**
-   * num passes
-   */
-  int numPasses;
-
-  /**
-   * saving period
-   */
-  int savingPeriod;
-
-  /**
-   * test wait
-   */
-  int testWait;
-
-  /**
-   * init model path
-   */
-  std::string initModelPath;
-
-  /**
-   * save only one
-   */
-  bool saveOnlyOne;
-
-  /**
-   * testing mode
-   */
-  bool testing;
-
-  /**
-   * mode
-   */
-  int mode;
-
-  /**
-   * config loc
-   */
-  std::string config;
-};
-
-}  //  namespace paddle
diff --git a/paddle/legacy/trainer/ThreadParameterUpdater.cpp b/paddle/legacy/trainer/ThreadParameterUpdater.cpp
deleted file mode 100644
index 0601bdf24e3..00000000000
--- a/paddle/legacy/trainer/ThreadParameterUpdater.cpp
+++ /dev/null
@@ -1,309 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ThreadParameterUpdater.h"
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include "paddle/legacy/math/SparseRowMatrix.h"
-#include "paddle/legacy/parameter/ThreadLocalBuffer.h"
-#include "paddle/legacy/utils/Thread.h"
-
-DECLARE_int32(trainer_count);
-
-namespace paddle {
-
-SgdThreadUpdater::SgdThreadUpdater(const OptimizationConfig& optConfig)
-    : config_(optConfig), numSamplesProcessed_(0) {
-  // fill types
-  auto types = sgdOptimizerGetTypes(optConfig, false /*inPserver*/);
-  for (auto type : types) {
-    addParameterType(type);
-  }
-}
-
-void SgdThreadUpdater::init(const std::vector<ParameterPtr>& parameters) {
-  ParameterUpdater::init(parameters);
-
-  // calc max parameter id
-  size_t maxId = 0;
-  for (auto& para : parameters_) {
-    maxId = std::max(maxId, para->getID());
-  }
-
-  optimizers_.resize(maxId + 1);
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid].reset(sgdOptimizerCreate(config_,
-                                              para->getConfig(),
-                                              para->isGradSparseUpdate(),
-                                              false /*inPserver*/));
-    size_t numRows = para->isGradSparseUpdate() ? para->getConfig().dims(0) : 0;
-    optimizers_[pid]->init(numRows, &para->getConfig());
-    if (para->isGradSparseUpdate() && FLAGS_trainer_count == 1) {
-      // For trainer_count=1, the gradient machine is NeuralNetwork, which does
-      // not create parameter buf for PARAMETER_GRADIENT for sparse update in
-      // Parameter::enableType(). But gradient parameter buf is still used
-      // in SgdThreadUpdater. We need to explicitly create it.
-      //
-      // The AverageOptimizer::restore/apply method will use PARAMETER_GRADIENT
-      // as a temp buffer.
-      para->enableBufType(PARAMETER_GRADIENT);
-    }
-  }
-}
-
-void SgdThreadUpdater::startPass() {
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->startPass();
-  }
-}
-
-bool SgdThreadUpdater::finishPass() {
-  catchUpWith();
-
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->finishPass();
-  }
-  return true;
-}
-
-void SgdThreadUpdater::updateImpl(Parameter* para) {
-  if (!para->useGpu()) return;
-  SetDevice setDevice(para->getDeviceId());
-  ParameterOptimizer* optimizer = optimizers_[para->getID()].get();
-  optimizer->update(para->getBufs(), para->getConfig());
-  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
-    callback(para->getBufs(), para->getConfig(), -1LU);
-  }
-
-  para->setValueUpdated();
-  para->clearGradient();
-}
-
-void SgdThreadUpdater::threadTraverse(
-    const ParameterOptimizer::TraverseCallback& callback,
-    int tid,
-    size_t numThreads,
-    Parameter* para) {
-  VectorPtr* vecs = parameter::getThreadLocalBuffer();
-  if (para->isGradSparseUpdate()) {
-    size_t height = para->getConfig().dims(0);
-    size_t width = para->getConfig().dims(1);
-    for (size_t i = tid; i < height; i += numThreads) {
-      // setup sub bufs
-      for (auto type : parameterTypes_) {
-        vecs[type]->subVecFrom(*para->getBuf(type), i * width, width);
-      }
-      callback(vecs, para->getConfig(), i);
-    }
-  } else {  // dense
-    // setup sub bufs
-    auto interval = calcSplitArrayInterval(
-        para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
-    for (auto type : parameterTypes_) {
-      vecs[type]->subVecFrom(*para->getBuf(type), interval);
-    }
-
-    callback(vecs, para->getConfig(), -1LU);
-  }
-}
-
-void SgdThreadUpdater::traverse(GetTraverseCallback getTraverseCallback) {
-  bool hasCpuPara = false;
-  bool hasGpuPara = false;
-  for (auto& para : parameters_) {
-    if (para->useGpu()) {
-      hasGpuPara = true;
-    } else {
-      hasCpuPara = true;
-    }
-  }
-
-  auto cpuTraverse = [&](int tid, size_t numThreads) {
-    for (auto& para : parameters_) {
-      if (auto callback = getTraverseCallback(para.get())) {
-        threadTraverse(callback, tid, numThreads, para.get());
-      }
-    }
-  };
-  auto gpuTraverse = [&](int tid, size_t numThreads) {
-    for (auto& para : parameters_) {
-      if (para->useGpu()) {
-        if (auto callback = getTraverseCallback(para.get())) {
-          SetDevice setDevice(para->getDeviceId());
-          callback(para->getBufs(), para->getConfig(), -1LU);
-        }
-      }
-    }
-  };
-
-  if (hasCpuPara && hasGpuPara) {
-    getGlobalSyncThreadPool()->exec(cpuTraverse, gpuTraverse);
-  } else if (hasCpuPara) {
-    getGlobalSyncThreadPool()->exec(cpuTraverse);
-  } else if (hasGpuPara) {
-    gpuTraverse(0, 0);
-  }
-}
-
-void SgdThreadUpdater::catchUpWith() {
-  traverse([this](Parameter* para) {
-    return optimizers_[para->getID()]->startCatchUpWith();
-  });
-
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->finishCatchUpWith();
-  }
-}
-
-void SgdThreadUpdater::apply() {
-  catchUpWith();
-
-  traverse(
-      [this](Parameter* para) { return optimizers_[para->getID()]->apply(); });
-}
-
-void SgdThreadUpdater::restore() {
-  traverse([this](Parameter* para) {
-    return optimizers_[para->getID()]->restore();
-  });
-}
-
-PassType SgdThreadUpdater::startBatch(int64_t batchSize) {
-  numSamplesProcessed_ += batchSize;
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->startBatch(numSamplesProcessed_);
-  }
-  return PASS_TRAIN;
-}
-
-void SgdThreadUpdater::finishBatch(real cost) {
-  getGlobalSyncThreadPool()->exec([&](int tid, size_t numThreads) {
-    for (auto& para : parameters_) {
-      if (para->isGradSparseUpdate()) {
-        threadUpdateSparse(tid, numThreads, para.get());
-      } else if (!para->useGpu()) {
-        threadUpdateDense(tid, numThreads, para.get());
-      }
-    }
-  });
-
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->finishBatch();
-  }
-}
-
-void SgdThreadUpdater::threadUpdateSparse(int tid,
-                                          size_t numThreads,
-                                          Parameter* para) {
-  int pid = para->getID();
-  ParameterOptimizer* optimizer = optimizers_[pid].get();
-  VectorPtr* vecs = parameter::getThreadLocalBuffer();
-
-  size_t height = para->getConfig().dims(0);
-  size_t width = para->getConfig().dims(1);
-
-  if (dynamic_cast<SparseRowIdsCpuMatrix*>(
-          para->getMat(PARAMETER_GRADIENT).get())) {
-    // From MultiGradientMachine
-    SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
-        para->getMat(PARAMETER_GRADIENT).get());
-    std::vector<uint32_t>& sparseIds = mainMat->getIds(tid);
-
-    for (auto id : sparseIds) {
-      // setup sub bufs
-      for (auto type : parameterTypes_) {
-        vecs[type]->subVecFrom(*para->getBuf(type), id * width, width);
-      }
-      optimizer->update(vecs, para->getConfig(), id);
-      vecs[PARAMETER_GRADIENT]->zeroMem();
-    }
-    sparseIds.clear();
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(
-                 para->getMat(PARAMETER_GRADIENT).get())) {
-    // From NeuralNetwork
-    SparseRowCpuMatrix* mainMat = dynamic_cast<SparseRowCpuMatrix*>(
-        para->getMat(PARAMETER_GRADIENT).get());
-
-    std::vector<unsigned int>& localIndices =
-        mainMat->getIndexDictHandle()->localIndices;
-
-    auto interval =
-        calcSplitArrayInterval(localIndices.size(), tid, numThreads);
-    for (size_t i = interval.first; i < interval.second; ++i) {
-      auto id = localIndices[i];
-      real* row = mainMat->getLocalRow(i);
-      // setup sub bufs
-      for (auto type : parameterTypes_) {
-        if (type == PARAMETER_GRADIENT) {
-          vecs[type]->subVecFrom(row, 0, width);
-        } else {
-          vecs[type]->subVecFrom(*para->getBuf(type), id * width, width);
-        }
-      }
-      optimizer->update(vecs, para->getConfig(), id);
-      vecs[PARAMETER_GRADIENT]->zeroMem();
-    }
-    // For numThreads > 1, MultiGradientMachine is used, which goes
-    // to the above branch.
-    CHECK_EQ(numThreads, 1UL);
-    mainMat->clearIndices();
-  } else {
-    auto& m = *para->getMat(PARAMETER_GRADIENT).get();
-    LOG(FATAL) << "Internal error: " << para->getName() << " "
-               << typeid(m).name();
-  }
-
-  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
-    for (size_t i = tid; i < height; i += numThreads) {
-      // setup sub bufs
-      for (auto type : parameterTypes_) {
-        vecs[type]->subVecFrom(*para->getBuf(type), i * width, width);
-      }
-      callback(vecs, para->getConfig(), i);
-    }
-  }
-}
-
-void SgdThreadUpdater::threadUpdateDense(int tid,
-                                         size_t numThreads,
-                                         Parameter* para) {
-  int pid = para->getID();
-  ParameterOptimizer* optimizer = optimizers_[pid].get();
-  VectorPtr* vecs = parameter::getThreadLocalBuffer();
-
-  auto interval = calcSplitArrayInterval(
-      para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
-
-  // setup sub bufs
-  for (auto type : parameterTypes_) {
-    vecs[type]->subVecFrom(*para->getBuf(type), interval);
-  }
-
-  // update
-  optimizer->update(vecs, para->getConfig());
-  vecs[PARAMETER_GRADIENT]->zeroMem();
-
-  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
-    callback(vecs, para->getConfig(), -1LU);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/ThreadParameterUpdater.h b/paddle/legacy/trainer/ThreadParameterUpdater.h
deleted file mode 100644
index 172287d4eb5..00000000000
--- a/paddle/legacy/trainer/ThreadParameterUpdater.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/parameter/AverageOptimizer.h"
-#include "paddle/legacy/parameter/FirstOrderOptimizer.h"
-#include "paddle/legacy/parameter/OptimizerFunctions.h"
-#include "paddle/legacy/parameter/OptimizerWithRegularizer.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/parameter/Regularizer.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include <memory>
-#include <vector>
-
-namespace paddle {
-
-/**
- * \brief A parameter updater that uses multiple threads to update parameters.
-   This parameter updater handles GPU and CPU updates differently,
-   because at the current moment, the merging on CPU is happening on the
-   main thread, and the its parameter size can be much larger than the one GPU.
-   Thus, for GPU, the parameter updates happens in updateImpl() function, which
-   is called by gradient machines as a callback function supplied to backward()
-   and forwardBackward().
-   For CPU, the parameter updates happens in separate threads maintained by this
-   class.
- */
-class SgdThreadUpdater : public ParameterUpdater {
- public:
-  explicit SgdThreadUpdater(const OptimizationConfig& optConfig);
-  virtual ~SgdThreadUpdater() {}
-
-  // Use the startPass() function of the base optimizer.
-  virtual void startPass();
-
-  // Use the finishPass() function of the base optimizer.
-  virtual bool finishPass();
-
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-  virtual PassType startBatch(int64_t batchSize);
-  // Call finishBatch for each optimizer.
-  virtual void finishBatch(real cost);
-  virtual void catchUpWith();
-  virtual void apply();
-  virtual void restore();
-
- protected:
-  // This is the function that will be eventualy called by the GradientMachine.
-  // used only for GPU update.
-  virtual void updateImpl(Parameter* para);
-  OptimizationConfig config_;
-  int64_t numSamplesProcessed_;
-
-  // One optimizers for each parameter.
-  std::vector<std::unique_ptr<ParameterOptimizer>> optimizers_;
-
-  // The update function for CPU sparse parameters.
-  void threadUpdateSparse(int tid, size_t numThreads, Parameter* para);
-
-  // The update function for CPU dense parameters.
-  void threadUpdateDense(int tid, size_t numThreads, Parameter* para);
-  // The update function for after update operations, such as averager.
-  void threadTraverse(const ParameterOptimizer::TraverseCallback& callback,
-                      int tid,
-                      size_t numThreads,
-                      Parameter* para);
-  typedef std::function<const ParameterOptimizer::TraverseCallback(Parameter*)>
-      GetTraverseCallback;
-  void traverse(GetTraverseCallback getTraverseCallback);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/Trainer.cpp b/paddle/legacy/trainer/Trainer.cpp
deleted file mode 100644
index 2db754793cf..00000000000
--- a/paddle/legacy/trainer/Trainer.cpp
+++ /dev/null
@@ -1,653 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Trainer.h"
-
-#include <stdio.h>
-
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-#include <google/protobuf/text_format.h>
-
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include "RemoteParameterUpdater.h"
-#include "TesterConfig.h"
-#include "ThreadParameterUpdater.h"
-#include "TrainerConfigHelper.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachineMode.h"
-#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/legacy/gserver/layers/ValidationLayer.h"
-
-DEFINE_string(config, "", "Trainer config file");
-
-DEFINE_int32(test_period,
-             0,
-             "if equal 0, do test on all test data at the end of "
-             "each pass. While if equal non-zero, do test on all test "
-             "data every test_period batches");
-DEFINE_bool(test_all_data_in_one_period,
-            false,
-            "This option was deprecated, since we will always do "
-            "test on all test set ");
-
-DEFINE_bool(local, true, "Train in local mode or not");
-
-DEFINE_int32(average_test_period,
-             0,
-             "Do test on average parameter every so"
-             " many batches. MUST be devided by FLAGS_log_period."
-             " Default 0 means do not test average parameter");
-
-DEFINE_int32(saving_period, 1, "Save parameteres every so many passes");
-DEFINE_int64(saving_period_by_batches,
-             0,
-             "Save parameters every so many batches in one pass");
-DEFINE_string(save_dir, "", "Directory for saving model parameter");
-DEFINE_int32(start_pass,
-             0,
-             "Start training from this pass. "
-             "Will load parameter from the previous pass");
-DEFINE_int32(test_pass, -1, "Will load parameter start from this pass to test");
-DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist");
-DEFINE_bool(with_cost, true, "enable cost layer or not");
-DEFINE_bool(distribute_test, false, "test in distribute mode");
-
-DEFINE_int32(num_passes, 100, "train for so many passes");
-
-DEFINE_string(config_args,
-              "",
-              "arguments passed to config file."
-              "Format: key1=value1,key2=value2");
-
-DEFINE_bool(save_only_one,
-            false,
-            "Save only parameters in last pass, remove previous.");
-
-DEFINE_string(feat_file, "", "File name of extracted feature.");
-DEFINE_string(predict_output_dir,
-              "",
-              "Directory that saves the predicted results of output layers");
-DEFINE_string(model_list, "", "File that saves the model list when evaluation");
-
-namespace paddle {
-
-void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
-                   bool testing,
-                   const std::shared_ptr<GradientMachine>& gradientMachine,
-                   const std::shared_ptr<DataProvider>& dataProvider,
-                   const std::shared_ptr<DataProvider>& testDataProvider) {
-  this->stats_ = std::make_shared<TrainerStats>();
-
-  config_ = config;
-
-  config_->updateConfigFromFlags();
-
-  testing_ = testing;
-
-  // in testing, mode_ may GradientMachine::kTesting or
-  // GradientMachine::kSgdSparseCpuTraining
-
-  if (FLAGS_local) {
-    CHECK(!FLAGS_loadsave_parameters_in_pserver)
-        << "local and loadsave_parameters_in_pserver can not both true";
-    if (config_->getOptConfig().use_sparse_remote_updater()) {
-      config_->disableRemoteSparseUpdaterForEachParams();
-      LOG(INFO) << "ignore sparse_remote_update=true due to  --local=true";
-    }
-  }
-  if (FLAGS_loadsave_parameters_in_pserver) {
-    CHECK(config_->getOptConfig().use_sparse_remote_updater())
-        << "no parameter to load from pserver, please check network config";
-  }
-  if (testing && !FLAGS_loadsave_parameters_in_pserver) {
-    if (config_->getOptConfig().use_sparse_remote_updater()) {
-      config_->disableRemoteSparseUpdater();
-      LOG(INFO) << "because parameter is loaded local,"
-                << "tester ignore sparse_remote_update flag";
-    }
-  }
-
-  CHECK(TrainAlgorithm::isValid(config_->getOptConfig().algorithm()))
-      << "invalid algorithm configuration: "
-      << config_->getOptConfig().algorithm();
-
-  bool useSparseUpdater = false;
-  for (auto& paraConfig : config_->getModelConfig().parameters()) {
-    if (paraConfig.sparse_update() || paraConfig.sparse_remote_update()) {
-      useSparseUpdater = true;
-    }
-  }
-
-  if (FLAGS_use_mkldnn) {
-    CHECK_EQ(FLAGS_trainer_count, 1) << "MKLDNN only need 1 trainer";
-  }
-
-  if (testing) {
-    LOG(INFO) << "trainer: in testing mode";
-    if (config_->getOptConfig().use_sparse_remote_updater() ||
-        FLAGS_trainer_count > 1) {
-      mode_ = GradientMachine::kSgdSparseCpuTraining;
-      LOG(INFO) << "trainer mode: SgdSparseCpuTraining";
-    } else {
-      mode_ = GradientMachine::kTesting;
-      LOG(INFO) << "trainer mode: Testing";
-    }
-  } else if (IGradientMachineMode::tryGetMode(
-                 (int*)&mode_,
-                 config_->getOptConfig().algorithm(),
-                 FLAGS_trainer_count,
-                 FLAGS_local,
-                 FLAGS_use_gpu)) {
-    LOG(INFO) << "Custom trainer mode.";
-  } else if ((config_->getOptConfig().algorithm() == TrainAlgorithm::SGD ||
-              config_->getOptConfig().algorithm() ==
-                  TrainAlgorithm::AsyncSGD) &&
-             useSparseUpdater) {
-    mode_ = GradientMachine::kSgdSparseCpuTraining;
-    LOG(INFO) << "trainer mode: SgdSparseCpuTraining";
-  } else {
-    mode_ = GradientMachine::kNormal;
-    LOG(INFO) << "trainer mode: Normal";
-  }
-
-  // initialize trainer internal
-  trainerInternal_.init(config_,
-                        gradientMachine,
-                        TrainerInternalConfig::createFromMode(mode_),
-                        stats_,
-                        testing);
-  std::unique_ptr<ParameterUtilConfig> paramConfig(
-      new ParameterUtilConfig(FLAGS_save_only_one,
-                              FLAGS_saving_period,
-                              FLAGS_loadsave_parameters_in_pserver,
-                              FLAGS_config));
-
-  paramUtil_.reset(
-      new paddle::ParameterUtil(config_,
-                                std::move(paramConfig),
-                                trainerInternal_.getGradientMachine(),
-                                trainerInternal_.getParameterUpdater()));
-
-  bool gpuData =
-      FLAGS_use_gpu && (!FLAGS_parallel_nn) &&
-      (!IGradientMachineMode::dataMustInCpu(mode_, FLAGS_trainer_count));
-
-  dataProvider_ = dataProvider;
-  if (!dataProvider_ && config_->hasDataConfig() && !testing_) {
-    dataProvider_.reset(DataProvider::create(*config_, *config_, gpuData));
-  }
-  if (!testDataProvider_) {
-    // No evaluator_ if there is testDataProvider but no dataProvider.
-    evaluator_.reset(trainerInternal_.getGradientMachine()->makeEvaluator());
-    currentEvaluator_.reset(
-        trainerInternal_.getGradientMachine()->makeEvaluator());
-    if (FLAGS_average_test_period > 0 && FLAGS_trainer_id == 0 &&
-        config_->getOptConfig().average_window() > 0) {
-      CHECK_EQ(FLAGS_average_test_period % FLAGS_log_period, 0)
-          << "FLAGS_average_test_period must be divided by FALGS_log_period";
-      averageEvaluator_.reset(
-          trainerInternal_.getGradientMachine()->makeEvaluator());
-    }
-  }
-
-  testDataProvider_ = testDataProvider;
-  if (!testDataProvider_ && config_->hasTestDataConfig()) {
-    testDataProvider_.reset(
-        DataProvider::create(config_->getTestDataConfig(), *config_, gpuData));
-  }
-  if (testDataProvider_) {
-    createTester();
-  }
-
-  if (!testing &&
-      (trainerInternal_.getGradientMachine()->hasStaticParameters())) {
-    CHECK(!FLAGS_loadsave_parameters_in_pserver)
-        << "is_static and loadsave_parameters_in_pserver can not both true";
-  }
-  if (testing) {
-    // will load per pass for tester
-  } else if (paramUtil_->tryLoadParametersFromConfig()) {
-    // load from config already.
-  } else {
-    trainerInternal_.getGradientMachine()->randParameters();
-  }
-
-  // Only non static parameters need to be updated
-  std::vector<ParameterPtr>& parameters =
-      trainerInternal_.getGradientMachine()->getNonStaticParameters();
-  if (trainerInternal_.getParameterUpdater()) {
-    trainerInternal_.getParameterUpdater()->init(parameters);
-
-    if (FLAGS_loadsave_parameters_in_pserver && FLAGS_trainer_id == 0) {
-      if (testing) {
-        // will load per pass for tester
-      } else if (!config_->getConfig().init_model_path().empty() &&
-                 (FLAGS_local || FLAGS_trainer_id == 0)) {
-        paramUtil_->loadParametersWithPath(
-            config_->getConfig().init_model_path(),
-            false /*local*/,
-            true /*remote*/);
-      } else if (config_->getConfig().start_pass() > 0 &&
-                 (FLAGS_local || FLAGS_trainer_id == 0)) {
-        CHECK(paramUtil_->loadParameters(config_->getConfig().start_pass() - 1,
-                                         false /*local*/,
-                                         true /*remote*/));
-      } else {
-        trainerInternal_.getParameterUpdater()->randParametersRemote();
-      }
-    }
-  }
-
-  // set current evaluator and evalutor
-  trainerInternal_.setCurrentEvaluator(currentEvaluator_.get());
-  trainerInternal_.setEvaluator(evaluator_.get());
-}
-
-void Trainer::train(size_t numPasses) {
-  startTrain();
-  for (size_t i = 0; i < numPasses; ++i) {
-    if (IGradientMachineMode::trainWholeDataInOneBatch(mode_)) {
-      trainOnePassBatch(config_->getConfig().start_pass() + i);
-    } else {
-      trainOnePass();
-    }
-    if (i < numPasses - 1) {
-      dataProvider_->reset();
-    }
-  }
-
-  finishTrain();
-}
-
-static double genPerturbation(real* d, real* grad, size_t dim) {
-  auto& reng = ThreadLocalRandomEngine::get();
-  std::uniform_real_distribution<double> dist(-1, 1);
-  double gradNorm = 0, dNorm = 0;
-  for (size_t i = 0; i < dim; ++i) {
-    d[i] = dist(reng);
-    dNorm += d[i] * d[i];
-    gradNorm += grad[i] * grad[i];
-  }
-  if (gradNorm > 0) {
-    real s = 0.5 * sqrt(gradNorm / dNorm);
-    for (size_t i = 0; i < dim; ++i) {
-      d[i] = s * d[i] + grad[i];
-    }
-  }
-  double delta = 0;
-  for (size_t i = 0; i < dim; ++i) {
-    delta += grad[i] * d[i];
-  }
-  return delta;
-}
-
-real Trainer::checkGradient() {
-  trainerInternal_.getGradientMachine()->start();
-  std::vector<ParameterPtr>& parameters =
-      trainerInternal_.getGradientMachine()->getNonStaticParameters();
-  DataBatch dataBatch;
-  int32_t batchSize = config_->getOptConfig().batch_size();
-
-  dataProvider_->getNextBatch(batchSize, &dataBatch);
-
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-  std::vector<Argument>& inArgs = dataBatch.getStreams();
-  std::vector<Argument> outArgs;
-
-  trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-  real cost = Argument::sum(outArgs);
-  LOG(INFO) << "original cost=" << cost;
-  trainerInternal_.getGradientMachine()->backward();
-
-  real maxDiff = 0;
-  char fill = ' ';
-  for (auto& parameter : parameters) {
-    CpuVector oldPara(parameter->getSize());
-    CpuVector newPara(parameter->getSize());
-    oldPara.copyFrom(*parameter->getBuf(PARAMETER_VALUE));
-    real* newp = newPara.getData();
-    real* oldp = oldPara.getData();
-    CpuVector cpuGrad(*parameter->getBuf(PARAMETER_GRADIENT));
-    real* grad = cpuGrad.getData();
-    size_t dim = parameter->getSize();
-    std::vector<real> d(dim);
-
-    double delta = genPerturbation(d.data(), grad, dim);
-
-    // use a step such that delta / cost is FLAGS_checkgrad_eps
-    real step =
-        (delta != 0) ? cost / delta * FLAGS_checkgrad_eps : FLAGS_checkgrad_eps;
-    delta *= step;
-    for (size_t i = 0; i < dim; ++i) {
-      newp[i] = oldp[i] + step * d[i];
-    }
-
-    parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
-    parameter->setValueUpdated();
-    trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-    real newCost1 = Argument::sum(outArgs);
-
-    for (size_t i = 0; i < dim; ++i) {
-      newp[i] = oldp[i] - step * d[i];
-    }
-
-    parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
-    parameter->setValueUpdated();
-    trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-    real newCost2 = Argument::sum(outArgs);
-
-    real trueDelta = 0.5 * (newCost1 - newCost2);
-    real diff = (1e-20 + trueDelta) / (1e-20 + delta) - 1;
-    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(fill)
-              << std::setw(20) << parameter->getName()
-              << "step=" << std::setw(15) << step << "cost1=" << std::setw(10)
-              << newCost1 << "cost2=" << std::setw(10) << newCost2
-              << "true_delta=" << std::setw(15) << trueDelta
-              << "analytic_delta=" << std::setw(15) << delta << "diff=" << diff
-              << (std::abs(diff) > 0.01 ? " ***" : "");
-
-    maxDiff = std::max(maxDiff, std::abs(diff));
-
-    // restore parameter
-    parameter->getBuf(PARAMETER_VALUE)->copyFrom(oldPara);
-    parameter->setValueUpdated();
-
-    fill = (fill == ' ') ? '.' : ' ';
-  }
-  return maxDiff;
-}
-
-void Trainer::startTrain() {
-  trainPassContext_.passId = config_->getConfig().start_pass();
-  srand(config_->getConfig().start_pass() + 1);
-  if (dataProvider_) {
-    dataProvider_->reset();
-  }
-
-  trainerInternal_.getGradientMachine()->start();
-}
-
-void Trainer::finishTrain() { trainerInternal_.getGradientMachine()->finish(); }
-
-void Trainer::startTrainPass() {
-  stats_->reset();
-  trainPassContext_.batchId = 0;
-  trainPassContext_.avgTestCost = 0;
-  trainPassContext_.numAvgTests = 0;
-  trainPassContext_.passInnerId = 1;
-
-  trainerInternal_.getParameterUpdater()->startPass();
-  evaluator_->start();
-  if (FLAGS_prev_batch_state) {
-    trainerInternal_.getGradientMachine()->resetState();
-    trainerInternal_.getGradientMachine()->getState(testState_);
-  }
-}
-
-void Trainer::trainOneDataBatch(DataBatch& dataBatch) {
-  int num = dataBatch.getSize();
-  if (averageEvaluator_) {
-    int64_t mod = trainPassContext_.batchId % FLAGS_average_test_period;
-    if (mod >= FLAGS_average_test_period - FLAGS_log_period) {
-      if (mod == FLAGS_average_test_period - FLAGS_log_period) {
-        averageEvaluator_->start();
-      }
-      trainerInternal_.getParameterUpdater()->apply();
-      if (FLAGS_prev_batch_state) {
-        trainerInternal_.getGradientMachine()->getState(trainState_);
-      }
-      trainPassContext_.avgTestCost += tester_->forwardOneBatch(
-          dataBatch, averageEvaluator_.get(), &forwardOutput_);
-      if (FLAGS_prev_batch_state) {
-        trainerInternal_.getGradientMachine()->setState(trainState_);
-      }
-      trainPassContext_.numAvgTests += num;
-      trainerInternal_.getParameterUpdater()->restore();
-    }
-  }
-  {
-    REGISTER_TIMER("TrainBatch");
-    trainerInternal_.trainOneBatch(
-        trainPassContext_.batchId, dataBatch, &forwardOutput_);
-  }
-
-  if (averageEvaluator_ &&
-      trainPassContext_.batchId % FLAGS_average_test_period ==
-          FLAGS_average_test_period - 1) {
-    averageEvaluator_->finish();
-    LOG(INFO) << " Averaged parameter:"
-              << " cost="
-              << trainPassContext_.avgTestCost / trainPassContext_.numAvgTests
-              << " Eval: " << *averageEvaluator_;
-    trainPassContext_.numAvgTests = 0;
-    trainPassContext_.avgTestCost = 0;
-  }
-
-  ++trainPassContext_.batchId;
-
-  if (trainPassContext_.batchId % FLAGS_log_period == 0) {
-    FOR_TIMING(globalStat.setThreadInfo(true));
-    FOR_TIMING(globalStat.printAllStatus());
-    FOR_TIMING(globalStat.reset());
-  }
-
-  if (testDataProvider_ && FLAGS_test_period > 0 &&
-      trainPassContext_.batchId % FLAGS_test_period == 0) {
-    tester_->testOnePeriod();
-  }
-
-  if (FLAGS_saving_period_by_batches > 0 &&
-      trainPassContext_.batchId >
-          FLAGS_saving_period_by_batches * trainPassContext_.passInnerId &&
-      0 == FLAGS_trainer_id) {
-    trainerInternal_.getParameterUpdater()->catchUpWith();
-    if (testDataProvider_) {
-      tester_->testOnePeriod();
-    }
-    paramUtil_->saveParametersOnePass(trainPassContext_.passId,
-                                      trainPassContext_.passInnerId);
-    ++trainPassContext_.passInnerId;
-  }
-}
-
-void Trainer::finishTrainPass() {
-  if (trainPassContext_.batchId == 0) {
-    // This means no more data from DataProvider
-    return;
-  }
-
-  trainerInternal_.finishTrainPass(trainPassContext_.passId,
-                                   trainPassContext_.batchId);
-
-  FOR_TIMING(globalStat.setThreadInfo(true));
-  FOR_TIMING(globalStat.printAllStatus());
-  FOR_TIMING(globalStat.reset());
-
-  if (testDataProvider_) {
-    tester_->testOnePeriod();
-  }
-
-  if (trainPassContext_.passId % FLAGS_saving_period == 0 &&
-      FLAGS_trainer_id == 0) {
-    paramUtil_->saveParametersOnePass(trainPassContext_.passId);
-  }
-  ++trainPassContext_.passId;
-}
-
-void Trainer::trainOnePass() {
-  startTrainPass();
-  size_t batchSize = config_->getOptConfig().batch_size();
-  while (true) {
-    DataBatch dataBatch;
-
-    int num = 0;
-    {
-      REGISTER_TIMER("getTrainBatch");
-      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-    }
-    if (num == 0) break;
-    CHECK_EQ(num, dataBatch.getSize());
-    trainOneDataBatch(dataBatch);
-  }
-
-  finishTrainPass();
-}
-
-void Trainer::trainOnePassBatch(int passId) {
-  this->stats_->reset();
-
-  trainerInternal_.getParameterUpdater()->startPass();
-  const std::vector<Argument> inArgs;
-  {
-    REGISTER_TIMER("onePass");
-    trainerInternal_.getGradientMachine()->forwardBackward(
-        inArgs, nullptr, PASS_TRAIN, nullptr);
-  }
-
-  real cost = .0;
-  int64_t num = 0;
-  trainerInternal_.getGradientMachine()->getStats(cost, num);
-  *stats_ += {num, cost};
-
-  trainerInternal_.getGradientMachine()->onPassEnd();
-
-  bool accepted = trainerInternal_.getParameterUpdater()->finishPass();
-
-  globalStat.setThreadInfo(true);
-  globalStat.printAllStatus();
-  globalStat.reset();
-
-  LOG(INFO) << " Pass=" << passId
-            << " AcceptedPass=" << (accepted ? acceptedPassId_ : -1)
-            << stats_->getStats(false /*withCurrentCost*/);
-
-  if (accepted) {
-    if (acceptedPassId_ % FLAGS_saving_period == 0 && FLAGS_trainer_id == 0) {
-      paramUtil_->saveParameters(acceptedPassId_);
-    }
-    acceptedPassId_++;
-    if (FLAGS_save_only_one && acceptedPassId_ >= FLAGS_saving_period) {
-      paramUtil_->deleteParameters(acceptedPassId_ - FLAGS_saving_period);
-    }
-  }
-}
-
-real Trainer::calcGradient(const DataBatch& dataBatch,
-                           const Vector& value,
-                           Vector& gradient) {
-  CHECK_EQ(value.getSize(), gradient.getSize());
-  std::vector<ParameterPtr>& parameters =
-      trainerInternal_.getGradientMachine()->getParameters();
-
-  clearGradient();
-
-  size_t offset = 0;
-  size_t valueSize = value.getSize();
-
-  for (auto& para : parameters) {
-    CHECK_LE(offset + para->getSize(), valueSize);
-    VectorPtr val =
-        Vector::create(para->getSize(), value.getMemoryHandle(), offset);
-    para->getBuf(PARAMETER_VALUE)->copyFrom(*val);
-    para->setValueUpdated();
-    offset += para->getSize();
-  }
-
-  CHECK_EQ(offset, valueSize);
-
-  std::vector<Argument> inArgs = dataBatch.getStreams();
-  std::vector<Argument> outArgs;
-
-  trainerInternal_.getGradientMachine()->forwardBackward(
-      inArgs, &outArgs, PASS_TRAIN);
-  real cost = Argument::sum(outArgs);
-
-  offset = 0;
-  for (auto& para : parameters) {
-    VectorPtr grad =
-        Vector::create(para->getSize(), gradient.getMemoryHandle(), offset);
-    if (para->getBuf(PARAMETER_GRADIENT)) {
-      grad->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
-    }
-    offset += para->getSize();
-  }
-
-  return cost;
-}
-
-void Trainer::clearGradient() {
-  std::vector<ParameterPtr>& parameters =
-      trainerInternal_.getGradientMachine()->getNonStaticParameters();
-  for (auto& parameter : parameters) {
-    parameter->clearGradient();
-  }
-}
-
-int Trainer::getBatchSize() { return config_->getOptConfig().batch_size(); }
-
-void Trainer::createTester() {
-  tester_.reset(new paddle::Tester(config_,
-                                   createTesterConfig(),
-                                   trainerInternal_.getGradientMachine(),
-                                   trainerInternal_.getParameterUpdater(),
-                                   testDataProvider_));
-}
-
-void Trainer::test() { tester_->test(); }
-
-std::unique_ptr<TesterConfig> Trainer::createTesterConfig() {
-  TesterConfig* conf = new TesterConfig;
-  if (FLAGS_test_period) {
-    LOG(WARNING) << "The meaning of --test_period is changed: "
-                 << "if equal 0, do test on all test data at the end of "
-                 << "each pass. While if equal non-zero, do test on all test "
-                 << "data every test_period batches ";
-  }
-  if (FLAGS_test_all_data_in_one_period) {
-    LOG(WARNING) << "--test_all_data_in_one_period was deprecated, since "
-                 << "we will always do test on all test set ";
-  }
-  conf->testPeriod = FLAGS_test_period;
-  conf->prevBatchState = FLAGS_prev_batch_state;
-  conf->logPeriod = FLAGS_log_period;
-  conf->loadsaveParametersInPserver = FLAGS_loadsave_parameters_in_pserver;
-  conf->featFile = FLAGS_feat_file;
-  conf->predictOutputDir = FLAGS_predict_output_dir;
-  conf->trainerId = FLAGS_trainer_id;
-  conf->distributeTest = FLAGS_distribute_test;
-  conf->config = FLAGS_config;
-  conf->modelList = FLAGS_model_list;
-  conf->testPass = FLAGS_test_pass;
-  conf->numPasses = FLAGS_num_passes;
-  conf->savingPeriod = FLAGS_saving_period;
-  conf->testWait = FLAGS_test_wait;
-  conf->initModelPath = FLAGS_init_model_path;
-  conf->saveOnlyOne = FLAGS_save_only_one;
-  conf->testing = testing_;
-  conf->mode = mode_;
-  conf->trainState = &trainState_;
-  conf->testState = &testState_;
-  return std::unique_ptr<TesterConfig>(conf);
-}
-
-ParameterUtil* Trainer::getParameterUtilPtr() { return paramUtil_.get(); }
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/Trainer.h b/paddle/legacy/trainer/Trainer.h
deleted file mode 100644
index b467f9af0cf..00000000000
--- a/paddle/legacy/trainer/Trainer.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include "ParamUtil.h"
-#include "ParameterUpdater.h"
-#include "Tester.h"
-#include "TrainerConfigHelper.h"
-#include "TrainerInternal.h"
-
-DECLARE_int32(num_passes);
-
-namespace paddle {
-
-/**
- * Trainer Class
- *
- * Trainer combines GradientMachine, ParameterUpdater, DataProvider together to
- * train/test a NeuralNetwork.
- */
-class Trainer {
- public:
-  /**
-   * Ctor.
-   * @return
-   */
-  Trainer() : acceptedPassId_(0) {}
-
-  virtual ~Trainer() {}
-
-  /**
-   * initialize a new trainer using config
-   *
-   * @param config TrainerConfig.
-   * @param testing true if only for testing
-   * @param gradientMachine GradientMachine that will be trained.
-   *                        nullptr if create from config.
-   * @param dataProvider Train Data Provider. null if create from config.
-   * @param testDataProvider Test Data Provider. null if create from config.
-   */
-  virtual void init(
-      const std::shared_ptr<TrainerConfigHelper>& config,
-      bool testing = false,
-      const std::shared_ptr<GradientMachine>& gradientMachine = nullptr,
-      const std::shared_ptr<DataProvider>& dataProvider = nullptr,
-      const std::shared_ptr<DataProvider>& testDataProvider = nullptr);
-
-  /**
-   * Train until num_passes reached.
-   * One pass means neural network train through all training data.
-   *
-   * @param numPasses the number of traning pass.
-   * @note Durning neural network training, the num passes may set a very large
-   * value, and kill training process when result is good enough.
-   */
-  void train(size_t numPasses = (size_t)FLAGS_num_passes);
-
-  /**
-   * compare the gradient from bp with finite difference
-   * @return  the maximal difference
-   */
-  real checkGradient();
-
-  void startTrain();
-  void finishTrain();
-  void startTrainPass();
-  void finishTrainPass();
-  void trainOneDataBatch(DataBatch& dataBatch);
-  void time();
-
-  /**
-   * given a dataBatch and the current parameter value
-   * calculate its gradient and return the cost.
-   *
-   * TODO(yuyang18): I think this method is deprecated and buggy. Should it be
-   * removed?
-   */
-  real calcGradient(const DataBatch& dataBatch,
-                    const Vector& value,
-                    Vector& gradient);
-
-  /**
-   * Get Trainer Config.
-   */
-  const TrainerConfig& getConfig() const { return config_->getConfig(); }
-
-  /**
-   * Get Train Data Provider
-   */
-  const DataProviderPtr& getDataProvider() { return dataProvider_; }
-
-  /**
-   * Get Gradient Machine.
-   */
-  const GradientMachinePtr& getGradientMachine() {
-    return trainerInternal_.getGradientMachine();
-  }
-
-  /**
-   * Get batch size in optimization config.
-   * @note This method didn't return the actual batch size. Just batch size
-   * set in the optimization config. The actual batch size in one trainer may
-   * less than batch size in config due to there are not enough data.
-   */
-  int getBatchSize();
-
-  /**
-   * Do test job
-   */
-  void test();
-
-  /**
-   * Get parameter util ptr
-   *
-   * TODO(yuyang18): Make it return a smart pointer.
-   */
-  ParameterUtil* getParameterUtilPtr();
-
- protected:
-  /**
-   * Train one pass of data.
-   *
-   * SGD Method.
-   */
-  void trainOnePass();
-
-  /**
-   * Train one pass in one batch.
-   *
-   */
-  void trainOnePassBatch(int passId);
-
-  /**
-   * set parameter gradient to zero
-   */
-  void clearGradient();
-
-  void createTester();
-
- private:
-  std::unique_ptr<TesterConfig> createTesterConfig();
-
- protected:
-  std::shared_ptr<TrainerConfigHelper> config_;
-  std::shared_ptr<TrainerStats> stats_;
-
-  DataProviderPtr dataProvider_;
-  DataProviderPtr testDataProvider_;
-  MachineState trainState_;
-  MachineState testState_;
-
-  struct TrainPassContext {
-    int64_t batchId;
-    real avgTestCost;
-    int64_t numAvgTests;
-    int passId;
-    int passInnerId;
-  };
-  std::vector<paddle::Argument> forwardOutput_;
-
-  TrainPassContext trainPassContext_;
-
-  std::unique_ptr<Evaluator> evaluator_;
-  std::unique_ptr<Evaluator> currentEvaluator_;
-  std::unique_ptr<Evaluator> averageEvaluator_;
-  // training mode
-  // used to decide which GradientMachine and ParameterUpdater to create
-  GradientMachine::CreateMode mode_;
-  int testing_;
-  int acceptedPassId_;
-
-  // trainer tester
-  std::unique_ptr<Tester> tester_;
-
-  // parameter util
-  std::unique_ptr<ParameterUtil> paramUtil_;
-
-  // trainer Internal
-  TrainerInternal trainerInternal_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerBenchmark.cpp b/paddle/legacy/trainer/TrainerBenchmark.cpp
deleted file mode 100644
index 7f5bd233548..00000000000
--- a/paddle/legacy/trainer/TrainerBenchmark.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#undef PADDLE_DISABLE_TIMER
-
-#include "Trainer.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-
-DECLARE_int32(test_period);
-
-DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
-
-namespace paddle {
-
-void Trainer::time() {
-  startTrain();
-
-  trainerInternal_.getParameterUpdater()->startPass();
-  evaluator_->start();
-
-  DataBatch dataBatch;
-  int32_t batchSize = config_->getOptConfig().batch_size();
-  int32_t num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-  CHECK_EQ(num, batchSize) << "The sample number is less than batch size "
-                           << num << " != " << batchSize;
-
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-
-  std::vector<paddle::Argument> outputs;
-  // burning time
-  LOG(INFO) << "Burning time...";
-  for (int n = 0; n < 10; ++n) {
-    trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
-  }
-  LOG(INFO) << "Burning time end.";
-
-  for (int n = 0; n < FLAGS_test_period; n++) {
-    if (FLAGS_feed_data) {
-      REGISTER_TIMER("GetData");
-      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-    }
-
-    if (num != batchSize) {
-      break;
-    }
-
-    {
-      REGISTER_TIMER("FwdBwd");
-      trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
-    }
-  }
-  globalStat.setThreadInfo(true);
-  globalStat.printSegTimerStatus();
-  globalStat.reset();
-
-  finishTrain();
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerConfigHelper.cpp b/paddle/legacy/trainer/TrainerConfigHelper.cpp
deleted file mode 100644
index 4d31ba8d71d..00000000000
--- a/paddle/legacy/trainer/TrainerConfigHelper.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TrainerConfigHelper.h"
-#include "ParamUtil.h"
-#include "TrainerConfig.pb.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-
-DECLARE_string(config);
-DECLARE_string(init_model_path);
-DECLARE_int32(start_pass);
-DECLARE_string(save_dir);
-DECLARE_int32(trainer_id);
-DECLARE_bool(local);
-DECLARE_bool(with_cost);
-DECLARE_bool(with_gpu);
-DECLARE_bool(parallel_nn);
-DECLARE_string(config_args);
-DECLARE_bool(use_mkldnn);
-DECLARE_bool(use_mkl_packed);
-
-const char *kConfigParserModuleName = "paddle.trainer.config_parser";
-const char *kConfigParserFuncName = "parse_config_and_serialize";
-
-namespace paddle {
-
-struct TrainerConfigHelperPrivate {
-  TrainerConfig conf;
-};
-
-TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
-    : m(new TrainerConfigHelperPrivate()) {
-  std::ostringstream configArgs;
-  configArgs << "trainer_id=" << FLAGS_trainer_id << ",local=" << FLAGS_local
-             << ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu
-             << ",parallel_nn=" << FLAGS_parallel_nn
-             << ",use_mkldnn=" << FLAGS_use_mkldnn
-             << ",use_mkl_packed=" << FLAGS_use_mkl_packed
-             << ",cudnn_version=" << hl_get_cudnn_lib_version();
-  if (!FLAGS_config_args.empty()) {
-    configArgs << "," << FLAGS_config_args;
-  }
-
-  VLOG(3) << "Parsing trainer config " << configFilePath;
-  std::string configProtoStr =
-      callPythonFunc(kConfigParserModuleName,
-                     kConfigParserFuncName,
-                     {configFilePath, configArgs.str()});
-  CHECK(m->conf.ParseFromString(configProtoStr));
-}
-
-TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig &config)
-    : m(new TrainerConfigHelperPrivate()) {
-  m->conf = config;
-}
-
-TrainerConfigHelper::~TrainerConfigHelper() { delete m; }
-
-const TrainerConfig &TrainerConfigHelper::getConfig() const { return m->conf; }
-
-TrainerConfig &TrainerConfigHelper::getMutableConfig() { return m->conf; }
-
-const OptimizationConfig &TrainerConfigHelper::getOptConfig() const {
-  return m->conf.opt_config();
-}
-
-const ModelConfig &TrainerConfigHelper::getModelConfig() const {
-  return m->conf.model_config();
-}
-
-const DataConfig *TrainerConfigHelper::getDataConfigPtr() const {
-  if (m->conf.has_data_config()) {
-    return &m->conf.data_config();
-  } else {
-    return nullptr;
-  }
-}
-
-const DataConfig &TrainerConfigHelper::getTestDataConfig() const {
-  CHECK(m->conf.has_test_data_config());
-  return m->conf.test_data_config();
-}
-
-bool TrainerConfigHelper::hasDataConfig() const {
-  return m->conf.has_data_config();
-}
-
-bool TrainerConfigHelper::hasTestDataConfig() const {
-  return m->conf.has_test_data_config();
-}
-
-void TrainerConfigHelper::updateConfigFromFlags() {
-  if (!FLAGS_save_dir.empty()) {
-    m->conf.set_save_dir(FLAGS_save_dir);
-  }
-  if (!FLAGS_init_model_path.empty()) {
-    m->conf.set_init_model_path(FLAGS_init_model_path);
-  }
-  if (FLAGS_start_pass != 0) {
-    m->conf.set_start_pass(FLAGS_start_pass);
-  }
-}
-
-void TrainerConfigHelper::disableRemoteSparseUpdater() {
-  m->conf.mutable_opt_config()->set_use_sparse_remote_updater(false);
-}
-
-void TrainerConfigHelper::disableRemoteSparseUpdaterForEachParams() {
-  this->disableRemoteSparseUpdater();
-  for (int i = 0; i < m->conf.model_config().parameters_size(); ++i) {
-    m->conf.mutable_model_config()
-        ->mutable_parameters(i)
-        ->set_sparse_remote_update(false);
-  }
-}
-
-OptimizationConfig &TrainerConfigHelper::getOptConfig() {
-  return *m->conf.mutable_opt_config();
-}
-
-void TrainerConfigHelper::setSaveDir(const std::string &saveDir) {
-  m->conf.set_save_dir(saveDir);
-}
-
-const std::string &TrainerConfigHelper::getSaveDir() const {
-  return m->conf.save_dir();
-}
-
-std::string TrainerConfigHelper::getConfigNameFromPath(
-    const std::string &modelPath) {
-  std::ifstream s(path::join(modelPath, "path.txt"));
-  CHECK(s.is_open()) << " fail to open path.txt";
-  std::string ss;
-  getline(s, ss);
-  VLOG(3) << "fileName " << path::join(modelPath, ss);
-  s.close();
-  return path::join(modelPath, ss);
-}
-
-std::string TrainerConfigHelper::getConfigNameFromPassId(
-    int passId, const std::string &modelPath) {
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  snprintf(buf, kBufLen, "pass-%05d", passId);
-  return TrainerConfigHelper::getConfigNameFromPath(path::join(modelPath, buf));
-}
-
-std::string TrainerConfigHelper::getConfigName(bool *ok) const {
-  std::string retv = "";
-
-  if (!m->conf.config_file().empty()) {
-    retv = m->conf.config_file();
-  } else if (!m->conf.init_model_path().empty()) {
-    retv = getConfigNameFromPath(m->conf.init_model_path());
-  } else if (m->conf.start_pass() >= 1) {
-    retv = getConfigNameFromPassId(m->conf.start_pass(), m->conf.save_dir());
-  }
-
-  if (ok) {
-    *ok = !retv.empty();
-  }
-
-  return retv;
-}
-
-std::shared_ptr<TrainerConfigHelper> TrainerConfigHelper::createFromFlags() {
-  std::string configPath;
-  if (!FLAGS_config.empty()) {
-    configPath = FLAGS_config;
-  } else if (!FLAGS_init_model_path.empty()) {
-    configPath = getConfigNameFromPath(FLAGS_init_model_path);
-  } else if (FLAGS_start_pass >= 1) {
-    configPath =
-        getConfigNameFromPassId(FLAGS_start_pass - 1, FLAGS_init_model_path);
-  } else {
-    return nullptr;
-  }
-  return std::make_shared<TrainerConfigHelper>(configPath);
-}
-
-std::shared_ptr<TrainerConfigHelper>
-TrainerConfigHelper::createFromFlagConfig() {
-  CHECK(!FLAGS_config.empty());
-  return std::make_shared<TrainerConfigHelper>(FLAGS_config);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerConfigHelper.h b/paddle/legacy/trainer/TrainerConfigHelper.h
deleted file mode 100644
index 0e428bea2c4..00000000000
--- a/paddle/legacy/trainer/TrainerConfigHelper.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <paddle/legacy/utils/Logging.h>
-#include <paddle/legacy/utils/Util.h>
-#include <memory>
-
-namespace paddle {
-
-class TrainerConfig;
-class OptimizationConfig;
-struct TrainerConfigHelperPrivate;
-class ModelConfig;
-class DataConfig;
-
-/**
- * @brief TrainerConfig Helper. A class wrap protobuf's TrainerConfig Object,
- * simplize the usage for TrainerConfig.
- *
- * The all operation to TrainerConfig object should use this object. It remove
- * many copy & paste code in trainer.
- *
- * @TODO(yuyang18): Make cmake check compiler support keyword 'final' or not.
- * Define a macro to unify 'final' keyword
- */
-class TrainerConfigHelper /*final*/ {
- public:
-  DISABLE_COPY(TrainerConfigHelper);
-
-  /**
-   * @brief Ctor, Create a TrainerConfig from config file
-   * @param configFilePath Config file path.
-   */
-  explicit TrainerConfigHelper(const std::string& configFilePath);
-  explicit TrainerConfigHelper(const TrainerConfig& config);
-
-  /**
-   * Dtor
-   * @warning this class is a final class. Should not be inherited.
-   */
-  ~TrainerConfigHelper();
-
-  /**
-   * @brief Get Trainer Config itself.
-   */
-  const TrainerConfig& getConfig() const;
-
-  TrainerConfig& getMutableConfig();
-
-  /**
-   * @brief Get Optimizer Config.
-   */
-  const OptimizationConfig& getOptConfig() const;
-
-  /**
-   * @brief Get Model Config.
-   */
-  const ModelConfig& getModelConfig() const;
-
-  /**
-   * @brief Get Train Data Config Pointer.
-   * @return nullptr if there is no train data. Else will return pointer
-   */
-  const DataConfig* getDataConfigPtr() const;
-
-  /**
-   * @brief Get Tain Data Config.
-   * @warning Core when there is no train data.
-   */
-  const DataConfig& getDataConfig() const {
-    CHECK(this->hasDataConfig());
-    auto conf = this->getDataConfigPtr();
-    return *conf;
-  }
-
-  /**
-   * @brief Get test data config
-   * @warning Core when there is no test data.
-   */
-  const DataConfig& getTestDataConfig() const;
-
-  /**
-   * @brief Has train data config or not.
-   * @return true if has train data.
-   */
-  bool hasDataConfig() const;
-
-  /**
-   * @brief Has test data config or not.
-   * @return true if has test data.
-   */
-  bool hasTestDataConfig() const;
-
-  /**
-   * @brief Update trainer config from command line flags.
-   *        Override config's (save_dir, init_model_path, start_pass) if command
-   *        flags is existed.
-   */
-  void updateConfigFromFlags();
-
-  /**
-   * @brief Disable optimization's sparse remote update.
-   */
-  void disableRemoteSparseUpdater();
-
-  /**
-   * @brief Disable optimization and each parameter's sparse remote update.
-   */
-  void disableRemoteSparseUpdaterForEachParams();
-
-  /**
-   * @brief implicit conversion.
-   */
-  inline operator const TrainerConfig&() const { return this->getConfig(); }
-
-  /**
-   * @brief implicit conversion.
-   */
-  inline operator const OptimizationConfig&() const {
-    return this->getOptConfig();
-  }
-
-  /**
-   * @brief implicit conversion.
-   */
-  inline operator const DataConfig&() const { return this->getDataConfig(); }
-
-  /**
-   * @brief implicit conversion.
-   */
-  inline operator const ModelConfig&() const { return this->getModelConfig(); }
-
-  /**
-   * @brief Get mutable optimization config.
-   */
-  OptimizationConfig& getOptConfig();
-
-  /**
-   * @brief set model save directory.
-   * @param saveDir Directory path.
-   */
-  void setSaveDir(const std::string& saveDir);
-
-  /**
-   * @brief get model save directory.
-   * @return save directory path.
-   */
-  const std::string& getSaveDir() const;
-
-  /**
-   * @brief Get config file name from model path.
-   *
-   * Paddle save model to a directory, and write a file 'path.txt' which save
-   * config filename.
-   *
-   * @param modelPath model saved directory.
-   * @return config file name.
-   */
-  static std::string getConfigNameFromPath(const std::string& modelPath);
-
-  /**
-   * @brief Get config file name from this config instance.
-   * @param[out] ok true if no error.
-   * @return config file name.
-   */
-  std::string getConfigName(bool* ok = nullptr) const;
-
-  /**
-   * @brief Try to create TrainerConfigHelper from all command line flags.
-   *        Try to load from --config, --init_model_path, --start_pass one by
-   *        one. Return nullptr if cannot load TrainerConfigHelper from all
-   *        these place.
-   * @return nullptr if cannot load, otherwise return a TrainerConfigHelper.
-   */
-  static std::shared_ptr<TrainerConfigHelper> createFromFlags();
-
-  /**
-   * @brief Try to create TrainerConfigHelper only from '--config' flag.
-   * @return nullptr if cannot load, otherwise return a TrainerConfigHelper.
-   */
-  static std::shared_ptr<TrainerConfigHelper> createFromFlagConfig();
-
- private:
-  static std::string getConfigNameFromPassId(int passId,
-                                             const std::string& modelPath);
-
-  TrainerConfigHelperPrivate* m;
-};
-
-typedef std::shared_ptr<TrainerConfigHelper> TrainerConfigHelperPtr;
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerInternal.cpp b/paddle/legacy/trainer/TrainerInternal.cpp
deleted file mode 100644
index ee3dea63401..00000000000
--- a/paddle/legacy/trainer/TrainerInternal.cpp
+++ /dev/null
@@ -1,303 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TrainerInternal.h"
-
-#include <fenv.h>
-#include <stdio.h>
-
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-#include <google/protobuf/text_format.h>
-
-#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/legacy/gserver/layers/ValidationLayer.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include "RemoteParameterUpdater.h"
-#include "ThreadParameterUpdater.h"
-
-namespace paddle {
-
-void TrainerInternal::init(const std::shared_ptr<TrainerConfigHelper>& config,
-                           const GradientMachinePtr& gradientMachine,
-                           std::unique_ptr<TrainerInternalConfig>&& intconfig,
-                           const std::shared_ptr<TrainerStats>& stats,
-                           bool testing) {
-  config_ = config;
-  intconfig_ = std::move(intconfig);
-  stats_ = stats;
-
-  //! in training will use parameter updater definitly.
-  //! But only use parameter in testing mode when some parameter in pserver.
-  if (!testing || (config_->getOptConfig().use_sparse_remote_updater() &&
-                   intconfig_->loadsave_parameters_in_pserver)) {
-    createParameterUpdater(testing);
-  }
-
-  gradientMachine_ = gradientMachine;
-  if (!gradientMachine) {
-    CHECK(config_->getConfig().has_model_config())
-        << "Missing model_config in trainer_config";
-    gradientMachine_.reset(
-        GradientMachine::create(config_->getConfig().model_config(),
-                                intconfig_->mode,
-                                parameterUpdater_->getParameterTypes()));
-  }
-}
-
-void TrainerInternal::trainOneBatch(int64_t batchId,
-                                    const DataBatch& dataBatch,
-                                    std::vector<Argument>* outArgs) {
-  // true means updating parameter whenever gradient is ready during backward()
-  bool doPipelineUpdate =
-      (intconfig_->mode != GradientMachine::kSgdSparseCpuTraining) &&
-      (intconfig_->local || intconfig_->use_gpu ||
-       intconfig_->trainer_count <= 1);
-
-  int64_t actualBatchSize = dataBatch.getSize();
-  if (actualBatchSize == 0) {
-    return;
-  }
-
-  bool showStats = intconfig_->show_param_stats_period > 0 &&
-                   (batchId + 1) % intconfig_->show_param_stats_period == 0 &&
-                   intconfig_->trainer_id == 0;
-
-  std::vector<ParaStat> paraStats;
-  if (showStats) {
-    paraStats.resize(gradientMachine_->getParameters().size());
-  }
-
-  const std::vector<Argument>& inArgs = dataBatch.getStreams();
-
-  PassType passType = parameterUpdater_->startBatch(actualBatchSize);
-
-  if (config_->getOptConfig().use_sparse_remote_updater()) {
-    REGISTER_TIMER("prefetch");
-    gradientMachine_->prefetch(inArgs);
-    parameterUpdater_->getParametersRemote();
-  }
-
-  UpdateCallback updateCallback = [this, showStats, &paraStats](
-      Parameter* para) {
-    if (showStats) {
-      //! @TODO(yuyang18) Show stats is actually a ParameterHook, refactor
-      // it
-      //! to ParameterHook.
-      auto& grad = para->getBuf(PARAMETER_GRADIENT);
-      SetDevice device(para->getDeviceId());
-      paraStats[para->getID()].avgAbsGrad = grad->getAbsSum() / para->getSize();
-      paraStats[para->getID()].maxAbsGrad = grad->getAbsMax();
-    }
-    parameterUpdater_->update(para);
-  };
-
-  {
-#ifndef PADDLE_DISABLE_TIMER
-    Timer timer;
-    timer.start();
-#endif
-    REGISTER_TIMER("forwardBackward");
-    forwardBackwardBatch(
-        inArgs, *outArgs, passType, updateCallback, doPipelineUpdate);
-#ifndef PADDLE_DISABLE_TIMER
-    timer.stop();
-    parameterUpdater_->setForwardbackwardTime(timer.get());
-#endif
-  }
-
-  if (!doPipelineUpdate) {
-    auto& parameters = gradientMachine_->getNonStaticParameters();
-    for (auto& para : parameters) {
-      updateCallback(para.get());
-    }
-  }
-
-  real cost = 0;
-  {
-    REGISTER_TIMER("sumCost");
-    cost = Argument::sum(*outArgs);
-  }
-
-  if (batchId % intconfig_->log_period == 0) {
-    currentEvaluator_->start();
-    stats_->resetCurrentStat();
-  }
-  {
-    REGISTER_TIMER("eval");
-    gradientMachine_->eval(currentEvaluator_);
-    gradientMachine_->eval(evaluator_);
-  }
-
-  *stats_ += {actualBatchSize, cost};
-  {
-    REGISTER_TIMER("finishBatch");
-    parameterUpdater_->finishBatch(cost);
-  }
-
-  if (showStats) {
-    showParameterStats(paraStats);
-  }
-  if ((batchId + 1) % intconfig_->log_period == 0) {
-    currentEvaluator_->finish();
-
-    if (intconfig_->dot_period > 0) {
-      std::cerr << std::endl;
-    }
-    LOG(INFO) << " Batch=" << batchId + 1 << " " << *stats_
-              << " Eval: " << *evaluator_
-              << " CurrentEval: " << *currentEvaluator_;
-  } else if (intconfig_->dot_period > 0 &&
-             (batchId + 1) % intconfig_->dot_period == 0) {
-    std::cerr << ".";
-  }
-}
-
-/**
- * finish train pass
- */
-void TrainerInternal::finishTrainPass(int passId, int batchId) {
-  gradientMachine_->onPassEnd();
-  parameterUpdater_->finishPass();
-  evaluator_->finish();
-  LOG(INFO) << " Pass=" << passId << " Batch=" << batchId << " "
-            << stats_->getStats(false /*without current cost*/)
-            << " Eval: " << *evaluator_;
-}
-
-void TrainerInternal::showParameterStats(
-    const std::vector<ParaStat>& paraStats) {
-  std::vector<ParameterPtr>& parameters = gradientMachine_->getParameters();
-  for (auto& parameter : parameters) {
-    SetDevice device(parameter->getDeviceId());
-    real sum = parameter->getBuf(PARAMETER_VALUE)->getAbsSum();
-    const auto& lr = parameter->getBuf(PARAMETER_LEARNING_RATE);
-    std::ostringstream osLrHistogram;
-    if (lr) {
-      if (VLOG_IS_ON(2)) {
-        osLrHistogram << " lr_histogram: ";
-        lr->histogram(osLrHistogram);
-      } else {
-        osLrHistogram << " max_lr=" << std::setw(11) << lr->getMax()
-                      << " min_lr=" << std::setw(11) << lr->getMin()
-                      << " avg_lr=" << std::setw(11)
-                      << lr->getSum() / parameter->getSize();
-      }
-    }
-    int pid = parameter->getID();
-    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
-              << std::setw(20) << parameter->getName()
-              << " avg_abs_val=" << std::setw(11) << sum / parameter->getSize()
-              << " max_val=" << std::setw(11)
-              << parameter->getBuf(PARAMETER_VALUE)->getAbsMax()
-              << " avg_abs_grad=" << std::setw(11) << paraStats[pid].avgAbsGrad
-              << " max_grad=" << std::setw(11) << paraStats[pid].maxAbsGrad
-              << osLrHistogram.str();
-  }
-}
-
-void TrainerInternal::createParameterUpdater(bool testing) {
-  const std::string& alg = config_->getOptConfig().algorithm();
-  parameterUpdater_.reset(ParameterUpdaterCreators::tryCreateUpdater(
-      alg, config_->getOptConfig(), intconfig_->local, intconfig_->num_passes));
-  if (parameterUpdater_) {
-    return;
-  }
-
-  if (!intconfig_->local) {
-    if (testing && config_->getOptConfig().use_sparse_remote_updater()) {
-      std::unique_ptr<ParameterUpdater> localUpdater;
-      localUpdater.reset(
-          new SgdLocalUpdater(config_->getOptConfig()));  // do nothing
-      parameterUpdater_.reset(
-          new SparseRemoteParameterUpdaterComposite(config_->getOptConfig(),
-                                                    intconfig_->num_passes,
-                                                    testing,
-                                                    std::move(localUpdater)));
-    } else {
-      if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode &&
-          !intconfig_->use_old_updater) {
-        intconfig_->use_old_updater = true;
-        LOG(INFO) << "Sgd sparse training can not work with"
-                  << " ConcurrentRemoteParameterUpdater,"
-                  << " automatically reset --use_old_updater=true";
-      }
-
-      std::unique_ptr<ParameterUpdater> localUpdater;
-      if (config_->getOptConfig().num_batches_per_send_parameter() > 1) {
-        CHECK(alg == TrainAlgorithm::SGD || alg == TrainAlgorithm::AsyncSGD)
-            << "Unsupported algorithm in remote-local mode: " << alg;
-        if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode) {
-          localUpdater.reset(new SgdThreadUpdater(*config_));
-        } else {
-          localUpdater.reset(new SgdLocalUpdater(*config_));
-        }
-      }
-
-      localUpdater.reset(
-          intconfig_->use_old_updater
-              ? new RemoteParameterUpdater(
-                    *config_, intconfig_->num_passes, std::move(localUpdater))
-              : new ConcurrentRemoteParameterUpdater(
-                    *config_, intconfig_->num_passes, std::move(localUpdater)));
-
-      if (config_->getOptConfig().use_sparse_remote_updater()) {
-        localUpdater.reset(
-            new SparseRemoteParameterUpdaterComposite(*config_,
-                                                      intconfig_->num_passes,
-                                                      testing,
-                                                      std::move(localUpdater)));
-      }
-
-      this->parameterUpdater_ = std::move(localUpdater);
-    }
-  } else {
-    CHECK_EQ(config_->getOptConfig().num_batches_per_send_parameter(), 1)
-        << "num_batches_per_send_parameter should be one in local mode!";
-
-    if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode) {
-      parameterUpdater_.reset(new SgdThreadUpdater(*config_));
-    } else if (alg == TrainAlgorithm::SGD || alg == TrainAlgorithm::AsyncSGD) {
-      if (config_->getModelConfig().type() == "recursive_nn") {
-        parameterUpdater_.reset(new SgdCpuUpdater(*config_));
-      } else if (intconfig_->use_gpu &&
-                 config_->getOptConfig().do_average_in_cpu() &&
-                 config_->getOptConfig().average_window() > 0) {
-        parameterUpdater_.reset(new SgdUpdaterWithCpuAverager(*config_));
-      } else {
-        parameterUpdater_.reset(new SgdLocalUpdater(*config_));
-      }
-    } else {
-      LOG(FATAL) << "Unsupported algorithm in local mode: " << alg;
-    }
-  }
-}
-
-void TrainerInternal::forwardBackwardBatch(const std::vector<Argument>& inArgs,
-                                           std::vector<Argument>& outArgs,
-                                           PassType& passType,
-                                           UpdateCallback updateCallback,
-                                           bool doPipelineUpdate) {
-  gradientMachine_->forwardBackward(
-      inArgs, &outArgs, passType, doPipelineUpdate ? updateCallback : nullptr);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerInternal.h b/paddle/legacy/trainer/TrainerInternal.h
deleted file mode 100644
index 93919a68fca..00000000000
--- a/paddle/legacy/trainer/TrainerInternal.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/utils/Util.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <fstream>
-
-#include "ParameterUpdater.h"
-#include "TrainerConfig.pb.h"
-#include "TrainerConfigHelper.h"
-#include "TrainerInternalConfig.h"
-#include "hl_gpu.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-
-namespace paddle {
-
-/**
- * TrainerInteral
- * the core training class for driving training logic
- */
-class TrainerInternal {
- public:
-  struct ParaStat {
-    real maxAbsGrad;
-    real avgAbsGrad;
-    ParaStat() : maxAbsGrad(.0), avgAbsGrad(.0) {}
-  };
-
-  TrainerInternal() {}
-
-  /**
-   * Intializes trainer internal class
-   * @param config network config
-   * @param machine gradient machine
-   * @param intconfig training config
-   * @param stats training stats
-   * @param testing if it is in testing phase
-   */
-  void init(const std::shared_ptr<TrainerConfigHelper>& config,
-            const GradientMachinePtr& machine,
-            std::unique_ptr<TrainerInternalConfig>&& intconfig,
-            const std::shared_ptr<TrainerStats>& stats,
-            bool testing);
-
-  virtual ~TrainerInternal() {}
-
-  /**
-   * CreateParameterUpdater
-   * @param testing if it is in testing phase
-   */
-  void createParameterUpdater(bool testing);
-
-  /**
-   * FinishTrainPass
-   * @param passId current pass id
-   * @param batchId current batch id, starts from 0
-   */
-  void finishTrainPass(int passId, int batchId);
-
-  /**
-   * trainOneBatch
-   * @param batchId current batch id
-   * @param dataBatch data for the batch
-   */
-  void trainOneBatch(int64_t batchId,
-                     const DataBatch& dataBatch,
-                     std::vector<Argument>* outArgs);
-
-  /**
-   * showParameterStats
-   * @param paraStats training stats
-   */
-  void showParameterStats(const std::vector<ParaStat>& paraStats);
-
-  /**
-   * getGradientMachine
-   */
-  inline const GradientMachinePtr& getGradientMachine() const {
-    return gradientMachine_;
-  }
-
-  /**
-   * getParameterUpdater
-   */
-  inline const std::shared_ptr<ParameterUpdater>& getParameterUpdater() {
-    return parameterUpdater_;
-  }
-
-  /**
-   * setCurrentEvaluator
-   * @param eval evaluator to set
-   */
-  inline void setCurrentEvaluator(Evaluator* eval) { currentEvaluator_ = eval; }
-
-  /**
-   * setEvaluator
-   * @param eval evaluator to set
-   */
-  inline void setEvaluator(Evaluator* eval) { evaluator_ = eval; }
-
-  /**
-   * forwardBackwardBatch
-   * @param inArgs input argument for data batch
-   * @param outArgs output argument from neural network
-   * @param updateCallback layerwise parameter gradient statistics
-   * @param doPipelineUpdate whether to do pipeline update
-   */
-  virtual void forwardBackwardBatch(const std::vector<Argument>& inArgs,
-                                    std::vector<Argument>& outArgs,
-                                    PassType& passType,
-                                    UpdateCallback updateCallback,
-                                    bool doPipelineUpdate);
-
- protected:
-  std::shared_ptr<ParameterUpdater> parameterUpdater_;
-  GradientMachinePtr gradientMachine_;
-  std::shared_ptr<TrainerConfigHelper> config_;
-  std::unique_ptr<TrainerInternalConfig> intconfig_;
-  std::shared_ptr<TrainerStats> stats_;
-  Evaluator* currentEvaluator_;
-  Evaluator* evaluator_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerInternalConfig.cpp b/paddle/legacy/trainer/TrainerInternalConfig.cpp
deleted file mode 100644
index 039fcdb5245..00000000000
--- a/paddle/legacy/trainer/TrainerInternalConfig.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TrainerInternalConfig.h"
-
-DEFINE_int32(show_parameter_stats_period,
-             0,
-             "Whether to show parameter stats during training");
-
-DEFINE_int32(dot_period, 1, "Print '.' every so many batches");
-
-DEFINE_bool(use_old_updater, false, "Use the old RemoteParameterUpdater");
-
-DECLARE_int32(num_passes);
-
-DECLARE_bool(local);
-
-namespace paddle {
-
-std::unique_ptr<TrainerInternalConfig> TrainerInternalConfig::createFromMode(
-    GradientMachine::CreateMode mode) {
-  auto config = new TrainerInternalConfig();
-  config->mode = mode;
-  config->local = FLAGS_local;
-  config->use_gpu = FLAGS_use_gpu;
-  config->trainer_count = FLAGS_trainer_count;
-  config->show_param_stats_period = FLAGS_show_parameter_stats_period;
-  config->trainer_id = FLAGS_trainer_id;
-  config->log_period = FLAGS_log_period;
-  config->dot_period = FLAGS_dot_period;
-  config->num_passes = FLAGS_num_passes;
-  config->use_old_updater = FLAGS_use_old_updater;
-  config->loadsave_parameters_in_pserver = FLAGS_loadsave_parameters_in_pserver;
-
-  return std::unique_ptr<TrainerInternalConfig>(config);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerInternalConfig.h b/paddle/legacy/trainer/TrainerInternalConfig.h
deleted file mode 100644
index b91b5393238..00000000000
--- a/paddle/legacy/trainer/TrainerInternalConfig.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-
-#include "TrainerConfig.pb.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include <sstream>
-#include "ParameterUpdater.h"
-
-namespace paddle {
-/**
- * @brief TrainerStats object will statistics sample processed and total cost.
- *
- * There are two stats in it, the 'AvgCost' and 'CurrentAvgCost'. 'AvgCost'
- * means cost through one pass(all mini-batches). 'CurrentAvgCost' means cost
- * through one mini-batch.
- */
-class TrainerStats {
- public:
-  /**
-   * @brief reset all stats.
-   *
-   * often used before pass start.
-   */
-  inline void reset() {
-    numProcessed_ = 0;
-    totalCost_ = .0;
-    this->resetCurrentStat();
-  }
-
-  /**
-   * @brief reset current stat.
-   *
-   * 'current' means the most recent --log_period mini-batches
-   */
-  inline void resetCurrentStat() {
-    currentCost_ = .0;
-    currentSamples_ = 0;
-  }
-
-  /**
-   * @brief add cost to stat.
-   * @param numProcessed current mini-batch size
-   * @param cost current mini-batch cost
-   */
-  inline void addCost(int64_t numProcessed, real cost) {
-    this->numProcessed_ += numProcessed;
-    this->totalCost_ += cost;
-    this->currentSamples_ += numProcessed;
-    this->currentCost_ += cost;
-  }
-
-  /**
-   * @brief get average cost through on pass(all processed mini-batches)
-   * @return pass average cost
-   */
-  inline real getAvgCost() const {
-    CHECK_NE(this->numProcessed_, 0);
-    return this->totalCost_ / this->numProcessed_;
-  }
-
-  /**
-   * @brief get current mini-batch's average cost.
-   * @return mini-batch average cost
-   */
-  inline real getCurrentAvgCost() const {
-    CHECK_NE(this->currentSamples_, 0);
-    return this->currentCost_ / this->currentSamples_;
-  }
-
-  /**
-   * @brief get all processed samples' number
-   * @return all processed samples' number
-   */
-  inline int64_t getNumProcessed() const { return this->numProcessed_; }
-
-  /**
-   * @brief same function as addCost. But it is simple to invoke.
-   * For example:
-   *
-   * @code{.cpp}
-   * TrainerStats stat;
-   * cost = neuralNetwork.forward(batchSize);
-   * stat += {batchSize, cost};
-   * @endcode
-   *
-   * @param p a pair of parameter, first is numProcessed, second is cost.
-   * @return *this
-   */
-  inline TrainerStats& operator+=(const std::pair<int64_t, real>& p) {
-    this->addCost(p.first, p.second);
-    return *this;
-  }
-
-  /**
-   * @brief TrainerStats Constructor.
-   *
-   * reset stat when constructed.
-   */
-  inline TrainerStats() { this->reset(); }
-
-  /**
-   * @brief show stats to ostream.
-   *
-   * If there is no need to print current cost, set withCurrentCost to False.
-   *
-   * @param os output stream.
-   * @param withCurrentCost print current cost or not.
-   */
-  void showStats(std::ostream& os, bool withCurrentCost = true) const {
-    os << "samples=" << this->getNumProcessed()
-       << " AvgCost=" << this->getAvgCost();
-    if (withCurrentCost) {
-      os << " CurrentCost=" << this->getCurrentAvgCost();
-    }
-  }
-
-  /**
-   * @brief get stats to std::string
-   * @param withCurrentCost return current cost or not
-   * @return stats string
-   */
-  std::string getStats(bool withCurrentCost = true) const {
-    std::ostringstream os;
-    this->showStats(os, withCurrentCost);
-    return os.str();
-  }
-
- private:
-  int64_t numProcessed_;
-  real totalCost_;
-  real currentCost_;
-  int64_t currentSamples_;
-};
-
-inline std::ostream& operator<<(std::ostream& os, const TrainerStats& stats) {
-  stats.showStats(os);
-  return os;
-}
-
-/**
- * TrainerInternalConfig
- * general configs for training
- */
-struct TrainerInternalConfig {
-  /**
-   * @brief Create TrainerInternalConfig from GradientMachine::CreateMode and
-   * command line arguments.
-   * @param mode
-   * @return
-   */
-  static std::unique_ptr<TrainerInternalConfig> createFromMode(
-      GradientMachine::CreateMode mode);
-
-  /**
-   * indicate whether the training is local
-   * if local, no parameter server is used
-   */
-  bool local;
-
-  /**
-   * indicate whether training uses GPU
-   */
-  bool use_gpu;
-
-  /**
-   * indicate number of trainer
-   */
-  int trainer_count;
-
-  /**
-   * how frequently to show param stats
-   */
-  int show_param_stats_period;
-
-  /**
-   * current trainer id
-   */
-  int trainer_id;
-
-  /**
-   * frequency to dump log
-   */
-  int log_period;
-
-  /**
-   * dot period
-   */
-  int dot_period;
-
-  /**
-   * num passes for training
-   */
-  int num_passes;
-
-  /**
-   * use old updater
-   */
-  bool use_old_updater;
-
-  /**
-   * whether to load and save parameter in pserver
-   */
-  bool loadsave_parameters_in_pserver;
-
-  /**
-   * training mode
-   */
-  GradientMachine::CreateMode mode;
-};
-
-}  //  namespace paddle
diff --git a/paddle/legacy/trainer/TrainerMain.cpp b/paddle/legacy/trainer/TrainerMain.cpp
deleted file mode 100644
index 911aeba1928..00000000000
--- a/paddle/legacy/trainer/TrainerMain.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fenv.h>
-#include "paddle/legacy/pserver/ParameterServerController.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-
-#include "ParamUtil.h"
-#include "Trainer.h"
-
-DEFINE_bool(start_pserver, false, "Whether to start pserver");
-DECLARE_int32(gpu_id);
-DEFINE_string(job, "train", "one of (train, test, checkgrad)");
-DECLARE_int32(start_pass);
-DECLARE_string(config);
-DECLARE_string(init_model_path);
-DECLARE_string(rdma_tcp);
-
-using namespace paddle;  // NOLINT
-
-int main(int argc, char** argv) {
-  // write logs instantly (never buffer log messages)
-  FLAGS_logbuflevel = -1;
-
-  initMain(argc, argv);
-  initPython(argc, argv);
-
-  std::unique_ptr<ParameterServerController> parameterServerPtr(nullptr);
-  if (FLAGS_start_pserver) {
-    parameterServerPtr.reset(
-        paddle::ParameterServerController::createFromGflags());
-    parameterServerPtr->start();
-  }
-  Trainer trainer;
-  auto config = TrainerConfigHelper::createFromFlags();
-  CHECK(config != nullptr) << "no valid config";
-
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-  trainer.init(config, FLAGS_job == "test");
-
-  if (FLAGS_job == "train") {
-    trainer.train();
-  } else if (FLAGS_job == "checkgrad") {
-    trainer.checkGradient();
-  } else if (FLAGS_job == "test") {
-    trainer.test();
-  } else if (FLAGS_job == "time") {
-    trainer.time();
-  } else {
-    LOG(FATAL) << "Unknown job type: " << FLAGS_job;
-  }
-
-  return 0;
-}
diff --git a/paddle/legacy/trainer/tests/.gitignore b/paddle/legacy/trainer/tests/.gitignore
deleted file mode 100644
index aedb0ef22e0..00000000000
--- a/paddle/legacy/trainer/tests/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-dump_text.test
-test_pydata_provider_wrapper.json
-*proto.bin
diff --git a/paddle/legacy/trainer/tests/CMakeLists.txt b/paddle/legacy/trainer/tests/CMakeLists.txt
deleted file mode 100644
index fbefcced564..00000000000
--- a/paddle/legacy/trainer/tests/CMakeLists.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/sample_trainer_config.conf
-    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
-)
-add_custom_target(copy_trainer_conf ALL DEPENDS sample_trainer_config.conf)
-
-set(PYTHON_PATH 
-   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
-   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/legacy/trainer/tests)
-function(trainer_test TARGET)
-  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
-  add_test(NAME ${TARGET}
-    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
-      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
-endfunction()
-
-trainer_test(test_Compare)
-trainer_test(test_PyDataProviderWrapper)
-trainer_test(test_recurrent_machine_generation)
-if(NOT APPLE)
-  trainer_test(test_Trainer)
-else()
-  message(WARNING "These tests has been disabled in OSX for random fail: \n test_Trainer") 
-endif()
-
-############### test_TrainerOnePass ##########################
-if(WITH_PYTHON)
-  # only run test_TrainerOnePass when PYTHON is enabled, because train one pass
-  # is using PyDataProvider2.
-  add_unittest_without_exec(test_TrainerOnePass
-      test_TrainerOnePass.cpp)
-  add_test(NAME test_TrainerOnePass
-    COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port 
-          ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
-      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
-endif()
-
-#################### test_config_parser #########################
-add_test(NAME test_config_parser
-  COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE} 
-        ${PADDLE_SOURCE_DIR}/paddle/legacy/trainer/tests/config_parser_test.py
-    WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
diff --git a/paddle/legacy/trainer/tests/__init__.py b/paddle/legacy/trainer/tests/__init__.py
deleted file mode 100644
index f662d682632..00000000000
--- a/paddle/legacy/trainer/tests/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/paddle/legacy/trainer/tests/config_parser_test.py b/paddle/legacy/trainer/tests/config_parser_test.py
deleted file mode 100644
index 0d3d82cbdaf..00000000000
--- a/paddle/legacy/trainer/tests/config_parser_test.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.config_parser import parse_config_and_serialize
-
-if __name__ == '__main__':
-    parse_config_and_serialize('legacy/trainer/tests/test_config.conf', '')
-    parse_config_and_serialize(
-        'legacy/trainer/tests/sample_trainer_config.conf',
-        'extension_module_name=paddle.trainer.config_parser_extension')
-    parse_config_and_serialize(
-        'legacy/gserver/tests/pyDataProvider/trainer.conf', '')
diff --git a/paddle/legacy/trainer/tests/fake_file_list.list b/paddle/legacy/trainer/tests/fake_file_list.list
deleted file mode 100644
index f27ceed277f..00000000000
--- a/paddle/legacy/trainer/tests/fake_file_list.list
+++ /dev/null
@@ -1 +0,0 @@
-do_not_matter.txt
diff --git a/paddle/legacy/trainer/tests/picojson.h b/paddle/legacy/trainer/tests/picojson.h
deleted file mode 100644
index 75349537b1c..00000000000
--- a/paddle/legacy/trainer/tests/picojson.h
+++ /dev/null
@@ -1,1103 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * Copyright 2009-2010 Cybozu Labs, Inc.
- * Copyright 2011-2014 Kazuho Oku
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef picojson_h
-#define picojson_h
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <map>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-// for isnan/isinf
-#if __cplusplus >= 201103L
-#include <cmath>
-#else
-extern "C" {
-#ifdef _MSC_VER
-#include <float.h>
-#elif defined(__INTEL_COMPILER)
-#include <mathimf.h>
-#else
-#include <math.h>
-#endif
-}
-#endif
-
-// experimental support for int64_t (see README.mkdn for detail)
-#ifdef PICOJSON_USE_INT64
-#define __STDC_FORMAT_MACROS
-#include <errno.h>
-#include <inttypes.h>
-#endif
-
-// to disable the use of localeconv(3), set PICOJSON_USE_LOCALE to 0
-#ifndef PICOJSON_USE_LOCALE
-#define PICOJSON_USE_LOCALE 1
-#endif
-#if PICOJSON_USE_LOCALE
-extern "C" {
-#include <locale.h>
-}
-#endif
-
-#ifndef PICOJSON_ASSERT
-#define PICOJSON_ASSERT(e)                  \
-  do {                                      \
-    if (!(e)) throw std::runtime_error(#e); \
-  } while (0)
-#endif
-
-#ifdef _MSC_VER
-#define SNPRINTF _snprintf_s
-#pragma warning(push)
-#pragma warning(disable : 4244)  // conversion from int to char
-#pragma warning(disable : 4127)  // conditional expression is constant
-#pragma warning(disable : 4702)  // unreachable code
-#else
-#define SNPRINTF snprintf
-#endif
-
-namespace picojson {
-
-enum {
-  null_type,
-  boolean_type,
-  number_type,
-  string_type,
-  array_type,
-  object_type
-#ifdef PICOJSON_USE_INT64
-  ,
-  int64_type
-#endif
-};
-
-enum { INDENT_WIDTH = 2 };
-
-struct null {};
-
-class value {
- public:
-  typedef std::vector<value> array;
-  typedef std::map<std::string, value> object;
-  union _storage {
-    bool boolean_;
-    double number_;
-#ifdef PICOJSON_USE_INT64
-    int64_t int64_;
-#endif
-    std::string* string_;
-    array* array_;
-    object* object_;
-  };
-
- protected:
-  int type_;
-  _storage u_;
-
- public:
-  value();
-  value(int type, bool);
-  explicit value(bool b);
-#ifdef PICOJSON_USE_INT64
-  explicit value(int64_t i);
-#endif
-  explicit value(double n);
-  explicit value(const std::string& s);
-  explicit value(const array& a);
-  explicit value(const object& o);
-  explicit value(const char* s);
-  value(const char* s, size_t len);
-  ~value();
-  value(const value& x);
-  value& operator=(const value& x);
-  void swap(value& x);
-  template <typename T>
-  bool is() const;
-  template <typename T>
-  const T& get() const;
-  template <typename T>
-  T& get();
-  bool evaluate_as_boolean() const;
-  const value& get(size_t idx) const;
-  const value& get(const std::string& key) const;
-  value& get(size_t idx);
-  value& get(const std::string& key);
-
-  bool contains(size_t idx) const;
-  bool contains(const std::string& key) const;
-  std::string to_str() const;
-  template <typename Iter>
-  void serialize(Iter os, bool prettify = false) const;
-  std::string serialize(bool prettify = false) const;
-
- private:
-  template <typename T>
-  value(const T*);  // intentionally defined to block implicit conversion of
-                    // pointer to bool
-  template <typename Iter>
-  static void _indent(Iter os, int indent);
-  template <typename Iter>
-  void _serialize(Iter os, int indent) const;
-  std::string _serialize(int indent) const;
-};
-
-typedef value::array array;
-typedef value::object object;
-
-inline value::value() : type_(null_type) {}
-
-inline value::value(int type, bool) : type_(type) {
-  switch (type) {
-#define INIT(p, v) \
-  case p##type:    \
-    u_.p = v;      \
-    break
-    INIT(boolean_, false);
-    INIT(number_, 0.0);
-#ifdef PICOJSON_USE_INT64
-    INIT(int64_, 0);
-#endif
-    INIT(string_, new std::string());
-    INIT(array_, new array());
-    INIT(object_, new object());
-#undef INIT
-    default:
-      break;
-  }
-}
-
-inline value::value(bool b) : type_(boolean_type) { u_.boolean_ = b; }
-
-#ifdef PICOJSON_USE_INT64
-inline value::value(int64_t i) : type_(int64_type) { u_.int64_ = i; }
-#endif
-
-inline value::value(double n) : type_(number_type) {
-  if (
-#ifdef _MSC_VER
-      !_finite(n)
-#elif __cplusplus >= 201103L || !(defined(isnan) && defined(isinf))
-      std::isnan(n) || std::isinf(n)
-#else
-      isnan(n) || isinf(n)
-#endif
-          ) {
-    throw std::overflow_error("");
-  }
-  u_.number_ = n;
-}
-
-inline value::value(const std::string& s) : type_(string_type) {
-  u_.string_ = new std::string(s);
-}
-
-inline value::value(const array& a) : type_(array_type) {
-  u_.array_ = new array(a);
-}
-
-inline value::value(const object& o) : type_(object_type) {
-  u_.object_ = new object(o);
-}
-
-inline value::value(const char* s) : type_(string_type) {
-  u_.string_ = new std::string(s);
-}
-
-inline value::value(const char* s, size_t len) : type_(string_type) {
-  u_.string_ = new std::string(s, len);
-}
-
-inline value::~value() {
-  switch (type_) {
-#define DEINIT(p) \
-  case p##type:   \
-    delete u_.p;  \
-    break
-    DEINIT(string_);
-    DEINIT(array_);
-    DEINIT(object_);
-#undef DEINIT
-    default:
-      break;
-  }
-}
-
-inline value::value(const value& x) : type_(x.type_) {
-  switch (type_) {
-#define INIT(p, v) \
-  case p##type:    \
-    u_.p = v;      \
-    break
-    INIT(string_, new std::string(*x.u_.string_));
-    INIT(array_, new array(*x.u_.array_));
-    INIT(object_, new object(*x.u_.object_));
-#undef INIT
-    default:
-      u_ = x.u_;
-      break;
-  }
-}
-
-inline value& value::operator=(const value& x) {
-  if (this != &x) {
-    value t(x);
-    swap(t);
-  }
-  return *this;
-}
-
-inline void value::swap(value& x) {
-  std::swap(type_, x.type_);
-  std::swap(u_, x.u_);
-}
-
-#define IS(ctype, jtype)                 \
-  template <>                            \
-  inline bool value::is<ctype>() const { \
-    return type_ == jtype##_type;        \
-  }
-IS(null, null)
-IS(bool, boolean)
-#ifdef PICOJSON_USE_INT64
-IS(int64_t, int64)
-#endif
-IS(std::string, string)
-IS(array, array)
-IS(object, object)
-#undef IS
-template <>
-inline bool value::is<double>() const {
-  return type_ == number_type
-#ifdef PICOJSON_USE_INT64
-         || type_ == int64_type
-#endif
-      ;
-}
-
-#define GET(ctype, var)                                                    \
-  template <>                                                              \
-  inline const ctype& value::get<ctype>() const {                          \
-    PICOJSON_ASSERT("type mismatch! call is<type>() before get<type>()" && \
-                    is<ctype>());                                          \
-    return var;                                                            \
-  }                                                                        \
-  template <>                                                              \
-  inline ctype& value::get<ctype>() {                                      \
-    PICOJSON_ASSERT("type mismatch! call is<type>() before get<type>()" && \
-                    is<ctype>());                                          \
-    return var;                                                            \
-  }
-GET(bool, u_.boolean_)
-GET(std::string, *u_.string_)
-GET(array, *u_.array_)
-GET(object, *u_.object_)
-#ifdef PICOJSON_USE_INT64
-GET(double,
-    (type_ == int64_type && (const_cast<value*>(this)->type_ = number_type,
-                             const_cast<value*>(this)->u_.number_ = u_.int64_),
-     u_.number_))
-GET(int64_t, u_.int64_)
-#else
-GET(double, u_.number_)
-#endif
-#undef GET
-
-inline bool value::evaluate_as_boolean() const {
-  switch (type_) {
-    case null_type:
-      return false;
-    case boolean_type:
-      return u_.boolean_;
-    case number_type:
-      return u_.number_ != 0;
-#ifdef PICOJSON_USE_INT64
-    case int64_type:
-      return u_.int64_ != 0;
-#endif
-    case string_type:
-      return !u_.string_->empty();
-    default:
-      return true;
-  }
-}
-
-inline const value& value::get(size_t idx) const {
-  static value s_null;
-  PICOJSON_ASSERT(is<array>());
-  return idx < u_.array_->size() ? (*u_.array_)[idx] : s_null;
-}
-
-inline value& value::get(size_t idx) {
-  static value s_null;
-  PICOJSON_ASSERT(is<array>());
-  return idx < u_.array_->size() ? (*u_.array_)[idx] : s_null;
-}
-
-inline const value& value::get(const std::string& key) const {
-  static value s_null;
-  PICOJSON_ASSERT(is<object>());
-  object::const_iterator i = u_.object_->find(key);
-  return i != u_.object_->end() ? i->second : s_null;
-}
-
-inline value& value::get(const std::string& key) {
-  static value s_null;
-  PICOJSON_ASSERT(is<object>());
-  object::iterator i = u_.object_->find(key);
-  return i != u_.object_->end() ? i->second : s_null;
-}
-
-inline bool value::contains(size_t idx) const {
-  PICOJSON_ASSERT(is<array>());
-  return idx < u_.array_->size();
-}
-
-inline bool value::contains(const std::string& key) const {
-  PICOJSON_ASSERT(is<object>());
-  object::const_iterator i = u_.object_->find(key);
-  return i != u_.object_->end();
-}
-
-inline std::string value::to_str() const {
-  switch (type_) {
-    case null_type:
-      return "null";
-    case boolean_type:
-      return u_.boolean_ ? "true" : "false";
-#ifdef PICOJSON_USE_INT64
-    case int64_type: {
-      char buf[sizeof("-9223372036854775808")];
-      SNPRINTF(buf, sizeof(buf), "%" PRId64, u_.int64_);
-      return buf;
-    }
-#endif
-    case number_type: {
-      char buf[256];
-      double tmp;
-      SNPRINTF(buf,
-               sizeof(buf),
-               fabs(u_.number_) < (1ULL << 53) && modf(u_.number_, &tmp) == 0
-                   ? "%.f"
-                   : "%.17g",
-               u_.number_);
-#if PICOJSON_USE_LOCALE
-      char* decimal_point = localeconv()->decimal_point;
-      if (strcmp(decimal_point, ".") != 0) {
-        size_t decimal_point_len = strlen(decimal_point);
-        for (char* p = buf; *p != '\0'; ++p) {
-          if (strncmp(p, decimal_point, decimal_point_len) == 0) {
-            return std::string(buf, p) + "." + (p + decimal_point_len);
-          }
-        }
-      }
-#endif
-      return buf;
-    }
-    case string_type:
-      return *u_.string_;
-    case array_type:
-      return "array";
-    case object_type:
-      return "object";
-    default:
-      PICOJSON_ASSERT(0);
-#ifdef _MSC_VER
-      __assume(0);
-#endif
-  }
-  return std::string();
-}
-
-template <typename Iter>
-void copy(const std::string& s, Iter oi) {
-  std::copy(s.begin(), s.end(), oi);
-}
-
-template <typename Iter>
-void serialize_str(const std::string& s, Iter oi) {
-  *oi++ = '"';
-  for (std::string::const_iterator i = s.begin(); i != s.end(); ++i) {
-    switch (*i) {
-#define MAP(val, sym) \
-  case val:           \
-    copy(sym, oi);    \
-    break
-      MAP('"', "\\\"");
-      MAP('\\', "\\\\");
-      MAP('/', "\\/");
-      MAP('\b', "\\b");
-      MAP('\f', "\\f");
-      MAP('\n', "\\n");
-      MAP('\r', "\\r");
-      MAP('\t', "\\t");
-#undef MAP
-      default:
-        if (static_cast<unsigned char>(*i) < 0x20 || *i == 0x7f) {
-          char buf[7];
-          SNPRINTF(buf, sizeof(buf), "\\u%04x", *i & 0xff);
-          copy(buf, buf + 6, oi);
-        } else {
-          *oi++ = *i;
-        }
-        break;
-    }
-  }
-  *oi++ = '"';
-}
-
-template <typename Iter>
-void value::serialize(Iter oi, bool prettify) const {
-  return _serialize(oi, prettify ? 0 : -1);
-}
-
-inline std::string value::serialize(bool prettify) const {
-  return _serialize(prettify ? 0 : -1);
-}
-
-template <typename Iter>
-void value::_indent(Iter oi, int indent) {
-  *oi++ = '\n';
-  for (int i = 0; i < indent * INDENT_WIDTH; ++i) {
-    *oi++ = ' ';
-  }
-}
-
-template <typename Iter>
-void value::_serialize(Iter oi, int indent) const {
-  switch (type_) {
-    case string_type:
-      serialize_str(*u_.string_, oi);
-      break;
-    case array_type: {
-      *oi++ = '[';
-      if (indent != -1) {
-        ++indent;
-      }
-      for (array::const_iterator i = u_.array_->begin(); i != u_.array_->end();
-           ++i) {
-        if (i != u_.array_->begin()) {
-          *oi++ = ',';
-        }
-        if (indent != -1) {
-          _indent(oi, indent);
-        }
-        i->_serialize(oi, indent);
-      }
-      if (indent != -1) {
-        --indent;
-        if (!u_.array_->empty()) {
-          _indent(oi, indent);
-        }
-      }
-      *oi++ = ']';
-      break;
-    }
-    case object_type: {
-      *oi++ = '{';
-      if (indent != -1) {
-        ++indent;
-      }
-      for (object::const_iterator i = u_.object_->begin();
-           i != u_.object_->end();
-           ++i) {
-        if (i != u_.object_->begin()) {
-          *oi++ = ',';
-        }
-        if (indent != -1) {
-          _indent(oi, indent);
-        }
-        serialize_str(i->first, oi);
-        *oi++ = ':';
-        if (indent != -1) {
-          *oi++ = ' ';
-        }
-        i->second._serialize(oi, indent);
-      }
-      if (indent != -1) {
-        --indent;
-        if (!u_.object_->empty()) {
-          _indent(oi, indent);
-        }
-      }
-      *oi++ = '}';
-      break;
-    }
-    default:
-      copy(to_str(), oi);
-      break;
-  }
-  if (indent == 0) {
-    *oi++ = '\n';
-  }
-}
-
-inline std::string value::_serialize(int indent) const {
-  std::string s;
-  _serialize(std::back_inserter(s), indent);
-  return s;
-}
-
-template <typename Iter>
-class input {
- protected:
-  Iter cur_, end_;
-  int last_ch_;
-  bool ungot_;
-  int line_;
-
- public:
-  input(const Iter& first, const Iter& last)
-      : cur_(first), end_(last), last_ch_(-1), ungot_(false), line_(1) {}
-  int getc() {
-    if (ungot_) {
-      ungot_ = false;
-      return last_ch_;
-    }
-    if (cur_ == end_) {
-      last_ch_ = -1;
-      return -1;
-    }
-    if (last_ch_ == '\n') {
-      line_++;
-    }
-    last_ch_ = *cur_ & 0xff;
-    ++cur_;
-    return last_ch_;
-  }
-  void ungetc() {
-    if (last_ch_ != -1) {
-      PICOJSON_ASSERT(!ungot_);
-      ungot_ = true;
-    }
-  }
-  Iter cur() const { return cur_; }
-  int line() const { return line_; }
-  void skip_ws() {
-    while (1) {
-      int ch = getc();
-      if (!(ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')) {
-        ungetc();
-        break;
-      }
-    }
-  }
-  bool expect(int expect) {
-    skip_ws();
-    if (getc() != expect) {
-      ungetc();
-      return false;
-    }
-    return true;
-  }
-  bool match(const std::string& pattern) {
-    for (std::string::const_iterator pi(pattern.begin()); pi != pattern.end();
-         ++pi) {
-      if (getc() != *pi) {
-        ungetc();
-        return false;
-      }
-    }
-    return true;
-  }
-};
-
-template <typename Iter>
-inline int _parse_quadhex(input<Iter>& in) {
-  int uni_ch = 0, hex;
-  for (int i = 0; i < 4; i++) {
-    if ((hex = in.getc()) == -1) {
-      return -1;
-    }
-    if ('0' <= hex && hex <= '9') {
-      hex -= '0';
-    } else if ('A' <= hex && hex <= 'F') {
-      hex -= 'A' - 0xa;
-    } else if ('a' <= hex && hex <= 'f') {
-      hex -= 'a' - 0xa;
-    } else {
-      in.ungetc();
-      return -1;
-    }
-    uni_ch = uni_ch * 16 + hex;
-  }
-  return uni_ch;
-}
-
-template <typename String, typename Iter>
-inline bool _parse_codepoint(String& out, input<Iter>& in) {
-  int uni_ch;
-  if ((uni_ch = _parse_quadhex(in)) == -1) {
-    return false;
-  }
-  if (0xd800 <= uni_ch && uni_ch <= 0xdfff) {
-    if (0xdc00 <= uni_ch) {
-      // a second 16-bit of a surrogate pair appeared
-      return false;
-    }
-    // first 16-bit of surrogate pair, get the next one
-    if (in.getc() != '\\' || in.getc() != 'u') {
-      in.ungetc();
-      return false;
-    }
-    int second = _parse_quadhex(in);
-    if (!(0xdc00 <= second && second <= 0xdfff)) {
-      return false;
-    }
-    uni_ch = ((uni_ch - 0xd800) << 10) | ((second - 0xdc00) & 0x3ff);
-    uni_ch += 0x10000;
-  }
-  if (uni_ch < 0x80) {
-    out.push_back(uni_ch);
-  } else {
-    if (uni_ch < 0x800) {
-      out.push_back(0xc0 | (uni_ch >> 6));
-    } else {
-      if (uni_ch < 0x10000) {
-        out.push_back(0xe0 | (uni_ch >> 12));
-      } else {
-        out.push_back(0xf0 | (uni_ch >> 18));
-        out.push_back(0x80 | ((uni_ch >> 12) & 0x3f));
-      }
-      out.push_back(0x80 | ((uni_ch >> 6) & 0x3f));
-    }
-    out.push_back(0x80 | (uni_ch & 0x3f));
-  }
-  return true;
-}
-
-template <typename String, typename Iter>
-inline bool _parse_string(String& out, input<Iter>& in) {
-  while (1) {
-    int ch = in.getc();
-    if (ch < ' ') {
-      in.ungetc();
-      return false;
-    } else if (ch == '"') {
-      return true;
-    } else if (ch == '\\') {
-      if ((ch = in.getc()) == -1) {
-        return false;
-      }
-      switch (ch) {
-#define MAP(sym, val)   \
-  case sym:             \
-    out.push_back(val); \
-    break
-        MAP('"', '\"');
-        MAP('\\', '\\');
-        MAP('/', '/');
-        MAP('b', '\b');
-        MAP('f', '\f');
-        MAP('n', '\n');
-        MAP('r', '\r');
-        MAP('t', '\t');
-#undef MAP
-        case 'u':
-          if (!_parse_codepoint(out, in)) {
-            return false;
-          }
-          break;
-        default:
-          return false;
-      }
-    } else {
-      out.push_back(ch);
-    }
-  }
-  return false;
-}
-
-template <typename Context, typename Iter>
-inline bool _parse_array(Context& ctx, input<Iter>& in) {
-  if (!ctx.parse_array_start()) {
-    return false;
-  }
-  size_t idx = 0;
-  if (in.expect(']')) {
-    return ctx.parse_array_stop(idx);
-  }
-  do {
-    if (!ctx.parse_array_item(in, idx)) {
-      return false;
-    }
-    idx++;
-  } while (in.expect(','));
-  return in.expect(']') && ctx.parse_array_stop(idx);
-}
-
-template <typename Context, typename Iter>
-inline bool _parse_object(Context& ctx, input<Iter>& in) {
-  if (!ctx.parse_object_start()) {
-    return false;
-  }
-  if (in.expect('}')) {
-    return true;
-  }
-  do {
-    std::string key;
-    if (!in.expect('"') || !_parse_string(key, in) || !in.expect(':')) {
-      return false;
-    }
-    if (!ctx.parse_object_item(in, key)) {
-      return false;
-    }
-  } while (in.expect(','));
-  return in.expect('}');
-}
-
-template <typename Iter>
-inline std::string _parse_number(input<Iter>& in) {
-  std::string num_str;
-  while (1) {
-    int ch = in.getc();
-    if (('0' <= ch && ch <= '9') || ch == '+' || ch == '-' || ch == 'e' ||
-        ch == 'E') {
-      num_str.push_back(ch);
-    } else if (ch == '.') {
-#if PICOJSON_USE_LOCALE
-      num_str += localeconv()->decimal_point;
-#else
-      num_str.push_back('.');
-#endif
-    } else {
-      in.ungetc();
-      break;
-    }
-  }
-  return num_str;
-}
-
-template <typename Context, typename Iter>
-inline bool _parse(Context& ctx, input<Iter>& in) {
-  in.skip_ws();
-  int ch = in.getc();
-  switch (ch) {
-#define IS(ch, text, op)        \
-  case ch:                      \
-    if (in.match(text) && op) { \
-      return true;              \
-    } else {                    \
-      return false;             \
-    }
-    IS('n', "ull", ctx.set_null());
-    IS('f', "alse", ctx.set_bool(false));
-    IS('t', "rue", ctx.set_bool(true));
-#undef IS
-    case '"':
-      return ctx.parse_string(in);
-    case '[':
-      return _parse_array(ctx, in);
-    case '{':
-      return _parse_object(ctx, in);
-    default:
-      if (('0' <= ch && ch <= '9') || ch == '-') {
-        double f;
-        char* endp;
-        in.ungetc();
-        std::string num_str = _parse_number(in);
-        if (num_str.empty()) {
-          return false;
-        }
-#ifdef PICOJSON_USE_INT64
-        {
-          errno = 0;
-          intmax_t ival = strtoimax(num_str.c_str(), &endp, 10);
-          if (errno == 0 && std::numeric_limits<int64_t>::min() <= ival &&
-              ival <= std::numeric_limits<int64_t>::max() &&
-              endp == num_str.c_str() + num_str.size()) {
-            ctx.set_int64(ival);
-            return true;
-          }
-        }
-#endif
-        f = strtod(num_str.c_str(), &endp);
-        if (endp == num_str.c_str() + num_str.size()) {
-          ctx.set_number(f);
-          return true;
-        }
-        return false;
-      }
-      break;
-  }
-  in.ungetc();
-  return false;
-}
-
-class deny_parse_context {
- public:
-  bool set_null() { return false; }
-  bool set_bool(bool) { return false; }
-#ifdef PICOJSON_USE_INT64
-  bool set_int64(int64_t) { return false; }
-#endif
-  bool set_number(double) { return false; }
-  template <typename Iter>
-  bool parse_string(input<Iter>&) {
-    return false;
-  }
-  bool parse_array_start() { return false; }
-  template <typename Iter>
-  bool parse_array_item(input<Iter>&, size_t) {
-    return false;
-  }
-  bool parse_array_stop(size_t) { return false; }
-  bool parse_object_start() { return false; }
-  template <typename Iter>
-  bool parse_object_item(input<Iter>&, const std::string&) {
-    return false;
-  }
-};
-
-class default_parse_context {
- protected:
-  value* out_;
-
- public:
-  default_parse_context(value* out) : out_(out) {}
-  bool set_null() {
-    *out_ = value();
-    return true;
-  }
-  bool set_bool(bool b) {
-    *out_ = value(b);
-    return true;
-  }
-#ifdef PICOJSON_USE_INT64
-  bool set_int64(int64_t i) {
-    *out_ = value(i);
-    return true;
-  }
-#endif
-  bool set_number(double f) {
-    *out_ = value(f);
-    return true;
-  }
-  template <typename Iter>
-  bool parse_string(input<Iter>& in) {
-    *out_ = value(string_type, false);
-    return _parse_string(out_->get<std::string>(), in);
-  }
-  bool parse_array_start() {
-    *out_ = value(array_type, false);
-    return true;
-  }
-  template <typename Iter>
-  bool parse_array_item(input<Iter>& in, size_t) {
-    array& a = out_->get<array>();
-    a.push_back(value());
-    default_parse_context ctx(&a.back());
-    return _parse(ctx, in);
-  }
-  bool parse_array_stop(size_t) { return true; }
-  bool parse_object_start() {
-    *out_ = value(object_type, false);
-    return true;
-  }
-  template <typename Iter>
-  bool parse_object_item(input<Iter>& in, const std::string& key) {
-    object& o = out_->get<object>();
-    default_parse_context ctx(&o[key]);
-    return _parse(ctx, in);
-  }
-
- private:
-  default_parse_context(const default_parse_context&);
-  default_parse_context& operator=(const default_parse_context&);
-};
-
-class null_parse_context {
- public:
-  struct dummy_str {
-    void push_back(int) {}
-  };
-
- public:
-  null_parse_context() {}
-  bool set_null() { return true; }
-  bool set_bool(bool) { return true; }
-#ifdef PICOJSON_USE_INT64
-  bool set_int64(int64_t) { return true; }
-#endif
-  bool set_number(double) { return true; }
-  template <typename Iter>
-  bool parse_string(input<Iter>& in) {
-    dummy_str s;
-    return _parse_string(s, in);
-  }
-  bool parse_array_start() { return true; }
-  template <typename Iter>
-  bool parse_array_item(input<Iter>& in, size_t) {
-    return _parse(*this, in);
-  }
-  bool parse_array_stop(size_t) { return true; }
-  bool parse_object_start() { return true; }
-  template <typename Iter>
-  bool parse_object_item(input<Iter>& in, const std::string&) {
-    return _parse(*this, in);
-  }
-
- private:
-  null_parse_context(const null_parse_context&);
-  null_parse_context& operator=(const null_parse_context&);
-};
-
-// obsolete, use the version below
-template <typename Iter>
-inline std::string parse(value& out, Iter& pos, const Iter& last) {
-  std::string err;
-  pos = parse(out, pos, last, &err);
-  return err;
-}
-
-template <typename Context, typename Iter>
-inline Iter _parse(Context& ctx,
-                   const Iter& first,
-                   const Iter& last,
-                   std::string* err) {
-  input<Iter> in(first, last);
-  if (!_parse(ctx, in) && err != NULL) {
-    char buf[64];
-    SNPRINTF(buf, sizeof(buf), "syntax error at line %d near: ", in.line());
-    *err = buf;
-    while (1) {
-      int ch = in.getc();
-      if (ch == -1 || ch == '\n') {
-        break;
-      } else if (ch >= ' ') {
-        err->push_back(ch);
-      }
-    }
-  }
-  return in.cur();
-}
-
-template <typename Iter>
-inline Iter parse(value& out,
-                  const Iter& first,
-                  const Iter& last,
-                  std::string* err) {
-  default_parse_context ctx(&out);
-  return _parse(ctx, first, last, err);
-}
-
-inline std::string parse(value& out, const std::string& s) {
-  std::string err;
-  parse(out, s.begin(), s.end(), &err);
-  return err;
-}
-
-inline std::string parse(value& out, std::istream& is) {
-  std::string err;
-  parse(out,
-        std::istreambuf_iterator<char>(is.rdbuf()),
-        std::istreambuf_iterator<char>(),
-        &err);
-  return err;
-}
-
-template <typename T>
-struct last_error_t {
-  static std::string s;
-};
-template <typename T>
-std::string last_error_t<T>::s;
-
-inline void set_last_error(const std::string& s) { last_error_t<bool>::s = s; }
-
-inline const std::string& get_last_error() { return last_error_t<bool>::s; }
-
-inline bool operator==(const value& x, const value& y) {
-  if (x.is<null>()) return y.is<null>();
-#define PICOJSON_CMP(type) \
-  if (x.is<type>()) return y.is<type>() && x.get<type>() == y.get<type>()
-  PICOJSON_CMP(bool);
-  PICOJSON_CMP(double);
-  PICOJSON_CMP(std::string);
-  PICOJSON_CMP(array);
-  PICOJSON_CMP(object);
-#undef PICOJSON_CMP
-  PICOJSON_ASSERT(0);
-#ifdef _MSC_VER
-  __assume(0);
-#endif
-  return false;
-}
-
-inline bool operator!=(const value& x, const value& y) { return !(x == y); }
-}  // namespace picojson
-
-namespace std {
-template <>
-inline void swap(picojson::value& x, picojson::value& y) {
-  x.swap(y);
-}
-}  // namespace std
-
-inline std::istream& operator>>(std::istream& is, picojson::value& x) {
-  picojson::set_last_error(std::string());
-  std::string err = picojson::parse(x, is);
-  if (!err.empty()) {
-    picojson::set_last_error(err);
-    is.setstate(std::ios::failbit);
-  }
-  return is;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const picojson::value& x) {
-  x.serialize(std::ostream_iterator<char>(os));
-  return os;
-}
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-#endif
diff --git a/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data b/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
deleted file mode 100644
index ed83e6ae84b..00000000000
--- a/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
+++ /dev/null
@@ -1,2 +0,0 @@
-0;0 1 3 5;1 3.42 2.25;2 4:4.2 6:2.8;3 aa
-2;0 7 3 8;1 2.25 1.24;2 1:2.3 5:8.24;3 bb
diff --git a/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list b/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
deleted file mode 100644
index 11c1b1b38b9..00000000000
--- a/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
+++ /dev/null
@@ -1 +0,0 @@
-legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
diff --git a/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.beam b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.beam
deleted file mode 100644
index 47401c949ef..00000000000
--- a/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.beam
+++ /dev/null
@@ -1,60 +0,0 @@
-0
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-1
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-2
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-3
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-4
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-5
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-6
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-7
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-8
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-9
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-10
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-11
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-12
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-13
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-14
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
diff --git a/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nest b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
deleted file mode 100644
index 02c7f142a34..00000000000
--- a/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
+++ /dev/null
@@ -1,16 +0,0 @@
-0	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-
diff --git a/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam
deleted file mode 100644
index 23bf1179ebb..00000000000
--- a/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam
+++ /dev/null
@@ -1,16 +0,0 @@
-0	 1 2 3 4
-1	 1 2 3 4
-2	 1 2 3 4
-3	 1 2 3 4
-4	 1 2 3 4
-5	 1 2 3 4
-6	 1 2 3 4
-7	 1 2 3 4
-8	 1 2 3 4
-9	 1 2 3 4
-10	 1 2 3 4
-11	 1 2 3 4
-12	 1 2 3 4
-13	 1 2 3 4
-14	 1 2 3 4
-
diff --git a/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/transtable b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/transtable
deleted file mode 100644
index 161624fbf795ac6188795a6350ab0887b53e6bba..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 116
wcmZQzU|?VYVo4wdfwO0P_CZ)D4lytw;|8qat5>bUDh@PbKg1jmiDJ%v0D;yY&;S4c

diff --git a/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/wordvec b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/wordvec
deleted file mode 100644
index 30ccf33d2e308ae12f1c719986d2a317344cf39b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 116
ZcmZQzU|?VYVo4x|fChUQ3zepxH~_{A1K9uo

diff --git a/paddle/legacy/trainer/tests/sample_data.txt b/paddle/legacy/trainer/tests/sample_data.txt
deleted file mode 100644
index 3398a38bdfc..00000000000
--- a/paddle/legacy/trainer/tests/sample_data.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-0 1 2 -1
-2 3 -1 2
-1 2 2 1
-0 2 1 2
-1 3 1 2
-1 1 2 1
-0 3 -1 2
-1 -2 2 1
-2 2 1 2
-1 3 1 2
diff --git a/paddle/legacy/trainer/tests/sample_filelist.txt b/paddle/legacy/trainer/tests/sample_filelist.txt
deleted file mode 100644
index 8573f9e1795..00000000000
--- a/paddle/legacy/trainer/tests/sample_filelist.txt
+++ /dev/null
@@ -1 +0,0 @@
-legacy/trainer/tests/sample_data.txt
diff --git a/paddle/legacy/trainer/tests/sample_trainer_config.conf b/paddle/legacy/trainer/tests/sample_trainer_config.conf
deleted file mode 100644
index 5800b362566..00000000000
--- a/paddle/legacy/trainer/tests/sample_trainer_config.conf
+++ /dev/null
@@ -1,87 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-TrainData(SimpleData(
-            files = "legacy/trainer/tests/sample_filelist.txt",
-            feat_dim = 3,
-            context_len = 0,
-            buffer_capacity = 1000000))
-
-TestData(SimpleData(
-           files = "legacy/trainer/tests/sample_filelist.txt",
-           feat_dim = 3,
-           context_len = 0,
-           buffer_capacity = 1000000))
-
-settings(batch_size = 100)
-
-data = data_layer(name='input', size=3)
-
-fc1 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=data, size=9,
-               bias_attr=False,
-               act=LinearActivation())
-
-fc3 = fc_layer(input=data, size=3,
-               bias_attr=False,
-               act=TanhActivation())
-
-fc4 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=LinearActivation(),
-               param_attr=ParamAttr(name='sharew'))
-
-fc5 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=BReluActivation())
-
-fc6 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=SoftReluActivation())
-
-fc7 = fc_layer(input=data, size=3,
-               bias_attr=False,
-               act=SquareActivation())
-
-fc8 = fc_layer(input=data, size=5,
-               bias_attr=True,
-               act=SquareActivation())
-
-with mixed_layer(size=3, act=SoftmaxActivation()) as layer9:
-    layer9 += full_matrix_projection(input=fc1)
-    layer9 += full_matrix_projection(input=fc2)
-    layer9 += full_matrix_projection(input=fc3)
-    layer9 += trans_full_matrix_projection(input=fc4,
-                                           param_attr=ParamAttr(name='sharew'))
-    layer9 += full_matrix_projection(input=fc5)
-    layer9 += full_matrix_projection(input=fc6)
-    layer9 += full_matrix_projection(input=fc7)
-    layer9 += full_matrix_projection(input=fc8)
-
-if get_config_arg('with_cost', bool, True):
-    # This is for training the neural network.
-    # We need to have another data layer for label
-    # and a layer for calculating cost
-    lbl = data_layer(name='label', size=1)
-    outputs(classification_cost(input=layer9, label=lbl))
-else:    
-    # This is for prediction where we don't have label
-    # and don't need to calculate cost
-    outputs(layer9)
diff --git a/paddle/legacy/trainer/tests/sample_trainer_config_hsigmoid.conf b/paddle/legacy/trainer/tests/sample_trainer_config_hsigmoid.conf
deleted file mode 100644
index 155c40b31f3..00000000000
--- a/paddle/legacy/trainer/tests/sample_trainer_config_hsigmoid.conf
+++ /dev/null
@@ -1,53 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-TrainData(SimpleData(
-    files = "legacy/trainer/tests/sample_filelist.txt",
-    feat_dim = 3,
-    context_len = 0,
-    buffer_capacity = 1000000,
-))
-
-settings(batch_size = 100)
-
-data = data_layer(name='input', size=3)
-
-fc1 = fc_layer(input=data, size=12,
-               bias_attr=False,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=data, size=19,
-               bias_attr=False,
-               act=LinearActivation())
-
-fc3 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=TanhActivation())
-
-fc4 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=LinearActivation())
-
-# This is for training the neural network.
-# We need to have another data layer for label
-# and a layer for calculating cost
-lbl = data_layer(name='label', size=1)
-
-outputs(hsigmoid(input=[fc1, fc2, fc3, fc4],
-                 label=lbl,
-                 num_classes=3))
diff --git a/paddle/legacy/trainer/tests/sample_trainer_config_parallel.conf b/paddle/legacy/trainer/tests/sample_trainer_config_parallel.conf
deleted file mode 100644
index 49cdde7fa2c..00000000000
--- a/paddle/legacy/trainer/tests/sample_trainer_config_parallel.conf
+++ /dev/null
@@ -1,86 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-TrainData(SimpleData(
-            files = "legacy/trainer/tests/sample_filelist.txt",
-            feat_dim = 3,
-            context_len = 0,
-            buffer_capacity = 1000000))
-
-TestData(SimpleData(
-           files = "legacy/trainer/tests/sample_filelist.txt",
-           feat_dim = 3,
-           context_len = 0,
-           buffer_capacity = 1000000))
-
-settings(batch_size = 100)
-
-# Output layer, label layer, cost layer, preferably set to the same environment.
-output_device = 0
-
-# Input Layer does not need to specify the device number.
-data = data_layer(name='input', size=3)
-
-# Calculate in the CPU.
-fc1 = fc_layer(input=data, size=5,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=-1),
-               act=SigmoidActivation())
-
-# Calculate in the GPU 0.
-fc2 = fc_layer(input=fc1, size=10,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=0),
-               act=SigmoidActivation())
-
-# Calculate in the GPU 1.
-fc3 = fc_layer(input=fc1, size=10,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=1),
-               act=SigmoidActivation())
-
-# Calculate in the GPU 0.
-fc4 = fc_layer(input=[fc2,fc3], size=10,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=0),
-               act=SigmoidActivation())
-
-# Calculate in the GPU 1.
-fc5 = fc_layer(input=[fc2,fc3], size=10,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=1),
-               act=SigmoidActivation())
-
-output = fc_layer(input=[fc4,fc5], size=10,
-                  bias_attr=True,
-                  layer_attr=ExtraAttr(device=output_device),
-                  act=SoftmaxActivation())
-
-if get_config_arg('with_cost', bool, True):
-    # This is for training the neural network.
-    # We need to have another data layer for label
-    # and a layer for calculating cost
-    lbl = data_layer(name='label', size=1,
-                    layer_attr=ExtraAttr(device=output_device))
-                    
-    outputs(classification_cost(input=output, 
-                                label=lbl,
-                                layer_attr=ExtraAttr(device=output_device)))
-else:
-    # This is for prediction where we don't have label
-    # and don't need to calculate cost
-    outputs(output)
diff --git a/paddle/legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf b/paddle/legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf
deleted file mode 100644
index 51ef905a5a1..00000000000
--- a/paddle/legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf
+++ /dev/null
@@ -1,73 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=15, learning_rate=0)
-
-num_words = 5
-beam_flag = get_config_arg('beam_search', bool, False)
-
-sent_id = data_layer(name="sent_id", size=1)
-
-# This layer has no actual use, but only to decide batch_size in generation.
-# When generating, at least one Memory in RecurrentLayer MUST have a boot layer.
-dummy_data = data_layer(name="dummy_data_input", size=2)
-
-def outer_step(dummy_data):
-
-    gen_inputs = [StaticInput(input=dummy_data, size=2, is_seq=True),
-                  GeneratedInput(size=num_words,
-                                 embedding_name="wordvec",
-                                 embedding_size=num_words)]
-
-    def inner_step(dummy_memory, predict_word):
-
-        # simplified RNN for testing
-        with mixed_layer(size=num_words) as layer:
-            layer += full_matrix_projection(input=predict_word,
-                                            param_attr=ParamAttr(name="transtable"))
-
-        with mixed_layer(size=num_words, act=ExpActivation()) as out:
-            out += trans_full_matrix_projection(input=layer,
-                                                param_attr=ParamAttr(name="wordvec"))
-
-        return out
-
-    beam_gen = beam_search(name="rnn_gen",
-                           step=inner_step,
-                           input=gen_inputs,
-                           bos_id=0,
-                           eos_id=num_words-1,
-                           beam_size=2 if beam_flag else 1,
-                           num_results_per_sample=1,
-                           max_length=10)
-    return beam_gen
-
-beam_gen_concat = recurrent_group(name="rnn_gen_concat",
-                                  step=outer_step,
-                                  input=[SubsequenceInput(dummy_data)])
-
-seqtext_printer_evaluator(input=beam_gen_concat,
-                          id_input=sent_id,
-                          dict_file="./legacy/trainer/tests/test_gen_dict.txt",
-                          result_file="./legacy/trainer/tests/dump_text.test")
-#outputs(beam_gen_concat)
-# In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory
-# is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs
-# as follows. Note that "__beam_search_predict__" is the default output name of beam_search.
-Inputs("sent_id","dummy_data_input")
-Outputs("__beam_search_predict__")
diff --git a/paddle/legacy/trainer/tests/sample_trainer_rnn_gen.conf b/paddle/legacy/trainer/tests/sample_trainer_rnn_gen.conf
deleted file mode 100644
index 35c7f0fcd91..00000000000
--- a/paddle/legacy/trainer/tests/sample_trainer_rnn_gen.conf
+++ /dev/null
@@ -1,66 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=15, learning_rate=0)
-
-num_words = 5
-beam_flag = get_config_arg('beam_search', bool, False)
-
-sent_id = data_layer(name="sent_id", size=1)
-
-# This layer has no actual use, but only to decide batch_size in generation.
-# When generating, at least one Memory in RecurrentLayer MUST have a boot layer.
-dummy_data = data_layer(name="dummy_data_input", size=2)
-
-gen_inputs = [StaticInput(input=dummy_data, size=2),
-              GeneratedInput(size=num_words,
-                             embedding_name="wordvec",
-                             embedding_size=num_words)]
-
-def step(dummy_memory, predict_word):
-
-    # simplified RNN for testing
-    with mixed_layer(size=num_words) as layer:
-        layer += full_matrix_projection(input=predict_word,
-                                        param_attr=ParamAttr(name="transtable"))
-
-    with mixed_layer(size=num_words, act=ExpActivation()) as out:
-        out += trans_full_matrix_projection(input=layer,
-                                            param_attr=ParamAttr(name="wordvec"))
-
-    return out
-
-beam_gen = beam_search(name="rnn_gen",
-                       step=step,
-                       input=gen_inputs,
-                       bos_id=0,
-                       eos_id=num_words-1,
-                       beam_size=2 if beam_flag else 1,
-                       num_results_per_sample=2 if beam_flag else 1,
-                       max_length=10)
-
-seqtext_printer_evaluator(input=beam_gen,
-                          id_input=sent_id,
-                          dict_file="./legacy/trainer/tests/test_gen_dict.txt",
-                          result_file="./legacy/trainer/tests/dump_text.test")
-#outputs(beam_gen)
-# In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory
-# is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs
-# as follows. Note that "__beam_search_predict__" is the default output name of beam_search.
-Inputs("sent_id","dummy_data_input")
-Outputs("__beam_search_predict__")
diff --git a/paddle/legacy/trainer/tests/simple_sparse_neural_network.py b/paddle/legacy/trainer/tests/simple_sparse_neural_network.py
deleted file mode 100644
index 9419f4d903b..00000000000
--- a/paddle/legacy/trainer/tests/simple_sparse_neural_network.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=17, learning_method=AdaGradOptimizer(), learning_rate=1e-4)
-
-file_list = 'legacy/trainer/tests/fake_file_list.list'
-
-define_py_data_sources2(
-    train_list=file_list,
-    test_list=file_list,
-    module="simple_sparse_neural_network_dp",
-    obj="process")
-
-embedding = embedding_layer(
-    input=data_layer(
-        name="word_ids", size=8191),
-    size=128,
-    param_attr=ParamAttr(sparse_update=True))
-prediction = fc_layer(input=embedding, size=10, act=SoftmaxActivation())
-
-outputs(
-    classification_cost(
-        input=prediction, label=data_layer(
-            name='label', size=10)))
diff --git a/paddle/legacy/trainer/tests/simple_sparse_neural_network_dp.py b/paddle/legacy/trainer/tests/simple_sparse_neural_network_dp.py
deleted file mode 100644
index 49043c91758..00000000000
--- a/paddle/legacy/trainer/tests/simple_sparse_neural_network_dp.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import provider, integer_sequence, integer_value
-import random
-
-
-def init_hook(settings, is_train, **kwargs):
-    settings.is_train = is_train
-
-
-@provider(
-    input_types={'word_ids': integer_value(8191),
-                 'label': integer_value(10)},
-    min_pool_size=0,
-    init_hook=init_hook)
-def process(settings, filename):
-    if settings.is_train:
-        data_size = 2**10
-    else:
-        data_size = 2**5
-
-    for _ in xrange(data_size):
-        yield random.randint(0, 8190), random.randint(0, 9)
diff --git a/paddle/legacy/trainer/tests/testPyDataWrapper.py b/paddle/legacy/trainer/tests/testPyDataWrapper.py
deleted file mode 100644
index a76eeeacb91..00000000000
--- a/paddle/legacy/trainer/tests/testPyDataWrapper.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-sys.path.append("../")
-
-from paddle.trainer.PyDataProviderWrapper import *
-import random
-import json
-import string
-
-SPARSE_ID_LIMIT = 1000
-SPARSE_ID_COUNT = 100
-SEQUENCE_LIMIT = 50
-STRING_LIMIT = 10
-
-sparse_id_randomer = lambda: random.randrange(0, SPARSE_ID_LIMIT - 1)
-sparse_count_randomer = lambda: random.randrange(1, SPARSE_ID_COUNT)
-val_randomer = lambda: random.uniform(-1.0, 1.0)
-seq_count_randomer = lambda: random.randrange(1, SEQUENCE_LIMIT)
-str_count_randomer = lambda: random.randrange(1, STRING_LIMIT)
-
-
-class IDRandomer():  # A random generator, return unique id
-    def __init__(self):
-        self.id_set = set()
-
-    def __call__(self):
-        idx = sparse_id_randomer()
-        if idx not in self.id_set:
-            self.id_set.add(idx)
-            return idx
-        else:
-            return self.__call__()
-
-
-# SparseValueSlot
-def sparse_value_creator(_):
-    rand = IDRandomer()
-    return [(rand(), val_randomer()) for _ in xrange(sparse_count_randomer())]
-
-
-sparse_value = map(sparse_value_creator, range(seq_count_randomer()))
-
-
-# DenseSlot
-def dense_creator(_):
-    return [val_randomer() for _ in xrange(SPARSE_ID_LIMIT)]
-
-
-dense = map(dense_creator, range(seq_count_randomer()))
-
-
-# SparseNonValueSlot
-def sparse_creator(_):
-    rand = IDRandomer()
-    return [rand() for _ in xrange(sparse_count_randomer())]
-
-
-sparse_nonvalue = map(sparse_creator, range(seq_count_randomer()))
-
-# IndexSlot
-ids = [sparse_id_randomer() for _ in range(seq_count_randomer())]
-
-
-# StringSlot
-def random_str(size=8, chars=string.ascii_letters + string.digits):
-    return ''.join(random.choice(chars) for _ in range(size))
-
-
-strs = [random_str(str_count_randomer()) for _ in range(seq_count_randomer())]
-
-
-def processSeqAndGenerateDataInit(obj, *args, **kwargs):
-    obj.json_filename = kwargs.get("load_data_args", "test_data.json")
-
-
-@provider(
-    slots=[
-        SparseValueSlot(SPARSE_ID_LIMIT), DenseSlot(SPARSE_ID_LIMIT),
-        SparseNonValueSlot(SPARSE_ID_LIMIT), IndexSlot(SPARSE_ID_LIMIT),
-        StringSlot(SPARSE_ID_LIMIT)
-    ],
-    use_seq=True,
-    init_hook=processSeqAndGenerateDataInit)
-def processSeqAndGenerateData(obj, name):
-    retv = [sparse_value, dense, sparse_nonvalue, ids, strs]
-    # Write to protoseq.
-    with open(obj.json_filename, "w") as f:
-        json.dump(retv, f)
-    yield retv
-
-
-def processSubSeqAndGenerateDataInit(obj, *args, **kwargs):
-    obj.json_filename = kwargs.get("load_data_args", "test_data.json")
-
-
-@provider(
-    slots=[
-        SparseValueSlot(SPARSE_ID_LIMIT), DenseSlot(SPARSE_ID_LIMIT),
-        SparseNonValueSlot(SPARSE_ID_LIMIT), IndexSlot(SPARSE_ID_LIMIT),
-        StringSlot(SPARSE_ID_LIMIT)
-    ],
-    use_seq=True,
-    init_hook=processSubSeqAndGenerateDataInit)
-def processSubSeqAndGenerateData(obj, name):
-    retv_json = [sparse_value, dense, sparse_nonvalue, ids, strs]
-    retv_wrapper = [[sparse_value], [dense], [sparse_nonvalue], [ids], [strs]]
-    # Write to protoseq.
-    with open(obj.json_filename, "w") as f:
-        json.dump(retv_json, f)
-    yield retv_wrapper
-
-
-if __name__ == "__main__":
-    pvd = processSeqAndGenerateData("_")
-    print pvd.getNextBatch(100)
-    pvd = processSubSeqAndGenerateData("_")
-    print pvd.getNextBatch(1)
diff --git a/paddle/legacy/trainer/tests/test_Compare.cpp b/paddle/legacy/trainer/tests/test_Compare.cpp
deleted file mode 100644
index e37e546be85..00000000000
--- a/paddle/legacy/trainer/tests/test_Compare.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/legacy/utils/PythonUtil.h>
-
-#include "paddle/legacy/trainer/Trainer.h"
-
-#include <gtest/gtest.h>
-#include <cstdlib>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile =
-    "legacy/trainer/tests/sample_trainer_config.conf";
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_string(config_args);
-
-struct comData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-
-void calcGradient(bool useGpu, comData& Data) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_config = configFile;
-
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig());
-
-  Data.parameters = trainer.getGradientMachine()->getParameters();
-  DataBatch dataBatch;
-  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
-  trainer.getDataProvider()->setSkipShuffle();
-  trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-  vector<Argument>& inArgs = dataBatch.getStreams();
-  trainer.getGradientMachine()->start();
-  for (int i = 0; i < 2; ++i) {
-    trainer.getGradientMachine()->forwardBackward(
-        inArgs, &Data.outArgs, PASS_TRAIN);
-  }
-  trainer.getGradientMachine()->finish();
-}
-
-void compareGradient(comData& comDataCpu, comData& comDataGpu);
-
-TEST(Trainer, create) {
-  int devCount = 0;
-  devCount = hl_get_device_count();
-  FLAGS_config_args = "drop_rate=0";
-
-  comData comDataCpu;
-  calcGradient(false, comDataCpu);
-  LOG(INFO) << "Cpu is completed";
-
-  {
-    LOG(INFO) << "Test GPU";
-    comData comData;
-    calcGradient(true, comData);
-    compareGradient(comDataCpu, comData);
-    LOG(INFO) << "Gpu is completed";
-  }
-
-  {
-    LOG(INFO) << "Test test multi gpu";
-    comData comData;
-    FLAGS_trainer_count = devCount;
-    calcGradient(true, comData);
-    compareGradient(comDataCpu, comData);
-    LOG(INFO) << "Gpu4 is completed";
-  }
-
-  {
-    LOG(INFO) << "Test use_sparse_update=true";
-    comData comData;
-    calcGradient(false, comData);
-    compareGradient(comDataCpu, comData);
-    LOG(INFO) << "Cpu4 is completed";
-  }
-}
-
-double checkBuffer(real* A, real* B, size_t len) {
-#ifdef PADDLE_TYPE_DOUBLE
-  double precision = 1e-7;
-#else
-  double precision = 2e-3;
-#endif
-  int nNum = 0;
-  double maxE = 0;
-  for (size_t i = 0; i < len; ++i) {
-    double e = fabs(A[i] - B[i]);
-    maxE = std::max(e, maxE);
-    nNum += e > precision * fabs(A[i]);
-  }
-  EXPECT_EQ(0, nNum);
-  return maxE;
-}
-
-void compareGradient(comData& comDataCpu, comData& comDataGpu) {
-  /*compare outArgs*/
-  vector<Argument> outArgs1 = comDataCpu.outArgs;
-  vector<Argument> outArgs2 = comDataGpu.outArgs;
-  CpuMatrix out1(outArgs1[0].value->getHeight(), outArgs1[0].value->getWidth());
-  CpuMatrix out2(outArgs2[0].value->getHeight(), outArgs2[0].value->getWidth());
-  out1.copyFrom(*outArgs1[0].value);
-  out2.copyFrom(*outArgs2[0].value);
-  checkBuffer(out1.getData(), out2.getData(), out1.getElementCnt());
-
-  /*compare parameters*/
-  vector<ParameterPtr>& parameters1 = comDataCpu.parameters;
-  vector<ParameterPtr>& parameters2 = comDataGpu.parameters;
-  for (size_t i = 0; i < parameters1.size(); ++i) {
-    ParameterPtr parameter1, parameter2;
-    parameter1 = parameters1[i];
-    parameter2 = parameters2[i];
-    /*compare parameters value*/
-    CpuVector para1(parameter1->getSize());
-    CpuVector para2(parameter2->getSize());
-    para1.copyFrom(*parameter1->getBuf(PARAMETER_VALUE));
-    para2.copyFrom(*parameter2->getBuf(PARAMETER_VALUE));
-    checkBuffer(para1.getData(), para2.getData(), para1.getSize());
-
-    /*compare parameters grad*/
-    CpuVector cpuGrad1(*parameter1->getBuf(PARAMETER_GRADIENT));
-    CpuVector cpuGrad2(*parameter2->getBuf(PARAMETER_GRADIENT));
-    double e =
-        checkBuffer(cpuGrad1.getData(), cpuGrad2.getData(), cpuGrad1.getSize());
-    LOG(INFO) << parameter1->getName() << " max error=" << e;
-  }
-}
-
-int main(int argc, char** argv) {
-#ifndef PADDLE_WITH_CUDA
-  exit(0);
-#endif
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  exit(ret);
-}
diff --git a/paddle/legacy/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/legacy/trainer/tests/test_PyDataProviderWrapper.cpp
deleted file mode 100644
index 847adcfabad..00000000000
--- a/paddle/legacy/trainer/tests/test_PyDataProviderWrapper.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_NO_PYTHON
-#include <DataConfig.pb.h>
-#include <gtest/gtest.h>
-#include <paddle/legacy/gserver/dataproviders/DataProvider.h>
-#include <paddle/legacy/math/Matrix.h>
-#include <paddle/legacy/parameter/Argument.h>
-#include <paddle/legacy/utils/PythonUtil.h>
-#include <fstream>
-#include <typeinfo>
-#include <unordered_map>
-#include <unordered_set>
-#include "picojson.h"
-
-void checkValue(std::vector<paddle::Argument>& arguments, picojson::array& arr);
-const std::string kDir = "./legacy/trainer/tests/pydata_provider_wrapper_dir/";
-
-TEST(PyDataProviderWrapper, SequenceData) {
-  paddle::DataConfig conf;
-  conf.set_type("py");
-  conf.set_load_data_module("testPyDataWrapper");
-  conf.set_load_data_object("processSeqAndGenerateData");
-  conf.set_load_data_args(kDir + "test_pydata_provider_wrapper.json");
-  conf.clear_files();
-  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
-  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromPy;
-  provider->getNextBatch(100, &batchFromPy);
-
-  picojson::value val;
-  std::fstream fin;
-  fin.open(kDir + "test_pydata_provider_wrapper.json", std::ios_base::in);
-  EXPECT_TRUE(fin.is_open());
-  if (fin.is_open()) {
-    std::string err = picojson::parse(val, fin);
-    EXPECT_TRUE(err.empty());
-    EXPECT_TRUE(val.is<picojson::array>());
-    picojson::array& arr = val.get<picojson::array>();
-    std::vector<paddle::Argument>& arguments = batchFromPy.getStreams();
-    // CHECK Value
-    checkValue(arguments, arr);
-    // CHECK sequenceStartPositions
-    for (size_t i = 0; i < arr.size(); i++) {
-      int row_id = arr[i].get<picojson::array>().size();
-      EXPECT_EQ(0, arguments[i].sequenceStartPositions->getData(false)[0]);
-      EXPECT_EQ((int)row_id,
-                arguments[i].sequenceStartPositions->getData(false)[1]);
-    }
-    fin.close();
-  }
-}
-
-TEST(PyDataProviderWrapper, HasSubSequenceData) {
-  paddle::DataConfig conf;
-  conf.set_type("py");
-  conf.set_load_data_module("testPyDataWrapper");
-  conf.set_load_data_object("processSubSeqAndGenerateData");
-  conf.set_load_data_args(kDir + "test_pydata_provider_wrapper.json");
-  conf.clear_files();
-  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
-  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromPy;
-  provider->getNextBatch(1, &batchFromPy);
-
-  picojson::value val;
-  std::fstream fin;
-  fin.open(kDir + "test_pydata_provider_wrapper.json", std::ios_base::in);
-  EXPECT_TRUE(fin.is_open());
-  if (fin.is_open()) {
-    std::string err = picojson::parse(val, fin);
-    EXPECT_TRUE(err.empty());
-    EXPECT_TRUE(val.is<picojson::array>());
-    picojson::array& arr = val.get<picojson::array>();
-    std::vector<paddle::Argument>& arguments = batchFromPy.getStreams();
-    // CHECK Value
-    checkValue(arguments, arr);
-    // CHECK sequenceStartPositions and subSequenceStartPositions
-    for (size_t i = 0; i < arr.size(); i++) {
-      int row_id = arr[i].get<picojson::array>().size();
-      EXPECT_EQ(0, arguments[i].sequenceStartPositions->getData(false)[0]);
-      EXPECT_EQ((int)row_id,
-                arguments[i].sequenceStartPositions->getData(false)[1]);
-      EXPECT_EQ(0, arguments[i].subSequenceStartPositions->getData(false)[0]);
-      EXPECT_EQ((int)row_id,
-                arguments[i].subSequenceStartPositions->getData(false)[1]);
-    }
-    fin.close();
-  }
-}
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  paddle::initPython(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-void checkValue(std::vector<paddle::Argument>& arguments,
-                picojson::array& arr) {
-  // CHECK SLOT 0, Sparse Value.
-  paddle::Argument& sparse_values_seq = arguments[0];
-  paddle::MatrixPtr& sparse_values_seq_rawmatrix = sparse_values_seq.value;
-  EXPECT_TRUE(sparse_values_seq_rawmatrix != nullptr);
-  paddle::CpuSparseMatrix* sparse_val_seq_sparse_mat =
-      dynamic_cast<paddle::CpuSparseMatrix*>(sparse_values_seq_rawmatrix.get());
-  EXPECT_TRUE(sparse_val_seq_sparse_mat != nullptr);
-  EXPECT_EQ(arr.size(), arguments.size());
-  EXPECT_TRUE(arr[0].is<picojson::array>());
-  size_t row_id = 0;
-  for (picojson::value& sparse_val_seq : arr[0].get<picojson::array>()) {
-    std::unordered_map<int, real> cols;
-    for (picojson::value& kv : sparse_val_seq.get<picojson::array>()) {
-      EXPECT_TRUE(kv.get(0).is<double>());
-      EXPECT_TRUE(kv.get(1).is<double>());
-      int col = (int)(kv.get(0).get<double>());
-      real val = (real)(kv.get(1).get<double>());
-      cols.insert({col, val});
-    }
-    size_t colNum = sparse_val_seq_sparse_mat->getColNum(row_id);
-    EXPECT_EQ(cols.size(), colNum);
-    int* rowIds = sparse_val_seq_sparse_mat->getRowCols(row_id);
-    real* rowBuf = sparse_val_seq_sparse_mat->getRowValues(row_id);
-    for (size_t i = 0; i < colNum; ++i) {
-      int id = rowIds[i];
-      auto it = cols.find(id);
-      EXPECT_NE(cols.end(), it);
-      real expect = it->second;
-      EXPECT_NEAR(expect, *rowBuf, 1e-5);
-      ++rowBuf;
-    }
-    ++row_id;
-  }
-
-  // CHECK SLOT 1, Dense Value.
-  paddle::Argument& dense_arg = arguments[1];
-  paddle::MatrixPtr& dense_mat = dense_arg.value;
-  EXPECT_NE(nullptr, dense_mat);
-  EXPECT_TRUE(arr[1].is<picojson::array>());
-  row_id = 0;
-  for (picojson::value& dense_seq : arr[1].get<picojson::array>()) {
-    EXPECT_TRUE(dense_seq.is<picojson::array>());
-    picojson::array& row = dense_seq.get<picojson::array>();
-    EXPECT_EQ(row.size(), dense_mat->getWidth());
-    real* rowBuf = dense_mat->getRowBuf(row_id++);
-
-    for (picojson::value& val : row) {
-      EXPECT_TRUE(val.is<double>());
-      real expect = val.get<double>();
-      EXPECT_NEAR(expect, *rowBuf++, 1e-5);
-    }
-  }
-
-  // CHECK SLOT 2, Sparse Non Value.
-  paddle::Argument& sparse_non_val_arg = arguments[2];
-  paddle::MatrixPtr& sparse_non_val_rawm = sparse_non_val_arg.value;
-  EXPECT_NE(nullptr, sparse_non_val_rawm);
-  paddle::CpuSparseMatrix* sparse_non_val_m =
-      dynamic_cast<paddle::CpuSparseMatrix*>(sparse_non_val_rawm.get());
-  EXPECT_NE(nullptr, sparse_non_val_m);
-  row_id = 0;
-  for (picojson::value& row : arr[2].get<picojson::array>()) {
-    EXPECT_TRUE(row.is<picojson::array>());
-    std::unordered_set<int> ids;
-    for (picojson::value& id : row.get<picojson::array>()) {
-      EXPECT_TRUE(id.is<double>());
-      ids.insert((int)(id.get<double>()));
-    }
-    size_t colNum = sparse_non_val_m->getColNum(row_id);
-    EXPECT_EQ(ids.size(), colNum);
-    for (size_t i = 0; i < colNum; ++i) {
-      int col = sparse_non_val_m->getRowCols(row_id)[i];
-      EXPECT_TRUE(ids.find(col) != ids.end());
-    }
-    ++row_id;
-  }
-
-  // CHECK SLOT 3, Index.
-  paddle::Argument& index_arg = arguments[3];
-  paddle::IVectorPtr indices = index_arg.ids;
-  EXPECT_NE(nullptr, indices);
-  int* idPtr = indices->getData();
-  for (picojson::value& id : arr[3].get<picojson::array>()) {
-    EXPECT_TRUE(id.is<double>());
-    int _id = (int)(id.get<double>());
-    EXPECT_EQ(_id, *idPtr++);
-  }
-
-  // CHECK SLOT 4, String.
-  paddle::Argument& strArg = arguments[4];
-  std::vector<std::string>* strPtr = strArg.strs.get();
-  EXPECT_NE(nullptr, strPtr);
-  size_t vecIndex = 0;
-  for (picojson::value& str : arr[4].get<picojson::array>()) {
-    EXPECT_TRUE(str.is<std::string>());
-    std::string _str = str.get<std::string>();
-    EXPECT_EQ(_str, (*strPtr)[vecIndex++]);
-  }
-}
-
-#else
-int main() { return 0; }
-
-#endif
diff --git a/paddle/legacy/trainer/tests/test_Trainer.cpp b/paddle/legacy/trainer/tests/test_Trainer.cpp
deleted file mode 100644
index 14ad0a26528..00000000000
--- a/paddle/legacy/trainer/tests/test_Trainer.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/legacy/utils/PythonUtil.h>
-#include <paddle/legacy/utils/Version.h>
-#include "paddle/legacy/trainer/Trainer.h"
-
-#include <gtest/gtest.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile1 =
-    "legacy/trainer/tests/sample_trainer_config.conf";
-static const string& configFile2 =
-    "legacy/trainer/tests/sample_trainer_config_hsigmoid.conf";
-static const string& configFile4 =
-    "legacy/trainer/tests/sample_trainer_config_parallel.conf";
-
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_int32(gpu_id);
-DECLARE_bool(allow_only_one_model_on_one_gpu);
-
-void checkGradientTest(const string& configFile,
-                       bool useGpu,
-                       bool parallel,
-                       int trainerCount = 1) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_parallel_nn = parallel;
-  FLAGS_config = configFile;
-  FLAGS_trainer_count = trainerCount;
-  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile;
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig());
-  EXPECT_LE(fabs(trainer.checkGradient()), 0.02);
-}
-
-TEST(checkGradient, cpu) { checkGradientTest(configFile1, false, false); }
-
-#ifdef PADDLE_WITH_CUDA
-TEST(checkGradient, gpu) { checkGradientTest(configFile1, true, false); }
-
-TEST(checkGradient, multiGpu) {
-  int numGpu;
-  numGpu = hl_get_device_count();
-  for (auto count : {2, 4}) {
-    if (count <= numGpu) {
-      checkGradientTest(configFile1, true, false, count);
-    }
-  }
-}
-
-TEST(checkGradient, parallel) {
-  if (hl_get_device_count() >= 2) {
-    checkGradientTest(configFile4, true, true);
-  }
-}
-
-TEST(checkGradient, multiParallel) {
-  FLAGS_allow_only_one_model_on_one_gpu = false;
-  checkGradientTest(configFile4, true, true, 2);
-  FLAGS_allow_only_one_model_on_one_gpu = true;
-}
-
-#endif
-
-TEST(checkGradient, multi) {
-  int numGpu;
-  if (version::isWithGpu()) {
-    numGpu = hl_get_device_count();
-  } else {
-    numGpu = 0;
-  }
-  for (bool useGpu : {false, true}) {
-    for (auto count : {2, 4}) {
-      if (useGpu && count > numGpu) continue;
-      checkGradientTest(configFile1, useGpu, false, count);
-    }
-  }
-}
-
-TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
-
-TEST(checkGradient, non_parallel) {
-  checkGradientTest(configFile4, false, false);
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/trainer/tests/test_TrainerOnePass.cpp b/paddle/legacy/trainer/tests/test_TrainerOnePass.cpp
deleted file mode 100644
index 3e5c5ea723f..00000000000
--- a/paddle/legacy/trainer/tests/test_TrainerOnePass.cpp
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/legacy/utils/GlobalConstants.h>
-#include <paddle/legacy/utils/PythonUtil.h>
-#include "paddle/legacy/trainer/Trainer.h"
-#include "paddle/legacy/trainer/TrainerInternal.h"
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/pserver/ParameterServer2.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile1 =
-    "legacy/trainer/tests/sample_trainer_config.conf";
-static const string& configFile2 =
-    "legacy/trainer/tests/sample_trainer_config_parallel.conf";
-
-static const string& configFileSimpleSparse =
-    "legacy/trainer/tests/simple_sparse_neural_network.py";
-
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_int32(gpu_id);
-DECLARE_int32(seed);
-DECLARE_int32(num_passes);
-DECLARE_int32(saving_period);
-
-class TrainerForTest : public paddle::Trainer {
- public:
-  inline const std::shared_ptr<ParameterUpdater>& getParameterUpdaterForTest() {
-    return this->trainerInternal_.getParameterUpdater();
-  }
-};
-
-int gNumDevices = 0;
-
-void trainerOnePassTest(const string& configFile,
-                        bool useGpu,
-                        bool parallel,
-                        int trainerCount = 1,
-                        double averageWindow = 0.0f,
-                        bool doAverageInCpu = false) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_parallel_nn = parallel;
-  FLAGS_config = configFile;
-  FLAGS_trainer_count = trainerCount;
-  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile;
-  srand(FLAGS_seed);
-
-  if (useGpu) {
-    if (gNumDevices < trainerCount) {
-      return;
-    }
-  }
-
-  Trainer trainer;
-  auto config = TrainerConfigHelper::createFromFlagConfig();
-  if (averageWindow > 0) {
-    config->getOptConfig().set_average_window(averageWindow);
-    config->getOptConfig().set_do_average_in_cpu(doAverageInCpu);
-  }
-  trainer.init(config);
-  trainer.train();
-}
-
-// 1. test trainer (cpu, gpu).
-TEST(trainerOnePass, cpu) { trainerOnePassTest(configFile1, false, false); }
-
-#ifdef PADDLE_WITH_CUDA
-TEST(trainerOnePass, gpu) { trainerOnePassTest(configFile1, true, false); }
-
-TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); }
-
-TEST(trainerOnePass, gpu4) { trainerOnePassTest(configFile1, true, false, 4); }
-
-TEST(trainerOnePass, parallel) {
-  if (hl_get_device_count() >= 2) {
-    trainerOnePassTest(configFile2, true, true);
-  }
-}
-#endif
-
-// 2. test average_window.
-#ifdef PADDLE_WITH_CUDA
-TEST(average_window, gpu) {
-  trainerOnePassTest(configFile1, true, false, 4, 0.01);
-}
-
-TEST(average_window, gpu2) {
-  FLAGS_num_passes = 20;
-  trainerOnePassTest(configFile1, true, false, 2, 0.01);
-  FLAGS_num_passes = 1;
-}
-
-TEST(average_window, gpu4) {
-  FLAGS_num_passes = 20;
-  trainerOnePassTest(configFile1, true, false, 4, 0.01);
-  FLAGS_num_passes = 1;
-}
-
-TEST(average_window_cpu, gpu2) {
-  FLAGS_num_passes = 20;
-  trainerOnePassTest(configFile1, true, false, 2, 0.01, true);
-  FLAGS_num_passes = 1;
-}
-
-TEST(average_window_cpu, gpu4) {
-  FLAGS_num_passes = 20;
-  trainerOnePassTest(configFile1, true, false, 4, 0.01, true);
-  FLAGS_num_passes = 1;
-}
-#endif
-
-// 3. test trainer + pserver.
-DECLARE_int32(num_gradient_servers);
-DECLARE_int32(port);
-DECLARE_bool(local);
-DECLARE_bool(use_old_updater);
-
-double checkRemoteParameterUpdater(TrainerForTest& trainer) {
-  auto gradientMachine = trainer.getGradientMachine();
-  auto parameterUpdater = trainer.getParameterUpdaterForTest();
-  auto dataProvider = trainer.getDataProvider();
-  auto& parameters = gradientMachine->getParameters();
-  const TrainerConfig& config = trainer.getConfig();
-  const string& alg = config.opt_config().algorithm();
-
-  vector<ParameterPtr> parameterCheck;
-  for (auto& parameter : parameters) {
-    parameterCheck.emplace_back(
-        new Parameter(parameter->getConfig(), /* useGpu= */ false));
-    parameterCheck.back()
-        ->getBuf(PARAMETER_VALUE)
-        ->copyFrom(*parameter->getBuf(PARAMETER_VALUE));
-    parameterCheck.back()
-        ->getBuf(PARAMETER_GRADIENT)
-        ->copyFrom(*parameter->getBuf(PARAMETER_GRADIENT));
-  }
-
-  std::unique_ptr<ParameterUpdater> parameterUpdaterCheck;
-  if (alg == TrainAlgorithm::SGD) {
-    parameterUpdaterCheck.reset(new SgdLocalUpdater(config.opt_config()));
-  } else {
-    LOG(INFO) << "unsupported algorithm in remote parameter check: " << alg;
-    return -1.0;
-  }
-  parameterUpdaterCheck->init(parameterCheck);
-
-  // gradientMachine->start(config, *dataProvider);
-  DataBatch dataBatch;
-  int32_t batchSize = config.opt_config().batch_size();
-  dataProvider->getNextBatch(batchSize, &dataBatch);
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-  int64_t actualBatchSize = dataBatch.getSize();
-  const vector<Argument>& inArgs = dataBatch.getStreams();
-  vector<Argument> outArgs;
-
-  UpdateCallback updateCallback = [parameterUpdater,
-                                   parameterCheck](Parameter* para) {
-    parameterCheck[para->getID()]
-        ->getBuf(PARAMETER_GRADIENT)
-        ->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
-    parameterUpdater->update(para);
-  };
-
-  parameterUpdater->startPass();
-  parameterUpdaterCheck->startPass();
-
-  for (int i = 0; i < config.opt_config().num_batches_per_get_parameter() * 2;
-       ++i) {
-    PassType passType = parameterUpdater->startBatch(actualBatchSize);
-    gradientMachine->forwardBackward(
-        inArgs, &outArgs, passType, updateCallback);
-    parameterUpdater->finishBatch(0);
-
-    parameterUpdaterCheck->startBatch(actualBatchSize);
-    for (auto& para : parameterCheck) {
-      parameterUpdaterCheck->update(para.get());
-    }
-    parameterUpdaterCheck->finishBatch(0);
-  }
-
-  double sum = 0.0f;
-  for (size_t i = 0; i != parameters.size(); ++i) {
-    real *v1, *v2;
-    CpuVector trainerPara(parameters[i]->getSize());
-    trainerPara.copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
-    if (!FLAGS_use_gpu) {
-      v1 = parameters[i]->getBuf(PARAMETER_VALUE)->getData();
-    } else {
-      v1 = trainerPara.getData();
-    }
-    v2 = parameterCheck[i]->getBuf(PARAMETER_VALUE)->getData();
-
-    size_t size = parameters[i]->getSize();
-    double diff = 0;
-    for (size_t j = 0; j < size; ++j) {
-      diff += fabs(v1[j] - v2[j]);
-    }
-    sum += diff;
-    LOG(INFO) << setiosflags(ios::left) << setfill(' ') << setw(20)
-              << parameters[i]->getName() << "diff=" << setw(15) << diff;
-  }
-
-  parameterUpdater->finishPass();
-  parameterUpdaterCheck->finishPass();
-  gradientMachine->finish();
-  return sum;
-}
-
-void checkRemoteParameterUpdaterTest(const string& configFile,
-                                     bool useGpu,
-                                     bool parallel,
-                                     int trainerCount = 1,
-                                     bool useOldUpdater = false,
-                                     int num_batches_per_get_parameter = 1) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_parallel_nn = parallel;
-  FLAGS_config = configFile;
-  FLAGS_trainer_count = trainerCount;
-  FLAGS_use_old_updater = useOldUpdater;
-  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile;
-  srand(FLAGS_seed);
-
-  if (useGpu) {
-    if (gNumDevices < trainerCount) {
-      return;
-    }
-  }
-
-  FLAGS_local = 0;
-  std::shared_ptr<ParameterServer2> pserver;
-  pserver.reset(new ParameterServer2(std::string(), FLAGS_port));
-  pserver->init();
-  pserver->start();
-
-  TrainerForTest trainer;
-  auto config = TrainerConfigHelper::createFromFlagConfig();
-  config->getOptConfig().set_num_batches_per_get_parameter(
-      num_batches_per_get_parameter);
-  trainer.init(config);
-  EXPECT_EQ(checkRemoteParameterUpdater(trainer), 0);
-
-  FLAGS_local = 1;
-}
-
-TEST(checkRemoteUpdater, cpuTrainer) {
-  checkRemoteParameterUpdaterTest(configFile1, false, false);
-}
-
-TEST(checkRemoteUpdater, cpuTrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(checkRemoteUpdater, gpuTrainer) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false);
-}
-
-TEST(checkRemoteUpdater, gpu2Trainer) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 2);
-}
-
-TEST(checkRemoteUpdater, gpu4Trainer) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 4);
-}
-
-TEST(checkRemoteUpdater, gpuTrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 1, true);
-}
-
-TEST(checkRemoteUpdater, gpu2TrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 2, true);
-}
-
-TEST(checkRemoteUpdater, gpu4TrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 4, true);
-}
-
-#endif
-
-TEST(checkRemoteUpdater, cpuDeltaTrainer) {
-  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, false, 10);
-}
-
-TEST(checkRemoteUpdater, cpuDeltaTrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true, 10);
-}
-
-TEST(SgdThreadUpdater, simpleSparseNN) {
-  trainerOnePassTest(configFileSimpleSparse, false, false, 1, 0.5, true);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  initPython(argc, argv);
-  gNumDevices = hl_get_device_count();
-
-  FLAGS_num_passes = 1;          // train one pass
-  FLAGS_saving_period = 100000;  // do not save parameteres
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/trainer/tests/test_config.conf b/paddle/legacy/trainer/tests/test_config.conf
deleted file mode 100644
index bce687ad836..00000000000
--- a/paddle/legacy/trainer/tests/test_config.conf
+++ /dev/null
@@ -1,77 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-TrainData(SimpleData(
-    files = "legacy/trainer/tests/sample_filelist.txt",
-    feat_dim = 3,
-    context_len = 0,
-    buffer_capacity = 1000000,
-    async_load_data = False))
-
-settings(batch_size = 100)
-
-data = data_layer(name='input', size=3)
-
-wt = data_layer(name='weight', size=1)
-
-fc1 = fc_layer(input=data, size=5,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=data, size=12,
-               bias_attr=True,
-               param_attr=ParamAttr(name='sharew'),
-               act=LinearActivation())
-
-fc3 = fc_layer(input=data, size=3,
-               bias_attr=True,
-               act=TanhActivation())
-
-fc4 = fc_layer(input=data, size=5,
-               bias_attr=True,
-               layer_attr=ExtraAttr(drop_rate=0.5),
-               act=SquareActivation())
-
-pool = img_pool_layer(input=fc2,
-                      pool_size=2,
-                      pool_size_y=3,
-                      num_channels=1,
-                      padding=1,
-                      padding_y=2,
-                      stride=2,
-                      stride_y=3,
-                      pool_type=CudnnAvgPooling())
-
-concat = concat_layer(input=[fc3, fc4])
-
-with mixed_layer(size=3, act=SoftmaxActivation()) as output:
-    output += full_matrix_projection(input=fc1)
-    output += trans_full_matrix_projection(input=fc2,
-                                           param_attr=ParamAttr(name='sharew'))
-    output += full_matrix_projection(input=concat)
-    output += identity_projection(input=fc3)
-
-lbl = data_layer(name='label', size=1)
-
-cost = classification_cost(input=output, label=lbl, weight=wt,
-                           layer_attr=ExtraAttr(device=-1))
-
-nce = nce_layer(input=fc2, label=lbl, weight=wt,
-                num_classes=3, 
-                neg_distribution=[0.1, 0.3, 0.6])
-                
-outputs(cost, nce)
diff --git a/paddle/legacy/trainer/tests/test_gen_dict.txt b/paddle/legacy/trainer/tests/test_gen_dict.txt
deleted file mode 100644
index 1000f900578..00000000000
--- a/paddle/legacy/trainer/tests/test_gen_dict.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-0
-1
-2
-3
-4
-5
-6
-7
-8
diff --git a/paddle/legacy/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/legacy/trainer/tests/test_recurrent_machine_generation.cpp
deleted file mode 100644
index 47b4e82cd32..00000000000
--- a/paddle/legacy/trainer/tests/test_recurrent_machine_generation.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-
-#include <paddle/legacy/trainer/Trainer.h>
-#include <paddle/legacy/utils/PythonUtil.h>
-
-#include <gtest/gtest.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& CONFIG_FILE =
-    "legacy/trainer/tests/sample_trainer_rnn_gen.conf";
-static const string& NEST_CONFIG_FILE =
-    "legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf";
-static const string& OUTPUT_DIR = "legacy/trainer/tests/dump_text.test";
-static string modelDir =
-    "legacy/trainer/tests/rnn_gen_test_model_dir/t1";       // NOLINT
-static string expectFile =                                  // NOLINT
-    "legacy/trainer/tests/rnn_gen_test_model_dir/r1.test";  // NOLINT
-
-DECLARE_string(config_args);
-
-vector<float> readRetFile(const string& fname) {
-  ifstream inFile(fname);
-  float ret;
-  vector<float> nums;
-  while (inFile >> ret) {
-    nums.push_back(ret);
-  }
-  return nums;
-}
-
-void checkOutput(const string& expRetFile) {
-  vector<float> rets = readRetFile(OUTPUT_DIR);
-  vector<float> expRets = readRetFile(expRetFile);
-  EXPECT_EQ(rets.size(), expRets.size());
-  for (size_t i = 0; i < rets.size(); i++) {
-    EXPECT_FLOAT_EQ(rets[i], expRets[i]);
-  }
-}
-
-void prepareInArgs(vector<Argument>& inArgs,
-                   const size_t batchSize,
-                   bool useGpu,
-                   bool hasSubseq) {
-  inArgs.clear();
-  // sentence id
-  Argument sentId;
-  sentId.value = nullptr;
-  if (hasSubseq) {
-    // as there is only one sequence, there is only one label.
-    IVector::resizeOrCreate(sentId.ids, 1, useGpu);
-    sentId.ids->setElement(0, 0);
-  } else {
-    // as there is batchSize word, there is batchSize label.
-    IVector::resizeOrCreate(sentId.ids, batchSize, useGpu);
-    for (size_t i = 0; i < batchSize; ++i) sentId.ids->setElement(i, i);
-  }
-  inArgs.emplace_back(sentId);
-
-  // a dummy layer to decide batch size
-  Argument dummyInput;
-  dummyInput.value = Matrix::create(batchSize, 2, false, useGpu);
-  dummyInput.value->randomizeUniform();
-  if (hasSubseq) {
-    // generate one sequence with batchSize subsequence,
-    // and each subsequence has only one word.
-    dummyInput.sequenceStartPositions = ICpuGpuVector::create(2, false);
-    int* buf = dummyInput.sequenceStartPositions->getMutableData(false);
-    dummyInput.subSequenceStartPositions =
-        ICpuGpuVector::create(batchSize + 1, false);
-    int* subBuf = dummyInput.subSequenceStartPositions->getMutableData(false);
-    buf[0] = 0;
-    buf[1] = batchSize;
-    for (size_t i = 0; i < batchSize + 1; i++) subBuf[i] = i;
-  }
-  inArgs.emplace_back(dummyInput);
-}
-
-void testGeneration(const string& configFile,
-                    bool useGpu,
-                    bool hasSubseq,
-                    const string& expRetFile) {
-  FLAGS_use_gpu = useGpu;
-  auto config = std::make_shared<TrainerConfigHelper>(configFile);
-  unique_ptr<GradientMachine> gradientMachine(GradientMachine::create(*config));
-  gradientMachine->loadParameters(modelDir);
-  vector<Argument> inArgs(2);
-
-  const size_t batchSize = 15;
-  prepareInArgs(inArgs, batchSize, useGpu, hasSubseq);
-  vector<Argument> outArgs;
-  unique_ptr<Evaluator> testEvaluator(gradientMachine->makeEvaluator());
-  testEvaluator->start();
-  gradientMachine->forward(inArgs, &outArgs, PASS_TEST);
-  gradientMachine->eval(testEvaluator.get());
-  testEvaluator->finish();
-  checkOutput(expRetFile);
-}
-
-#ifndef PADDLE_TYPE_DOUBLE
-
-TEST(RecurrentGradientMachine, test_generation) {
-#ifndef PADDLE_WITH_CUDA
-  const auto useGpuConfs = {false};
-#else
-  const auto useGpuConfs = {true, false};
-#endif
-  auto testGen = [&](const string& configFile,
-                     bool hasSubseq,
-                     const string& expRetFile,
-                     bool beam_search) {
-    FLAGS_config_args = beam_search ? "beam_search=1" : "beam_search=0";
-    for (auto useGpu : useGpuConfs) {
-      LOG(INFO) << configFile << " useGpu=" << useGpu
-                << " beam_search=" << beam_search;
-      testGeneration(configFile, useGpu, hasSubseq, expRetFile);
-    }
-  };
-  testGen(CONFIG_FILE, false, expectFile + ".nobeam", false);  // no beam search
-  testGen(CONFIG_FILE, false, expectFile + ".beam", true);     // beam search
-  // In hierarchical RNN, beam search and one way search are only in inner-RNN,
-  // outer-RNN will concat the generated inner-results (first for beam search)
-  // from inner-RNN. Thus, they have the same outer-results.
-  testGen(NEST_CONFIG_FILE,
-          true,
-          expectFile + ".nest",
-          false);  // no beam search
-  testGen(NEST_CONFIG_FILE, true, expectFile + ".nest", true);  // beam search
-}
-#endif
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-  CHECK(argc == 1 || argc == 3);
-  if (argc == 3) {
-    modelDir = argv[1];
-    expectFile = argv[2];
-  }
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/utils/.gitignore b/paddle/legacy/utils/.gitignore
deleted file mode 100644
index f2cfd740941..00000000000
--- a/paddle/legacy/utils/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-enable_virtualenv.c
diff --git a/paddle/legacy/utils/Any.h b/paddle/legacy/utils/Any.h
deleted file mode 100644
index 99a0139accc..00000000000
--- a/paddle/legacy/utils/Any.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#if __cplusplus > 201402L
-#include <any>
-
-namespace paddle {
-// using std::any for C++ 17
-using std::any;
-using std::any_cast;
-using std::bad_any_cast;
-}  // namespace paddle
-
-#else
-#include <any.hpp>
-
-namespace paddle {
-// use linb::any for C++ 11
-using linb::any;
-using linb::any_cast;
-using linb::bad_any_cast;
-}  // namespace paddle
-#endif
diff --git a/paddle/legacy/utils/CMakeLists.txt b/paddle/legacy/utils/CMakeLists.txt
deleted file mode 100644
index b42b2bae968..00000000000
--- a/paddle/legacy/utils/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-# The utilities for paddle
-file(GLOB UTIL_HEADERS . *.h)
-file(GLOB UTIL_SOURCES . *.cpp)
-create_resources(${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.py
-  ${CMAKE_CURRENT_BINARY_DIR}/enable_virtualenv.c)
-set(UTIL_RES ${CMAKE_CURRENT_BINARY_DIR}/enable_virtualenv.c)
-
-if(APPLE)
-    file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)
-else()
-    file(GLOB UTIL_ARCH_SOURCES . arch/linux/*.cpp)
-endif()
-add_library(paddle_utils STATIC
-        ${UTIL_SOURCES}
-        ${UTIL_ARCH_SOURCES}
-        ${UTIL_RES})
-add_dependencies(paddle_utils paddle_proto ${external_project_dependencies})
-if(WITH_TESTING)
-    add_subdirectory(tests)
-endif()
diff --git a/paddle/legacy/utils/ClassRegistrar.h b/paddle/legacy/utils/ClassRegistrar.h
deleted file mode 100644
index 5f40a0b25e9..00000000000
--- a/paddle/legacy/utils/ClassRegistrar.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <string>
-
-#include "Util.h"
-
-namespace paddle {
-
-/**
- * This class is used to keep a set of class types. It can register a
- * class by a type name and create an instance of a class by type.
- * Example:
- *   // Declare the registrar
- *   ClassRegistrar<Layer, LayerConfig> registar_;
- *
- *   // Register a class using its constructor
- *   registrar_.registerClass<ConvLayer>("conv");
- *
- *   // Register a class using a creation function
- *   registrar_.registerClass("pool", [](LayerConfig& config){
- *     return PoolLayer::create(config);
- *   });
- *
- *   // create a class instance by type name
- *   Layer* layer = registrar_.createByType("conv", config);
- */
-template <class BaseClass, typename... CreateArgs>
-class ClassRegistrar {
- public:
-  typedef std::function<BaseClass*(CreateArgs...)> ClassCreator;
-
-  // Register a class using a creation function.
-  // The creation function's arguments are CreateArgs
-  void registerClass(const std::string& type, ClassCreator creator) {
-    CHECK(creatorMap_.count(type) == 0) << "Duplicated class type: " << type;
-    creatorMap_[type] = creator;
-  }
-
-  // Register a class using its constructor
-  // The constructor's arguments are CreateArgs
-  template <class ClassType>
-  void registerClass(const std::string& type) {
-    registerClass(type,
-                  [](CreateArgs... args) { return new ClassType(args...); });
-  }
-
-  // Create a class instance of type @type using args
-  BaseClass* createByType(const std::string& type, CreateArgs... args) {
-    ClassCreator creator;
-    CHECK(mapGet(type, creatorMap_, &creator)) << "Unknown class type: "
-                                               << type;
-    return creator(args...);
-  }
-
-  template <typename T>
-  inline void forEachType(T callback) {
-    for (auto it = creatorMap_.begin(); it != creatorMap_.end(); ++it) {
-      callback(it->first);
-    }
-  }
-
- protected:
-  std::map<std::string, ClassCreator> creatorMap_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Common.h b/paddle/legacy/utils/Common.h
deleted file mode 100644
index 1f1d0255a5e..00000000000
--- a/paddle/legacy/utils/Common.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Excepts.h"
-
-/**
- * Disable copy macro.
- */
-#define DISABLE_COPY(class_name)                \
-  class_name(class_name &&) = delete;           \
-  class_name(const class_name &other) = delete; \
-  class_name &operator=(const class_name &other) = delete
-
-namespace paddle {
-
-#ifdef PADDLE_TYPE_DOUBLE
-using real = double;
-#else
-using real = float;
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/CpuId.cpp b/paddle/legacy/utils/CpuId.cpp
deleted file mode 100644
index 66e7c6606f0..00000000000
--- a/paddle/legacy/utils/CpuId.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/CpuId.h"
-#include "paddle/legacy/utils/Util.h"
-
-#ifdef _WIN32
-
-#include <intrin.h>
-
-/// for MSVC
-#define CPUID(info, x) __cpuidex(info, x, 0)
-
-#else
-
-#if !defined(__arm__) && !defined(__aarch64__)
-#include <cpuid.h>
-/// for GCC/Clang
-#define CPUID(info, x) __cpuid_count(x, 0, info[0], info[1], info[2], info[3])
-#endif
-
-#endif
-
-namespace paddle {
-
-SIMDFlags::SIMDFlags() {
-#if defined(__arm__) || defined(__aarch64__)
-  simd_flags_ = SIMD_NEON;
-#else
-  unsigned int cpuInfo[4];
-  // CPUID: https://en.wikipedia.org/wiki/CPUID
-  // clang-format off
-  CPUID(cpuInfo, 0x00000001);
-  simd_flags_ |= cpuInfo[3] & (1 << 25) ? SIMD_SSE   : SIMD_NONE;
-  simd_flags_ |= cpuInfo[3] & (1 << 26) ? SIMD_SSE2  : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 <<  0) ? SIMD_SSE3  : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 <<  9) ? SIMD_SSSE3 : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 << 19) ? SIMD_SSE41 : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 << 20) ? SIMD_SSE42 : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 << 12) ? SIMD_FMA3  : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 << 28) ? SIMD_AVX   : SIMD_NONE;
-
-  CPUID(cpuInfo, 0x00000007);
-  simd_flags_ |= cpuInfo[1] & (1 <<  5) ? SIMD_AVX2  : SIMD_NONE;
-  simd_flags_ |= cpuInfo[1] & (1 << 16) ? SIMD_AVX512: SIMD_NONE;
-
-  CPUID(cpuInfo, 0x80000001);
-  simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4  : SIMD_NONE;
-  // clang-fotmat on
-#endif
-}
-
-SIMDFlags const* SIMDFlags::instance() {
-  static SIMDFlags instance;
-  return &instance;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/CpuId.h b/paddle/legacy/utils/CpuId.h
deleted file mode 100644
index ed58211d13a..00000000000
--- a/paddle/legacy/utils/CpuId.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Common.h"
-#include "Error.h"
-
-namespace paddle {
-
-// clang-format off
-enum simd_t {
-  SIMD_NONE   = 0,          ///< None
-  SIMD_SSE    = 1 << 0,     ///< SSE
-  SIMD_SSE2   = 1 << 1,     ///< SSE 2
-  SIMD_SSE3   = 1 << 2,     ///< SSE 3
-  SIMD_SSSE3  = 1 << 3,     ///< SSSE 3
-  SIMD_SSE41  = 1 << 4,     ///< SSE 4.1
-  SIMD_SSE42  = 1 << 5,     ///< SSE 4.2
-  SIMD_FMA3   = 1 << 6,     ///< FMA 3
-  SIMD_FMA4   = 1 << 7,     ///< FMA 4
-  SIMD_AVX    = 1 << 8,     ///< AVX
-  SIMD_AVX2   = 1 << 9,     ///< AVX 2
-  SIMD_AVX512 = 1 << 10,    ///< AVX 512
-  SIMD_NEON   = 1 << 11,    ///  NEON
-};
-// clang-format on
-
-class SIMDFlags final {
- public:
-  DISABLE_COPY(SIMDFlags);
-
-  SIMDFlags();
-
-  static SIMDFlags const* instance();
-
-  inline bool check(int flags) const {
-    return !((simd_flags_ & flags) ^ flags);
-  }
-
- private:
-  int simd_flags_ = SIMD_NONE;
-};
-
-/**
- * @brief   Check SIMD flags at runtime.
- *
- * For example.
- * @code{.cpp}
- *
- * if (HAS_SIMD(SIMD_AVX2 | SIMD_FMA4)) {
- *      avx2_fm4_stub();
- * } else if (HAS_SIMD(SIMD_AVX)) {
- *      avx_stub();
- * }
- *
- * @endcode
- */
-#define HAS_SIMD(__flags) SIMDFlags::instance()->check(__flags)
-
-/**
- * @brief   Check SIMD flags at runtime.
- *
- * 1. Check all SIMD flags at runtime:
- *
- * @code{.cpp}
- * if (HAS_AVX && HAS_AVX2) {
- *      avx2_stub();
- * }
- * @endcod
- *
- * 2. Check one SIMD flag at runtime:
- *
- * @code{.cpp}
- * if (HAS_SSE41 || HAS_SSE42) {
- *      sse4_stub();
- * }
- * @endcode
- */
-// clang-format off
-#define HAS_SSE     HAS_SIMD(SIMD_SSE)
-#define HAS_SSE2    HAS_SIMD(SIMD_SSE2)
-#define HAS_SSE3    HAS_SIMD(SIMD_SSE3)
-#define HAS_SSSE3   HAS_SIMD(SIMD_SSSE3)
-#define HAS_SSE41   HAS_SIMD(SIMD_SSE41)
-#define HAS_SSE42   HAS_SIMD(SIMD_SSE42)
-#define HAS_FMA3    HAS_SIMD(SIMD_FMA3)
-#define HAS_FMA4    HAS_SIMD(SIMD_FMA4)
-#define HAS_AVX     HAS_SIMD(SIMD_AVX)
-#define HAS_AVX2    HAS_SIMD(SIMD_AVX2)
-#define HAS_AVX512  HAS_SIMD(SIMD_AVX512)
-#define HAS_NEON    HAS_SIMD(SIMD_NEON)
-// clang-format on
-
-/**
- * Invoke checkCPUFeature() before Paddle initialization to
- * check target machine whether support compiled instructions.
- * If not, simply throw out an error.
- */
-inline Error __must_check checkCPUFeature() {
-  Error err;
-#ifndef __AVX__
-  if (HAS_AVX) {
-    LOG(WARNING) << "PaddlePaddle wasn't compiled to use avx instructions, "
-                 << "but these are available on your machine and could "
-                 << "speed up CPU computations via CMAKE .. -DWITH_AVX=ON";
-  }
-#else
-  if (!HAS_AVX) {
-    err = Error(
-        "PaddlePaddle was compiled to use avx instructions, "
-        "but these aren't available on your machine, please "
-        "disable it via CMAKE .. -DWITH_AVX=OFF");
-  }
-#endif  // __AVX__
-#ifdef __SSE3__
-  if (!HAS_SSE3) {
-    err = Error(
-        "PaddlePaddle was compiled to use sse3 instructions, "
-        "which is the minimum requirement of PaddlePaddle. "
-        "But these aren't available on your current machine.");
-  }
-#endif  // __SSE3__
-
-  return err;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/CustomStackTrace.cpp b/paddle/legacy/utils/CustomStackTrace.cpp
deleted file mode 100644
index 9723d7df974..00000000000
--- a/paddle/legacy/utils/CustomStackTrace.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CustomStackTrace.h"
-#include <gflags/gflags.h>
-#include <iostream>
-
-DEFINE_bool(
-    layer_stack_error_only_current_thread,
-    true,
-    "Dump current thread or whole process layer stack when signal error "
-    "occurred. true means only dump current thread layer stack");
-
-namespace paddle {
-
-CustomStackTrace<std::string> gLayerStackTrace;
-
-static std::mutex gLayerStackTraceMtx;
-void installLayerStackTracer() {
-  logging::installFailureWriter([](const char* data, int sz) {
-    std::lock_guard<std::mutex> guard(gLayerStackTraceMtx);
-    if (!gLayerStackTrace.empty()) {
-      size_t curTid = -1UL;
-      std::hash<std::thread::id> hasher;
-      gLayerStackTrace.dump(
-          [&curTid, &hasher](std::thread::id tid,
-                             bool* isForwarding,
-                             const std::string& layerName) {
-            if (curTid != hasher(tid)) {
-              if (curTid != -1UL) {
-                std::cerr << std::endl;
-              }
-              curTid = hasher(tid);
-              std::cerr << "Thread [" << tid << "] ";
-              if (isForwarding) {
-                std::cerr << (*isForwarding ? "Forwarding " : "Backwarding ");
-              }
-            }
-            std::cerr << layerName << ", ";
-          },
-          FLAGS_layer_stack_error_only_current_thread);
-      std::cerr << std::endl;
-    }
-    std::cerr.write(data, sz);
-  });
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/CustomStackTrace.h b/paddle/legacy/utils/CustomStackTrace.h
deleted file mode 100644
index b60077ea2d9..00000000000
--- a/paddle/legacy/utils/CustomStackTrace.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <stack>
-#include <thread>
-#include <unordered_map>
-
-#include "ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * A ThreadLocal stack for tracing train/test process.
- * (More details of ThreadLocal can be find
- * in the comments of ThreadLocal class.)
- *
- * For example.
- * @code{.cpp}
- *
- * paddle::CustomStackTrace<std::string> stack;
- * for (auto& layer : layers){
- *   stack.push(layer->getName());
- *   layer->forward();
- * }
- *
- * stack.pop("");  // mark under pop stage.
- *
- * for (auto it = layers.rbegin(); it != layers.rend(); ++it){
- *   auto& layer = *it;
- *   layer->backward(passType);
- *   stack.pop(layer->getName());
- * }
- *
- * @endcode
- */
-template <typename T>
-class CustomStackTrace {
- public:
-  /**
-   * @brief Pop out an item from the top of the stack if item == top.
-   *        Else, just set status to popping.
-   */
-  void pop(const T& item) {
-    auto& s = this->stack();
-    if (item == s.top()) {
-      s.pop();
-    }
-  }
-
-  /**
-   * @brief Indicate whether we are at forward or backward stage of computation
-   */
-  void set_stage(bool isForward) { pushing() = isForward; }
-
-  /**
-   * @brief clear current thread stack.
-   */
-  void clear() {
-    auto& s = stack();
-    while (!s.empty()) {
-      s.pop();
-    }
-  }
-
-  /**
-   * @brief return true if all thread's stack is empty.
-   * @return true if empty
-   */
-  bool empty() const {
-    std::lock_guard<std::mutex> g(this->mtx_);
-    for (auto p : this->stackBuffers_) {
-      std::stack<T>& s = *p.second;
-      if (!s.empty()) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  /**
-   * @brief DumpCallback Type. It will be invoked many times by dump method.
-   *
-   * The first parameter is stack thread id.
-   * The second parameter is the last action of stack is push or not.
-   * The third parameter is the item in stack.
-   */
-  typedef std::function<void(const std::thread::id& /*threadId*/,
-                             bool* /*isPushing*/,
-                             const T& /*item*/)>
-      DumpCallback;
-
-  /**
-   * Dump all thread stack, and all stack will be cleared.
-   */
-  void dump(const DumpCallback& callback, bool onlyCurrentThread = false) {
-    std::lock_guard<std::mutex> g(this->mtx_);
-    for (auto p : this->stackBuffers_) {
-      std::thread::id tid = p.first;
-      if (onlyCurrentThread && tid != std::this_thread::get_id()) {
-        continue;
-      }
-      std::stack<T>& s = *p.second;
-      bool* isPush = nullptr;
-      auto it = this->pushingBuffers_.find(tid);
-      if (it != this->pushingBuffers_.end()) {
-        isPush = it->second;
-      }
-
-      while (!s.empty()) {
-        callback(tid, isPush, s.top());
-        s.pop();
-      }
-    }
-  }
-
-  /**
-   * @brief Push item to current thread stack.
-   */
-  void push(const T& item) {
-    pushing() = true;
-    auto& p = this->stack();
-    p.push(item);
-  }
-
- private:
-  /**
-   * Get thread local attribute, and save them into a map (threadId => TYPE*)
-   *
-   * @tparam TYPE thread local attribute type.
-   * @param threadLocal Thread Local object.
-   * @param buffers a map from threadId to TYPE*
-   */
-  template <typename TYPE>
-  inline TYPE& getThreadLocal(
-      ThreadLocal<TYPE>& threadLocal,
-      std::unordered_map<std::thread::id, TYPE*>& buffers) {
-    TYPE* retv = threadLocal.get(false);
-    if (retv) {
-      return *retv;
-    } else {
-      std::lock_guard<std::mutex> guard(this->mtx_);
-      retv = threadLocal.get();
-      auto id = std::this_thread::get_id();
-      buffers.insert({id, retv});
-      return *retv;
-    }
-  }
-
-  /**
-   * @brief Get thread local stack reference.
-   */
-  std::stack<T>& stack() {
-    return this->getThreadLocal(this->logStack_, this->stackBuffers_);
-  }
-
-  /**
-   * @brief Get thread local pushing flag.
-   */
-  bool& pushing() {
-    return this->getThreadLocal(this->isPushing_, this->pushingBuffers_);
-  }
-
- private:
-  mutable std::mutex mtx_;
-
-  std::unordered_map<std::thread::id, std::stack<T>*> stackBuffers_;
-  std::unordered_map<std::thread::id, bool*> pushingBuffers_;
-  ThreadLocal<bool> isPushing_;
-  ThreadLocal<std::stack<T>> logStack_;
-};
-
-extern CustomStackTrace<std::string> gLayerStackTrace;
-
-/**
- * @brief Install a failure handler to print layer stack when error.
- */
-extern void installLayerStackTracer();
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/DynamicLoader.cpp b/paddle/legacy/utils/DynamicLoader.cpp
deleted file mode 100644
index 9ac4a56c6e3..00000000000
--- a/paddle/legacy/utils/DynamicLoader.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DynamicLoader.h"
-#include <gflags/gflags.h>
-#include "Logging.h"
-
-DEFINE_string(cudnn_dir,
-              "",
-              "Specify path for loading libcudnn.so. For instance, "
-              "/usr/local/cudnn/lib. If empty [default], dlopen "
-              "will search cudnn from LD_LIBRARY_PATH");
-
-DEFINE_string(cuda_dir,
-              "",
-              "Specify path for loading cuda library, such as libcublas, "
-              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
-              "dlopen will search cuda from LD_LIBRARY_PATH");
-
-DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
-
-DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
-
-DEFINE_string(tensorrt_dir, "", "Specify path for loading libnvinfer.so.");
-
-static inline std::string join(const std::string& part1,
-                               const std::string& part2) {
-  // directory separator
-  const char sep = '/';
-  if (!part2.empty() && part2.front() == sep) {
-    return part2;
-  }
-  std::string ret;
-  ret.reserve(part1.size() + part2.size() + 1);
-  ret = part1;
-  if (!ret.empty() && ret.back() != sep) {
-    ret += sep;
-  }
-  ret += part2;
-  return ret;
-}
-
-static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
-                                               void** dso_handle,
-                                               int dynload_flags) {
-  VLOG(3) << "Try to find library: " << dso_path
-          << " from default system path.";
-  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
-  *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-
-// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
-// bring System Integrity Projection (SIP), if dso_handle
-// is null, search from default package path in Mac OS.
-#if defined(__APPLE__) || defined(__OSX__)
-  if (nullptr == *dso_handle) {
-    dso_path = join("/usr/local/cuda/lib/", dso_path);
-    *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-    if (nullptr == *dso_handle) {
-      if (dso_path == "libcudnn.dylib") {
-        LOG(FATAL)
-            << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"  // NOLINT
-            << "For instance, sudo tar -xzf "
-               "cudnn-7.5-osx-x64-v5.0-ga.tgz -C "  // NOLINT
-            << "/usr/local \n sudo chmod a+r "
-               "/usr/local/cuda/include/cudnn.h "  // NOLINT
-            << "/usr/local/cuda/lib/libcudnn*";
-      }
-    }
-  }
-#endif
-}
-
-static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
-                                              const std::string& dso_name,
-                                              void** dso_handle) {
-  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-  *dso_handle = nullptr;
-
-  std::string dlPath = dso_name;
-  if (search_root.empty()) {
-    GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
-  } else {
-    // search xxx.so from custom path
-    dlPath = join(search_root, dso_name);
-    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
-    // if not found, search from default path
-    if (nullptr == *dso_handle) {
-      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
-                   << dlerror() << ")";
-      dlPath = dso_name;
-      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
-    }
-  }
-
-  CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath
-                                << " (" << dlerror() << ") \n"
-                                << "Please specify its path correctly using "
-                                   "following ways: \n"
-
-                                << "Method. set environment variable "
-                                   "LD_LIBRARY_PATH on Linux or "
-                                << "DYLD_LIBRARY_PATH on Mac OS. \n"
-                                << "For instance, issue command: export "
-                                   "LD_LIBRARY_PATH=... \n"
-
-                                << "Note: After Mac OS 10.11, using the "
-                                   "DYLD_LIBRARY_PATH is impossible "
-                                << "unless System Integrity Protection (SIP) "
-                                   "is disabled.";
-}
-
-void GetCublasDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
-#endif
-}
-
-void GetCudnnDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
-#endif
-}
-
-void GetCurandDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
-#endif
-}
-
-void GetWarpCTCDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
-#endif
-}
-
-void GetLapackDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
-#endif
-}
-
-void GetTensorRtDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(
-      FLAGS_tensorrt_dir, "libnvinfer.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so", dso_handle);
-#endif
-}
diff --git a/paddle/legacy/utils/DynamicLoader.h b/paddle/legacy/utils/DynamicLoader.h
deleted file mode 100644
index 02f519de4b3..00000000000
--- a/paddle/legacy/utils/DynamicLoader.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <dlfcn.h>
-#include <memory>
-#include <mutex>
-#include <string>
-
-/**
- * @brief    load the DSO of CUBLAS
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCublasDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of CUDNN
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCudnnDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of CURAND
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCurandDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of warp-ctc
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetWarpCTCDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of lapack
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetLapackDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of tensorrt
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetTensorRtDsoHandle(void** dso_handle);
diff --git a/paddle/legacy/utils/Error.h b/paddle/legacy/utils/Error.h
deleted file mode 100644
index 1fc8482e3a1..00000000000
--- a/paddle/legacy/utils/Error.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <memory>
-#include <string>
-
-/**
- * __must_check macro. It make the function's return value must be used,
- * otherwise it will raise a compile warning. And also Paddle treat all compile
- * warnings as errors.
- */
-#ifdef __GNUC__
-#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) >= 30400
-#define __must_check __attribute__((warn_unused_result))
-#else
-#define __must_check
-#endif
-#else
-#define __must_check
-#endif
-
-namespace paddle {
-
-/**
- * Error is Paddle error code. It only contain a std::string as error message.
- *
- *
- * There are two styles to return error in Paddle.
- *
- * 1. Return Error
- *    When method return a status, the return must use `__must_check` attribute.
- *    Example as below.
- * @code{cpp}
- * Error __must_check foo();
- *
- * Error __must_check bar() {
- *   // do something.
- *   Error err = foo();  // invoke other method return status.
- *   if (err) return err;
- *   // do something else.
- *   return Error();
- * }
- * @endcode{cpp}
- *
- * 2. Return by parameter.
- *    It is another way to return an error, by using a pointer parameter.
- *    Example as below.
- *
- * @code{cpp}
- * Error bar();
- *
- * int foo(Error* error) {
- *   // Do something.
- *   Error err = bar();
- *   if (err) {
- *     *error = s;
- *     return 0;
- *   }
- *   // Do something else.
- *   if (someInternalErrorHappend) {
- *     *error = Error("Some dimension is too large, %d", dimension);
- *     return 0;
- *   }
- *   // End of method.
- *   return someValue;
- * }
- *
- * Error foobar() {
- *   Error err;
- *   // do something.
- *   foo(&err);
- *   if (err) return err;
- * }
- * @endcode{cpp}
- *
- *
- * Currently there is a helper method 'check' in status, because Paddle always
- * use log(FATAL) or CHECK to make program exit before. When we clean all
- * log(FATAL) and CHECK in Paddle, 'check' method will be removed.
- */
-class Error {
- public:
-  /**
-   * Construct a no-error value.
-   */
-  Error() {}
-
-  /**
-   * @brief Create an Error use printf syntax.
-   */
-  explicit Error(const char* fmt, ...) {
-    va_list ap;
-    va_start(ap, fmt);
-    constexpr size_t kBufferSize = 1024;
-    char buffer[kBufferSize];
-    vsnprintf(buffer, kBufferSize, fmt, ap);
-    this->msg_.reset(new std::string(buffer));
-    va_end(ap);
-  }
-
-  /**
-   * @brief msg will return the error message. If no error, return nullptr.
-   */
-  const char* msg() const {
-    if (msg_) {
-      return msg_->c_str();
-    } else {
-      return nullptr;
-    }
-  }
-
-  /**
-   * @brief check this status by glog.
-   * @note It is a temp method used during cleaning Paddle code. It will be
-   *       removed later.
-   */
-  void check() const { CHECK(this->isOK()) << msg(); }
-
-  /**
-   * @brief isOK return True if there is no error.
-   * @return True if no error.
-   */
-  bool isOK() const { return msg_ == nullptr; }
-
- private:
-  std::shared_ptr<std::string> msg_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Excepts.h b/paddle/legacy/utils/Excepts.h
deleted file mode 100644
index 5c2c504f53a..00000000000
--- a/paddle/legacy/utils/Excepts.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef EXCEPTS_H_
-#define EXCEPTS_H_
-
-#include <fenv.h>
-
-#if defined(__APPLE__) || defined(__OSX__)
-
-int fegetexcept(void);
-int feenableexcept(unsigned int excepts);
-int fedisableexcept(unsigned int excepts);
-
-#endif
-
-#endif  // EXCEPTS_H_
diff --git a/paddle/legacy/utils/Flags.cpp b/paddle/legacy/utils/Flags.cpp
deleted file mode 100644
index ea47cf23eb6..00000000000
--- a/paddle/legacy/utils/Flags.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Flags.h"
-
-#ifndef PADDLE_WITH_CUDA
-DEFINE_bool(use_gpu, false, "Only support CPU training");
-#else
-DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
-#endif
-
-#ifdef PADDLE_WITH_MKLDNN
-// TODO(TJ): change to true when MKLDNN layers support multi-inputs
-DEFINE_bool(use_mkldnn, false, "Default still keep use CPU training");
-#else
-DEFINE_bool(use_mkldnn, false, "Only support CPU training");
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-// TODO(TJ): change to true when fully confirmed
-DEFINE_bool(use_mkl_packed, false, "Whether to use MKL Packed Optimization");
-#else
-DEFINE_bool(use_mkl_packed, false, "Not to use MKL Packed Optimization");
-#endif
-
-DEFINE_bool(parallel_nn,
-            false,
-            "Whether to use multi-threads to calculate one neural network."
-            "If it was set false, use gpu_id specify which gpu core to use"
-            "(the device property in the trainer config file will be ingored)."
-            "If it was set true, the gpu core is specified by the trainer"
-            "  config file(gpu_id will be ignored).");
-DEFINE_int32(trainer_count, 1, "Defined how many trainers to train");
-DEFINE_int32(gpu_id, 0, "Which gpu core to use");
-DEFINE_int32(port, 20134, "Listening port for pserver");
-DEFINE_int32(ports_num,
-             1,
-             "Number of ports for sending dense parameter,"
-             " following ports on parameter server will be visited"
-             " for sending dense parameter: [port, port+ports_num-1]");
-DEFINE_int32(ports_num_for_sparse,
-             0,
-             "Number of ports for sending sparse parameter,"
-             " following ports on parameter server will be visited"
-             " for sending sparse parameter:"
-             " [port+ports_num, port+ports_num+ports_num_for_sparse-1]");
-DEFINE_string(nics, "xgbe0,xgbe1", "network device name for pservers");
-DEFINE_string(rdma_tcp, "tcp", "use rdma or tcp rdma transport protocol");
-DEFINE_int32(trainer_id,
-             0,
-             "For distributed training, each trainer must be given an unique id"
-             " ranging from 0 to num_trainers-1. Trainer 0 is the master"
-             " trainer");
-DEFINE_int32(num_gradient_servers, 1, "number of gradient servers");
-DEFINE_string(comment, "", "A string for commenting this training task");
-DEFINE_string(load_missing_parameter_strategy,
-              "fail",
-              "which operation to take on load model fails. support "
-              "fail/rand/zero only.");
-DEFINE_int32(log_period, 100, "Log progress every so many batches");
-DEFINE_int32(log_period_server,
-             500,
-             "Log progress every so many batches at pserver end");
-DEFINE_double(checkgrad_eps, 1e-5, "parameter change size for checkgrad");
-DEFINE_int32(enable_parallel_vector, 0, "threshold for enable parallel vector");
-DEFINE_bool(loadsave_parameters_in_pserver,
-            false,
-            "load and save parameters in pserver. "
-            "only work while parameter set sparse_remote_update.");
-DEFINE_int32(beam_size,
-             1,
-             "Beam size used in generating most probable output sequences.");
-
-DEFINE_bool(show_layer_stat, false, "show the statistics of each layer");
-DEFINE_string(predict_file, "", "File name for saving predict result");
-DEFINE_bool(prev_batch_state, false, "batch is continue with next batch");
-DEFINE_string(init_model_path,
-              "",
-              "Path of the initial model parameters."
-              "If it was set, start_pass will be ignored.");
diff --git a/paddle/legacy/utils/Flags.h b/paddle/legacy/utils/Flags.h
deleted file mode 100644
index b64295bca09..00000000000
--- a/paddle/legacy/utils/Flags.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <gflags/gflags.h>
-
-DECLARE_bool(parallel_nn);
-DECLARE_int32(async_count);
-DECLARE_int32(port);
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_int32(trainer_count);
-DECLARE_int32(ports_num);
-DECLARE_int32(ports_num_for_sparse);
-DECLARE_string(nics);
-DECLARE_string(rdma_tcp);
-DECLARE_int32(trainer_id);
-DECLARE_int32(num_gradient_servers);
-DECLARE_string(comment);
-DECLARE_string(load_missing_parameter_strategy);
-DECLARE_int32(log_period);
-DECLARE_int32(log_period_server);
-DECLARE_double(checkgrad_eps);
-DECLARE_int32(enable_parallel_vector);
-DECLARE_bool(loadsave_parameters_in_pserver);
-DECLARE_int32(beam_size);
-DECLARE_bool(show_layer_stat);
-DECLARE_string(predict_file);
-DECLARE_bool(prev_batch_state);
-DECLARE_string(init_model_path);
-DECLARE_bool(use_mkldnn);
-DECLARE_bool(use_mkl_packed);
diff --git a/paddle/legacy/utils/GlobalConstants.cpp b/paddle/legacy/utils/GlobalConstants.cpp
deleted file mode 100644
index 9e8dade0b22..00000000000
--- a/paddle/legacy/utils/GlobalConstants.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GlobalConstants.h"
-
-namespace paddle {
-
-const std::string TrainAlgorithm::SGD = "sgd";
-const std::string TrainAlgorithm::AsyncSGD = "async_sgd";
-const std::string TrainAlgorithm::OWLQN = "owlqn";
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/GlobalConstants.h b/paddle/legacy/utils/GlobalConstants.h
deleted file mode 100644
index 3f45e822684..00000000000
--- a/paddle/legacy/utils/GlobalConstants.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-
-namespace paddle {
-
-namespace enumeration_wrapper {
-enum PassType {
-  PASS_TRAIN,   // Train pass
-  PASS_TEST,    // Test pass
-  PASS_GC,      // Gradient Check pass
-  PASS_METRIC,  // pass for generate template output with no drop rate.
-};
-
-enum ParameterType {
-  PARAMETER_VALUE = 0,
-  PARAMETER_GRADIENT,
-  PARAMETER_MOMENTUM,
-
-  // Used by ParameterAverager
-  PARAMETER_SUM1,
-  PARAMETER_SUM2,
-  PARAMETER_SUM3,
-
-  //   also used by AdagradParameterUpdater/AdadeltaParameterUpdater
-  PARAMETER_LEARNING_RATE,
-
-  // Used by Sparse SGD update
-  PARAMETER_UPDATE_TIME,
-
-  // Used by async_sgd
-  // Change of the parameter since last remote update
-  PARAMETER_DELTA,
-
-  // Used by BatchRemoteParameterUpdater
-  PARAMETER_GRADIENT_SUM,
-
-  // Used by AdagradParameterUpdater/AdadeltaParameterUpdater
-  PARAMETER_GRADIENT_SQURESUM,
-  PARAMETER_GRADIENT_SQURESUM1,
-
-  // Used by SparseConnected layer
-  PARAMETER_ROWS,
-  PARAMETER_COLS,
-
-  // Used by Adam Optimizer.
-  PARAMETER_SECOND_MOMENTUM,
-
-  // Used By AdaMax Optimizer.
-  PARAMETER_WEIGHTED_INFINITY_NORM,
-
-  // Used by remote parameter average
-  PARAMETER_APPLY,
-
-  // Used by sparse momentum
-  PARAMETER_MOMENTUM_UT,
-  PARAMETER_MOMENTUM_VT,
-
-  NUM_PARAMETER_TYPES,
-};
-
-}  // namespace enumeration_wrapper
-
-//! explicit import enum into paddle namespace.
-using namespace enumeration_wrapper;  // NOLINT
-
-class TrainAlgorithm {
- public:
-  static const std::string SGD;
-  static const std::string AsyncSGD;
-  static const std::string OWLQN;
-
-  static inline bool isValid(const std::string& algo) {
-    return algo == SGD || algo == AsyncSGD || algo == OWLQN;
-  }
-};
-
-#ifdef __AVX__
-const int ALIGN_HINT = 32;
-#else
-const int ALIGN_HINT = 16;
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Locks.h b/paddle/legacy/utils/Locks.h
deleted file mode 100644
index 65f983685f5..00000000000
--- a/paddle/legacy/utils/Locks.h
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <pthread.h>
-#include <sys/time.h>
-#include <condition_variable>
-#include <mutex>
-
-#include "Common.h"
-
-namespace paddle {
-
-/**
- * A simple read-write lock.
- * The RWlock allows a number of readers or at most one writer
- * at any point in time.
- * The RWlock disable copy.
- *
- * Lock:
- *
- * Use lock() to lock on write mode, no other thread can get it
- * until unlock.
- *
- * Use lock_shared() to lock on read mode, other thread can get
- * it by using the same method lock_shared().
- *
- * Unlock:
- *
- * Use unlock() to unlock the lock.
- */
-class RWLock {
- public:
-  RWLock() { pthread_rwlock_init(&rwlock_, NULL); }
-  ~RWLock() { pthread_rwlock_destroy(&rwlock_); }
-  RWLock(const RWLock&) = delete;
-  RWLock& operator=(const RWLock&) = delete;
-
-  /**
-   * @brief lock on write mode.
-   * @note the method will block the thread, if failed to get the lock.
-   */
-  // std::mutex interface
-  void lock() { pthread_rwlock_wrlock(&rwlock_); }
-  /**
-   * @brief lock on read mode.
-   * @note if another thread is writing, it can't get the lock,
-   * and will block the thread.
-   */
-  void lock_shared() { pthread_rwlock_rdlock(&rwlock_); }
-  void unlock() { pthread_rwlock_unlock(&rwlock_); }
-
- protected:
-  pthread_rwlock_t rwlock_;
-};
-
-/**
- * The ReadLockGuard is a read mode RWLock
- * using RAII management mechanism.
- */
-class ReadLockGuard {
- public:
-  /**
-   * @brief Construct Function. Lock on rwlock in read mode.
-   */
-  explicit ReadLockGuard(RWLock& rwlock) : rwlock_(&rwlock) {
-    rwlock_->lock_shared();
-  }
-
-  /**
-   * @brief Destruct Function.
-   * @note This method just unlock the read mode rwlock,
-   * won't destroy the lock.
-   */
-  ~ReadLockGuard() { rwlock_->unlock(); }
-
- protected:
-  RWLock* rwlock_;
-};
-
-/**
- * A simple wrapper for spin lock.
- * The lock() method of SpinLock is busy-waiting
- * which means it will keep trying to lock until lock on successfully.
- * The SpinLock disable copy.
- */
-class SpinLockPrivate;
-class SpinLock {
- public:
-  DISABLE_COPY(SpinLock);
-  SpinLock();
-  ~SpinLock();
-
-  // std::mutext interface
-  void lock();
-  void unlock();
-
- private:
-  SpinLockPrivate* m;
-};
-
-/**
- * A simple wapper of semaphore which can only be shared in the same process.
- */
-class SemaphorePrivate;
-class Semaphore {
- public:
-  //! Disable copy & assign
-  Semaphore(const Semaphore& other) = delete;
-  Semaphore& operator=(const Semaphore&& other) = delete;
-
-  //! Enable move.
-  Semaphore(Semaphore&& other) : m(std::move(other.m)) {}
-
- public:
-  /**
-   * @brief Construct Function.
-   * @param[in] initValue the initial value of the
-   * semaphore, default 0.
-   */
-  explicit Semaphore(int initValue = 0);
-
-  ~Semaphore();
-
-  /**
-   * @brief The same as wait(), except if the decrement can not
-   * be performed until ts return false install of blocking.
-   * @param[in] ts an absolute timeout in seconds and nanoseconds
-   * since the Epoch 1970-01-01 00:00:00 +0000(UTC).
-   * @return ture if the decrement proceeds before ts,
-   * else return false.
-   */
-  bool timeWait(struct timespec* ts);
-
-  /**
-   * @brief decrement the semaphore. If the semaphore's value is 0, then call
-   * blocks.
-   */
-  void wait();
-
-  /**
-   * @brief increment the semaphore. If the semaphore's value
-   * greater than 0, wake up a thread blocked in wait().
-   */
-  void post();
-
- private:
-  SemaphorePrivate* m;
-};
-
-/**
- * A simple wrapper of thread barrier.
- * The ThreadBarrier disable copy.
- */
-class ThreadBarrierPrivate;
-class ThreadBarrier {
- public:
-  DISABLE_COPY(ThreadBarrier);
-
-  /**
-   * @brief Construct Function. Initialize the barrier should
-   * wait for count threads in wait().
-   */
-  explicit ThreadBarrier(int count);
-  ~ThreadBarrier();
-
-  /**
-   * @brief .
-   * If there were count - 1 threads waiting before,
-   * then wake up all the count - 1 threads and continue run together.
-   * Else block the thread until waked by other thread .
-   */
-  void wait();
-
- private:
-  ThreadBarrierPrivate* m;
-};
-
-/**
- * A wrapper for condition variable with mutex.
- */
-class LockedCondition : public std::condition_variable {
- public:
-  /**
-   * @brief execute op and notify one thread which was blocked.
-   * @param[in] op a thread can do something in op before notify.
-   */
-  template <class Op>
-  void notify_one(Op op) {
-    std::lock_guard<std::mutex> guard(mutex_);
-    op();
-    std::condition_variable::notify_one();
-  }
-
-  /**
-   * @brief execute op and notify all the threads which were blocked.
-   * @param[in] op a thread can do something in op before notify.
-   */
-  template <class Op>
-  void notify_all(Op op) {
-    std::lock_guard<std::mutex> guard(mutex_);
-    op();
-    std::condition_variable::notify_all();
-  }
-
-  /**
-   * @brief wait until pred return ture.
-   * @tparam Predicate c++ concepts, describes a function object
-   * that takes a single iterator argument
-   * that is dereferenced and used to
-   * return a value testable as a bool.
-   * @note pred shall not apply any non-constant function
-   * through the dereferenced iterator.
-   */
-  template <class Predicate>
-  void wait(Predicate pred) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    std::condition_variable::wait(lock, pred);
-  }
-
-  /**
-   * @brief get mutex.
-   */
-  std::mutex* mutex() { return &mutex_; }
-
- protected:
-  std::mutex mutex_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Logging.cpp b/paddle/legacy/utils/Logging.cpp
deleted file mode 100644
index ea96bad240a..00000000000
--- a/paddle/legacy/utils/Logging.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * Basically from tensorflow/core/platform/default/logging.cc
- * Used in embedded system where there is no glogs.
- */
-
-#include "Logging.h"
-#include <cstdlib>
-
-namespace paddle {
-
-void initializeLogging(int argc, char** argv) {
-  (void)(argc);
-  if (!getenv("GLOG_logtostderr")) {
-    google::LogToStderr();
-  }
-  google::InstallFailureSignalHandler();
-  google::InitGoogleLogging(argv[0]);
-}
-
-namespace logging {
-
-void setMinLogLevel(int level) { FLAGS_minloglevel = level; }
-
-void installFailureFunction(void (*callback)()) {
-  google::InstallFailureFunction(callback);
-}
-
-void installFailureWriter(void (*callback)(const char*, int)) {
-  google::InstallFailureWriter(callback);
-}
-
-}  // namespace logging
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Logging.h b/paddle/legacy/utils/Logging.h
deleted file mode 100644
index d9e551f0891..00000000000
--- a/paddle/legacy/utils/Logging.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * Basically from tensorflow/core/platform/default/logging.h
- * Used in embedded system where there is no glogs.
- */
-
-#pragma once
-#include <memory>
-#include <sstream>
-#include <string>
-
-#include <glog/logging.h>
-namespace paddle {
-
-void initializeLogging(int argc, char** argv);
-
-namespace logging {
-
-void setMinLogLevel(int level);
-
-void installFailureFunction(void (*callback)());
-
-void installFailureWriter(void (*callback)(const char*, int));
-
-}  // namespace logging
-}  // namespace paddle
-
-#ifndef NDEBUG
-#define DEBUG_LEVEL 5
-#define DBG VLOG(DEBUG_LEVEL)
-#else
-#define DBG DLOG(INFO)
-#endif
diff --git a/paddle/legacy/utils/PythonUtil.cpp b/paddle/legacy/utils/PythonUtil.cpp
deleted file mode 100644
index 21ed049c4d2..00000000000
--- a/paddle/legacy/utils/PythonUtil.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PythonUtil.h"
-#include <signal.h>
-#include <sstream>
-
-namespace paddle {
-
-#ifdef PADDLE_NO_PYTHON
-
-DEFINE_string(python_path, "", "python path");
-DEFINE_string(python_bin, "python2.7", "python bin");
-
-constexpr int kExecuteCMDBufLength = 204800;
-
-int executeCMD(const char* cmd, char* result) {
-  char bufPs[kExecuteCMDBufLength];
-  char ps[kExecuteCMDBufLength] = {0};
-  FILE* ptr;
-  strncpy(ps, cmd, kExecuteCMDBufLength);
-  if ((ptr = popen(ps, "r")) != NULL) {
-    size_t count = fread(bufPs, 1, kExecuteCMDBufLength, ptr);
-    memcpy(result,
-           bufPs,
-           count - 1);  // why count-1: remove the '\n' at the end
-    result[count] = 0;
-    pclose(ptr);
-    ptr = NULL;
-    return count - 1;
-  } else {
-    LOG(FATAL) << "popen failed";
-    return -1;
-  }
-}
-
-std::string callPythonFunc(const std::string& moduleName,
-                           const std::string& funcName,
-                           const std::vector<std::string>& args) {
-  std::string pythonLibPath = "";
-  std::string pythonBinPath = "";
-  if (!FLAGS_python_path.empty()) {
-    pythonLibPath = FLAGS_python_path + "/lib:";
-    pythonBinPath = FLAGS_python_path + "/bin/";
-  }
-  std::string s = "LD_LIBRARY_PATH=" + pythonLibPath + "$LD_LIBRARY_PATH " +
-                  pythonBinPath + std::string(FLAGS_python_bin) +
-                  " -c 'import " + moduleName + "\n" + "print " + moduleName +
-                  "." + funcName + "(";
-  for (auto& arg : args) {
-    s = s + "\"" + arg + "\", ";
-  }
-  s += ")'";
-  char result[kExecuteCMDBufLength] = {0};
-  LOG(INFO) << " cmd string: " << s;
-  int length = executeCMD(s.c_str(), result);
-  CHECK_NE(-1, length);
-  return std::string(result, length);
-}
-
-#else
-
-static std::recursive_mutex g_pyMutex;
-
-PyGuard::PyGuard() : guard_(g_pyMutex) {}
-
-static void printPyErrorStack(std::ostream& os,
-                              bool withEndl = false,
-                              bool withPyPath = true) {
-  PyObject *ptype, *pvalue, *ptraceback;
-  PyErr_Fetch(&ptype, &pvalue, &ptraceback);
-  PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
-  PyErr_Clear();
-  if (withPyPath) {
-    os << "Current PYTHONPATH: " << py::repr(PySys_GetObject(strdup("path")));
-    if (withEndl) {
-      os << std::endl;
-    }
-  }
-  PyTracebackObject* obj = (PyTracebackObject*)ptraceback;
-
-  os << "Python Error: " << PyString_AsString(PyObject_Str(ptype)) << " : "
-     << (pvalue == NULL ? "" : PyString_AsString(PyObject_Str(pvalue)));
-  if (withEndl) {
-    os << std::endl;
-  }
-  os << "Python Callstack: ";
-  if (withEndl) {
-    os << std::endl;
-  }
-  while (obj != NULL) {
-    int line = obj->tb_lineno;
-    const char* filename =
-        PyString_AsString(obj->tb_frame->f_code->co_filename);
-    os << "            " << filename << " : " << line;
-    if (withEndl) {
-      os << std::endl;
-    }
-    obj = obj->tb_next;
-  }
-
-  Py_XDECREF(ptype);
-  Py_XDECREF(pvalue);
-  Py_XDECREF(ptraceback);
-}
-PyObjectPtr callPythonFuncRetPyObj(const std::string& moduleName,
-                                   const std::string& funcName,
-                                   const std::vector<std::string>& args) {
-  PyGuard guard;
-  PyObjectPtr pyModule = py::import(moduleName);
-  PyObjectPtr pyFunc(PyObject_GetAttrString(pyModule.get(), funcName.c_str()));
-  CHECK_PY(pyFunc) << "GetAttrString failed.";
-  PyObjectPtr pyArgs(PyTuple_New(args.size()));
-  for (size_t i = 0; i < args.size(); ++i) {
-    PyObjectPtr pyArg(PyString_FromString(args[i].c_str()));
-    CHECK_PY(pyArg) << "Import pyArg failed.";
-    PyTuple_SetItem(pyArgs.get(), i, pyArg.release());  //  Maybe a problem
-  }
-  PyObjectPtr ret(PyObject_CallObject(pyFunc.get(), pyArgs.get()));
-  CHECK_PY(ret) << "Call Object failed.";
-  return ret;
-}
-
-std::string callPythonFunc(const std::string& moduleName,
-                           const std::string& funcName,
-                           const std::vector<std::string>& args) {
-  PyObjectPtr obj = callPythonFuncRetPyObj(moduleName, funcName, args);
-#if PY_MAJOR_VERSION >= 3
-  Py_ssize_t str_size = 0u;
-  const char* str = PyUnicode_AsUTF8AndSize(obj.get(), &str_size);
-  return std::string(str, (size_t)str_size);
-#else
-  return std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
-#endif  // PY_MAJOR_VERSION >= 3
-}
-
-PyObjectPtr createPythonClass(
-    const std::string& moduleName,
-    const std::string& className,
-    const std::vector<std::string>& args,
-    const std::map<std::string, std::string>& kwargs) {
-  PyGuard guard;
-  PyObjectPtr pyModule = py::import(moduleName);
-  LOG(INFO) << "createPythonClass moduleName.c_str:" << moduleName.c_str();
-  CHECK_PY(pyModule) << "Import module " << moduleName << " failed.";
-  PyObjectPtr pyDict(PyModule_GetDict(pyModule.get()));
-  CHECK_PY(pyDict) << "Get Dict failed.";
-  PyObjectPtr pyClass(PyDict_GetItemString(pyDict.get(), className.c_str()));
-  LOG(INFO) << "createPythonClass className.c_str():" << className.c_str();
-  CHECK_PY(pyClass) << "Import class " << className << " failed.";
-  PyObjectPtr argsObjectList(PyTuple_New(args.size()));
-  for (size_t i = 0; i < args.size(); ++i) {
-    PyObjectPtr pyArg(Py_BuildValue("s#", args[i].c_str(), args[i].length()));
-    PyTuple_SetItem(argsObjectList.get(), i, pyArg.release());
-  }
-
-  PyObjectPtr kwargsObjectList(PyDict_New());
-  for (auto& x : kwargs) {
-    PyObjectPtr pyArg(Py_BuildValue("s#", x.second.c_str(), x.second.length()));
-    PyDict_SetItemString(
-        kwargsObjectList.get(), x.first.c_str(), pyArg.release());
-  }
-
-  PyObjectPtr pyInstance(PyInstance_New(
-      pyClass.get(), argsObjectList.release(), kwargsObjectList.release()));
-  CHECK_PY(pyInstance) << "Create class " << className << " failed.";
-  return pyInstance;
-}
-
-namespace py {
-char* repr(PyObject* obj) { return PyString_AsString(PyObject_Repr(obj)); }
-
-std::string getPyCallStack() {
-  std::ostringstream os;
-  printPyErrorStack(os, true);
-  return os.str();
-}
-
-PyObjectPtr import(const std::string& moduleName) {
-  auto module = PyImport_ImportModule(moduleName.c_str());
-  CHECK_PY(module) << "Import " << moduleName << "Error";
-  return PyObjectPtr(module);
-}
-
-}  // namespace py
-
-#endif
-extern "C" {
-extern const char enable_virtualenv_py[];
-}
-void initPython(int argc, char** argv) {
-#ifndef PADDLE_NO_PYTHON
-  Py_SetProgramName(argv[0]);
-  Py_Initialize();
-  PySys_SetArgv(argc, argv);
-  // python blocks SIGINT. Need to enable it.
-  signal(SIGINT, SIG_DFL);
-
-  // Manually activate virtualenv when user is using virtualenv
-  PyRun_SimpleString(enable_virtualenv_py);
-#endif
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/PythonUtil.h b/paddle/legacy/utils/PythonUtil.h
deleted file mode 100644
index d5b2dbddde2..00000000000
--- a/paddle/legacy/utils/PythonUtil.h
+++ /dev/null
@@ -1,381 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-// clang-format off
-#include "paddle/legacy/utils/Util.h"
-
-#ifndef PADDLE_NO_PYTHON
-// must include the following two blocks, otherwise,
-// gcc compiler may produce warning
-#ifdef __APPLE__
-#define _POSIX_SOURCE
-#define _POSIX_C_SOURCE 200809L
-#define _XOPEN_SOURCE 700
-#endif
-
-#ifdef _POSIX_C_SOURCE
-#define __TEMP_POSIX_C_SOURCE _POSIX_C_SOURCE
-#undef _POSIX_C_SOURCE
-#endif
-#ifdef _XOPEN_SOURCE
-#define __TEMP_XOPEN_SOURCE _XOPEN_SOURCE
-#undef _XOPEN_SOURCE
-#endif
-#include <Python.h>
-#include <frameobject.h>
-#endif
-
-#include <stdarg.h>
-#include <map>
-#include <mutex>
-// clang-format on
-
-namespace paddle {
-
-std::string callPythonFunc(const std::string& moduleName,
-                           const std::string& funcName,
-                           const std::vector<std::string>& args);
-
-#ifndef PADDLE_NO_PYTHON
-
-/**
- * Global lock guard of python C-api invokes.
- * NOTE: the lock of this guard is reentrant or recursive.
- */
-class PyGuard {
- public:
-  PyGuard();
-  PyGuard(const PyGuard& other) = delete;
-  PyGuard& operator=(const PyGuard& other) = delete;
-
- private:
-  std::lock_guard<std::recursive_mutex> guard_;
-};
-
-struct PyObjectDeleter {
-  void operator()(PyObject* obj) {
-    if (obj) {
-      Py_DECREF(obj);
-    }
-  }
-};
-
-typedef std::unique_ptr<PyObject, PyObjectDeleter> PyObjectPtr;
-
-PyObjectPtr callPythonFuncRetPyObj(const std::string& moduleName,
-                                   const std::string& funcName,
-                                   const std::vector<std::string>& args);
-
-PyObjectPtr createPythonClass(const std::string& moduleName,
-                              const std::string& className,
-                              const std::vector<std::string>& args,
-                              const std::map<std::string, std::string>& kwargs);
-
-#define CHECK_PY(x) CHECK((x) != nullptr) << ::paddle::py::getPyCallStack()
-
-namespace py {
-PyObjectPtr import(const std::string& moduleName);
-
-#if PY_MAJOR_VERSION >= 3
-/**
- * Cast a PyLong to int type T.
- * @tparam T return type.
- * @param [in] obj PyLong object.
- * @param [out] ok status for casting. False if error occured. nullptr if user
- *                 don't care is ok or not.
- * @return The value of python object, or 0 if not ok.
- */
-template <typename T>
-T castInt(PyObject* obj, bool* ok = nullptr) {
-  // Refer to https://www.python.org/dev/peps/pep-0237/, the int and long object
-  // were unified to long since python3
-  if (PyLong_Check(obj)) {
-    if (ok) *ok = true;
-    return (T)PyLong_AsUnsignedLong(obj);
-  } else {
-    if (ok) *ok = false;
-    return (T)0;
-  }
-}
-
-// Convert PyAPI from 2.x to 3.x
-#define PyString_FromString PyUnicode_FromString
-#define PyString_AsString PyUnicode_AsUTF8
-
-#else
-/**
- * Cast a PyLong or PyInt to int type T.
- * @tparam T return type.
- * @param [in] obj PyLong or PyInt object.
- * @param [out] ok status for casting. False if error occured. nullptr if user
- *                 don't care is ok or not.
- * @return The value of python object, or 0 if not ok.
- */
-template <typename T>
-T castInt(PyObject* obj, bool* ok = nullptr) {
-  if (PyLong_Check(obj)) {
-    if (ok) *ok = true;
-    return (T)PyLong_AsUnsignedLong(obj);
-  } else if (PyInt_Check(obj)) {
-    if (ok) *ok = true;
-    return (T)PyInt_AsLong(obj);
-  } else {
-    if (ok) *ok = false;
-    return (T)0;
-  }
-}
-#endif  // PY_MAJOR_VERSION >= 3
-
-/**
- * Invoke repr of python object.
- *
- * Just like toString method in java.
- */
-char* repr(PyObject* obj);
-
-/**
- * Invoke repr of python object.
- */
-inline char* repr(const PyObjectPtr& obj) { return repr(obj.get()); }
-
-/**
- * Get Python Error Stack String.
- */
-std::string getPyCallStack();
-
-/**
- * Object Helper for PyObjectPtr.
- *
- * Implements getAttr method for object.
- */
-class ObjectHelper {
- public:
-  explicit ObjectHelper(const PyObjectPtr& obj) : obj_(obj) {}
-
-  /**
-   * get attribute
-   */
-  inline PyObject* getAttr(const std::string& field) const {
-    auto obj = PyObject_GetAttrString(obj_.get(), field.c_str());
-    CHECK_PY(obj) << "Cannot get attribute on python object " << obj_.get();
-    return obj;
-  }
-
-  /**
-   * Get Int attribute
-   * @param [in] field  attribute name.
-   * @param [out] ok true if this attribute is int.
-   * @tparam T int type.
-   * @return int value.
-   */
-  template <typename T>
-  T getIntAttr(const std::string& field, bool* ok = nullptr) const {
-    PyObjectPtr tmp(getAttr(field));
-    return castInt<T>(tmp.get(), ok);
-  }
-
-  /**
-   * Get int attribute. Log(Fatal) when not ok
-   * @param field attribute name.
-   * @return int value.
-   */
-  template <typename T>
-  T getIntAttrWithError(const std::string& field) const {
-    bool ok;
-    T tmp = getIntAttr<T>(field, &ok);
-    CHECK(ok) << "Cannot get integer attribute on object " << obj_.get();
-    return tmp;
-  }
-
-  /**
-   * Get bool attribute.
-   * @param field
-   * @param [out] isBoolType return true if attribute is bool type. If the
-   *                         attribute is not bool type, then an implicit
-   *                         conversion will happens, and will return the
-   *                         conversion result.
-   *
-   *                         Such as, if the attribute is 1, then the return
-   *                         value of function will be true, but the isBoolType
-   *                         will return false.
-   * @return
-   */
-  bool getBoolAttr(const std::string& field, bool* isBoolType = nullptr) const {
-    PyObjectPtr tmp(getAttr(field));
-    if (isBoolType) {
-      *isBoolType = PyBool_Check(tmp.get());
-    }
-    return PyObject_IsTrue(tmp.get());
-  }
-
- private:
-  const PyObjectPtr& obj_;
-};
-
-/**
- * Python Sequence Helper
- *
- * The python sequence means list or tuple.
- */
-class SequenceHelper {
- public:
-  explicit SequenceHelper(const PyObjectPtr& seq) : seq_(seq.get()) {
-    CHECK(PySequence_Check(seq_));
-  }
-
-  explicit SequenceHelper(PyObject* seq) : seq_(seq) {
-    CHECK(PySequence_Check(seq_));
-  }
-
-  inline size_t size() const { return (size_t)PySequence_Size(seq_); }
-
-  inline PyObject* operator[](size_t i) const {
-    return PySequence_Fast_GET_ITEM(seq_, i);
-  }
-
-  inline double getDouble(size_t i) const {
-    auto* ptr = (*this)[i];
-    return PyFloat_AsDouble(ptr);
-  }
-
-  /**
-   * Set a sequence item o[i] = obj;
-   * @param i index
-   * @param obj setted item.
-   * @param steal if steal = true, sequence will move object in iteself,
-   *              just like std::move. Otherwise, it will increase reference
-   *              count. Default is false.
-   */
-  inline void set(size_t i, const PyObjectPtr& obj, bool steal = false) {
-    this->set(i, obj.get(), steal);
-  }
-
-  /**
-   * Set a sequence item o[i] = obj;
-   */
-  inline void set(size_t i, PyObject* obj, bool steal = false) {
-    if (!steal) {
-      Py_XINCREF(obj);
-    }
-    if (PyTuple_Check(seq_)) {
-      CHECK_NE(PyTuple_SetItem(seq_, i, obj), -1) << getPyCallStack();
-    } else {
-      CHECK_NE(PySequence_SetItem(seq_, i, obj), -1) << getPyCallStack();
-    }
-  }
-
- private:
-  PyObject* seq_;
-};
-
-class DictHelper {
- public:
-  explicit DictHelper(PyObject* d) : dict_(d) {}
-
-  explicit DictHelper(const PyObjectPtr& d) : dict_(d.get()) {}
-
-  void set(const std::string& key, PyObject* item) {
-    PyDict_SetItemString(dict_, key.c_str(), item);
-  }
-
-  void setBool(const std::string& key, bool b) {
-    this->set(key, PyBool_FromLong(b));
-  }
-
-  void setStringList(const std::string& key,
-                     const std::vector<std::string>& items) {
-    auto* list = PyList_New(items.size());
-    for (size_t i = 0; i < items.size(); ++i) {
-      PyList_SetItem(list, i, PyString_FromString(items[i].c_str()));
-    }
-    this->set(key, list);
-  }
-
- private:
-  inline void checkDict() { CHECK(PyDict_Check(this->dict_)); }
-
-  PyObject* dict_;
-};
-
-inline static bool isCallable(const PyObjectPtr& obj) {
-  return PyCallable_Check(obj.get());
-}
-
-/**
- * Wrap a callable object.
- */
-class CallableHelper {
- public:
-  explicit CallableHelper(const PyObjectPtr& obj) : obj_(obj) {
-    CHECK(py::isCallable(obj_));
-  }
-
-  ~CallableHelper() {}
-
-  /**
-   * reset args, and create new tuple.
-   * @param sz args size.
-   */
-  void setArgsSize(size_t sz) { args.reset(PyTuple_New(sz)); }
-
-  /**
-   * Get args sequence. User can set/get by SequenceHelper.
-   */
-  SequenceHelper getArgs() { return SequenceHelper(args); }
-
-  /**
-   * Call python method, return an object.
-   */
-  PyObject* operator()() {
-    PyGuard guard;
-    return PyObject_Call(obj_.get(), args.get(), kwargs.get());
-  }
-
- private:
-  const PyObjectPtr& obj_;
-  PyObjectPtr args;
-  PyObjectPtr kwargs;
-};
-
-inline static PyObject* iterNext(const PyObjectPtr& context, bool* atEnd) {
-  PyGuard g;
-  PyObject* data = PyIter_Next(context.get());
-  if (data == nullptr) {
-    if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
-      PyErr_Clear();
-      *atEnd = true;
-      return nullptr;
-    } else if (PyErr_Occurred()) {
-      CHECK_PY(data) << "Calling iterator next error";
-      return nullptr;
-    } else {
-      *atEnd = false;
-      return data;  // just return none in iterator.
-    }
-  } else {
-    *atEnd = false;
-    return data;
-  }
-}
-}  // namespace py
-
-#endif
-
-/**
- * Initialize python.
- */
-void initPython(int argc, char** argv);
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Queue.h b/paddle/legacy/utils/Queue.h
deleted file mode 100644
index 189e1a14f7b..00000000000
--- a/paddle/legacy/utils/Queue.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <condition_variable>
-#include <deque>
-#include <mutex>
-
-#include "Locks.h"
-
-namespace paddle {
-
-/**
- * A thread-safe queue that automatically grows but never shrinks.
- * Dequeue a empty queue will block current thread. Enqueue an element
- * will wake up another thread that blocked by dequeue method.
- *
- * For example.
- * @code{.cpp}
- *
- * paddle::Queue<int> q;
- * END_OF_JOB=-1
- * void thread1() {
- *   while (true) {
- *     auto job = q.dequeue();
- *     if (job == END_OF_JOB) {
- *       break;
- *     }
- *     processJob(job);
- *   }
- * }
- *
- * void thread2() {
- *   while (true) {
- *      auto job = getJob();
- *      q.enqueue(job);
- *      if (job == END_OF_JOB) {
- *        break;
- *      }
- *   }
- * }
- *
- * @endcode
- */
-template <class T>
-class Queue {
- public:
-  /**
-   * @brief Construct Function. Default capacity of Queue is zero.
-   */
-  Queue() : numElements_(0) {}
-
-  ~Queue() {}
-
-  /**
-   * @brief enqueue an element into Queue.
-   * @param[in] el The enqueue element.
-   * @note This method is thread-safe, and will wake up another blocked thread.
-   */
-  void enqueue(const T& el) {
-    std::unique_lock<std::mutex> lock(queueLock_);
-    elements_.emplace_back(el);
-    numElements_++;
-
-    queueCV_.notify_all();
-  }
-
-  /**
-   * @brief enqueue an element into Queue.
-   * @param[in] el The enqueue element. rvalue reference .
-   * @note This method is thread-safe, and will wake up another blocked thread.
-   */
-  void enqueue(T&& el) {
-    std::unique_lock<std::mutex> lock(queueLock_);
-    elements_.emplace_back(std::move(el));
-    numElements_++;
-
-    queueCV_.notify_all();
-  }
-
-  /**
-   * Dequeue from a queue and return a element.
-   * @note this method will be blocked until not empty.
-   */
-  T dequeue() {
-    std::unique_lock<std::mutex> lock(queueLock_);
-    queueCV_.wait(lock, [this]() { return numElements_ != 0; });
-    T el;
-
-    using std::swap;
-    // Becuase of the previous statement, the right swap() can be found
-    // via argument-dependent lookup (ADL).
-    swap(elements_.front(), el);
-
-    elements_.pop_front();
-    numElements_--;
-    if (numElements_ == 0) {
-      queueCV_.notify_all();
-    }
-    return el;
-  }
-
-  /**
-   * Return size of queue.
-   *
-   * @note This method is not thread safe. Obviously this number
-   * can change by the time you actually look at it.
-   */
-  inline int size() const { return numElements_; }
-
-  /**
-   * @brief is empty or not.
-   * @return true if empty.
-   * @note This method is not thread safe.
-   */
-  inline bool empty() const { return numElements_ == 0; }
-
-  /**
-   * @brief wait util queue is empty
-   */
-  void waitEmpty() {
-    std::unique_lock<std::mutex> lock(queueLock_);
-    queueCV_.wait(lock, [this]() { return numElements_ == 0; });
-  }
-
-  /**
-   * @brief wait queue is not empty at most for some seconds.
-   * @param seconds wait time limit.
-   * @return true if queue is not empty. false if timeout.
-   */
-  bool waitNotEmptyFor(int seconds) {
-    std::unique_lock<std::mutex> lock(queueLock_);
-    return queueCV_.wait_for(lock, std::chrono::seconds(seconds), [this] {
-      return numElements_ != 0;
-    });
-  }
-
- private:
-  std::deque<T> elements_;
-  int numElements_;
-  std::mutex queueLock_;
-  std::condition_variable queueCV_;
-};
-
-/*
- * A thread-safe circular queue that
- * automatically blocking calling thread if capacity reached.
- *
- * For example.
- * @code{.cpp}
- *
- * paddle::BlockingQueue<int> q(capacity);
- * END_OF_JOB=-1
- * void thread1() {
- *   while (true) {
- *     auto job = q.dequeue();
- *     if (job == END_OF_JOB) {
- *       break;
- *     }
- *     processJob(job);
- *   }
- * }
- *
- * void thread2() {
- *   while (true) {
- *      auto job = getJob();
- *      q.enqueue(job); //Block until q.size() < capacity .
- *      if (job == END_OF_JOB) {
- *        break;
- *      }
- *   }
- * }
- */
-template <typename T>
-class BlockingQueue {
- public:
-  /**
-   * @brief Construct Function.
-   * @param[in] capacity the max numer of elements the queue can have.
-   */
-  explicit BlockingQueue(size_t capacity) : capacity_(capacity) {}
-
-  /**
-   * @brief enqueue an element into Queue.
-   * @param[in] x The enqueue element, pass by reference .
-   * @note This method is thread-safe, and will wake up another thread
-   * who was blocked because of the queue is empty.
-   * @note If it's size() >= capacity before enqueue,
-   * this method will block and wait until size() < capacity.
-   */
-  void enqueue(const T& x) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    notFull_.wait(lock, [&] { return queue_.size() < capacity_; });
-    queue_.push_back(x);
-    notEmpty_.notify_one();
-  }
-
-  /**
-   * Dequeue from a queue and return a element.
-   * @note this method will be blocked until not empty.
-   * @note this method will wake up another thread who was blocked because
-   * of the queue is full.
-   */
-  T dequeue() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    notEmpty_.wait(lock, [&] { return !queue_.empty(); });
-
-    T front(queue_.front());
-    queue_.pop_front();
-    notFull_.notify_one();
-    return front;
-  }
-
-  /**
-   * Return size of queue.
-   *
-   * @note This method is thread safe.
-   * The size of the queue won't change until the method return.
-   */
-  size_t size() {
-    std::lock_guard<std::mutex> guard(mutex_);
-    return queue_.size();
-  }
-
-  /**
-   * @brief is empty or not.
-   * @return true if empty.
-   * @note This method is thread safe.
-   */
-  size_t empty() {
-    std::lock_guard<std::mutex> guard(mutex_);
-    return queue_.empty();
-  }
-
- private:
-  std::mutex mutex_;
-  std::condition_variable notEmpty_;
-  std::condition_variable notFull_;
-  std::deque<T> queue_;
-  size_t capacity_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Stat.cpp b/paddle/legacy/utils/Stat.cpp
deleted file mode 100644
index ff1b1bf888f..00000000000
--- a/paddle/legacy/utils/Stat.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Stat.h"
-#include <algorithm>
-#include <iomanip>
-#include "Util.h"
-
-namespace paddle {
-
-StatSet globalStat("GlobalStatInfo");
-
-void Stat::addSample(uint64_t value) {
-  StatInfo* statInfo = statInfo_.get(false);
-  if (!statInfo) {
-    statInfo = new StatInfo(this);
-    statInfo_.set(statInfo);
-    std::lock_guard<std::mutex> guard(lock_);
-    threadLocalBuf_.push_back({statInfo, getTID()});
-  }
-  if (value > statInfo->max_) {
-    statInfo->max_ = value;
-  }
-  if (value < statInfo->min_) {
-    statInfo->min_ = value;
-  }
-  statInfo->total_ += value;
-  statInfo->count_++;
-}
-
-void Stat::mergeThreadStat(StatInfo& allThreadStat) {
-  allThreadStat = destructStat_;
-  for (auto& buf : threadLocalBuf_) {
-    if (buf.first->max_ > allThreadStat.max_) {
-      allThreadStat.max_ = buf.first->max_;
-    }
-    if (buf.first->min_ < allThreadStat.min_) {
-      allThreadStat.min_ = buf.first->min_;
-    }
-    allThreadStat.total_ += buf.first->total_;
-    allThreadStat.count_ += buf.first->count_;
-  }
-}
-
-void Stat::reset() {
-  std::lock_guard<std::mutex> guard(lock_);
-  for (auto& buf : threadLocalBuf_) {
-    buf.first->reset();
-  }
-}
-
-std::ostream& operator<<(std::ostream& outPut, const Stat& stat) {
-  std::lock_guard<std::mutex> guard(const_cast<Stat&>(stat).lock_);
-  auto showStat = [&](const StatInfo* info, pid_t tid, bool isFirst = true) {
-    uint64_t average = 0;
-    if (info->count_ > 0) {
-      outPut << std::setfill(' ') << std::left;
-      if (!isFirst) {
-        outPut << std::setw(42) << " ";
-      }
-      average = info->total_ / info->count_;
-      outPut << "Stat=" << std::setw(30) << stat.getName();
-      if (tid) {
-        outPut << " TID=" << std::setw(6) << tid;
-      }
-      outPut << " total=" << std::setw(10) << info->total_ * 0.001
-             << " avg=" << std::setw(10) << average * 0.001
-             << " max=" << std::setw(10) << info->max_ * 0.001
-             << " min=" << std::setw(10) << info->min_ * 0.001
-             << " count=" << std::setw(10) << info->count_ << std::endl;
-    }
-  };
-  if (!stat.getThreadInfo()) {
-    StatInfo infoVarTmp;
-    const_cast<Stat&>(stat).mergeThreadStat(infoVarTmp);
-    showStat(&infoVarTmp, 0);
-  } else {
-    bool isFirst = true;
-    for (auto& buf : stat.threadLocalBuf_) {
-      showStat(buf.first, buf.second, isFirst);
-      if (isFirst) isFirst = false;
-    }
-    showStat(&stat.destructStat_, 0);
-  }
-
-  return outPut;
-}
-
-void StatSet::printSegTimerStatus() {
-  ReadLockGuard guard(lock_);
-  LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
-            << "======= StatSet: [" << name_ << "] status ======" << std::endl;
-  for (auto& stat : statSet_) {
-    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
-              << *(stat.second);
-  }
-}
-
-void StatSet::printAllStatus() {
-#ifndef PADDLE_DISABLE_TIMER
-  printSegTimerStatus();
-#endif
-  LOG(INFO) << std::setiosflags(std::ios::left)
-            << "--------------------------------------------------"
-            << std::endl;
-}
-
-void StatSet::reset(bool clearRawData) {
-  ReadLockGuard guard(lock_);
-  for (auto& stat : statSet_) {
-    stat.second->reset();
-  }
-}
-
-void StatSet::setThreadInfo(const std::string& name, bool flag) {
-  ReadLockGuard guard(lock_);
-  auto iter = statSet_.find(name);
-  CHECK(iter != statSet_.end()) << name << " is not registed in " << name_;
-  iter->second->setThreadInfo(flag);
-}
-
-StatInfo::~StatInfo() {
-  if (stat_) {
-    std::lock_guard<std::mutex> guard(stat_->lock_);
-    if (stat_->destructStat_.max_ < this->max_) {
-      stat_->destructStat_.max_ = this->max_;
-    }
-    if (stat_->destructStat_.min_ > this->min_) {
-      stat_->destructStat_.min_ = this->min_;
-    }
-    stat_->destructStat_.total_ += this->total_;
-    stat_->destructStat_.count_ += this->count_;
-    stat_->threadLocalBuf_.remove({this, getTID()});
-  }
-}
-
-static unsigned g_profileCount = 0;
-static std::recursive_mutex g_profileMutex;
-
-GpuProfiler::GpuProfiler(std::string statName, std::string info)
-    : guard_(g_profileMutex) {
-  if (++g_profileCount == 1) {
-    LOG(INFO) << "Enable GPU Profiler Stat: [" << statName << "] " << info;
-    hl_profiler_start();
-  }
-}
-
-GpuProfiler::~GpuProfiler() {
-  if (--g_profileCount == 0) {
-    hl_profiler_end();
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Stat.h b/paddle/legacy/utils/Stat.h
deleted file mode 100644
index 100e9eba909..00000000000
--- a/paddle/legacy/utils/Stat.h
+++ /dev/null
@@ -1,302 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <sys/time.h>
-#include <iostream>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <unordered_map>
-
-#include "Locks.h"
-#include "Logging.h"
-#include "ThreadLocal.h"
-#include "hl_gpu.h"
-
-namespace paddle {
-
-class Stat;
-
-class StatInfo {
- public:
-  explicit StatInfo(Stat* stat = nullptr) : stat_(stat) {
-    total_ = 0;
-    max_ = 0;
-    count_ = 0;
-    min_ = UINT64_MAX;
-  }
-
-  void reset() {
-    total_ = 0;
-    count_ = 0;
-    max_ = 0;
-    min_ = UINT64_MAX;
-  }
-
-  ~StatInfo();
-
-  Stat* stat_;
-  uint64_t total_;
-  uint64_t max_;
-  uint64_t count_;
-  uint64_t min_;
-};
-
-class Stat;
-typedef std::shared_ptr<Stat> StatPtr;
-
-class StatSet {
- public:
-  explicit StatSet(const std::string& name) : name_(name) {}
-  ~StatSet() {}
-
-  // print to LOG(INFO)
-  void printSegTimerStatus();
-  void printAllStatus();
-
-  StatPtr getStat(const std::string& name) {
-    {
-      ReadLockGuard guard(lock_);
-      auto it = statSet_.find(name);
-      if (it != statSet_.end()) {
-        return it->second;
-      }
-    }
-    StatPtr stat = std::make_shared<Stat>(name);
-    std::lock_guard<RWLock> guard(lock_);
-    auto ret = statSet_.insert(std::make_pair(name, stat));
-    return ret.first->second;
-  }
-
-  // true for showing stats for each thread
-  // false for showing stats aggragated over threads
-  void setThreadInfo(const std::string& name, bool flag);
-
-  // true for showing stats for each thread
-  // false for showing stats aggragated over threads
-  void setThreadInfo(bool flag) {
-    for (auto& iter : statSet_) {
-      setThreadInfo(iter.first, flag);
-    }
-  }
-
-  // reset the counters for all stats
-  // clearRawData means also clearing raw tuning data, because at pserver end,
-  // barrier rawData(timeVector_) is stateful, clearing it will cause rubbish
-  // data, while rawData should be cleared at the new pass (so complicated
-  // pserver code logic, -_- ).
-  void reset(bool clearRawData = true);
-
- private:
-  std::unordered_map<std::string, StatPtr> statSet_;
-  const std::string name_;
-  RWLock lock_;
-};
-
-extern StatSet globalStat;
-
-/*@brief : a simple stat*/
-class Stat {
- public:
-  explicit Stat(const std::string& statName)
-      : destructStat_(nullptr), name_(statName), openThreadInfo_(false) {}
-  ~Stat() {}
-
-  typedef std::list<std::pair<StatInfo*, pid_t>> ThreadLocalBuf;
-
-  const std::string& getName() const { return name_; }
-
-  void addSample(uint64_t value);
-
-  // clear all stats
-  void reset();
-
-  friend std::ostream& operator<<(std::ostream& outPut, const Stat& stat);
-
-  /*  Set operator << whether to print thread info.
-   *  If openThreadInfo_ == true, then print, else print merge thread info.
-   */
-  void setThreadInfo(bool flag) { openThreadInfo_ = flag; }
-
-  bool getThreadInfo() const { return openThreadInfo_; }
-
-  friend class StatInfo;
-
- private:
-  void mergeThreadStat(StatInfo& allThreadStat);
-
-  std::mutex lock_;
-  ThreadLocalBuf threadLocalBuf_;
-  StatInfo destructStat_;
-  ThreadLocal<StatInfo> statInfo_;
-  const std::string name_;
-  bool openThreadInfo_;
-};
-
-extern StatSet globalStat;
-
-inline StatPtr getStat(const std::string& name) {
-  return globalStat.getStat(name);
-}
-
-inline uint64_t nowInMicroSec() {
-  timeval tvTime;
-  (void)gettimeofday(&tvTime, NULL);
-  return tvTime.tv_sec * 1000000LU + tvTime.tv_usec;
-}
-
-/**
- * A simple help class to measure time interval
- */
-class Timer {
- public:
-  explicit Timer(bool autoStart = true) : total_(0), startStamp_(0) {
-    if (autoStart) {
-      start();
-    }
-  }
-  void start() { startStamp_ = nowInMicroSec(); }
-  void setStartStamp(uint64_t startStamp) { startStamp_ = startStamp; }
-  uint64_t stop() {
-    total_ += nowInMicroSec() - startStamp_;
-    return total_;
-  }
-
-  uint64_t get() const { return total_; }
-
-  void reset() { total_ = 0; }
-
- protected:
-  uint64_t total_;
-  uint64_t startStamp_;
-};
-
-class TimerOnce {
- public:
-  TimerOnce(Stat* stat,
-            const char* info = "",
-            uint64_t threshold = -1,
-            bool autoStart = true,
-            uint64_t startStamp = 0)
-      : stat_(stat), info_(info), timer_(autoStart), threshold_(threshold) {
-    if (!autoStart) {
-      timer_.setStartStamp(startStamp);
-    }
-  }
-  ~TimerOnce() {
-    uint64_t span = timer_.stop();
-    if (span >= threshold_) {
-      LOG(INFO) << "Stat: [" << stat_->getName() << "] " << info_
-                << " [Span:" << span / 1000 << "ms" << span % 1000 << "us"
-                << "] ";
-    }
-    stat_->addSample(span);
-  }
-
- private:
-  Stat* stat_;
-  const char* info_;
-  Timer timer_;
-  uint64_t threshold_;
-};
-
-inline uint64_t registerTimerArg1(uint64_t threshold = -1,
-                                  StatSet& statSet = globalStat) {
-  return threshold;
-}
-
-inline StatSet& registerTimerArg2(uint64_t threshold = -1,
-                                  StatSet& statSet = globalStat) {
-  return statSet;
-}
-
-#ifdef PADDLE_DISABLE_TIMER
-
-#define REGISTER_TIMER(statName, ...)
-#define REGISTER_TIMER_SET(statName, start, ...)
-#define REGISTER_TIMER_DYNAMIC(statName, ...)
-#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...)
-#define REGISTER_TIMER_INFO(statName, info)
-#define FOR_TIMING(statement)
-
-#else
-
-#define FOR_TIMING(statement) statement
-
-// The default arguments are shown in the following line:
-// REGISTER_TIMER(statName, threshold = -1, statSet = globalStat)
-// TODO(yuyang18,wangyanfei01): if UNIQUE_NAME is needed
-#define REGISTER_TIMER(statName, ...)                             \
-  static ::paddle::StatPtr __stat =                               \
-      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName); \
-  ::paddle::TimerOnce __timerOnce(                                \
-      __stat.get(), "", ::paddle::registerTimerArg1(__VA_ARGS__));
-
-#define REGISTER_TIMER_SET(statName, start, ...)                            \
-  static ::paddle::StatPtr __stat =                                         \
-      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName);           \
-  ::paddle::TimerOnce __timerOnce(__stat.get(),                             \
-                                  "",                                       \
-                                  ::paddle::registerTimerArg1(__VA_ARGS__), \
-                                  false,                                    \
-                                  start);
-
-// dynmaic timer, support to discriminate runtime entity, used in pserver
-#define REGISTER_TIMER_DYNAMIC(statName, ...)                     \
-  ::paddle::StatPtr __stat =                                      \
-      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName); \
-  ::paddle::TimerOnce __timerOnce(                                \
-      __stat.get(), "", ::paddle::registerTimerArg1(__VA_ARGS__));
-
-#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...)                    \
-  ::paddle::StatPtr __stat =                                                \
-      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName);           \
-  ::paddle::TimerOnce __timerOnce(__stat.get(),                             \
-                                  "",                                       \
-                                  ::paddle::registerTimerArg1(__VA_ARGS__), \
-                                  false,                                    \
-                                  start);
-
-#define REGISTER_TIMER_INFO(statName, info)                                 \
-  static ::paddle::StatPtr __stat = ::paddle::globalStat.getStat(statName); \
-  ::paddle::TimerOnce __timerOnce(                                          \
-      __stat.get(), info, 10 * 1000000LU /*threshold*/);
-
-#endif  // DISABLE_TIMER
-
-class GpuProfiler final {
- public:
-  GpuProfiler(std::string statName, std::string info);
-  ~GpuProfiler();
-
- private:
-  std::lock_guard<std::recursive_mutex> guard_;
-};
-
-#ifdef PADDLE_DISABLE_PROFILER
-
-#define REGISTER_GPU_PROFILER(statName, ...)
-
-#else
-
-#define REGISTER_GPU_PROFILER(statName, ...) \
-  GpuProfiler __gpuProfiler(statName, #__VA_ARGS__);
-
-#endif  // DISABLE_PROFILER
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/StringUtil.cpp b/paddle/legacy/utils/StringUtil.cpp
deleted file mode 100644
index 0c98e6db345..00000000000
--- a/paddle/legacy/utils/StringUtil.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "StringUtil.h"
-
-namespace paddle {
-namespace str {
-
-bool endsWith(const std::string& str, const std::string& ext) {
-  if (str.size() >= ext.size() && ext == str.substr(str.size() - ext.size())) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-void split(const std::string& str, char sep, std::vector<std::string>* pieces) {
-  pieces->clear();
-  if (str.empty()) {
-    return;
-  }
-  size_t pos = 0;
-  size_t next = str.find(sep, pos);
-  while (next != std::string::npos) {
-    pieces->push_back(str.substr(pos, next - pos));
-    pos = next + 1;
-    next = str.find(sep, pos);
-  }
-  if (!str.substr(pos).empty()) {
-    pieces->push_back(str.substr(pos));
-  }
-}
-
-bool startsWith(const std::string& str, const std::string& prefix) {
-  if (prefix.size() <= str.size()) {
-    for (size_t i = 0; i < prefix.size(); ++i) {
-      if (str[i] != prefix[i]) return false;
-    }
-    return true;
-  } else {
-    return false;
-  }
-}
-
-}  // namespace str
-}  // namespace paddle
diff --git a/paddle/legacy/utils/StringUtil.h b/paddle/legacy/utils/StringUtil.h
deleted file mode 100644
index 95f071cb7de..00000000000
--- a/paddle/legacy/utils/StringUtil.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <sstream>
-#include <string>
-#include <vector>
-#include "Logging.h"
-
-namespace paddle {
-
-namespace str {
-/// test whether a string ends with another string
-bool endsWith(const std::string& str, const std::string& ext);
-
-bool startsWith(const std::string& str, const std::string& prefix);
-
-/**
- * Use sep to split str into pieces.
- * If str is empty, *pieces will be empty.
- * If str ends with sep, the last piece will be an empty string.
- */
-void split(const std::string& str, char sep, std::vector<std::string>* pieces);
-
-/**
- * Cast string to type T with status.
- *
- * @param [in] s input string.
- * @param [out] ok status, return true if there is no error in casting. Set
- *              nullptr if user don't care error at all.
- * @return result of casting. If error occurred, a default value of T() will
- *         return.
- */
-template <class T>
-inline T toWithStatus(const std::string& s, bool* ok = nullptr) {
-  std::istringstream sin(s);
-  T v;
-  sin >> v;
-  if (ok) {
-    *ok = sin.eof() && !sin.fail();
-  }
-  return v;
-}
-
-/**
- * Cast type T to string with status.
- *
- * @param [in] v input value of type T.
- * @param [out] ok status, return true if there is no error in casting. Set
- *              nullptr if user don't care error at all.
- * @return result of casting. If error occurred, a empty string will be
- *              returned.
- */
-template <class T>
-inline std::string toWithStatus(const T v, bool* ok = nullptr) {
-  std::ostringstream sout;
-  sout << v;
-  if (ok) {
-    *ok = !sout.fail();
-  }
-  return sout.str();
-}
-
-/// Convert string to type T. It makes sure all the characters in s are used.
-/// Otherwise it will abort.
-///
-/// @tparam T type of return
-/// @param s string input.
-template <class T>
-inline T to(const std::string& s) {
-  bool ok;
-  T v = toWithStatus<T>(s, &ok);
-  CHECK(ok) << "Cannot convert s(" << s << ") to type " << typeid(T).name();
-  return v;
-}
-
-/// Convert type T to string.
-///
-/// @tparam T type of input value
-/// @param v input value of type T
-template <class T>
-std::string to_string(T v) {
-  bool ok;
-  std::string s = toWithStatus<T>(v, &ok);
-  CHECK(ok) << "Cannot convert v(" << v << ") to type std::string";
-  return s;
-}
-
-}  // namespace str
-
-#undef DEFINE_STRING_CONVERSION
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Thread.h b/paddle/legacy/utils/Thread.h
deleted file mode 100644
index 2ee6eba1a68..00000000000
--- a/paddle/legacy/utils/Thread.h
+++ /dev/null
@@ -1,615 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <thread>
-#include "Logging.h"
-#include "Util.h"
-
-#include "Queue.h"
-#include "ThreadLocal.h"
-
-#include <future>
-
-namespace paddle {
-
-/**
- * A simple wrapper for std::thread
- */
-
-class Thread {
- public:
-  /**
-   * @brief Construct Function. Default thread pointer is null.
-   */
-  Thread() { thread_ = nullptr; }
-
-  virtual ~Thread() {}
-
-  /**
-   * @brief Creat a new thread and call *run()* function.
-   */
-  void start() {
-    thread_.reset(new std::thread([this]() { this->run(); }));
-  }
-
-  /**
-   * @brief Detach the thread.
-   * It don't need to be waited until it finish.
-   */
-  void detach() { thread_->detach(); }
-
-  /**
-   * @brief Join the thread.
-   * It should be waited until it finish.
-   */
-  void join() { thread_->join(); }
-
-  /**
-   * @brief Define what to be done on this thread through override this
-   * function.
-   */
-  virtual void run() = 0;
-
- protected:
-  std::unique_ptr<std::thread> thread_;
-};
-
-/**
- * ThreadWorker maintains a job queue. It executes the jobs in the job queue
- * sequentianlly in a separate thread.
- *
- * Use addJob() to add a new job to the job queue.
- */
-class ThreadWorker : protected Thread {
- public:
-  typedef std::function<void()> JobFunc;
-
-  /**
-   * @brief Construct Function. Default size of job queue is 0 and not stopping.
-   */
-  ThreadWorker() : stopping_(false), empty_(true) { start(); }
-
-  /**
-   * @brief Destruct Function.
-   * If it's running, wait until all job finish and then stop it.
-   */
-  ~ThreadWorker() {
-    if (!stopping_) {
-      wait();
-      stop();
-    }
-  }
-
-  /**
-   * @brief Finish current running job and quit the thread.
-   */
-  void stop() {
-    stopping_ = true;
-    jobs_.enqueue([]() {});
-    join();
-  }
-
-  /**
-   * @brief Add a new job to the job queue.
-   */
-  void addJob(JobFunc func) {
-    empty_ = false;
-    jobs_.enqueue(func);
-  }
-
-  /**
-   * @brief Wait until all jobs was done (the job queue was empty).
-   */
-  void wait() {
-    finishCV_.wait([this] { return empty_; });
-  }
-
- protected:
-  /**
-   * @brief Execute jobs in the job queue sequentianlly,
-   * @note If finish all the jobs in the job queue,
-   * notifies all the waiting threads the job queue was empty.
-   */
-  virtual void run() {
-    while (true) {
-      JobFunc func = jobs_.dequeue();
-      if (stopping_) break;
-      func();
-      if (jobs_.empty()) {
-        finishCV_.notify_all([this] { empty_ = true; });
-      }
-    }
-  }
-
-  Queue<JobFunc> jobs_;
-  bool stopping_;
-  LockedCondition finishCV_;
-  bool empty_;
-};
-
-/**
- * SyncThreadPool maintains a pool of threads.
- * It executes the job use all workers in the pool.
- *
- * Use exec() to run a new job, job complete when exec returned.
- * Only one job can exec simultaneously.
- *
- * Each worker has an tid whose range is [0, getNumThreads()).
- * JobFunc can use tid to divide input data.
- */
-class SyncThreadPool {
- public:
-  typedef std::function<void(int tid, size_t numThreads)> JobFunc;
-
-  /**
-   * @brief Construct Function. No thread will be created.
-   */
-  SyncThreadPool() : jobStartBarrier_(0), jobFinishBarrier_(0) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief Construct Fucntion. Create numWorkers of threads in the pool.
-   * @param[in] numWorkers Number of the workers in the pool.
-   * @param[in] checkOwner Default true. If checkOwner is true,
-   * this sync thread pool should be used by it's owner thread.
-   */
-  explicit SyncThreadPool(size_t numWorkers, bool checkOwner = true)
-      : stopping_(false),
-        jobStartBarrier_(numWorkers + 1),
-        jobFinishBarrier_(numWorkers + 1),
-        jobFunc_(nullptr),
-        checkOwner_(checkOwner) {
-    ownerThreadId_ = getTID();
-    workers_.resize(numWorkers);
-    start();
-  }
-
-  ~SyncThreadPool() {
-    if (!stopping_) {
-      stop();
-    }
-  }
-
-  /**
-   * @brief Return num of threads in the pool.
-   */
-  size_t getNumThreads() { return workers_.size(); }
-
-  /**
-   * @brief Execute a job using all the theads in the pool.
-   * @param[in] jobFunc The function to be executed.
-   * @param[in] ownerFunc Owner thread can do something in owerFunc when job
-   * executing.
-   * @note For the ownerFunc, tid=getNumThreads().
-   */
-  void exec(JobFunc jobFunc, JobFunc ownerFunc = nullptr) {
-    if (checkOwner_) {
-      CHECK_EQ(ownerThreadId_, getTID())
-          << "this sync thread pool should be used in one thread";
-    }
-
-    CHECK(jobFunc_ == nullptr);
-    jobFunc_ = jobFunc;
-    jobStartBarrier_.wait();  // notify worker thread start job
-
-    if (ownerFunc) {
-      ownerFunc(workers_.size(), workers_.size());
-    }
-
-    jobFinishBarrier_.wait();  // wait all worker thread complete
-    jobFunc_ = nullptr;
-  }
-
-  /**
-   * @brief Execute a job using all the threads in the pool.
-   * And the owner thread will do the same job.
-   * @param jobFunc The job to be executed.
-   * @note  Assume that JobFunc will execute numThread + 1 times,
-   * with tid ranging [0,numThread]. The thread whose tid is numThread
-   * is the owner thread.
-   */
-  void execPlusOwner(JobFunc jobFunc) { exec(jobFunc, jobFunc); }
-
-  /**
-   * @brief Execute a job if has pool, else use caller thread as a worker.
-   * @param[in] pool The pool to execute the job.
-   * @param[in] jobFunc The job to be excuted.
-   */
-  static void execHelper(SyncThreadPool* pool, JobFunc jobFunc) {
-    if (pool) {
-      pool->exec(jobFunc);
-    } else {
-      jobFunc(0, 1);
-    }
-  }
-
- protected:
-  /**
-   * @brief Start all the workers in the pool, call their run() function.
-   */
-  void start() {
-    for (size_t i = 0; i < workers_.size(); ++i) {
-      workers_[i].reset(
-          new std::thread([this](int tid) { this->run(tid); }, i));
-    }
-  }
-
-  /**
-   * @brief Stop all the workers in the pool.
-   */
-  void stop() {
-    stopping_ = true;
-    // notify worker thread to stop
-    jobStartBarrier_.wait();
-
-    // stop workers
-    for (auto& thread : workers_) {
-      if (thread) {
-        thread->join();
-        thread.reset(nullptr);
-      }
-    }
-  }
-
-  /**
-   * @brief Execute the jobFunc_ using the worker thread tid, if not stopping.
-   */
-  void run(int tid) {
-    VLOG(1) << "SyncThreadPool worker thread " << tid;
-    // init seed deterministic, but differs from global srand()
-    ThreadLocalRand::initThreadSeed(tid + workers_.size());
-
-    while (true) {
-      jobStartBarrier_.wait();  // wait job
-
-      if (stopping_) {
-        break;
-      }
-
-      jobFunc_(tid, workers_.size());
-
-      jobFinishBarrier_.wait();  // notify job complete
-    }
-  }
-
- protected:
-  pid_t ownerThreadId_;
-  bool stopping_;
-  ThreadBarrier jobStartBarrier_;
-  ThreadBarrier jobFinishBarrier_;
-
-  JobFunc jobFunc_;
-  bool checkOwner_;
-  std::vector<std::unique_ptr<std::thread>> workers_;
-};
-
-/**
- * MultiThreadWorker maintains a job queue and a result queue.
- * It executes the jobs in the job queue and puts the results into the
- * result queue sequentially in multi separate threads.
- *
- * Add jobs:
- *
- *    Use addJob() to add a new job to the job queue
- *        (the user added jobs should not return nullptr).
- *
- *    Use stopAddJob() to stop adding new jobs to the job queue
- *        (addJob() can not be called after stopAddJob()).
- *
- * Normal stop:
- *
- *    Use waitResult() to get the results until nullptr is returned.
- *    Use stop() to exit normally
- *        (stopAddJob() should be called first).
- *
- * Force stop:
- *
- *    Use forceStop() to exit forcibly even though there are remaining jobs in
- * the
- * job queue.
- */
-template <class T>
-class MultiThreadWorker {
- public:
-  typedef T ResultType;
-  typedef std::shared_ptr<ResultType> ResultPtrType;
-  typedef std::function<ResultPtrType()> JobFunc;
-  /**
-   * @brief Construct Function. Initialize the multithread worker.
-   * @param[in] workerNum Number of the workers.
-   * @param[in] queueCapacity Capapcity of the result queue.
-   */
-  MultiThreadWorker(size_t workerNum, size_t queueCapacity)
-      : stopping_(false),
-        jobAdding_(true),
-        nullResultNum_(0),
-        results_(queueCapacity) {
-    workers_.resize(workerNum);
-    for (auto& worker : workers_) {
-      worker.reset(new std::thread([this]() { this->run(); }));
-    }
-  }
-
-  /**
-   * @brief Destruct Function. Force stop the workers
-   * even though there are remaining jobs in the job queue.
-   */
-  virtual ~MultiThreadWorker() { forceStop(); }
-
-  /**
-   * @brief Stop all the workers normally.
-   * @note stopAddJob() should be called before it.
-   */
-  void stop() {
-    CHECK(!jobAdding_) << "stopAddJob() should be called before stop()";
-    for (auto& worker : workers_) {
-      if (worker) {
-        worker->join();
-        worker = nullptr;
-      }
-    }
-    stopping_ = true;
-  }
-
-  /**
-   * @brief Stop all the workers forcibly.
-   * @note This function will call stopAddJob() first
-   * and empty the result queue.
-   */
-  void forceStop() {
-    if (!stopping_) {
-      stopping_ = true;
-      stopAddJob();
-      while (nullptr != waitResult()) {
-      }
-      stop();
-    }
-  }
-
-  /**
-   * @brief Add a job to the job queue.
-   * @note Job can not be added after calling stopAddJob().
-   */
-  void addJob(JobFunc func) {
-    CHECK(jobAdding_) << "addJob() can not be called after stopAddJob()";
-    jobs_.enqueue(func);
-  }
-
-  /**
-   * @brief Stop adding new jobs to the job queue.
-   * @note This fuction enqueue a return nullptr function to the job queue.
-   */
-  void stopAddJob() {
-    for (size_t i = 0; i < workers_.size(); ++i) {
-      jobs_.enqueue([]() { return nullptr; });
-    }
-    jobAdding_ = false;
-  }
-
-  /**
-   * @brief Dequeue the first result in the result queue and return it.
-   * @note If the result queue is empty, wait until it's not empty
-   * or return nullptr if all the results have been returned.
-   */
-  ResultPtrType waitResult() {
-    while (true) {
-      ResultPtrType result = results_.dequeue();
-      if (result) {
-        return result;
-      }
-
-      ++nullResultNum_;
-      if (nullResultNum_ == workers_.size()) {
-        return nullptr;
-      }
-    }
-  }
-
-  /**
-   * @brief The result queue is empty or not.
-   * @return true if empty.
-   */
-  bool testResult() { return results_.empty(); }
-
- protected:
-  /**
-   * @brief Do the jobs in the job queue sequentianlly
-   * and enqueue the result into the result queue.
-   * @note A nullptr will be enqueued into the resulte queue, when a worker
-   * finished.
-   */
-  virtual void run() {
-    while (true) {
-      JobFunc func = jobs_.dequeue();
-      ResultPtrType result = func();
-      if (result == nullptr || stopping_) {
-        // When a worker finished, a nullptr would be enqueued into results_
-        results_.enqueue(nullptr);
-        break;
-      }
-      results_.enqueue(result);
-    }
-  }
-
-  bool stopping_;
-  bool jobAdding_;
-  size_t nullResultNum_;
-  Queue<JobFunc> jobs_;
-  BlockingQueue<ResultPtrType> results_;
-  std::vector<std::unique_ptr<std::thread>> workers_;
-};
-
-/**
- * AsyncThreadPool maintains a job queue and threads pool.
- * It executes the jobs from queue asynchronously.
- *
- * Add jobs:
- *
- *    Use addJob() to add a new job to the job queue and get a std::future
- *    result. The caller's thread continues running. Call std::future::get()
- *    when the result's value is needed, and the caller's thread may be
- *    blocked until thread-pool finished the job.
- *
- *    Use addBatchJobs() to add a batch of jobs.
- *    Unlike addJob()'s asynchronization, addBatchJobs will block caller's
- *    thread until all jobs in the batch are finished.
- *
- * Stop:
- *    Use stop() to stop the thread pool. Job can be added once stopped.
- *
- * Process-wide Singleton:
- *    Use AsyncThreadPool::ProcessChannel(N) first to create N threads.
- *    Then call AsyncThreadPool::ProcessChannel() to get the process-wide global
- *    thread pool.
- */
-class AsyncThreadPool {
- public:
-  typedef std::function<void()> JobFunc;
-
-  AsyncThreadPool() { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief Construct Function. Install all the workers.
-   * @param[in] threadNum Number of the threads, must greater than 1.
-   */
-  explicit AsyncThreadPool(size_t threadNum) {
-    CHECK_GT(threadNum, 1U);
-    stopping_ = false;
-    workers_.resize(threadNum);
-    for (auto& worker : workers_) {
-      worker.reset(new std::thread([this]() { this->run(); }));
-    }
-  }
-
-  ~AsyncThreadPool() {
-    if (!stopping_) {
-      stop();
-    }
-  }
-
-  /**
-   * @brief Stop all the workers normally.
-   */
-  void stop() {
-    stopping_ = true;
-    for (size_t i = 0; i < workers_.size(); i++) {
-      jobs_.enqueue([] {});
-    }
-    for (auto& worker : workers_) {
-      worker->join();
-    }
-  }
-
-  /**
-   * @brief A process-wide singleton. Used as a global thread pool
-   *    It should be initialized by calling
-   *    AsyncThreadPool::ProcessChannel(N) first to create N threads,
-   *    then call AsyncThreadPool::ProcessChannel() will get the thread pool.
-   */
-  static AsyncThreadPool& ProcessChannel(size_t initThreadNum = 0) {
-    static std::shared_ptr<AsyncThreadPool> channel(
-        new AsyncThreadPool(initThreadNum));
-    return *channel;
-  }
-
-  /**
-   * @brief Add a job to queue and return a std::future.
-   * @note The job will be executed
-   * asynchronously.
-   * Call std::future::get() when the execturation result is needed;
-   */
-  template <class F, class... Args>
-  auto addJob(F&& f, Args&&... args)
-      -> std::future<typename std::result_of<F(Args...)>::type> {
-    CHECK(!stopping_) << "AsyncThreadPool is closed";
-    typedef typename std::result_of<F(Args...)>::type T;
-
-    auto task = std::make_shared<std::packaged_task<T()>>(
-        std::bind(std::forward<F>(f), std::forward<Args>(args)...));
-    auto res = task->get_future();
-    jobs_.enqueue([task] { (*task)(); });
-    return res;
-  }
-
-  /**
-   * @brief Add a batch of jobs to the queue. The main thread will be blocked
-   * until these jobs are finished.
-   * The results will be stored in  `results` according to `jobs` order.
-   *
-   * @tparam F should have a return value.
-   *
-   * @param[in] jobs a vector of executable objection.
-   * @param[in] results a vector to store the results.
-   *
-   * @note *results* may need to be carefully cleared before *addBatchJobs()*.
-   */
-  template <class F>
-  void addBatchJobs(const std::vector<F>& jobs,
-                    std::vector<typename std::result_of<F()>::type>& results) {
-    typedef typename std::result_of<F()>::type T;
-    static_assert(!std::is_same<T, void>::value,
-                  "should pass a non-void function as job");
-
-    std::vector<std::future<T>> resFuts;
-    for (const auto& job : jobs) {
-      resFuts.emplace_back(addJob(job));
-    }
-    for (auto& fut : resFuts) {
-      results.emplace_back(fut.get());
-    }
-  }
-
-  /**
-   * @brief Add a batch of jobs reguardless of its result.
-   * @tparam F don't need to have a return value.
-   * @param[in] jobs a vector of executable objection.
-   */
-  template <class F>
-  void addBatchJobs(const std::vector<F>& jobs) {
-    CHECK(!stopping_) << "AsyncThreadPool is closed";
-    std::vector<std::future<bool>> tmpRes;
-
-    for (const auto& job : jobs) {
-      tmpRes.emplace_back(addJob([&job] {
-        job();
-        return true;
-      }));
-    }
-
-    for (auto& res : tmpRes) {
-      res.get();
-    }
-  }
-
- protected:
-  /**
-   * @brief Execute the jobs in the job queue.
-   */
-  void run() {
-    while (true) {
-      JobFunc func = jobs_.dequeue();
-      func();
-      if (stopping_) break;
-    }
-  }
-
- private:
-  std::vector<std::unique_ptr<std::thread>> workers_;
-  Queue<JobFunc> jobs_;
-  bool stopping_;
-};  // class AsyncThreadPool
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/ThreadLocal.cpp b/paddle/legacy/utils/ThreadLocal.cpp
deleted file mode 100644
index 58fe51bd40c..00000000000
--- a/paddle/legacy/utils/ThreadLocal.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ThreadLocal.h"
-
-#include <gflags/gflags.h>
-
-#include "Util.h"
-
-DEFINE_bool(thread_local_rand_use_global_seed,
-            false,
-            "Whether to use global seed in thread local rand.");
-
-namespace paddle {
-
-unsigned int ThreadLocalRand::defaultSeed_ = 1;
-ThreadLocal<unsigned int> ThreadLocalRand::seed_;
-
-unsigned int* ThreadLocalRand::getSeed() {
-  unsigned int* p = seed_.get(false /*createLocal*/);
-  if (!p) {  // init seed
-    if (FLAGS_thread_local_rand_use_global_seed) {
-      p = new unsigned int(defaultSeed_);
-    } else if (getpid() == getTID()) {  // main thread
-      // deterministic, but differs from global srand()
-      p = new unsigned int(defaultSeed_ - 1);
-    } else {
-      p = new unsigned int(defaultSeed_ + getTID());
-      VLOG(3) << "thread use undeterministic rand seed:" << *p;
-    }
-    seed_.set(p);
-  }
-  return p;
-}
-
-ThreadLocal<std::default_random_engine> ThreadLocalRandomEngine::engine_;
-std::default_random_engine& ThreadLocalRandomEngine::get() {
-  auto engine = engine_.get(false);
-  if (!engine) {
-    engine = new std::default_random_engine;
-    int defaultSeed = ThreadLocalRand::getDefaultSeed();
-    engine->seed(FLAGS_thread_local_rand_use_global_seed
-                     ? defaultSeed
-                     : defaultSeed + getTID());
-    engine_.set(engine);
-  }
-  return *engine;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/ThreadLocal.h b/paddle/legacy/utils/ThreadLocal.h
deleted file mode 100644
index 6268b73a855..00000000000
--- a/paddle/legacy/utils/ThreadLocal.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef _WIN32
-#include <pthread.h>
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-#include <sys/types.h>
-#include <map>
-#include <mutex>
-#include <random>
-#include "Logging.h"
-#include "Util.h"
-
-namespace paddle {
-
-/**
- * Thread local storage for object.
- * Example:
- *
- * Declarartion:
- * ThreadLocal<vector<int>> vec_;
- *
- * Use in thread:
- * vector<int>& vec = *vec; // obtain the thread specific object
- * vec.resize(100);
- *
- * Note that this ThreadLocal will desconstruct all internal data when thread
- * exits
- * This class is suitable for cases when frequently creating and deleting
- * threads.
- *
- * Consider implementing a new ThreadLocal if one needs to frequently create
- * both instances and threads.
- *
- * see also ThreadLocalD
- */
-template <class T>
-class ThreadLocal {
- public:
-  ThreadLocal() {
-    CHECK_EQ(pthread_key_create(&threadSpecificKey_, dataDestructor), 0);
-  }
-  ~ThreadLocal() { pthread_key_delete(threadSpecificKey_); }
-
-  /**
-   * @brief get thread local object.
-   * @param if createLocal is true and thread local object is never created,
-   * return a new object. Otherwise, return nullptr.
-   */
-  T* get(bool createLocal = true) {
-    T* p = (T*)pthread_getspecific(threadSpecificKey_);
-    if (!p && createLocal) {
-      p = new T();
-      int ret = pthread_setspecific(threadSpecificKey_, p);
-      CHECK_EQ(ret, 0);
-    }
-    return p;
-  }
-
-  /**
-   * @brief set (overwrite) thread local object. If there is a thread local
-   * object before, the previous object will be destructed before.
-   *
-   */
-  void set(T* p) {
-    if (T* q = get(false)) {
-      dataDestructor(q);
-    }
-    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
-  }
-
-  /**
-   * return reference.
-   */
-  T& operator*() { return *get(); }
-
-  /**
-   * Implicit conversion to T*
-   */
-  operator T*() { return get(); }
-
- private:
-  static void dataDestructor(void* p) { delete (T*)p; }
-
-  pthread_key_t threadSpecificKey_;
-};
-
-/**
- * Almost the same as ThreadLocal, but note that this ThreadLocalD will
- * destruct all internal data when ThreadLocalD instance destructs.
- *
- * This class is suitable for cases when frequently creating and deleting
- * objects.
- *
- * see also ThreadLocal
- *
- * @note The type T must implemented default constructor.
- */
-template <class T>
-class ThreadLocalD {
- public:
-  ThreadLocalD() { CHECK_EQ(pthread_key_create(&threadSpecificKey_, NULL), 0); }
-  ~ThreadLocalD() {
-    pthread_key_delete(threadSpecificKey_);
-    for (auto t : threadMap_) {
-      dataDestructor(t.second);
-    }
-  }
-
-  /**
-   * @brief Get thread local object. If not exists, create new one.
-   */
-  T* get() {
-    T* p = (T*)pthread_getspecific(threadSpecificKey_);
-    if (!p) {
-      p = new T();
-      CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
-      updateMap(p);
-    }
-    return p;
-  }
-
-  /**
-   * @brief Set thread local object. If there is an object create before, the
-   * old object will be destructed.
-   */
-  void set(T* p) {
-    if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) {
-      dataDestructor(q);
-    }
-    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
-    updateMap(p);
-  }
-
-  /**
-   * @brief Get reference of the thread local object.
-   */
-  T& operator*() { return *get(); }
-
- private:
-  static void dataDestructor(void* p) { delete (T*)p; }
-
-  void updateMap(T* p) {
-    pid_t tid = getTID();
-    CHECK_NE(tid, -1);
-    std::lock_guard<std::mutex> guard(mutex_);
-    auto ret = threadMap_.insert(std::make_pair(tid, p));
-    if (!ret.second) {
-      ret.first->second = p;
-    }
-  }
-
-  pthread_key_t threadSpecificKey_;
-  std::mutex mutex_;
-  std::map<pid_t, T*> threadMap_;
-};
-
-/**
- * @brief Thread-safe C-style random API.
- */
-class ThreadLocalRand {
- public:
-  /**
-   * initSeed just like srand,
-   * called by main thread,
-   * init defaultSeed for all thread
-   */
-  static void initSeed(unsigned int seed) { defaultSeed_ = seed; }
-
-  /**
-   * initThreadSeed called by each thread,
-   * init seed to defaultSeed + *tid*
-   * It should be called after main initSeed and before using rand()
-   * It's optional, getSeed will init seed if it's not initialized.
-   */
-  static void initThreadSeed(int tid) {
-    seed_.set(new unsigned int(defaultSeed_ + tid));
-  }
-
-  /// thread get seed, then can call rand_r many times.
-  /// Caller thread can modify the seed value if it's necessary.
-  ///
-  /// if flag thread_local_rand_use_global_seed set,
-  /// the seed will be set to defaultSeed in thread's first call.
-  static unsigned int* getSeed();
-
-  /// like ::rand
-  static int rand() { return rand_r(getSeed()); }
-
-  /**
-   * Get defaultSeed for all thread.
-   */
-  static int getDefaultSeed() { return defaultSeed_; }
-
- protected:
-  static unsigned int defaultSeed_;
-  static ThreadLocal<unsigned int> seed_;
-};
-
-/**
- * @brief Thread-safe C++ style random engine.
- */
-class ThreadLocalRandomEngine {
- public:
-  /**
-   * get random_engine for each thread.
-   *
-   * Engine's seed will be initialized by ThreadLocalRand.
-   */
-  static std::default_random_engine& get();
-
- protected:
-  static ThreadLocal<std::default_random_engine> engine_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Util.cpp b/paddle/legacy/utils/Util.cpp
deleted file mode 100644
index 2755fdd9cd1..00000000000
--- a/paddle/legacy/utils/Util.cpp
+++ /dev/null
@@ -1,409 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Util.h"
-
-#include <dirent.h>
-#include <signal.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-
-#ifdef __SSE__
-#include <xmmintrin.h>
-#endif
-#ifdef __SSE3__
-#include <pmmintrin.h>
-#endif
-
-#include <fstream>
-#include <mutex>
-
-#include <gflags/gflags.h>
-
-#include "CpuId.h"
-#include "CustomStackTrace.h"
-#include "Logging.h"
-#include "StringUtil.h"
-#include "Thread.h"
-#include "ThreadLocal.h"
-#include "Version.h"
-
-DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)");
-
-#ifdef WITH_GOOGLE_PERFTOOLS
-/*
-  In order to use google profiler, you need to install gperftools,
-  which can be obtained at:
-  https://gperftools.googlecode.com/files/gperftools-2.0.tar.gz
-
-  gperftools should be configured with --enable-frame-pointers
-
-  Then link the executable with -lprofiler.
-
-  After you start the application, you can use kill -s signal PID to
-  start/stop profiling. The profile data will be stored in file
-  FLAGS_profile_data_file, which can be analyzed by pprof.
-*/
-
-#include <gperftools/profiler.h>
-
-DEFINE_int32(profile_signal, 12, "signal for switch google profiler");
-DEFINE_string(profile_data_file, "gperf.prof", "file for storing profile data");
-
-static void profilerSwitch(int signalNumber) {
-  bool static started = false;
-
-  if (!started) {
-    if (ProfilerStart(FLAGS_profile_data_file.c_str())) {
-      LOG(INFO) << "Profiler started";
-    } else {
-      LOG(WARNING) << "Can't turn on cpu profiling for "
-                   << FLAGS_profile_data_file;
-    }
-  } else {
-    ProfilerStop();
-    LOG(INFO) << "Profiler stopped";
-  }
-  started = !started;
-}
-
-static void installProfilerSwitch() {
-  sighandler_t oldHandler = signal(FLAGS_profile_signal, profilerSwitch);
-
-  if (!oldHandler) {
-    LOG(INFO) << "Using signal " << FLAGS_profile_signal
-              << " to turn on/off profiler";
-  } else {
-    LOG(WARNING) << "Signal " << FLAGS_profile_signal << " is already in use\n";
-  }
-}
-
-#else
-
-static void installProfilerSwitch() {}
-
-#endif  // WITH_GOOGLE_PERFTOOLS
-
-namespace paddle {
-
-pid_t getTID() {
-#if defined(__APPLE__) || defined(__OSX__)
-  // syscall is deprecated: first deprecated in macOS 10.12.
-  // syscall is unsupported;
-  // syscall pid_t tid = syscall(SYS_thread_selfid);
-  uint64_t tid;
-  pthread_threadid_np(NULL, &tid);
-#else
-#ifndef __NR_gettid
-#define __NR_gettid 224
-#endif
-  pid_t tid = syscall(__NR_gettid);
-#endif
-  CHECK_NE((int)tid, -1);
-  return tid;
-}
-
-static bool g_initialized = false;
-typedef std::pair<int, std::function<void()>> PriorityFuncPair;
-typedef std::vector<PriorityFuncPair> InitFuncList;
-static InitFuncList* g_initFuncs = nullptr;
-static std::once_flag g_onceFlag;
-void registerInitFunction(std::function<void()> func, int priority) {
-  if (g_initialized) {
-    LOG(FATAL) << "registerInitFunction() should only called before initMain()";
-  }
-  if (!g_initFuncs) {
-    g_initFuncs = new InitFuncList();
-  }
-  g_initFuncs->push_back(std::make_pair(priority, func));
-}
-
-void runInitFunctions() {
-  std::call_once(g_onceFlag, []() {
-    VLOG(3) << "Calling runInitFunctions";
-    if (g_initFuncs) {
-      std::sort(g_initFuncs->begin(),
-                g_initFuncs->end(),
-                [](const PriorityFuncPair& x, const PriorityFuncPair& y) {
-                  return x.first > y.first;
-                });
-      for (auto& f : *g_initFuncs) {
-        f.second();
-      }
-      delete g_initFuncs;
-      g_initFuncs = nullptr;
-    }
-    g_initialized = true;
-    VLOG(3) << "Call runInitFunctions done.";
-  });
-}
-
-void initMain(int argc, char** argv) {
-  installLayerStackTracer();
-  std::string line;
-  for (int i = 0; i < argc; ++i) {
-    line += argv[i];
-    line += ' ';
-  }
-
-#ifndef GFLAGS_GFLAGS_H_
-  namespace gflags = google;
-#endif
-
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  initializeLogging(argc, argv);
-  LOG(INFO) << "commandline: " << line;
-  CHECK_EQ(argc, 1) << "Unknown commandline argument: " << argv[1];
-
-  installProfilerSwitch();
-
-#ifdef __SSE__
-  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
-#endif
-#ifdef __SSE3__
-  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
-#endif
-
-  if (FLAGS_seed == 0) {
-    unsigned int t = time(NULL);
-    srand(t);
-    ThreadLocalRand::initSeed(t);
-    LOG(INFO) << "random number seed=" << t;
-  } else {
-    srand(FLAGS_seed);
-    ThreadLocalRand::initSeed(FLAGS_seed);
-  }
-
-  if (FLAGS_use_gpu) {
-    // This is the initialization of the CUDA environment,
-    // need before runInitFunctions.
-    // TODO(hedaoyuan) Can be considered in the runInitFunctions,
-    // but to ensure that it is the first to initialize.
-    hl_start();
-    hl_init(FLAGS_gpu_id);
-  }
-
-  version::printVersion();
-  checkCPUFeature().check();
-  runInitFunctions();
-}
-
-std::string readFile(const std::string& fileName) {
-  std::ifstream is(fileName);
-
-  // get length of file:
-  is.seekg(0, is.end);
-  size_t length = is.tellg();
-  is.seekg(0, is.beg);
-  std::string str(length, (char)0);
-  CHECK(is.read(&str[0], length)) << "Fail to read file: " << fileName;
-  return str;
-}
-
-namespace path {
-
-std::string basename(const std::string& path) {
-  size_t pos = path.rfind(sep);
-  ++pos;
-  return path.substr(pos, std::string::npos);
-}
-
-std::string dirname(const std::string& path) {
-  size_t pos = path.rfind(sep);
-  if (pos == std::string::npos) return std::string();
-  return path.substr(0, pos);
-}
-
-std::string join(const std::string& part1, const std::string& part2) {
-  if (!part2.empty() && part2.front() == sep) {
-    return part2;
-  }
-  std::string ret;
-  ret.reserve(part1.size() + part2.size() + 1);
-  ret = part1;
-  if (!ret.empty() && ret.back() != sep) {
-    ret += sep;
-  }
-  ret += part2;
-  return ret;
-}
-
-}  // namespace path
-
-void copyFileToPath(const std::string& file, const std::string& dir) {
-  VLOG(3) << "copy " << file << " to " << dir;
-  std::string fileName = path::basename(file);
-  std::string dst = path::join(dir, fileName);
-  std::ifstream source(file, std::ios_base::binary);
-  std::ofstream dest(dst, std::ios_base::binary);
-  CHECK(source) << "Fail to open " << file;
-  CHECK(dest) << "Fail to open " << dst;
-  dest << source.rdbuf();
-  source.close();
-  dest.close();
-}
-
-bool fileExist(const char* filename) { return (access(filename, 0) == 0); }
-
-void touchFile(const char* filename) {
-  if (!fileExist(filename)) {
-    std::ofstream os(filename);
-  }
-}
-
-int isDir(const char* path) {
-  struct stat s_buf;
-  if (stat(path, &s_buf)) {
-    return 0;
-  }
-  return S_ISDIR(s_buf.st_mode);
-}
-
-void rmDir(const char* folderName) {
-  if (isDir(folderName)) {
-    DIR* dp;
-    struct dirent* ep;
-    std::string buf;
-    dp = opendir(folderName);
-    while ((ep = readdir(dp)) != NULL) {
-      if (strcmp(ep->d_name, ".") && strcmp(ep->d_name, "..")) {
-        buf = std::string(folderName) + "/" + std::string(ep->d_name);
-        if (isDir(buf.c_str())) {
-          rmDir(buf.c_str());
-        } else {
-          remove(buf.c_str());
-        }
-      }
-    }
-    closedir(dp);
-    rmdir(folderName);
-  }
-}
-
-void mkDir(const char* filename) {
-  if (mkdir(filename, 0755)) {
-    CHECK(errno == EEXIST) << filename << "mkdir failed!";
-  }
-}
-
-void mkDirRecursively(const char* dir) {
-  struct stat sb;
-
-  if (*dir == 0) return;  // empty string
-  if (!stat(dir, &sb)) return;
-
-  mkDirRecursively(path::dirname(dir).c_str());
-
-  mkDir(dir);
-}
-
-void loadFileList(const std::string& fileListFileName,
-                  std::vector<std::string>& fileList) {
-  std::ifstream is(fileListFileName);
-  CHECK(is) << "Fail to open " << fileListFileName;
-  std::string line;
-  while (is) {
-    if (!getline(is, line)) break;
-    fileList.push_back(line);
-  }
-}
-
-double getMemoryUsage() {
-#if defined(__ANDROID__)
-  return 0.0;
-#else
-  FILE* fp = fopen("/proc/meminfo", "r");
-  CHECK(fp) << "failed to fopen /proc/meminfo";
-  size_t bufsize = 256 * sizeof(char);
-  char* buf = new (std::nothrow) char[bufsize];
-  CHECK(buf);
-  int totalMem = -1;
-  int freeMem = -1;
-  int bufMem = -1;
-  int cacheMem = -1;
-  while (getline(&buf, &bufsize, fp) >= 0) {
-    if (0 == strncmp(buf, "MemTotal", 8)) {
-      if (1 != sscanf(buf, "%*s%d", &totalMem)) {
-        LOG(FATAL) << "failed to get MemTotal from string: [" << buf << "]";
-      }
-    } else if (0 == strncmp(buf, "MemFree", 7)) {
-      if (1 != sscanf(buf, "%*s%d", &freeMem)) {
-        LOG(FATAL) << "failed to get MemFree from string: [" << buf << "]";
-      }
-    } else if (0 == strncmp(buf, "Buffers", 7)) {
-      if (1 != sscanf(buf, "%*s%d", &bufMem)) {
-        LOG(FATAL) << "failed to get Buffers from string: [" << buf << "]";
-      }
-    } else if (0 == strncmp(buf, "Cached", 6)) {
-      if (1 != sscanf(buf, "%*s%d", &cacheMem)) {
-        LOG(FATAL) << "failed to get Cached from string: [" << buf << "]";
-      }
-    }
-    if (totalMem != -1 && freeMem != -1 && bufMem != -1 && cacheMem != -1) {
-      break;
-    }
-  }
-  CHECK(totalMem != -1 && freeMem != -1 && bufMem != -1 && cacheMem != -1)
-      << "failed to get all information";
-  fclose(fp);
-  delete[] buf;
-  double usedMem = 1.0 - 1.0 * (freeMem + bufMem + cacheMem) / totalMem;
-  return usedMem;
-#endif
-}
-
-SyncThreadPool* getGlobalSyncThreadPool() {
-  static std::unique_ptr<SyncThreadPool> syncThreadPool;
-  if (syncThreadPool &&
-      syncThreadPool->getNumThreads() != (size_t)FLAGS_trainer_count) {
-    LOG(WARNING) << "trainer_count changed in training process!";
-    syncThreadPool.reset(nullptr);
-  }
-  if (!syncThreadPool) {
-    syncThreadPool.reset(new SyncThreadPool(FLAGS_trainer_count));
-  }
-  return syncThreadPool.get();
-}
-
-size_t calculateServiceNum(const std::string& pservers, int ports_num) {
-  std::vector<std::string> hosts;
-  str::split(pservers, ',', &hosts);
-  return hosts.size() * ports_num;
-}
-
-void memcpyWithCheck(void* dest,
-                     const void* src,
-                     size_t num,
-                     const void* srcEnd) {
-  int minus = (char*)srcEnd - (char*)src - num;
-  CHECK_LE(0, minus) << "memcpyWithCheck: copy " << num
-                     << " bytes data out of range.";
-  memcpy(dest, src, num);
-}
-
-hl_activation_mode_t hlActiveType(const std::string& type) {
-  if (type == "sigmoid") {
-    return HL_ACTIVATION_SIGMOID;
-  } else if (type == "relu") {
-    return HL_ACTIVATION_RELU;
-  } else if (type == "tanh") {
-    return HL_ACTIVATION_TANH;
-  } else if (type == "linear" || type == "") {
-    return HL_ACTIVATION_LINEAR;
-  } else {
-    LOG(FATAL) << "Do not support activation type " << type;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Util.h b/paddle/legacy/utils/Util.h
deleted file mode 100644
index 3a878b2b301..00000000000
--- a/paddle/legacy/utils/Util.h
+++ /dev/null
@@ -1,597 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef _WIN32
-#include <sys/syscall.h>  // for syscall()
-#endif
-#include <sys/types.h>
-#include <algorithm>
-#include <cmath>
-#include <functional>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <thread>
-#include <unordered_map>
-#include <vector>
-
-#include "Common.h"
-#include "Logging.h"
-#include "TrainerConfig.pb.h"
-
-#include "Flags.h"
-#include "hl_gpu.h"
-
-#if defined(__ANDROID__) && (__ANDROID_API__ < 21)
-inline int rand_r(unsigned int* seedp) {
-  (void)seedp;
-  return rand();
-}
-#endif
-
-#ifdef _WIN32
-#define NOMINMAX  // msvc max/min macro conflict with std::min/max
-#include <windows.h>
-
-template <typename T>
-inline int __builtin_clz(const T& value) {
-  DWORD leadning_zero = 0;
-  if (_BitScanReverse(&leadning_zero, value)) {
-    return static_cast<int>(sizeof(T) * 8 - leadning_zero);
-  } else {
-    return static_cast<int>(0);
-  }
-}
-
-inline int __builtin_clzl(const unsigned long& value) {
-  return __builtin_clz(value);
-}
-
-inline int __builtin_clzll(const unsigned long long& value) {
-  return __builtin_clz(value);
-}
-
-#define pid_t int
-#endif
-
-/**
- * Loop over the elements in a container
- * TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach,
- *                 or make it a inline method?
- * Example:
- * FOR_EACH(it, array) {
- *  sum += *it;
- * }
- */
-#define FOR_EACH(iterator_name, container)                              \
-  for (auto iterator_name = (container).begin(), e = (container).end(); \
-       iterator_name != e;                                              \
-       ++iterator_name)
-
-/**
- * Loop over the elements in a container in reverse order
- * TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach,
- *                 or make it a inline method?
- * Example:
- * FOR_EACH_R(it, array) {
- *  sum += *it;
- * }
- */
-#define FOR_EACH_R(iterator_name, container)                              \
-  for (auto iterator_name = (container).rbegin(), e = (container).rend(); \
-       iterator_name != e;                                                \
-       ++iterator_name)
-
-namespace paddle {
-
-// return the thread id used by glog
-pid_t getTID();
-
-/**
- * return the 1-based index of the highest bit set
- *
- * for x > 0:
- * \f[
- *    findLastSet(x) = 1 + \floor*{\log_{2}x}
- * \f]
- */
-inline constexpr size_t findLastSet(size_t x) {
-  return std::is_same<size_t, unsigned int>::value
-             ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
-             : (std::is_same<size_t, unsigned long>::value  // NOLINT
-                    ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
-                    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
-}
-
-/**
- * calculate the non-negative remainder of a/b
- * @param[in] a
- * @param[in] b, should be positive
- * @return the non-negative remainder of a / b
- */
-inline int mod(int a, int b) {
-  int r = a % b;
-  return r >= 0 ? r : r + b;
-}
-
-/**
- * find the value given a key k from container c.
- * If the key can be found, the value is stored in *value
- * return true if the key can be found. false otherwise.
- */
-template <class K, class V, class C>
-bool mapGet(const K& k, const C& c, V* value) {
-  auto it = c.find(k);
-  if (it != c.end()) {
-    *value = it->second;
-    return true;
-  } else {
-    return false;
-  }
-}
-
-template <class Container, class T>
-static bool contains(const Container& container, const T& val) {
-  return std::find(container.begin(), container.end(), val) != container.end();
-}
-
-/**
- * pop and get the front element of a container
- */
-template <typename Container>
-typename Container::value_type pop_get_front(Container& c) {
-  typename Container::value_type v;
-  swap(v, c.front());
-  c.pop_front();
-  return v;
-}
-
-#define ARRAYSIZE(a) (sizeof(a) / sizeof(*(a)))
-
-/**
- * Initialize some creators or initFunctions for layers and data
- * providers.
- * Client codes should call this function before they refer any other
- * codes that use the layer class and data provider class.
- *
- * Codes inside 'core' directory can call initMain which calls
- * runInitFunctions directly, while codes outside core can simply
- * call runInitFunctions if they don't need the commandline flags
- * designed for PADDLE main procedure.
- */
-void runInitFunctions();
-
-/**
- * Initialize logging and parse commandline
- */
-void initMain(int argc, char** argv);
-
-// read the whole file into a string
-std::string readFile(const std::string& fileName);
-
-// copy file to path
-void copyFileToPath(const std::string& file, const std::string& path);
-
-// test file exist or not
-bool fileExist(const char* filename);
-// touch file if not exist
-void touchFile(const char* filename);
-// make dir if not exist
-void mkDir(const char* filename);
-void mkDirRecursively(const char* filename);
-
-void rmDir(const char* folderName);
-
-// load a file list file into a vector(fileList)
-void loadFileList(const std::string& fileListFileName,
-                  std::vector<std::string>& fileList);
-
-/**
- * Register a function, the function will be called in initMain(). Functions
- * with higher priority will be called first. The execution order of functions
- * with same priority is not defined.
- */
-void registerInitFunction(std::function<void()> func, int priority = 0);
-class InitFunction {
- public:
-  explicit InitFunction(std::function<void()> func, int priority = 0) {
-    registerInitFunction(func, priority);
-  }
-};
-
-/**
- * Class SetDevice provides a mechanism for set device enviroment.
- * When a SetDevice object is created, it attempts to change device enviroment.
- * When the SetDevice object is destructed, it will restore device environment.
- */
-class SetDevice {
- public:
-  explicit SetDevice(int deviceId) {
-    isSet_ = deviceId >= 0;
-    devId_ = 0;
-    if (isSet_) {
-      devId_ = hl_get_device();
-      hl_set_device(deviceId);
-    }
-  }
-  ~SetDevice() {
-    if (isSet_) {
-      hl_set_device(devId_);
-    }
-  }
-
- protected:
-  bool isSet_;
-  int devId_;
-};
-
-/**
- * Enables direct access to memory allocations on a peer device(d2).
- * input:
- * *d1* is device can direct access device d2.
- * *d2* is peer device to enable direct access to by the d1 device.
- */
-inline void enablePeerAccess(int d1, int d2) {
-#ifdef PADDLE_WITH_CUDA
-  if (hl_device_can_access_peer(d1, d2)) {
-    SetDevice dev(d1);
-    hl_device_enable_peer_access(d2);
-  }
-#else
-  LOG(FATAL) << "Paddle should be compiled in GPU mode to use this method.";
-#endif
-}
-
-/**
- * Change the gpu computation mode to asynchronized mode for the rest of the
- * compilation block. This is useful if the computation consists of multiple
- * small steps. Async mode can overlap the cuda-kernel launch overhead with the
- * actual computation.
- * Example:
- * {
- *    AsycnGpuBlock asyncBlock;
- *    do_some_gpu_computation
- * }
- */
-class AsyncGpuBlock {
- public:
-  AsyncGpuBlock() : syncFlag_(hl_get_sync_flag()) { hl_set_sync_flag(false); }
-  ~AsyncGpuBlock() {
-    if (syncFlag_) {
-      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-      hl_set_sync_flag(syncFlag_);
-    }
-  }
-
- private:
-  bool syncFlag_;
-};
-
-inline bool useGpu(int deviceId) {
-  return FLAGS_parallel_nn ? (deviceId >= 0 ? true : false) : FLAGS_use_gpu;
-}
-
-/*
- * hppl activation mode
- */
-hl_activation_mode_t hlActiveType(const std::string& type);
-
-/**
- * Return value: memory usage ratio (from 0-1)
- */
-double getMemoryUsage();
-
-/**
- * split array by index.
- * used by sync multi thread task,
- * each thread call calcSplitArrayInterval with thread id,
- * get a interval as return.
- * input:
- * *totalSize* is array size,
- * *tId* is thread id, *tSize* is total worker thread num
- * output:
- * start and end index as a std::pair
- */
-inline std::pair<size_t, size_t> calcSplitArrayInterval(size_t totalSize,
-                                                        size_t tId,
-                                                        size_t tSize) {
-  size_t start = totalSize * tId / tSize;
-  size_t end = totalSize * (tId + 1) / tSize;
-  return std::make_pair(start, end);
-}
-
-/**
- * same as above, but split at boundary of block.
- */
-inline std::pair<size_t, size_t> calcSplitArrayInterval(size_t totalSize,
-                                                        size_t tId,
-                                                        size_t tSize,
-                                                        size_t blockSize) {
-  size_t numBlocks = totalSize / blockSize;
-  if (numBlocks * blockSize < totalSize) {
-    numBlocks++;
-  }
-
-  auto interval = calcSplitArrayInterval(numBlocks, tId, tSize);
-  size_t start = std::min(interval.first * blockSize, totalSize);
-  size_t end = std::min(interval.second * blockSize, totalSize);
-
-  return std::make_pair(start, end);
-}
-
-// Calculate the number of pservers/dservers based
-// on the host list and port_num.
-size_t calculateServiceNum(const std::string& pservers, int ports_num);
-
-/**
- * sort and unique ids vector.
- */
-inline void uniqueIds(std::vector<uint32_t>& ids) {
-  std::sort(ids.begin(), ids.end());
-  auto endpos = std::unique(ids.begin(), ids.end());
-  ids.erase(endpos, ids.end());
-}
-
-/**
- * Read Type value
- */
-template <typename T>
-T readT(char*& p, const char* pEnd) {
-  int minus = pEnd - p - sizeof(T);
-  CHECK_LE(0, minus) << "readT: Out of range.";
-  T v = *reinterpret_cast<T*>(p);
-  p += sizeof(T);
-  return v;
-}
-
-void memcpyWithCheck(void* dest,
-                     const void* src,
-                     size_t num,
-                     const void* srcEnd);
-
-/**
- * A global sync thread pool, has #FLAGS_trainer_count of threads.
- * can be used in main thread.
- */
-class SyncThreadPool;
-SyncThreadPool* getGlobalSyncThreadPool();
-
-namespace path {
-
-// directory separator
-const char sep = '/';
-
-// Return the base name of pathname path.
-std::string basename(const std::string& path);
-
-// Return the directory name of path. If the path does not contains any
-// directory, it returns an empty string.
-std::string dirname(const std::string& path);
-
-/*
-  Join two path components intelligently.
-  The return value is the concatenation of part1 and part2 with exactly one
-  directory separator (path.sep) following each non-empty part except the last,
-  meaning that the result will only end in a separator if the last part is
-  empty.
-  If a component is an absolute path, all previous components are thrown away
-  and joining continues from the absolute path component.
-*/
-std::string join(const std::string& part1, const std::string& part2);
-
-template <typename... Args>
-std::string join(const std::string& part1,
-                 const std::string& part2,
-                 Args... args) {
-  return join(join(part1, part2), args...);
-}
-
-}  // namespace path
-
-/**
- * A Checker for each invoke of method in same thread.
- */
-class SameThreadChecker {
- public:
-  SameThreadChecker() {}
-
-  /**
-   * Disable copy
-   */
-  SameThreadChecker(const SameThreadChecker& other) = delete;
-  SameThreadChecker& operator=(const SameThreadChecker& other) = delete;
-
-  /**
-   * Each invoke of check method should be in same thread, otherwise, it will
-   * failed and core dump.
-   */
-  void check() {
-    std::thread::id curThreadId = std::this_thread::get_id();
-    std::call_once(onceFlag_, [&] { invokeThreadId_ = curThreadId; });
-    CHECK_EQ(invokeThreadId_, curThreadId)
-        << "This method should invoke in "
-           "same thread, but first invoked in "
-        << invokeThreadId_ << " current invoked in " << curThreadId;
-  }
-
- private:
-  std::once_flag onceFlag_;
-  std::thread::id invokeThreadId_;
-};
-
-/**
- * Key-Value Cache Helper.
- *
- * It store a object instance global. User can invoke get method by key and a
- * object creator callback. If there is a instance stored in cache, then it will
- * return a shared_ptr of it, otherwise, it will invoke creator callback, create
- * a new instance store global, and return it.
- *
- * The cache instance will release when nobody hold a reference to it.
- *
- * The KType is the key type.
- * The VType is the value type.
- * The Hash is the key hasher object.
- */
-template <typename KType, typename VType, typename Hash>
-class WeakKVCache {
- public:
-  WeakKVCache() {}
-
-  std::shared_ptr<VType> get(const KType& key,
-                             const std::function<VType*()>& creator) {
-    std::lock_guard<std::mutex> guard(this->lock_);
-    auto it = this->storage_.find(key);
-    if (it != this->storage_.end()) {
-      auto& val = it->second;
-      auto retVal = val.lock();
-      if (retVal != nullptr) {
-        return retVal;
-      }  // else fall trough. Because it is WeakPtr Cache.
-    }
-    auto rawPtr = creator();
-    CHECK(rawPtr != nullptr);
-    std::shared_ptr<VType> retVal(rawPtr);
-    this->storage_[key] = retVal;
-    return retVal;
-  }
-
- private:
-  std::mutex lock_;
-  std::unordered_map<KType, std::weak_ptr<VType>, Hash> storage_;
-};
-
-/**
- * @brief The ScopedCallbacks class is a callback invoker when object is
- *        created and destroyed.
- */
-template <typename CallbackType, typename... Args>
-class ScopedCallbacks {
- public:
-  ScopedCallbacks(CallbackType enter, CallbackType exit, Args&... args)
-      : exit_(std::bind(exit, args...)) {
-    enter(args...);
-  }
-
-  ScopedCallbacks(const ScopedCallbacks& other) = delete;
-  ScopedCallbacks& operator=(const ScopedCallbacks& other) = delete;
-
-  ~ScopedCallbacks() { exit_(); }
-
- private:
-  std::function<void()> exit_;
-};
-
-/**
- * std compatible allocator with memory alignment.
- * @tparam T type of allocator elements.
- * @tparam Alignment the alignment in bytes.
- */
-template <typename T, size_t Alignment>
-class AlignedAllocator {
- public:
-  /// std campatible typedefs.
-  typedef T* pointer;
-  typedef const T* const_pointer;
-  typedef T& reference;
-  typedef const T& const_reference;
-  typedef T value_type;
-  typedef size_t size_type;
-  typedef ptrdiff_t difference_type;
-
-  T* address(T& r) const { return &r; }
-
-  const T* address(const T& r) const { return &r; }
-
-  size_t max_size() const {
-    return std::numeric_limits<size_t>::max() / sizeof(T);
-  }
-
-  template <typename U>
-  struct rebind {
-    typedef AlignedAllocator<U, Alignment> other;
-  };
-
-  bool operator==(const AlignedAllocator& other) const { return true; }
-
-  bool operator!=(const AlignedAllocator& other) const {
-    return !(*this == &other);
-  }
-
-  void construct(const T* p, const T& t) const {
-    void* pv = const_cast<T*>(p);
-    new (pv) T(t);
-  }
-
-  void deallocate(const T* p, const size_type n) const {
-    (void)(n);  // UNUSED n
-    free(const_cast<T*>(p));
-  }
-
-  void destroy(const T* p) const { p->~T(); }
-
-  AlignedAllocator() {}
-  ~AlignedAllocator() {}
-
-  AlignedAllocator(const AlignedAllocator&) {}
-  template <typename U>
-  AlignedAllocator(const AlignedAllocator<U, Alignment>&) {}
-
-  /**
-   * @brief allocate n elements of type T, the first address is aligned by
-   *        Alignment bytes.
-   * @param n element count.
-   * @return begin address of allocated buffer
-   * @throw std::length_error for n * sizeof(T) is overflowed.
-   * @throw std::bad_alloc
-   */
-  T* allocate(const size_type n) const {
-    if (n == 0) {
-      return nullptr;
-    }
-    if (n > max_size()) {
-      throw std::length_error("AlignAllocator<T>::allocate() - Int Overflow.");
-    }
-    void* r = nullptr;
-    CHECK_EQ(posix_memalign(&r, Alignment * 8, sizeof(T) * n), 0);
-    if (r == nullptr) {
-      throw std::bad_alloc();
-    } else {
-      return static_cast<T*>(r);
-    }
-  }
-
-  template <typename U>
-  T* allocate(const std::size_t n, const U* /* const hint */) const {
-    return this->allocate(n);
-  }
-
- private:
-  AlignedAllocator& operator=(const AlignedAllocator&);  // disable
-};
-
-class Deprecated {
- public:
-  explicit Deprecated(const std::string& msg = "") {
-    if (msg.empty()) {
-      LOG(WARNING) << "This class is deprecated, please do not use this class.";
-    } else {
-      LOG(WARNING) << msg;
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Version.cpp b/paddle/legacy/utils/Version.cpp
deleted file mode 100644
index 731c3084211..00000000000
--- a/paddle/legacy/utils/Version.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Version.h"
-
-#include <iomanip>
-#include <numeric>
-#include "Flags.h"
-#include "Util.h"
-
-DECLARE_bool(version);
-
-namespace paddle {
-namespace version {
-
-void printVersion(std::ostream& os) {
-#ifndef PADDLE_VERSION
-#define PADDLE_VERSION "unknown"
-#endif
-// converts macro to string
-// https://gcc.gnu.org/onlinedocs/cpp/Stringification.html
-#define xstr(s) str(s)
-#define str(s) #s
-
-  os << "paddle version: " << xstr(PADDLE_VERSION) << std::endl
-     << std::boolalpha << "\t"
-     << "withGpu: " << version::isWithGpu() << std::endl
-     << "\t"
-     << "withAvx: " << version::isWithAvx() << std::endl
-     << "\t"
-     << "withPyDataProvider: " << version::isWithPyDataProvider() << std::endl
-     << "\t"
-     << "withTimer: " << version::isWithTimer() << std::endl
-     << "\t"
-     << "withFpga: " << version::isWithFpga() << std::endl
-     << "\t"
-     << "real byte size: " << version::sizeofReal() << std::endl
-     << std::endl;
-}
-
-void printVersion() {
-  if (FLAGS_version) {
-    printVersion(std::cout);
-    exit(0);
-  }
-}
-
-}  //  namespace version
-}  //  namespace paddle
diff --git a/paddle/legacy/utils/Version.h b/paddle/legacy/utils/Version.h
deleted file mode 100644
index 004d62451cd..00000000000
--- a/paddle/legacy/utils/Version.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stddef.h>
-#include <iostream>
-#include "Common.h"
-
-namespace paddle {
-
-/**
- * namespace paddle::version
- * Some constexpr to detect paddle version.
- *    use paddle_trainer --version to print version information.
- *
- * Possible output as follow:
- * paddle version:
- *    withGpu: false
- *    withAvx: false
- *    withPyDataProvider: true
- *    withTimer: false
- *    withFpga: false
- *    real byte size: 4
- */
-
-namespace version {
-
-/**
- * @brief print paddle version and exit when --version flag setted. Otherwise,
- * do nothing.
- */
-void printVersion();
-
-void printVersion(std::ostream& os);
-/**
- * @brief isWithGpu
- * @return return true if paddle compiled with GPU
- */
-constexpr bool isWithGpu() {
-#ifndef PADDLE_WITH_CUDA
-  return false;
-#else
-  return true;
-#endif
-}
-
-/**
- * @brief isWithPyDataProvider
- * @return return true if paddle compiled with PyDataProvider
- *
- * @note: A complete python interpreter is embeded into paddle binary if paddle
- * is compiled with PyDataProvider. Then the config parser just invoke python
- * method. Otherwise, ConfigParser just serializes config into protobuf, and
- * pass to C++ by using stdio.
- */
-constexpr bool isWithPyDataProvider() {
-#ifdef PADDLE_NO_PYTHON
-  return false;
-#else
-  return true;
-#endif
-}
-
-/**
- * @brief isWithTimer
- * @return true if paddle compiled with timer.
- */
-constexpr bool isWithTimer() {
-#ifdef PADDLE_DISABLE_TIMER
-  return false;
-#else
-  return true;
-#endif
-}
-
-/**
- * @brief isWithAvx
- * @return true if paddle compiled with AVX instructs.
- */
-constexpr bool isWithAvx() {
-#ifdef __AVX__
-  return true;
-#else
-  return false;
-#endif
-}
-
-/**
- * @brief isWithFpga
- * @return true if paddle compiled with FPGA for prediction.
- */
-constexpr bool isWithFpga() {
-#ifdef PADDLE_USE_FPGA
-  return true;
-#else
-  return false;
-#endif
-}
-
-/**
- * @brief sizeofReal
- * @return return the byte size of real
- */
-constexpr size_t sizeofReal() { return sizeof(real); }
-
-/**
- * @brief isPaddleUseDouble
- * @return true if paddle compiled with double precision.
- */
-constexpr bool isPaddleUseDouble() { return sizeofReal() == sizeof(double); }
-
-/**
- * @brief isPaddleUseFloat
- * @return true if paddle compiled with float precision
- */
-constexpr bool isPaddleUseFloat() { return sizeofReal() == sizeof(float); }
-
-}  //  namespace version
-
-}  //  namespace paddle
diff --git a/paddle/legacy/utils/arch/linux/Locks.cpp b/paddle/legacy/utils/arch/linux/Locks.cpp
deleted file mode 100644
index 32d351e3328..00000000000
--- a/paddle/legacy/utils/arch/linux/Locks.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Locks.h"
-#include <semaphore.h>
-#include <unistd.h>
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-class SemaphorePrivate {
- public:
-  sem_t sem;
-};
-
-Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
-  sem_init(&m->sem, 0, initValue);
-}
-
-Semaphore::~Semaphore() {
-  sem_destroy(&m->sem);
-  delete m;
-}
-
-bool Semaphore::timeWait(struct timespec* ts) {
-  return (0 == sem_timedwait(&m->sem, ts));
-}
-
-void Semaphore::wait() { sem_wait(&m->sem); }
-
-void Semaphore::post() { sem_post(&m->sem); }
-
-/// SpinLockPrivate
-
-#ifdef PADDLE_USE_PTHREAD_SPINLOCK
-
-class SpinLockPrivate {
- public:
-  inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
-  inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
-
-  inline void lock() { pthread_spin_lock(&lock_); }
-  inline void unlock() { pthread_spin_unlock(&lock_); }
-
-  pthread_spinlock_t lock_;
-  char padding_[64 - sizeof(pthread_spinlock_t)];
-};
-
-#else
-// clang-format off
-#include <cstddef>
-#include <atomic>
-// clang-format on
-
-class SpinLockPrivate {
- public:
-  inline void lock() {
-    while (lock_.test_and_set(std::memory_order_acquire)) {
-    }
-  }
-  inline void unlock() { lock_.clear(std::memory_order_release); }
-
-  std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
-  char padding_[64 - sizeof(lock_)];  // Padding to cache line size
-};
-
-#endif
-
-SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
-SpinLock::~SpinLock() { delete m; }
-void SpinLock::lock() { m->lock(); }
-void SpinLock::unlock() { m->unlock(); }
-
-/// ThreadBarrierPrivate
-
-#ifdef PADDLE_USE_PTHREAD_BARRIER
-
-class ThreadBarrierPrivate {
- public:
-  pthread_barrier_t barrier_;
-
-  inline explicit ThreadBarrierPrivate(int count) {
-    pthread_barrier_init(&barrier_, nullptr, count);
-  }
-
-  inline ~ThreadBarrierPrivate() { pthread_barrier_destroy(&barrier_); }
-
-  inline void wait() { pthread_barrier_wait(&barrier_); }
-};
-
-#else
-
-class ThreadBarrierPrivate {
- public:
-  pthread_mutex_t mutex_;
-  pthread_cond_t cond_;
-  int count_;
-  int tripCount_;
-
-  inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
-    CHECK_NE(cnt, 0);
-    CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
-    CHECK_GE(pthread_cond_init(&cond_, 0), 0);
-  }
-
-  inline ~ThreadBarrierPrivate() {
-    pthread_cond_destroy(&cond_);
-    pthread_mutex_destroy(&mutex_);
-  }
-
-  /**
-   * @brief wait
-   * @return true if the last wait
-   */
-  inline bool wait() {
-    pthread_mutex_lock(&mutex_);
-    ++count_;
-    if (count_ >= tripCount_) {
-      count_ = 0;
-      pthread_cond_broadcast(&cond_);
-      pthread_mutex_unlock(&mutex_);
-      return true;
-    } else {
-      pthread_cond_wait(&cond_, &mutex_);
-      pthread_mutex_unlock(&mutex_);
-      return false;
-    }
-  }
-};
-
-#endif
-
-/// ThreadBarrier
-
-ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
-ThreadBarrier::~ThreadBarrier() { delete m; }
-void ThreadBarrier::wait() { m->wait(); }
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/arch/osx/Excepts.cpp b/paddle/legacy/utils/arch/osx/Excepts.cpp
deleted file mode 100644
index 2b7d6dca845..00000000000
--- a/paddle/legacy/utils/arch/osx/Excepts.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Excepts.h"
-
-#if defined(__APPLE__) || defined(__OSX__)
-#if defined(__arm__) || defined(__arm64__)
-// TODO(liuyiqun): implement the arm version
-int fegetexcept(void) { return -1; }
-int feenableexcept(unsigned int excepts) { return -1; }
-int fedisableexcept(unsigned int excepts) { return -1; }
-#else
-int fegetexcept(void) {
-  static fenv_t fenv;
-  return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
-}
-
-int feenableexcept(unsigned int excepts) {
-  static fenv_t fenv;
-  unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
-
-  if (fegetenv(&fenv)) return -1;
-  old_excepts = fenv.__control & FE_ALL_EXCEPT;
-
-  // unmask
-  fenv.__control &= ~new_excepts;
-  fenv.__mxcsr &= ~(new_excepts << 7);
-
-  return (fesetenv(&fenv) ? -1 : old_excepts);
-}
-
-int fedisableexcept(unsigned int excepts) {
-  static fenv_t fenv;
-  unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
-
-  if (fegetenv(&fenv)) return -1;
-  old_excepts = fenv.__control & FE_ALL_EXCEPT;
-
-  // mask
-  fenv.__control |= new_excepts;
-  fenv.__mxcsr |= new_excepts << 7;
-
-  return (fesetenv(&fenv) ? -1 : old_excepts);
-}
-#endif
-#endif
diff --git a/paddle/legacy/utils/arch/osx/Locks.cpp b/paddle/legacy/utils/arch/osx/Locks.cpp
deleted file mode 100644
index b68c48f0c31..00000000000
--- a/paddle/legacy/utils/arch/osx/Locks.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Locks.h"
-#include <dispatch/dispatch.h>
-#include <libkern/OSAtomic.h>
-#include <atomic>
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-class SemaphorePrivate {
- public:
-  ~SemaphorePrivate() { dispatch_release(sem); }
-
-  dispatch_semaphore_t sem;
-};
-
-Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
-  m->sem = dispatch_semaphore_create(initValue);
-}
-
-Semaphore::~Semaphore() { delete m; }
-
-bool Semaphore::timeWait(timespec *ts) {
-  dispatch_time_t tm = dispatch_walltime(ts, 0);
-  return (0 == dispatch_semaphore_wait(m->sem, tm));
-}
-
-void Semaphore::wait() {
-  dispatch_semaphore_wait(m->sem, DISPATCH_TIME_FOREVER);
-}
-
-void Semaphore::post() { dispatch_semaphore_signal(m->sem); }
-
-class SpinLockPrivate {
- public:
-  std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
-  char padding_[64 - sizeof(lock_)];  // Padding to cache line size
-};
-
-SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
-SpinLock::~SpinLock() { delete m; }
-
-void SpinLock::lock() {
-  while (m->lock_.test_and_set(std::memory_order_acquire)) {
-  }
-}
-
-void SpinLock::unlock() { m->lock_.clear(std::memory_order_release); }
-
-class ThreadBarrierPrivate {
- public:
-  pthread_mutex_t mutex_;
-  pthread_cond_t cond_;
-  int count_;
-  int tripCount_;
-
-  inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
-    CHECK_NE(cnt, 0);
-    CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
-    CHECK_GE(pthread_cond_init(&cond_, 0), 0);
-  }
-
-  inline ~ThreadBarrierPrivate() {
-    pthread_cond_destroy(&cond_);
-    pthread_mutex_destroy(&mutex_);
-  }
-
-  /**
-   * @brief wait
-   * @return true if the last wait
-   */
-  inline bool wait() {
-    pthread_mutex_lock(&mutex_);
-    ++count_;
-    if (count_ >= tripCount_) {
-      count_ = 0;
-      pthread_cond_broadcast(&cond_);
-      pthread_mutex_unlock(&mutex_);
-      return true;
-    } else {
-      pthread_cond_wait(&cond_, &mutex_);
-      pthread_mutex_unlock(&mutex_);
-      return false;
-    }
-  }
-};
-
-ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
-ThreadBarrier::~ThreadBarrier() { delete m; }
-void ThreadBarrier::wait() { m->wait(); }
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/enable_virtualenv.py b/paddle/legacy/utils/enable_virtualenv.py
deleted file mode 100644
index 4e998381e9e..00000000000
--- a/paddle/legacy/utils/enable_virtualenv.py
+++ /dev/null
@@ -1,26 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-
-def __activate_virtual_env__():
-    __path__ = os.getenv('VIRTUAL_ENV')
-    if __path__ is None:
-        return
-    __script__ = os.path.join(__path__, 'bin', 'activate_this.py')
-    execfile(__script__, {'__file__': __script__})
-
-
-__activate_virtual_env__()
diff --git a/paddle/legacy/utils/tests/CMakeLists.txt b/paddle/legacy/utils/tests/CMakeLists.txt
deleted file mode 100644
index 4af01db5c84..00000000000
--- a/paddle/legacy/utils/tests/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-add_simple_unittest(test_Thread)
-add_simple_unittest(test_StringUtils)
-add_simple_unittest(test_CustomStackTrace)
-add_simple_unittest(test_ThreadBarrier)
-add_simple_unittest(test_SpinLock)
-add_simple_unittest(test_SIMDFlags)
-add_simple_unittest(test_Error)
-
-add_executable(
-    test_CustomStackTracePrint
-    test_CustomStackTracePrint.cpp
-)
-link_paddle_exe(test_CustomStackTracePrint)
-if(NOT APPLE)
-    add_test(NAME test_CustomStackTracePrint
-        COMMAND ${PADDLE_SOURCE_DIR}/paddle/legacy/utils/tests/test_CustomStackTracePrint.sh
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-endif()
diff --git a/paddle/legacy/utils/tests/test_CustomStackTrace.cpp b/paddle/legacy/utils/tests/test_CustomStackTrace.cpp
deleted file mode 100644
index 2a418e3ae22..00000000000
--- a/paddle/legacy/utils/tests/test_CustomStackTrace.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gflags/gflags.h>  // NOLINT
-#include <gtest/gtest.h>    // NOLINT
-
-#include "paddle/legacy/utils/CustomStackTrace.h"
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/StringUtil.h"
-#include "paddle/legacy/utils/Util.h"
-
-DEFINE_int32(test_thread_num, 10, "testing thread number");
-
-void testNormalImpl(
-    const std::function<void(paddle::CustomStackTrace<std::string>&,
-                             size_t,
-                             size_t,
-                             paddle::ThreadBarrier&,
-                             paddle::ThreadBarrier&)>& callback) {
-  paddle::CustomStackTrace<std::string> tracer;
-  paddle::ThreadBarrier doneBarrier(FLAGS_test_thread_num + 1);
-  paddle::ThreadBarrier startBarrier(FLAGS_test_thread_num + 1);
-  constexpr size_t countDown = 10;
-  constexpr size_t layerSize = 1000;
-  std::vector<std::unique_ptr<std::thread>> threads;
-  threads.reserve(FLAGS_test_thread_num);
-
-  for (int32_t i = 0; i < FLAGS_test_thread_num; ++i) {
-    threads.emplace_back(
-        new std::thread([&tracer, &startBarrier, &doneBarrier, &callback] {
-          callback(tracer, countDown, layerSize, startBarrier, doneBarrier);
-        }));
-  }
-  size_t cntDown = countDown;
-  while (cntDown-- > 0) {
-    startBarrier.wait();
-    sleep(1);
-    doneBarrier.wait();
-    ASSERT_TRUE(tracer.empty());
-  }
-
-  for (auto& thread : threads) {
-    thread->join();
-  }
-}
-
-TEST(CustomStackTrace, normalTrain) {
-  testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
-                    size_t countDown,
-                    size_t layerSize,
-                    paddle::ThreadBarrier& start,
-                    paddle::ThreadBarrier& finish) {
-    while (countDown-- > 0) {
-      start.wait();
-      for (size_t i = 0; i < layerSize; ++i) {
-        tracer.push("layer_" + paddle::str::to_string(i));
-      }
-      for (size_t i = 0; i < layerSize; ++i) {
-        tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i));
-      }
-      finish.wait();
-    }
-  });
-}
-
-TEST(CustomStackTrace, normalTest) {
-  testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
-                    size_t countDown,
-                    size_t layerSize,
-                    paddle::ThreadBarrier& start,
-                    paddle::ThreadBarrier& finish) {
-    while (countDown-- > 0) {
-      start.wait();
-      for (size_t i = 0; i < layerSize; ++i) {
-        tracer.push("layer_" + paddle::str::to_string(i));
-      }
-      tracer.clear();  // in forward test, tracer will clear after forward.
-      finish.wait();
-    }
-  });
-}
diff --git a/paddle/legacy/utils/tests/test_CustomStackTracePrint.cpp b/paddle/legacy/utils/tests/test_CustomStackTracePrint.cpp
deleted file mode 100644
index 78886a3ed9f..00000000000
--- a/paddle/legacy/utils/tests/test_CustomStackTracePrint.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/CustomStackTrace.h"
-#include "paddle/legacy/utils/StringUtil.h"
-#include "paddle/legacy/utils/Util.h"
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-
-  for (size_t i = 0; i < 1000; ++i) {
-    paddle::gLayerStackTrace.push("layer_" + paddle::str::to_string(i));
-    if (i == 998) {
-      throw "Unhandle exception";
-    }
-  }
-
-  return 0;
-}
diff --git a/paddle/legacy/utils/tests/test_CustomStackTracePrint.sh b/paddle/legacy/utils/tests/test_CustomStackTracePrint.sh
deleted file mode 100755
index b5543485f36..00000000000
--- a/paddle/legacy/utils/tests/test_CustomStackTracePrint.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-echo "Test Custom Stack Trace print correct result when fail"
-./test_CustomStackTracePrint >customStackTraceLog 2>&1
-if [ $? -eq 0 ]; then
-  exit 1
-else
-  set -e
-  TEXT=""
-  for ((i=0; i<=998; i++))
-  do
-    TEXT="layer_$i, "$TEXT
-  done
-  TEXT="Forwarding "$TEXT
-  grep -q "$TEXT" customStackTraceLog
-fi
diff --git a/paddle/legacy/utils/tests/test_Error.cpp b/paddle/legacy/utils/tests/test_Error.cpp
deleted file mode 100644
index 250c4d58a64..00000000000
--- a/paddle/legacy/utils/tests/test_Error.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Error.h"
-
-#include <gtest/gtest.h>
-
-TEST(Error, testAll) {
-  paddle::Error error;
-  ASSERT_TRUE(error.isOK());
-  error = paddle::Error("I'm the error");
-  ASSERT_FALSE(error.isOK());
-  ASSERT_STREQ("I'm the error", error.msg());
-
-  error = paddle::Error("error2");
-  ASSERT_FALSE(error.isOK());
-  ASSERT_STREQ("error2", error.msg());
-
-  int i = 3;
-  auto error3 = paddle::Error("error%d", i);
-  ASSERT_FALSE(error3.isOK());
-  ASSERT_STREQ("error3", error3.msg());
-}
diff --git a/paddle/legacy/utils/tests/test_SIMDFlags.cpp b/paddle/legacy/utils/tests/test_SIMDFlags.cpp
deleted file mode 100644
index 6362210acda..00000000000
--- a/paddle/legacy/utils/tests/test_SIMDFlags.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-
-#include "paddle/legacy/utils/CpuId.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-
-TEST(SIMDFlags, gccTest) {
-#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) && \
-    !defined(__arm__) && !defined(__aarch64__)
-  // clang-format off
-  CHECK(!__builtin_cpu_supports("sse")    != HAS_SSE);
-  CHECK(!__builtin_cpu_supports("sse2")   != HAS_SSE2);
-  CHECK(!__builtin_cpu_supports("sse3")   != HAS_SSE3);
-  CHECK(!__builtin_cpu_supports("ssse3")  != HAS_SSSE3);
-  CHECK(!__builtin_cpu_supports("sse4.1") != HAS_SSE41);
-  CHECK(!__builtin_cpu_supports("sse4.2") != HAS_SSE42);
-  CHECK(!__builtin_cpu_supports("avx")    != HAS_AVX);
-  CHECK(!__builtin_cpu_supports("avx2")   != HAS_AVX2);
-// clang-format on
-#endif
-}
-
-TEST(SIMDFlags, normalPrint) {
-  LOG(INFO) << "Has SSE:     " << std::boolalpha << HAS_SSE;
-  LOG(INFO) << "Has SSE2:    " << std::boolalpha << HAS_SSE2;
-  LOG(INFO) << "Has SSE3:    " << std::boolalpha << HAS_SSE3;
-  LOG(INFO) << "Has SSSE3:   " << std::boolalpha << HAS_SSSE3;
-  LOG(INFO) << "Has SSE4:    " << std::boolalpha << HAS_SSE41 || HAS_SSE42;
-  LOG(INFO) << "Has FMA3:    " << std::boolalpha << HAS_FMA3;
-  LOG(INFO) << "Has FMA4:    " << std::boolalpha << HAS_FMA4;
-  LOG(INFO) << "Has AVX:     " << std::boolalpha << HAS_AVX;
-  LOG(INFO) << "Has AVX2:    " << std::boolalpha << HAS_AVX2;
-  LOG(INFO) << "Has AVX512:  " << std::boolalpha << HAS_AVX512;
-  LOG(INFO) << "Has NEON:    " << std::boolalpha << HAS_NEON;
-}
diff --git a/paddle/legacy/utils/tests/test_SpinLock.cpp b/paddle/legacy/utils/tests/test_SpinLock.cpp
deleted file mode 100644
index 4cd7836d6af..00000000000
--- a/paddle/legacy/utils/tests/test_SpinLock.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Util.h"
-
-DEFINE_int32(test_thread_num, 100, "testing thread number");
-
-void testNormalImpl(
-    size_t thread_num,
-    const std::function<void(size_t, size_t&, paddle::SpinLock&)>& callback) {
-  paddle::SpinLock mutex;
-  std::vector<std::thread> threads;
-  threads.reserve(thread_num);
-
-  size_t count = 0;
-  for (size_t i = 0; i < thread_num; ++i) {
-    threads.emplace_back([&thread_num, &count, &mutex, &callback] {
-      callback(thread_num, count, mutex);
-    });
-  }
-  for (auto& thread : threads) {
-    thread.join();
-  }
-  // Check whether all threads reach this point or not
-  CHECK_EQ(count, thread_num);
-}
-
-TEST(ThreadSpinLock, normalTest) {
-  for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) {
-    testNormalImpl(
-        thread_num,
-        [](size_t thread_num, size_t& count, paddle::SpinLock& mutex) {
-          std::lock_guard<paddle::SpinLock> lock(mutex);
-          ++count;
-        });
-  }
-}
diff --git a/paddle/legacy/utils/tests/test_StringUtils.cpp b/paddle/legacy/utils/tests/test_StringUtils.cpp
deleted file mode 100644
index 61d2815f097..00000000000
--- a/paddle/legacy/utils/tests/test_StringUtils.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/StringUtil.h"
-
-#include <gtest/gtest.h>
-
-TEST(StringUtil, to) {
-  ASSERT_NEAR(paddle::str::to<double>("12.45"), 12.45, 1e-5);
-  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<double>("12.45x23"), ".*");
-  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<int>(""), ".*");
-}
diff --git a/paddle/legacy/utils/tests/test_Thread.cpp b/paddle/legacy/utils/tests/test_Thread.cpp
deleted file mode 100644
index 5e07da32368..00000000000
--- a/paddle/legacy/utils/tests/test_Thread.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/utils/Thread.h>
-#include <atomic>
-
-using paddle::AsyncThreadPool;  // NOLINT
-
-TEST(AsyncThreadPool, addJob) {
-  AsyncThreadPool pool(8);
-  auto a = pool.addJob([] { return 1; });
-  auto b = pool.addJob([] { return true; });
-  auto c = pool.addJob([] { return false; });
-
-  ASSERT_EQ(a.get(), 1);
-  ASSERT_TRUE(b.get());
-  ASSERT_FALSE(c.get());
-}
-
-TEST(AsyncThreadPool, addBatchJob) {
-  AsyncThreadPool pool(8);
-  std::atomic<int> counter{0};
-
-  std::vector<AsyncThreadPool::JobFunc> jobs;
-
-  for (int i = 0; i < 10000; i++) {
-    jobs.emplace_back([&] { counter++; });
-  }
-
-  pool.addBatchJobs(jobs);
-
-  ASSERT_EQ(counter, 10000);
-}
-
-TEST(AsyncThreadPool, multiThreadAddBatchJob) {
-  AsyncThreadPool levelOnePool(200);
-  AsyncThreadPool levelTwoPool(200);
-
-  std::shared_ptr<std::mutex> mut = std::make_shared<std::mutex>();
-  int counter = 0;
-  const int numMonitors = 300;
-  const int numSlaves = 300;
-  std::vector<AsyncThreadPool::JobFunc> moniterJobs(numMonitors, [&] {
-    std::vector<AsyncThreadPool::JobFunc> slaveJobs(numSlaves, [mut, &counter] {
-      std::lock_guard<std::mutex> lk(*mut);
-      counter++;
-    });
-    levelTwoPool.addBatchJobs(slaveJobs);
-  });
-  levelOnePool.addBatchJobs(moniterJobs);
-  ASSERT_EQ(counter, numMonitors * numSlaves);
-}
-
-TEST(AsyncThreadPool, addBatchJobWithResults) {
-  AsyncThreadPool pool(100);
-
-  std::vector<std::function<int()>> jobs;
-  const int numJobs = 100;
-  for (int i = 0; i < numJobs; i++) {
-    jobs.emplace_back([i] { return i; });
-  }
-
-  std::vector<int> res;
-  pool.addBatchJobs(jobs, res);
-
-  for (int i = 0; i < numJobs; i++) {
-    ASSERT_EQ(res[i], i);
-  }
-}
diff --git a/paddle/legacy/utils/tests/test_ThreadBarrier.cpp b/paddle/legacy/utils/tests/test_ThreadBarrier.cpp
deleted file mode 100644
index 9c8851ae211..00000000000
--- a/paddle/legacy/utils/tests/test_ThreadBarrier.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <set>
-#include <vector>
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Util.h"
-
-DEFINE_int32(test_thread_num, 100, "testing thread number");
-
-void testNormalImpl(
-    size_t thread_num,
-    const std::function<void(size_t,
-                             std::mutex&,
-                             std::set<std::thread::id>&,
-                             paddle::ThreadBarrier&)>& callback) {
-  std::mutex mutex;
-  std::set<std::thread::id> tids;
-  paddle::ThreadBarrier barrier(thread_num);
-
-  std::vector<std::thread> threads;
-  threads.reserve(thread_num);
-  for (size_t i = 0; i < thread_num; ++i) {
-    threads.emplace_back([&thread_num, &mutex, &tids, &barrier, &callback] {
-      callback(thread_num, mutex, tids, barrier);
-    });
-  }
-
-  for (auto& thread : threads) {
-    thread.join();
-  }
-}
-
-TEST(ThreadBarrier, normalTest) {
-  for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) {
-    testNormalImpl(thread_num,
-                   [](size_t thread_num,
-                      std::mutex& mutex,
-                      std::set<std::thread::id>& tids,
-                      paddle::ThreadBarrier& barrier) {
-                     {
-                       std::lock_guard<std::mutex> guard(mutex);
-                       tids.insert(std::this_thread::get_id());
-                     }
-                     barrier.wait();
-                     // Check whether all threads reach this point or not
-                     CHECK_EQ(tids.size(), thread_num);
-                   });
-  }
-}
-- 
GitLab


From ef038743f1c015b13287abcb87f7d63717f45b1b Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 14:34:39 +0800
Subject: [PATCH 15/73] remove legacy python code

---
 python/paddle/trainer/PyDataProvider2.py      |  541 --
 .../paddle/trainer/PyDataProviderWrapper.py   |  749 --
 python/paddle/trainer/__init__.py             |   13 -
 python/paddle/trainer/config_parser.py        | 4447 ----------
 .../paddle/trainer/config_parser_extension.py |   39 -
 python/paddle/trainer/recurrent_units.py      |  357 -
 .../paddle/trainer_config_helpers/__init__.py |   25 -
 .../trainer_config_helpers/activations.py     |  263 -
 python/paddle/trainer_config_helpers/attrs.py |  291 -
 .../config_parser_utils.py                    |   51 -
 .../trainer_config_helpers/data_sources.py    |  213 -
 .../default_decorators.py                     |  164 -
 .../trainer_config_helpers/evaluators.py      |  813 --
 .../trainer_config_helpers/layer_math.py      |  113 -
 .../paddle/trainer_config_helpers/layers.py   | 7610 -----------------
 .../paddle/trainer_config_helpers/networks.py | 1813 ----
 .../trainer_config_helpers/optimizers.py      |  447 -
 .../paddle/trainer_config_helpers/poolings.py |  148 -
 .../tests/CMakeLists.txt                      |   17 -
 .../tests/ProtobufEqualMain.cpp               |   59 -
 .../tests/configs/.gitignore                  |    1 -
 .../tests/configs/file_list.sh                |   17 -
 .../tests/configs/generate_protostr.sh        |   27 -
 .../tests/configs/img_layers.py               |   38 -
 .../tests/configs/img_trans_layers.py         |   38 -
 .../tests/configs/last_first_seq.py           |   35 -
 .../tests/configs/layer_activations.py        |   34 -
 .../tests/configs/math_ops.py                 |   42 -
 .../tests/configs/projections.py              |   80 -
 .../configs/protostr/img_layers.protostr      |  193 -
 .../protostr/img_trans_layers.protostr        |  193 -
 .../configs/protostr/last_first_seq.protostr  |  102 -
 .../protostr/layer_activations.protostr       |  423 -
 .../tests/configs/protostr/math_ops.protostr  |  413 -
 .../configs/protostr/projections.protostr     |  466 -
 .../tests/configs/protostr/shared_fc.protostr |  125 -
 .../configs/protostr/shared_gru.protostr      |  289 -
 .../configs/protostr/shared_lstm.protostr     |  385 -
 .../protostr/simple_rnn_layers.protostr       |  424 -
 .../protostr/test_BatchNorm3D.protostr        |   93 -
 .../protostr/test_bi_grumemory.protostr       |  155 -
 .../protostr/test_bilinear_interp.protostr    |  137 -
 .../configs/protostr/test_clip_layer.protostr |   31 -
 .../protostr/test_conv3d_layer.protostr       |  132 -
 .../protostr/test_cost_layers.protostr        |  375 -
 .../test_cost_layers_with_weight.protostr     |  162 -
 .../test_cross_entropy_over_beam.protostr     |  207 -
 .../protostr/test_deconv3d_layer.protostr     |  132 -
 .../test_detection_output_layer.protostr      |   66 -
 .../protostr/test_dot_prod_layer.protostr     |   38 -
 .../protostr/test_expand_layer.protostr       |   56 -
 .../test_factorization_machine.protostr       |   39 -
 .../tests/configs/protostr/test_fc.protostr   |   98 -
 .../protostr/test_gated_unit_layer.protostr   |  106 -
 .../protostr/test_grumemory_layer.protostr    |   51 -
 .../configs/protostr/test_hsigmoid.protostr   |   62 -
 .../test_kmax_seq_socre_layer.protostr        |   59 -
 .../protostr/test_l2_distance_layer.protostr  |   39 -
 .../protostr/test_lstmemory_layer.protostr    |   53 -
 .../configs/protostr/test_maxout.protostr     |  233 -
 .../test_multibox_loss_layer.protostr         |   79 -
 .../protostr/test_multiplex_layer.protostr    |   63 -
 .../configs/protostr/test_ntm_layers.protostr |  225 -
 .../tests/configs/protostr/test_pad.protostr  |  122 -
 .../protostr/test_pooling3D_layer.protostr    |  123 -
 .../protostr/test_prelu_layer.protostr        |  144 -
 .../protostr/test_print_layer.protostr        |   27 -
 .../protostr/test_recursive_topology.protostr |  593 --
 .../protostr/test_repeat_layer.protostr       |   42 -
 .../protostr/test_resize_layer.protostr       |   27 -
 .../configs/protostr/test_rnn_group.protostr  |  738 --
 .../protostr/test_roi_pool_layer.protostr     |  100 -
 .../configs/protostr/test_row_conv.protostr   |   41 -
 .../protostr/test_row_l2_norm_layer.protostr  |   27 -
 .../protostr/test_scale_shift_layer.protostr  |   72 -
 .../test_scale_sub_region_layer.protostr      |   51 -
 .../protostr/test_seq_concat_reshape.protostr |   51 -
 .../protostr/test_seq_slice_layer.protostr    |   79 -
 .../protostr/test_sequence_pooling.protostr   |  162 -
 .../configs/protostr/test_smooth_l1.protostr  |   40 -
 .../protostr/test_split_datasource.protostr   |   72 -
 .../configs/protostr/test_spp_layer.protostr  |   40 -
 .../test_sub_nested_seq_select_layer.protostr |   37 -
 .../configs/protostr/unused_layers.protostr   |   27 -
 .../configs/protostr/util_layers.protostr     |   87 -
 .../tests/configs/run_tests.sh                |   44 -
 .../tests/configs/shared_fc.py                |   43 -
 .../tests/configs/shared_gru.py               |   54 -
 .../tests/configs/shared_lstm.py              |   56 -
 .../tests/configs/simple_rnn_layers.py        |   51 -
 .../tests/configs/test_BatchNorm3D.py         |   25 -
 .../tests/configs/test_bi_grumemory.py        |   21 -
 .../tests/configs/test_bilinear_interp.py     |   41 -
 .../tests/configs/test_clip_layer.py          |   20 -
 .../test_config_parser_for_non_file_config.py |   51 -
 .../tests/configs/test_conv3d_layer.py        |   63 -
 .../tests/configs/test_cost_layers.py         |   61 -
 .../configs/test_cost_layers_with_weight.py   |   33 -
 .../tests/configs/test_crop.py                |   35 -
 .../configs/test_cross_entropy_over_beam.py   |   45 -
 .../tests/configs/test_deconv3d_layer.py      |   64 -
 .../configs/test_detection_output_layer.py    |   37 -
 .../tests/configs/test_dot_prod_layer.py      |   21 -
 .../tests/configs/test_expand_layer.py        |   28 -
 .../configs/test_factorization_machine.py     |   21 -
 .../tests/configs/test_fc.py                  |   30 -
 .../tests/configs/test_gated_unit_layer.py    |   30 -
 .../tests/configs/test_grumemory_layer.py     |   27 -
 .../tests/configs/test_hsigmoid.py            |   22 -
 .../configs/test_kmax_seq_socre_layer.py      |    9 -
 .../tests/configs/test_l2_distance_layer.py   |   21 -
 .../tests/configs/test_lstmemory_layer.py     |   27 -
 .../tests/configs/test_maxout.py              |   56 -
 .../tests/configs/test_multibox_loss_layer.py |   39 -
 .../tests/configs/test_multiplex_layer.py     |   26 -
 .../tests/configs/test_ntm_layers.py          |   44 -
 .../tests/configs/test_pad.py                 |   34 -
 .../tests/configs/test_pooling3D_layer.py     |   52 -
 .../tests/configs/test_prelu_layer.py         |   24 -
 .../tests/configs/test_print_layer.py         |   23 -
 .../tests/configs/test_recursive_topology.py  |   30 -
 .../tests/configs/test_repeat_layer.py        |   25 -
 .../tests/configs/test_resize_layer.py        |   20 -
 .../tests/configs/test_rnn_group.py           |   62 -
 .../tests/configs/test_roi_pool_layer.py      |   37 -
 .../tests/configs/test_row_conv.py            |   23 -
 .../tests/configs/test_row_l2_norm_layer.py   |   20 -
 .../tests/configs/test_scale_shift_layer.py   |   23 -
 .../configs/test_scale_sub_region_layer.py    |   25 -
 .../tests/configs/test_seq_concat_reshape.py  |   26 -
 .../tests/configs/test_seq_slice_layer.py     |   13 -
 .../tests/configs/test_sequence_pooling.py    |   43 -
 .../tests/configs/test_smooth_l1.py           |   21 -
 .../tests/configs/test_split_datasource.py    |   24 -
 .../tests/configs/test_spp_layer.py           |   24 -
 .../test_sub_nested_seq_select_layer.py       |   11 -
 .../tests/configs/unused_layers.py            |   25 -
 .../tests/configs/util_layers.py              |   27 -
 .../tests/layers_test.py                      |   20 -
 .../tests/layers_test_config.py               |   86 -
 .../tests/test_reset_hook.py                  |   29 -
 python/paddle/trainer_config_helpers/utils.py |   33 -
 python/paddle/v2/__init__.py                  |  156 -
 python/paddle/v2/activation.py                |   26 -
 python/paddle/v2/attr.py                      |   29 -
 python/paddle/v2/config_base.py               |   68 -
 python/paddle/v2/data_feeder.py               |  133 -
 python/paddle/v2/data_type.py                 |   27 -
 python/paddle/v2/dataset/__init__.py          |   46 -
 python/paddle/v2/dataset/cifar.py             |  148 -
 python/paddle/v2/dataset/common.py            |  236 -
 python/paddle/v2/dataset/conll05.py           |  257 -
 python/paddle/v2/dataset/flowers.py           |  218 -
 python/paddle/v2/dataset/imdb.py              |  148 -
 python/paddle/v2/dataset/imikolov.py          |  161 -
 python/paddle/v2/dataset/mnist.py             |  129 -
 python/paddle/v2/dataset/movielens.py         |  262 -
 python/paddle/v2/dataset/mq2007.py            |  333 -
 python/paddle/v2/dataset/sentiment.py         |  141 -
 python/paddle/v2/dataset/tests/cifar_test.py  |   56 -
 python/paddle/v2/dataset/tests/common_test.py |   94 -
 .../paddle/v2/dataset/tests/flowers_test.py   |   51 -
 python/paddle/v2/dataset/tests/imdb_test.py   |   57 -
 .../paddle/v2/dataset/tests/imikolov_test.py  |   67 -
 python/paddle/v2/dataset/tests/mnist_test.py  |   44 -
 python/paddle/v2/dataset/tests/mq2007_test.py |   33 -
 .../paddle/v2/dataset/tests/test_sentiment.py |   55 -
 .../paddle/v2/dataset/tests/voc2012_test.py   |   42 -
 python/paddle/v2/dataset/tests/wmt16_test.py  |   66 -
 python/paddle/v2/dataset/uci_housing.py       |  134 -
 python/paddle/v2/dataset/voc2012.py           |   85 -
 python/paddle/v2/dataset/wmt14.py             |  181 -
 python/paddle/v2/dataset/wmt16.py             |  352 -
 python/paddle/v2/evaluator.py                 |   36 -
 python/paddle/v2/event.py                     |  113 -
 python/paddle/v2/image.py                     |  380 -
 python/paddle/v2/inference.py                 |  172 -
 python/paddle/v2/layer.py                     |  326 -
 python/paddle/v2/master/.gitignore            |    3 -
 python/paddle/v2/master/__init__.py           |   17 -
 python/paddle/v2/master/client.py             |   95 -
 python/paddle/v2/minibatch.py                 |   43 -
 python/paddle/v2/networks.py                  |   33 -
 python/paddle/v2/op.py                        |  120 -
 python/paddle/v2/optimizer.py                 |  297 -
 python/paddle/v2/parameters.py                |  441 -
 python/paddle/v2/plot/__init__.py             |   17 -
 python/paddle/v2/plot/plot.py                 |   82 -
 python/paddle/v2/plot/tests/CMakeLists.txt    |    5 -
 python/paddle/v2/plot/tests/__init__.py       |   16 -
 python/paddle/v2/plot/tests/test_ploter.py    |   40 -
 python/paddle/v2/pooling.py                   |   26 -
 python/paddle/v2/reader/__init__.py           |   74 -
 python/paddle/v2/reader/creator.py            |  130 -
 python/paddle/v2/reader/decorator.py          |  405 -
 python/paddle/v2/reader/tests/CMakeLists.txt  |    2 -
 python/paddle/v2/reader/tests/__init__.py     |   13 -
 python/paddle/v2/reader/tests/creator_test.py |   74 -
 .../paddle/v2/reader/tests/decorator_test.py  |  178 -
 .../v2/reader/tests/test_data_creator.txt     |    3 -
 .../v2/reader/tests/test_reader_recordio.dat  |  Bin 76 -> 0 bytes
 .../v2/reader/tests/test_recordio_creator.dat |  Bin 88 -> 0 bytes
 python/paddle/v2/tests/CMakeLists.txt         |    8 -
 python/paddle/v2/tests/cat.jpg                |  Bin 57218 -> 0 bytes
 python/paddle/v2/tests/test_data_feeder.py    |  267 -
 python/paddle/v2/tests/test_image.py          |   43 -
 python/paddle/v2/tests/test_layer.py          |  290 -
 python/paddle/v2/tests/test_op.py             |   51 -
 .../paddle/v2/tests/test_paramconf_order.py   |   99 -
 python/paddle/v2/tests/test_parameters.py     |  143 -
 python/paddle/v2/tests/test_rnn_layer.py      |  166 -
 python/paddle/v2/tests/test_topology.py       |   85 -
 python/paddle/v2/topology.py                  |  145 -
 python/paddle/v2/trainer.py                   |  258 -
 214 files changed, 37347 deletions(-)
 delete mode 100644 python/paddle/trainer/PyDataProvider2.py
 delete mode 100644 python/paddle/trainer/PyDataProviderWrapper.py
 delete mode 100644 python/paddle/trainer/__init__.py
 delete mode 100644 python/paddle/trainer/config_parser.py
 delete mode 100644 python/paddle/trainer/config_parser_extension.py
 delete mode 100644 python/paddle/trainer/recurrent_units.py
 delete mode 100644 python/paddle/trainer_config_helpers/__init__.py
 delete mode 100644 python/paddle/trainer_config_helpers/activations.py
 delete mode 100644 python/paddle/trainer_config_helpers/attrs.py
 delete mode 100644 python/paddle/trainer_config_helpers/config_parser_utils.py
 delete mode 100644 python/paddle/trainer_config_helpers/data_sources.py
 delete mode 100644 python/paddle/trainer_config_helpers/default_decorators.py
 delete mode 100644 python/paddle/trainer_config_helpers/evaluators.py
 delete mode 100644 python/paddle/trainer_config_helpers/layer_math.py
 delete mode 100644 python/paddle/trainer_config_helpers/layers.py
 delete mode 100644 python/paddle/trainer_config_helpers/networks.py
 delete mode 100644 python/paddle/trainer_config_helpers/optimizers.py
 delete mode 100644 python/paddle/trainer_config_helpers/poolings.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/CMakeLists.txt
 delete mode 100644 python/paddle/trainer_config_helpers/tests/ProtobufEqualMain.cpp
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/.gitignore
 delete mode 100755 python/paddle/trainer_config_helpers/tests/configs/file_list.sh
 delete mode 100755 python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/img_layers.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/math_ops.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/projections.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/layer_activations.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/shared_fc.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_conv3d_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_deconv3d_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_expand_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_fc.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_grumemory_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_hsigmoid.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_lstmemory_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_multiplex_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_pooling3D_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_repeat_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_slice_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_sub_nested_seq_select_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/unused_layers.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr
 delete mode 100755 python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_crop.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_fc.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_pad.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/util_layers.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/layers_test.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/layers_test_config.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/test_reset_hook.py
 delete mode 100644 python/paddle/trainer_config_helpers/utils.py
 delete mode 100644 python/paddle/v2/__init__.py
 delete mode 100644 python/paddle/v2/activation.py
 delete mode 100644 python/paddle/v2/attr.py
 delete mode 100644 python/paddle/v2/config_base.py
 delete mode 100644 python/paddle/v2/data_feeder.py
 delete mode 100644 python/paddle/v2/data_type.py
 delete mode 100644 python/paddle/v2/dataset/__init__.py
 delete mode 100644 python/paddle/v2/dataset/cifar.py
 delete mode 100644 python/paddle/v2/dataset/common.py
 delete mode 100644 python/paddle/v2/dataset/conll05.py
 delete mode 100644 python/paddle/v2/dataset/flowers.py
 delete mode 100644 python/paddle/v2/dataset/imdb.py
 delete mode 100644 python/paddle/v2/dataset/imikolov.py
 delete mode 100644 python/paddle/v2/dataset/mnist.py
 delete mode 100644 python/paddle/v2/dataset/movielens.py
 delete mode 100644 python/paddle/v2/dataset/mq2007.py
 delete mode 100644 python/paddle/v2/dataset/sentiment.py
 delete mode 100644 python/paddle/v2/dataset/tests/cifar_test.py
 delete mode 100644 python/paddle/v2/dataset/tests/common_test.py
 delete mode 100644 python/paddle/v2/dataset/tests/flowers_test.py
 delete mode 100644 python/paddle/v2/dataset/tests/imdb_test.py
 delete mode 100644 python/paddle/v2/dataset/tests/imikolov_test.py
 delete mode 100644 python/paddle/v2/dataset/tests/mnist_test.py
 delete mode 100644 python/paddle/v2/dataset/tests/mq2007_test.py
 delete mode 100644 python/paddle/v2/dataset/tests/test_sentiment.py
 delete mode 100644 python/paddle/v2/dataset/tests/voc2012_test.py
 delete mode 100644 python/paddle/v2/dataset/tests/wmt16_test.py
 delete mode 100644 python/paddle/v2/dataset/uci_housing.py
 delete mode 100644 python/paddle/v2/dataset/voc2012.py
 delete mode 100644 python/paddle/v2/dataset/wmt14.py
 delete mode 100644 python/paddle/v2/dataset/wmt16.py
 delete mode 100644 python/paddle/v2/evaluator.py
 delete mode 100644 python/paddle/v2/event.py
 delete mode 100644 python/paddle/v2/image.py
 delete mode 100644 python/paddle/v2/inference.py
 delete mode 100644 python/paddle/v2/layer.py
 delete mode 100644 python/paddle/v2/master/.gitignore
 delete mode 100644 python/paddle/v2/master/__init__.py
 delete mode 100644 python/paddle/v2/master/client.py
 delete mode 100644 python/paddle/v2/minibatch.py
 delete mode 100644 python/paddle/v2/networks.py
 delete mode 100644 python/paddle/v2/op.py
 delete mode 100644 python/paddle/v2/optimizer.py
 delete mode 100644 python/paddle/v2/parameters.py
 delete mode 100644 python/paddle/v2/plot/__init__.py
 delete mode 100644 python/paddle/v2/plot/plot.py
 delete mode 100644 python/paddle/v2/plot/tests/CMakeLists.txt
 delete mode 100644 python/paddle/v2/plot/tests/__init__.py
 delete mode 100644 python/paddle/v2/plot/tests/test_ploter.py
 delete mode 100644 python/paddle/v2/pooling.py
 delete mode 100644 python/paddle/v2/reader/__init__.py
 delete mode 100644 python/paddle/v2/reader/creator.py
 delete mode 100644 python/paddle/v2/reader/decorator.py
 delete mode 100644 python/paddle/v2/reader/tests/CMakeLists.txt
 delete mode 100644 python/paddle/v2/reader/tests/__init__.py
 delete mode 100644 python/paddle/v2/reader/tests/creator_test.py
 delete mode 100644 python/paddle/v2/reader/tests/decorator_test.py
 delete mode 100644 python/paddle/v2/reader/tests/test_data_creator.txt
 delete mode 100644 python/paddle/v2/reader/tests/test_reader_recordio.dat
 delete mode 100644 python/paddle/v2/reader/tests/test_recordio_creator.dat
 delete mode 100644 python/paddle/v2/tests/CMakeLists.txt
 delete mode 100644 python/paddle/v2/tests/cat.jpg
 delete mode 100644 python/paddle/v2/tests/test_data_feeder.py
 delete mode 100644 python/paddle/v2/tests/test_image.py
 delete mode 100644 python/paddle/v2/tests/test_layer.py
 delete mode 100644 python/paddle/v2/tests/test_op.py
 delete mode 100644 python/paddle/v2/tests/test_paramconf_order.py
 delete mode 100644 python/paddle/v2/tests/test_parameters.py
 delete mode 100644 python/paddle/v2/tests/test_rnn_layer.py
 delete mode 100644 python/paddle/v2/tests/test_topology.py
 delete mode 100644 python/paddle/v2/topology.py
 delete mode 100644 python/paddle/v2/trainer.py

diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
deleted file mode 100644
index 05635833bf1..00000000000
--- a/python/paddle/trainer/PyDataProvider2.py
+++ /dev/null
@@ -1,541 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cPickle
-import logging
-import collections
-import functools
-import itertools
-
-logging.basicConfig(format="[%(levelname)s %(asctime)s %(filename)s:%(lineno)s]"
-                    " %(message)s")
-
-
-class SequenceType(object):
-    NO_SEQUENCE = 0
-    SEQUENCE = 1
-    SUB_SEQUENCE = 2
-
-    @classmethod
-    def tostring(cls, value):
-        for k in cls.__dict__:
-            if not k.startswith('__'):
-                if getattr(cls, k) == value:
-                    return cls.__name__ + '.' + k
-        return 'INVALID(' + str(value) + ')'
-
-
-# TODO(yuyang18): Add string data type here.
-class DataType(object):
-    Dense = 0
-    SparseNonValue = 1
-    SparseValue = 2
-    Index = 3
-
-    @classmethod
-    def tostring(cls, value):
-        for k in cls.__dict__:
-            if not k.startswith('__'):
-                if getattr(cls, k) == value:
-                    return cls.__name__ + '.' + k
-        return 'INVALID(' + str(value) + ')'
-
-
-class CacheType(object):
-    NO_CACHE = 0  # No cache at all
-
-    # First pass, read data from python.  And store them in memory. Read from
-    # memory during rest passes.
-    CACHE_PASS_IN_MEM = 1
-
-
-class InputType(object):
-    """
-    InputType is the base class for paddle input types.
-
-    ..  note::
-
-        this is a base class, and should never be used by user.
-
-    :param dim: dimension of input. If the input is an integer, it means the
-                value range. Otherwise, it means the size of layer.
-    :type dim: int
-    :param seq_type: sequence type of input. 0 means it is not a sequence. 1
-                     means it is a variable length sequence. 2 means it is a
-                     nested sequence.
-    :type seq_type: int
-    :param type: data type of input.
-    :type type: int
-    """
-    __slots__ = ['dim', 'seq_type', 'type']
-
-    def __init__(self, dim, seq_type, tp):
-        self.dim = dim
-        self.seq_type = seq_type
-        self.type = tp
-
-    def __repr__(self):
-        """
-        Return a human readable representation like 'InputType(dim=25921, 
-            seq_type=SequenceType.NO_SEQUENCE, type=DataType.Dense)'
-        """
-        repr_str = type(self).__name__
-        repr_str += '('
-        serialize_func_map = {
-            'dim': repr,
-            'seq_type': SequenceType.tostring,
-            'type': DataType.tostring
-        }
-        for idx, k in enumerate(self.__slots__):
-            if idx != 0:
-                repr_str += ', '
-            repr_str += (
-                k + '=' + serialize_func_map.get(k, repr)(getattr(self, k)))
-        repr_str += ')'
-        return repr_str
-
-
-def dense_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
-    """
-    Dense Array. It means the input feature is dense array with float type.
-    For example, if the input is an image with 28*28 pixels, the input of
-    Paddle neural network could be a dense vector with dimension 784 or a
-    numpy array with shape (28, 28).
-
-    For the 2-D convolution operation, each sample in one mini-batch must have
-    the similarly size in PaddlePaddle now. But, it supports variable-dimension
-    feature across mini-batch. For the variable-dimension, the param dim is not
-    used. While the data reader must yield numpy array and the data feeder will
-    set the data shape correctly.
-
-    :param dim: dimension of this vector.
-    :type dim: int
-    :param seq_type: sequence type of input.
-    :type seq_type: int
-    :return: An input type object.
-    :rtype: InputType
-    """
-    return InputType(dim, seq_type, DataType.Dense)
-
-
-def sparse_non_value_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
-    """
-    Sparse binary vector. It means the input feature is a sparse vector and the
-    every element in this vector is either zero or one.
-
-    :param dim: dimension of this vector.
-    :type dim: int
-    :param seq_type: sequence type of this input.
-    :type seq_type: int
-    :return: An input type object.
-    :rtype: InputType
-    """
-    return InputType(dim, seq_type, DataType.SparseNonValue)
-
-
-def sparse_value_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
-    """
-    Sparse vector. It means the input feature is a sparse vector. Most of the
-    elements in this vector are zero, others could be any float value.
-
-    :param dim: dimension of this vector.
-    :type dim: int
-    :param seq_type: sequence type of this input.
-    :type seq_type: int
-    :return: An input type object.
-    :rtype: InputType
-    """
-    return InputType(dim, seq_type, DataType.SparseValue)
-
-
-def index_slot(value_range, seq_type=SequenceType.NO_SEQUENCE):
-    """
-    Data type of integer.
-
-    :param seq_type: sequence type of this input.
-    :type seq_type: int
-    :param value_range: range of this integer.
-    :type value_range: int
-    :return: An input type object
-    :rtype: InputType
-    """
-    return InputType(value_range, seq_type, DataType.Index)
-
-
-dense_vector = dense_slot
-sparse_binary_vector = sparse_non_value_slot
-sparse_float_vector = sparse_value_slot
-integer_value = index_slot
-
-# dense_array can be used for variable-length input feature.
-# Each feature is not a vector, but a multi-dimensional array.
-dense_array = dense_slot
-
-
-def dense_vector_sequence(dim):
-    """
-    Data type of a sequence of dense vector.
-
-    :param dim: dimension of dense vector.
-    :type dim: int
-    :return: An input type object
-    :rtype: InputType
-    """
-    return dense_vector(dim, seq_type=SequenceType.SEQUENCE)
-
-
-def dense_vector_sub_sequence(dim):
-    return dense_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
-
-
-def sparse_binary_vector_sequence(dim):
-    """
-    Data type of a sequence of sparse vector, which every element is either zero
-     or one.
-
-    :param dim: dimension of sparse vector.
-    :type dim: int
-    :return: An input type object
-    :rtype: InputType
-    """
-    return sparse_binary_vector(dim, seq_type=SequenceType.SEQUENCE)
-
-
-def sparse_binary_vector_sub_sequence(dim):
-    return sparse_binary_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
-
-
-def sparse_float_vector_sequence(dim):
-    """
-    Data type of a sequence of sparse vector, which most elements are zero,
-    others could be any float value.
-
-    :param dim: dimension of sparse vector.
-    :type dim: int
-    :return: An input type object
-    :rtype: InputType
-    """
-    return sparse_float_vector(dim, seq_type=SequenceType.SEQUENCE)
-
-
-def sparse_float_vector_sub_sequence(dim):
-    return sparse_float_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
-
-
-def integer_value_sequence(value_range):
-    """
-    Data type of a sequence of integer.
-
-    :param value_range: range of each element.
-    :type value_range: int
-    """
-    return integer_value(value_range, seq_type=SequenceType.SEQUENCE)
-
-
-def integer_value_sub_sequence(dim):
-    return integer_value(dim, seq_type=SequenceType.SUB_SEQUENCE)
-
-
-integer_sequence = integer_value_sequence
-
-
-class SingleSlotWrapper(object):
-    def __init__(self, generator):
-        self.generator = generator
-
-    def __call__(self, obj, filename):
-        for item in self.generator(obj, filename):
-            if isinstance(item, dict):
-                yield item
-            else:
-                yield [item]
-
-
-class InputOrderWrapper(object):
-    def __init__(self, generator, input_order):
-        self.generator = generator
-        self.input_order = input_order
-
-    def __call__(self, obj, filename):
-        for item in self.generator(obj, filename):
-            if isinstance(item, dict):
-                yield [
-                    item.get(input_name, None)
-                    for input_name in self.input_order
-                ]
-            else:
-                yield item
-
-
-class CheckWrapper(object):
-    def __init__(self, generator, input_types, check_fail_continue, logger):
-        self.generator = generator
-        self.input_types = input_types
-        self.check_fail_continue = check_fail_continue
-        self.logger = logger
-
-    def __call__(self, obj, filename):
-        for items in self.generator(obj, filename):
-            try:
-                assert len(items) == len(self.input_types)
-                assert len(filter(lambda x: x is None, items)) == 0
-                for item, input_type in itertools.izip(items, self.input_types):
-                    callback = functools.partial(CheckWrapper.loop_callback,
-                                                 input_type)
-
-                    for _ in xrange(input_type.seq_type):
-                        callback = functools.partial(CheckWrapper.loop_check,
-                                                     callback)
-                    callback(item)
-
-                yield items
-            except AssertionError as e:
-                self.logger.warning(
-                    "Item (%s) is not fit the input type with error %s" %
-                    (repr(item), repr(e)))
-
-                if self.check_fail_continue:
-                    continue
-                else:
-                    raise
-
-    @staticmethod
-    def loop_callback(input_type, each):
-        assert isinstance(input_type, InputType)
-        if input_type.type == DataType.Dense:
-            assert isinstance(each, collections.Sequence)
-            for d in each:
-                assert isinstance(d, float)
-            assert len(each) == input_type.dim
-        elif input_type.type == DataType.Index:
-            assert isinstance(each, int)
-            assert each < input_type.dim
-        elif input_type.type == DataType.SparseNonValue \
-                or input_type.type == DataType.SparseValue:
-            assert isinstance(each, collections.Sequence)
-            sparse_id = set()
-            for k in each:
-                if input_type.type == DataType.SparseValue:
-                    k, v = k
-                    assert isinstance(v, float)
-                assert isinstance(k, int)
-                assert k < input_type.dim
-                sparse_id.add(k)
-            assert len(sparse_id) == len(each)
-        else:
-            raise RuntimeError("Not support input type")
-
-    @staticmethod
-    def loop_check(callback, item):
-        for each in item:
-            callback(each)
-
-
-class CheckInputTypeWrapper(object):
-    def __init__(self, generator, input_types, logger):
-        self.generator = generator
-        self.input_types = input_types
-        self.logger = logger
-
-    def __call__(self, obj, filename):
-        for items in self.generator(obj, filename):
-            try:
-                # dict type is required for input_types when item is dict type
-                assert (isinstance(items, dict) and \
-                        not isinstance(self.input_types, dict))==False
-                yield items
-            except AssertionError as e:
-                self.logger.error(
-                    "%s type is required for input type but got %s" %
-                    (repr(type(items)), repr(type(self.input_types))))
-                raise
-
-
-def provider(input_types=None,
-             should_shuffle=None,
-             pool_size=-1,
-             min_pool_size=-1,
-             can_over_batch_size=True,
-             calc_batch_size=None,
-             cache=CacheType.NO_CACHE,
-             check=False,
-             check_fail_continue=False,
-             init_hook=None,
-             **outter_kwargs):
-    """
-    Provider decorator. Use it to make a function into PyDataProvider2 object.
-    In this function, user only need to get each sample for some train/test
-    file.
-
-    The basic usage is:
-
-    ..  code-block:: python
-
-        @provider(some data provider config here...)
-        def process(settings, file_name):
-            while not at end of file_name:
-                sample = readOneSampleFromFile(file_name)
-                yield sample.
-
-    The configuration of data provider should be setup by\:
-
-    :param input_types: Specify the input types, can also be set in init_hook.
-                        It could be a list of InputType object. For example,
-                        input_types=[dense_vector(9), integer_value(2)]. Or user
-                        can set a dict of InputType object, which key is
-                        data_layer's name. For example, input_types=\
-                        {'img': img_features, 'label': label}. when using dict of
-                        InputType, user could yield a dict of feature values, which
-                        key is also data_layer's name.
-
-    :type input_types: list|tuple|dict
-
-    :param should_shuffle: True if data should shuffle. Pass None means shuffle
-                           when is training and not to shuffle when is testing.
-    :type should_shuffle: bool
-
-    :param pool_size: Max number of sample in data pool.
-    :type pool_size: int
-
-    :param min_pool_size: Set minimal sample in data pool. The PaddlePaddle will
-                          random pick sample in pool. So the min_pool_size
-                          effect the randomize of data.
-    :type min_pool_size: int
-
-    :param can_over_batch_size: True if paddle can return a mini-batch larger
-                                than batch size in settings. It is useful when
-                                custom calculate one sample's batch_size.
-
-                                It is very danger to set it to false and use
-                                calc_batch_size together. Default is true.
-    :type can_over_batch_size: bool
-
-    :param calc_batch_size: a method to calculate each sample's batch size.
-                            Default each sample's batch size is 1. But to you
-                            can customize each sample's batch size.
-    :type calc_batch_size: callable
-
-    :param cache: Cache strategy of Data Provider. Default is CacheType.NO_CACHE
-    :type cache: int
-
-    :param init_hook: Initialize hook. Useful when data provider need load some
-                      external data like dictionary. The parameter is
-                      (settings, file_list, \*\*kwargs).
-
-                      - settings. It is the global settings object. User can set
-                        settings.input_types here.
-                      - file_list. All file names for passed to data provider.
-                      - is_train. Is this data provider used for training or not.
-                      - kwargs. Other keyword arguments passed from
-                        trainer_config's args parameter.
-    :type init_hook: callable
-
-    :param check: Check the yield data format is as same as input_types. Enable
-                  this will make data provide process slow but it is very useful
-                  for debug. Default is disabled.
-    :type check: bool
-
-    :param check_fail_continue: Continue train or not when check failed. Just
-                                drop the wrong format data when it is True. Has
-                                no effect when check set to False.
-    :type check_fail_continue: bool
-    """
-
-    def __wrapper__(generator):
-        class DataProvider(object):
-            def __init__(self, file_list, **kwargs):
-                self.logger = logging.getLogger("")
-                self.logger.setLevel(logging.INFO)
-                self.input_types = None
-                self.should_shuffle = should_shuffle
-
-                true_table = [1, 't', 'true', 'on']
-                false_table = [0, 'f', 'false', 'off']
-                if not isinstance(self.should_shuffle, bool) and \
-                                self.should_shuffle is not None:
-
-                    if isinstance(self.should_shuffle, basestring):
-                        self.should_shuffle = self.should_shuffle.lower()
-
-                    if self.should_shuffle in true_table:
-                        self.should_shuffle = True
-                    elif self.should_shuffle in false_table:
-                        self.should_shuffle = False
-                    else:
-                        self.logger.warning(
-                            "Could not recognize should_shuffle (%s), "
-                            "just use default value of should_shuffle."
-                            " Please set should_shuffle to bool value or "
-                            "something in %s" %
-                            (repr(self.should_shuffle),
-                             repr(true_table + false_table)))
-                        self.should_shuffle = None
-
-                self.pool_size = pool_size
-                self.can_over_batch_size = can_over_batch_size
-                self.calc_batch_size = calc_batch_size
-                self.file_list = file_list
-                self.generator = generator
-                self.cache = cache
-                self.min_pool_size = min_pool_size
-                self.input_order = kwargs['input_order']
-                self.check = check
-                if init_hook is not None:
-                    init_hook(self, file_list=file_list, **kwargs)
-
-                if 'slots' in outter_kwargs:
-                    self.logger.warning('setting slots value is deprecated, '
-                                        'please use input_types instead.')
-                    self.slots = outter_kwargs['slots']
-                if input_types is not None:
-                    self.slots = input_types
-
-                if self.input_types is not None:
-                    self.slots = self.input_types
-
-                assert self.slots is not None, \
-                    "Data Provider's input_types must be set"
-                assert self.generator is not None
-
-                use_dynamic_order = False
-                if isinstance(self.slots, dict):  # reorder input_types
-                    self.slots = [self.slots[ipt] for ipt in self.input_order]
-                    use_dynamic_order = True
-
-                if len(self.slots) == 1:
-                    self.generator = SingleSlotWrapper(self.generator)
-
-                if use_dynamic_order:
-                    self.generator = InputOrderWrapper(self.generator,
-                                                       self.input_order)
-                else:
-                    self.generator = CheckInputTypeWrapper(
-                        self.generator, self.slots, self.logger)
-                if self.check:
-                    self.generator = CheckWrapper(self.generator, self.slots,
-                                                  check_fail_continue,
-                                                  self.logger)
-
-        return DataProvider
-
-    return __wrapper__
-
-
-def deserialize_args(args):
-    """
-    Internal use only.
-    :param args:
-    :return:
-    """
-    return cPickle.loads(args)
diff --git a/python/paddle/trainer/PyDataProviderWrapper.py b/python/paddle/trainer/PyDataProviderWrapper.py
deleted file mode 100644
index 374976db9f1..00000000000
--- a/python/paddle/trainer/PyDataProviderWrapper.py
+++ /dev/null
@@ -1,749 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This module provide a wrapper(decorator) to wrap a data process method into a
-PyDataProvider. Some examples are shown `here <data_provider/python_case.html>`_.
-"""
-
-import struct
-import array
-import random
-import gc
-import logging
-import pstats
-import sys
-import numpy
-import functools
-
-__all__ = [
-    'DenseSlot', 'SlotType', 'SparseNonValueSlot', 'StringSlot',
-    'SparseValueSlot', 'IndexSlot', 'PoolSize', 'GeneralPyDataProvider',
-    'provider', 'init_hook_wrapper'
-]
-
-try:  # Just for profile mode, will try to import cProfile first.
-    # Most python will contains cProfile, cProfile/profile are basically same.
-    # ref: https://docs.python.org/2/library/profile.html#introduction-to-the-profilers
-    import cProfile as profile
-except ImportError:
-    import profile
-
-try:
-    import cPickle as pickle
-except ImportError:
-    import six.moves.cPickle as pickle
-
-import io
-
-
-class SlotType(object):  # Just a hint for user.
-    pass
-
-
-class DenseSlot(SlotType):
-    """
-    Dense Slot Type: Each item is the value of a Dense Vector.
-
-    Its yield format for :code:`provider` is:
-
-    - **NonSeq**: [float, float, ... ]
-    - **Seq**: [[float, float, ...], [float, float ....], ... ]
-    - **SubSeq**: [[[float, float, ...], [float ....], ...] ,  \
-                   [[float, float, ...], [float ....], ...] , ...]
-    """
-
-    def __init__(self, dim):
-        """
-        :param dim: slot dimension
-        :type dim: int
-        """
-        self.dim = dim
-        self.type = 0
-
-
-class SparseNonValueSlot(SlotType):
-    """
-    Sparse NonValue Slot Type: Each item is the id of a Sparse Vector.
-
-    Its yield format for :code:`provider` is:
-
-    - **NonSeq**: [int, int, ...]
-    - **Seq**: [[int, int, ...], [int, int, ...], ... ]
-    - **SubSeq**: [[[int, int, ...], [int, ....], ...] ,  \
-                   [[int, int, ...], [int, ....], ...] , ...]
-    """
-
-    def __init__(self, dim):
-        """
-        :param dim: slot dimension
-        :type dim: int
-        """
-        self.dim = dim
-        self.type = 1
-
-
-class SparseValueSlot(SlotType):
-    """
-    Sparse Value Slot Type: Each item is the id and value of a Sparse Vector.
-
-    Its yield format for :code:`provider` is:
-
-    - **NonSeq**: [(int, float), (int, float), ... ]
-    - **Seq**: [[(int,float), (int, float), ... ], \
-                [(int, float), (int, float), ...], ... ]
-    - **SubSeq**: [[[(int,float), ...], [(int, float), ....], ...] ,  \
-                   [[(int,float), ...], [(int, float), ....], ...] , ...]
-    """
-
-    def __init__(self, dim):
-        """
-        :param dim: slot dimension.
-        :type dim: int
-        """
-        self.dim = dim
-        self.type = 2
-
-
-class IndexSlot(SlotType):
-    """
-    Index Value Slot Type: Each item is the id of Label.
-
-    Its yield format for :code:`provider` is:
-
-    - **NonSeq**: int
-    - **Seq**:  [int, int, ....]
-    - **SubSeq**: [[int, int, ...], [int, int, ...], ... ]
-    """
-
-    def __init__(self, dim):
-        """
-        :param dim: slot dimension
-        :type dim: int
-        """
-        self.dim = dim
-        self.type = 3
-
-
-class StringSlot(SlotType):
-    """
-    String Value Slot Type: Each item is a string for printout, \
-                            can be used in DataLayer too.
-
-    Its yield format for :code:`provider` is:
-
-    - **NonSeq**: string
-    - **Seq**: [string, string, ....]
-    - **SubSeq**:  [[string, string, ...], [string, string, ...], ... ]
-    """
-
-    def __init__(self, dim):
-        """
-        :param dim: slot dimension
-        :type dim: string
-        """
-        self.dim = dim
-        self.type = 6
-
-
-class SparseNonValueHandler(object):
-    """
-    Private Class, Use for converting python object to paddle string.
-    """
-
-    def __init__(self):
-        self.offsets = []
-        self.value = []
-        self.offset_count = 0
-
-    def __call__(self, ele):
-        """
-        It will be invoked when scan each sparse data.
-
-        :param ele: list of sparse data, maybe non-value [ idx, ... ] or value.
-                    [ (idx, val), ... ]
-        :type ele: list
-        """
-        self.offsets.append(self.offset_count)
-        self.offset_count += len(ele)
-        self.processElement(ele)
-
-    def processElement(self, ele):
-        """
-        Process for element list. See __call__ for more document.
-        """
-        self.value += ele
-
-    def done(self, data_stream, int_packer):
-        """
-        Dump data to stream.
-        :param data_stream: Output Stream.
-        :param int_packer:  A struct.Struct("i") object
-        """
-        data_stream.write(array.array("i", self.offsets).tostring())
-        data_stream.write(int_packer.pack(self.offset_count))
-        data_stream.write(array.array("i", self.value).tostring())
-
-
-class SparseValueHandler(SparseNonValueHandler):
-    """
-    Private class, use for converting python obj to paddle string.
-    """
-
-    def __init__(self):
-        SparseNonValueHandler.__init__(self)
-        self.weight = []
-
-    def processElement(self, ele):
-        for idx, w in ele:
-            self.value.append(idx)
-            self.weight.append(w)
-
-    def done(self, data_stream, int_packer):
-        SparseNonValueHandler.done(self, data_stream, int_packer)
-        data_stream.write(int_packer.pack(self.offset_count))
-        data_stream.write(array.array("f", self.weight).tostring())
-
-
-class StringHandler(object):
-    """
-    Private Class, Use for converting python object to paddle string.
-    """
-
-    def __init__(self, data_stream, int_packer):
-        self.data_stream = data_stream
-        self.int_packer = int_packer
-
-    def __call__(self, ele):
-        """
-        It will be invoked when scan each string data.
-        :param ele: string data
-        :type ele: str
-        """
-        self.data_stream.write(self.int_packer.pack(len(ele)))
-        self.data_stream.write(array.array("c", ele).tostring())
-
-
-class GeneralPyDataProvider:
-    def __init__(self, *file_list, **kwargs):
-        """
-        :param file_list: input file_list
-        """
-        del kwargs  # unused
-        gc.disable()
-        assert isinstance(self.logger, logging.Logger)
-        self.use_seq_flag = hasattr(self, "use_seq_flag") and self.use_seq_flag
-        self.slots_num = len(self.getSlots())
-        self.file_list = list(file_list)
-        self.generators = map(self.generateData, self.file_list)
-        self.int_packer = struct.Struct("i")
-        self.head_packer = struct.Struct("ii")
-        self.float_packer = struct.Struct("f")
-        self.shuffler = lambda *args, **kwargs: None
-        self.data_pool = []
-        self.has_subseq = []
-        self.has_checked = False
-
-        self.debug = hasattr(self, "debug") and self.debug
-
-        if hasattr(self, "profile_filename") and isinstance(
-                self.profile_filename, str):
-            self.profile_count = 0
-            self.is_profile = True
-        else:
-            self.is_profile = False
-
-        if not hasattr(self, "file_count") or not isinstance(self.file_count,
-                                                             int):
-            self.file_count = sys.maxint
-
-        if not hasattr(self, "can_over_batch_size"):
-            self.can_over_batch_size = True
-        elif not self.can_over_batch_size:
-            self.logger.warn(
-                "User should ensure every data size is not larger than batch"
-                " size when can_over_batch_size = False")
-
-        self.data_pool_idx = 0
-
-    def reset(self):
-        """Reset all data in provider."""
-
-        self.logger.debug("reset dataprovider.")
-        self.generators = map(self.generateData, self.file_list)
-        self.shuffler = lambda *args, **kwargs: None
-        self.data_pool = []
-        self.data_pool_idx = 0
-        if self.file_count != 0:
-            self.max_pool_size = 0
-
-        # When use Profile, each pass will print a profile result.
-        if self.is_profile:
-            if hasattr(self, "profiler") and isinstance(self.profiler,
-                                                        profile.Profile):
-                self.profiler.disable()
-                fn = "%s_%d" % (self.profile_filename, self.profile_count)
-                sortby = "cumulative"
-                with open(fn, "w") as f:
-                    pstats.Stats(
-                        self.profiler,
-                        stream=f).sort_stats(sortby).print_stats()
-                self.logger.info("saving profile to file %s" % fn)
-                self.profile_count += 1
-            self.logger.info("resetting profile")
-            self.profiler = profile.Profile()
-            self.profiler.enable()
-
-    def shuffle(self):
-        """ shuffle data"""
-        if not self.should_shuffle:
-            return
-        else:
-            self.logger.debug("shuffling data.")
-            random.shuffle(self.generators)
-            self.shuffler = random.shuffle
-
-    def getSlots(self):
-        """
-        :return : return a list of SlotType
-        :rtype: list
-        """
-        return []
-
-    def generateData(self, fn):
-        """
-        :param fn: file name
-        :return: a generator to yield data one by one.
-        """
-        raise NotImplementedError
-
-    def calculateDataBatchSize(self, data):
-        """
-        :param data: One sample which yield by generateData
-        :type data: list
-        :return: The batch size that the data contribute.
-        :rtype: int
-        """
-        return 1
-
-    def getHeader(self):
-        """return paddle header format"""
-        ret = self.head_packer.pack(self.slots_num, self.use_seq_flag)
-        for obj in self.getSlots():
-            ret += self.head_packer.pack(obj.type, obj.dim)
-        return ret
-
-    def getHeaderNative(self):
-        return self.use_seq_flag, self.getSlots()
-
-    def getNextBatchNative(self, batch_size):
-        ret_list = []
-        self.__prepareData(batch_size, ret_list)
-        return ret_list
-
-    def getNextBatch(self, batch_size):
-        """
-        :param batch_size: the batch_size approximately return.
-        :return: return paddle pyDataProvider format, just see documents.
-        :rtype: str
-
-        NOTE: If can_over_batch_size is True, the return batch_size >= input batch_size.
-              Otherwise, the return batch_size < input batch_size, BUT USER MUST ENSURE THAT each data's batch size
-              is less than input batch_size.
-        """
-        ret_list = []
-        current_batch_size = self.__prepareData(batch_size, ret_list)
-        # create unified format for ret_list with differnt slots_num
-        if self.slots_num == 1:
-            ret_list = [ret_list]
-
-        if current_batch_size == 0:
-            return self.int_packer.pack(current_batch_size)
-        data_bytes = io.BytesIO()
-        seq_bytes = io.BytesIO()
-        subseq_bytes = io.BytesIO()
-        data_stream = io.BufferedWriter(data_bytes)
-        seq_stream = io.BufferedWriter(seq_bytes)
-        subseq_stream = io.BufferedWriter(subseq_bytes)
-
-        def convertDataImpl(idx, data_callback):
-            """
-            This method will handle sequence in return data. invoke data_callback one by one.
-            :param idx: the slot index.
-            :param data_callback: a callback, which type is (each sample) => None.
-            """
-            indices = 0
-            slot_sample_num = len(ret_list)
-            if self.use_seq_flag:
-                slot_sample_num = 0
-                if self.has_subseq[idx]:  # has sub-sequence
-                    slot_subseq_num = 0
-                    for dat in ret_list:
-                        dat = dat[idx]
-                        slot_subseq_num += len(dat)
-                        for sub_dat in dat:
-                            slot_sample_num += len(sub_dat)
-                    subseq_stream.write(self.int_packer.pack(slot_subseq_num))
-                else:
-                    for dat in ret_list:
-                        dat = dat[idx]
-                        slot_sample_num += len(dat)
-                seq_stream.write(self.int_packer.pack(len(ret_list)))
-            data_stream.write(self.int_packer.pack(slot_sample_num))
-
-            for dat in ret_list:
-                dat = dat[idx]
-                if self.use_seq_flag:
-                    seq_stream.write(self.int_packer.pack(indices))
-                    if self.has_subseq[idx]:  # has sub-sequence
-                        for sub_dat in dat:
-                            writeDataStream(sub_dat, data_callback)
-                            subseq_stream.write(self.int_packer.pack(indices))
-                            indices += len(sub_dat)
-                    else:
-                        writeDataStream(dat, data_callback)
-                        indices += len(dat)
-                else:
-                    writeDataStream(dat, data_callback)
-
-        def writeDataStream(dat, data_callback):
-            if self.use_seq_flag > 0:
-                if data_callback is None:  # Special for index slot
-                    data_stream.write(array.array("i", dat).tostring())
-                else:
-                    for ele in dat:
-                        data_callback(ele)
-            else:
-                if data_callback is None:  # Special for index slot
-                    data_stream.write(self.int_packer.pack(dat))
-                else:
-                    data_callback(dat)
-
-        try:
-            for i in range(self.slots_num):
-                slot = self.getSlots()[i]
-                # According to the data_type, each slot data will be converted to binary
-                if isinstance(slot, DenseSlot):
-                    convertDataImpl(i, lambda e: data_stream.write(
-                        array.array("f", e).tostring()))
-                elif isinstance(slot, SparseNonValueSlot):
-                    handler = SparseNonValueHandler()
-                    convertDataImpl(i, handler)
-                    handler.done(data_stream, self.int_packer)
-                elif isinstance(slot, SparseValueSlot):
-                    handler = SparseValueHandler()
-                    convertDataImpl(i, handler)
-                    handler.done(data_stream, self.int_packer)
-                elif isinstance(slot, IndexSlot):
-                    convertDataImpl(i, None)
-                elif isinstance(slot, StringSlot):
-                    handler = StringHandler(data_stream, self.int_packer)
-                    convertDataImpl(i, handler)
-                else:
-                    raise RuntimeError("The data_type must be 0/1/2/3/6")
-            data_stream.flush()
-            seq_stream.flush()
-            subseq_stream.flush()
-
-            return "".join([
-                self.int_packer.pack(current_batch_size), data_bytes.getvalue(),
-                seq_bytes.getvalue(), subseq_bytes.getvalue()
-            ])
-
-        finally:
-            data_stream.close()
-            seq_stream.close()
-            subseq_stream.close()
-            data_bytes.close()
-            seq_bytes.close()
-            subseq_bytes.close()
-
-    def hasSubseq(self, ret_list):
-        # create unified format for ret_list with differnt slots_num
-        if self.slots_num == 1:
-            ret_list = [ret_list]
-        # decide whether slot has sub-sequence using its first sample
-        for i in range(self.slots_num):
-            slot = self.getSlots()[i]
-            dat = ret_list[0][i][0]
-            if isinstance(slot, IndexSlot) or isinstance(slot, StringSlot):
-                if isinstance(dat, list) or isinstance(dat, numpy.ndarray):
-                    self.has_subseq.append(1)  # has_subseq = True
-                    continue
-            elif isinstance(dat[0], list) or isinstance(dat[0], numpy.ndarray):
-                self.has_subseq.append(1)  # has_subseq = True
-                continue
-            self.has_subseq.append(0)  # has_subseq = False
-
-    def checkOrder(self):
-        first_noSubseq_slot = self.slots_num
-        last_subseq_slot = -1
-        for i in range(self.slots_num):
-            if not self.has_subseq[i]:
-                first_noSubseq_slot = i
-                break
-        for i in range(self.slots_num):
-            if self.has_subseq[i]:
-                last_subseq_slot = i
-        if first_noSubseq_slot < last_subseq_slot:
-            raise RuntimeError(
-                "slot hasSubseq must put before than slot without subseq")
-        self.has_checked = True
-
-    def __prepareData(self, batch_size, ret_list):
-        current_batch_size = 0
-        could_exit = False
-        while not could_exit:
-            if len(self.data_pool) == 0:
-                self.data_pool_idx = 0
-                self.fillPool()
-            if len(self.data_pool) != 0:
-                for idx in xrange(self.data_pool_idx, len(self.data_pool)):
-                    current_batch_size += self.calculateDataBatchSize(
-                        self.data_pool[idx])
-                    if current_batch_size >= batch_size:
-                        could_exit = True
-                        break
-                if current_batch_size > batch_size and not self.can_over_batch_size:  # if cannot over batch size
-                    current_batch_size -= self.calculateDataBatchSize(
-                        self.data_pool[idx])
-                    idx -= 1
-
-                ret_list += self.data_pool[self.data_pool_idx:idx + 1]
-
-                # for speed reason, just shift left index, not delete data actually.
-                self.data_pool_idx = idx + 1
-
-                if self.data_pool_idx == len(self.data_pool):
-                    self.data_pool = []
-            else:
-                break
-        if self.use_seq_flag and not self.has_checked:  # compute self.has_subseq and checkOrder only at first time
-            self.hasSubseq(ret_list)
-            self.checkOrder()
-        return current_batch_size
-
-    def fillPool(self):
-        """
-        Fill the pool to max_pool_size. If max_pool_size is None, then read file_count to pool.
-        """
-        if self.max_pool_size == 0:
-            for i in xrange(min(self.file_count, len(self.generators))):
-                self.data_pool += list(self.generators[i])
-            self.generators = self.generators[min(self.file_count,
-                                                  len(self.generators)):]
-            self.max_pool_size = len(self.data_pool)
-        else:
-            while len(self.data_pool) < self.max_pool_size and len(
-                    self.generators) != 0:
-                try:
-                    self.data_pool.append(self.generators[0].next())
-                except StopIteration:
-                    self.generators.pop(0)
-        self.shuffler(self.data_pool)
-
-
-class PoolSize(object):
-    """Max number of sample which contains in provider."""
-
-    def __init__(self, pool_size):
-        self.size = pool_size
-
-
-def default_init_hook(cls, *args, **kwargs):
-    """ default hook, do nothing """
-    del cls, args, kwargs
-
-
-def provider(slots=None,
-             use_seq=False,
-             should_shuffle=True,
-             pool_size=1,
-             can_over_batch_size=True,
-             calc_batch_size=lambda data: 1,
-             debug=False,
-             init_hook=default_init_hook,
-             profile_filename=None):
-    """
-    The decorator for PyDataProvider. User should use this to create Provider class.
-    User should only concern how to read sample from file.
-
-    So the basic usage is:
-
-    ..  code-block:: python
-
-        @provider(some data provider config here...)
-        def process(obj, file_name):
-            while not at end of file_name:
-                sample = readOneSampleFromFile(file_name)
-                yield sample.
-
-    The configuration of data provider should be setup by:
-
-    :param init_hook: A callback will be invoked when PyDataProvider instance \
-                      created. The parameter is (obj, \*args, \*\*kwargs).
-
-                      - **obj**: actually data provider instance, which \
-                                 contains some global objects in obj.xxxxx, \
-                                 and is used by process function.
-
-                        1. **obj.slots**: a list of SlotType Object. Can be \
-                                          set in init. For example, obj.slots = \
-                                          [DenseSlot(9), IndexSlot(2)].
-                        2. **obj.logger**: a logger object. User can invoke \
-                                          obj.logger.info(), obj.logger.fatal(), etc.
-
-                      - **args** and **kwargs**: the data provider __init__ \
-                                                 parameters. For example, load_data_args \
-                                                 will be found in \*\*kwargs, \
-                                                 and if you want to recieve \
-                                                 it from trainer_config, \
-                                                 recommand to use init_hook_wrapper
-    :type init_hook: callable
-
-    :param pool_size:
-                      - **int**: it will read at most pool_size files to memory.
-                      - **PoolSize**: it will read at most PoolSize.size samples to memory.
-                      - If not set, it will read all the files to memory.
-    :type pool_size: int | PoolSize
-
-    :param slots: Specify the SlotTypes, can also be set in init_hook. It has two formats:
-
-                  - A list of SlotType objects. For example, slots = \
-                    [DenseSlot(9), IndexSlot(2)].
-                  - A method return a list of SlotTypes, and the parameter of \
-                    method is (obj, \*file_list, \*\*kwargs).
-    :type slots: list | callable
-
-    :param use_seq:  False if use no sequence (Default). True if use sequence:
-
-                     - If sequence has **no sub-sequence**: Each slot will \
-                       return a list of data. This list is one sequence. \
-                       So the return format likes \
-                       [[a0, a1, a2], [b1, b2, b3, b4], [c1]].
-                     - If sequence has **sub-sequence**: Each slot will return \
-                       a nested-list of data. This list contains several \
-                       sub-lists, each sub-list is one sub-sequence. \
-                       So the return format likes \
-                       [[[a0, a1, a2], [a4, a5]], [[b1, b2, b3, b4], [b5, b6]], [[c1], [c2]]].
-    :type use_seq: bool
-
-    :param should_shuffle: True if data should shuffle.
-    :type should_shuffle: bool
-
-    :param calc_batch_size: The method calculate each data's batch size.
-
-                            - Default is the batch size of one sample.
-                            - User can customize by **lamda** funtion. For example, \
-                              :code:`calc_batch_size = lambda data : len(data)` \
-                              means calculating the token number of a sequence data.
-    :type calc_batch_size: callable
-
-    :param can_over_batch_size: Whether :code:`actual batch size >= input batch size`
-
-                                - **True** (>=): getNextBatch method can return more data (Default).
-                                - **False** (<): user must ensure that each data's batch size < input batch size.
-    :type can_over_batch_size: bool
-
-    :param debug: True if enable debug logger and some debug check. Default is False.
-    :type debug: bool
-
-    :param profile_filename: None if disable profile (Default). Otherwise, \
-                             the data provider will dump profile result when \
-                             reset. And the dump filename is \
-                             **<profile_filename>_<reset_count>**.
-    :type profile_filename: None | Str
-    """
-
-    def _wrapper(handler):
-        class Cls(GeneralPyDataProvider):
-            """ Real PyDataProvider Class. """
-
-            def __init__(self, *file_list, **kwargs):
-                logging.basicConfig(
-                    format="[%(levelname)s %(asctime)s %(filename)s:%(lineno)s]"
-                    " %(message)s")
-
-                self.logger = logging.getLogger("")
-                if debug:
-                    self.logger.setLevel(logging.DEBUG)
-                    self.logger.debug("Running pydataprovider in debug mode.")
-                else:
-                    self.logger.setLevel(logging.INFO)
-
-                init_hook(self, *file_list, **kwargs)
-                if callable(slots):
-                    self.slots = slots(self, *file_list, **kwargs)
-                elif slots is not None:
-                    self.slots = slots
-
-                if isinstance(pool_size, int):
-                    self.max_pool_size = 0
-                    self.file_count = pool_size
-                elif isinstance(pool_size, PoolSize):
-                    self.max_pool_size = pool_size.size
-                    self.file_count = 0
-                else:
-                    raise RuntimeError
-                self.can_over_batch_size = can_over_batch_size
-                self.debug = debug
-                self.profile_filename = profile_filename
-                self.use_seq_flag = use_seq
-                self.should_shuffle = should_shuffle
-                GeneralPyDataProvider.__init__(self, *file_list, **kwargs)
-
-            def getSlots(self):
-                return self.slots
-
-            def generateData(self, f):
-                return handler(self, f)
-
-            def calculateDataBatchSize(self, data):
-                return calc_batch_size(data)
-
-        return Cls
-
-    return _wrapper
-
-
-def init_hook_wrapper(func):
-    """
-    Wrap a method for PyDataProviderWrapper's init_hook. This method can
-    receive parameter from trainer_config's load_data_args. The load_data_args
-    must pass a pickle.dumps() value, and dump a map as keyword args. The
-    wrapped method :code:`func` will receive them as keyword args.
-
-    So an example usage is:
-
-    ..  code-block:: python
-
-        @init_hook_wrapper
-        def hook(obj, dictionary, file_list, **kwargs):
-            obj.dictionary = dictionary
-            obj.slots = [IndexSlot(len(obj.dictionary)),
-                         IndexSlot(len(open(file_list[0], "r").readlines()))]
-
-    :param func: init_hook function
-    :type func: callable
-    :return: wrapped method, can be passed into @provider.
-    """
-
-    @functools.wraps(func)
-    def wrapper(obj, *file_list, **kwargs):
-        args = kwargs.get("load_data_args", dict())
-        if isinstance(args, basestring):
-            args = pickle.loads(args)
-        args['file_list'] = file_list
-        func(obj=obj, **args)
-
-    return wrapper
diff --git a/python/paddle/trainer/__init__.py b/python/paddle/trainer/__init__.py
deleted file mode 100644
index f662d682632..00000000000
--- a/python/paddle/trainer/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
deleted file mode 100644
index 5b90facd49d..00000000000
--- a/python/paddle/trainer/config_parser.py
+++ /dev/null
@@ -1,4447 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-'''
-The following functions are available in the config file:
-
-Bias: define bias. To be used as value of bias argument in Layer().
-
-Data: define data provider.
-
-Input: define input layer for a layer. To be used as element of inputs argument
-       in Layer().
-
-Conv: define a convolution operation for an input of a layer.
-
-Norm: define a normalization operation for an input of a layer.
-
-Pool: define a pooling operation for an input of a layer.
-
-Layer: define a layer.
-
-Parameter: define a parameter.
-
-Import: import another config file. If the imported config file name is
-        a relative path, then it will be searched under the directory of the
-        current config file.
-
-Inputs(layer_names...):
-    Define the name of the input layers of the NeuralNetwork.
-    The type of these layers must be "data".
-    These layers will be provided with the DataBatch obtained
-    from DataProvider. The data streams from DataProvider must
-    have the same order.
-
-Outputs(layer_names...):
-    Define the name of the output layers of the NeuralNetwork.
-    Usually the output is simply the cost layer.
-    You can specify other layers as outputs and  calculate the
-    cost (and its derivative) yourself.
-
-
-default_initial_std(val)
-default_initial_mean(val)
-default_momentum(val):
-default_decay_rate(val): Set the default value for these parameters
-
-
-get_config_arg(name, type, default): Get the value for a config parameter.
-
-
-*** customized extension to config_parser ***
-The functionality of the config_parser can be extended.
-If the config_arg_str for parse_config() contains
-extension_module_name=[MODULE_NAME], then config_parser will call
-MODULE_NAME.get_config_funcs(g_config)
-MODULE_NAME.get_config_funcs() should return a dictionary of name to functions,
-those functions will be available in the config file.
-See legacy/trainer/tests/config_parser_test.py for example
-
-To use this from paddle_trainer, paddle_trainer should be called with
---config_args=extension_module_name=[MODULE_NAME]
-
-'''
-import copy
-import logging
-import os
-import sys
-import traceback
-import math
-import shutil
-
-try:
-    from paddle.proto.DataConfig_pb2 import DataConfig
-    from paddle.proto.ModelConfig_pb2 import ModelConfig
-    from paddle.proto.ModelConfig_pb2 import LayerConfig
-    from paddle.proto.ModelConfig_pb2 import LayerInputConfig
-    from paddle.proto.ModelConfig_pb2 import ProjectionConfig
-    from paddle.proto.ModelConfig_pb2 import OperatorConfig
-    from paddle.proto.ModelConfig_pb2 import GeneratorConfig
-    from paddle.proto.ModelConfig_pb2 import LinkConfig
-    from paddle.proto.ParameterConfig_pb2 import ParameterConfig
-    from paddle.proto.ParameterConfig_pb2 import ParameterUpdaterHookConfig
-    from paddle.proto.TrainerConfig_pb2 import TrainerConfig
-
-except Exception as e:
-    traceback.print_exc()
-    raise
-
-logging.basicConfig(
-    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
-logger = logging.getLogger('paddle')
-logger.setLevel(logging.INFO)
-__real_print__ = print
-print = logger.info
-
-# from layer type name to layer class
-g_layer_type_map = {}
-
-
-# Initialize global variables. We use this function so that we can
-# call parse_config() multiple times
-def init_config_environment(
-        g_default_momentum=None,
-        g_default_decay_rate=None,
-        g_default_initial_mean=0.,
-        g_default_initial_std=0.01,
-        g_default_num_batches_regularization=None,
-        g_default_initial_strategy=0,
-        g_default_initial_smart=False,
-        g_default_gradient_clipping_threshold=None,
-        g_default_device=None,
-        g_default_update_hooks=None,
-        g_default_compact_func=None,
-        g_config=TrainerConfig(),
-        g_layer_map={},
-        g_parameter_map={},
-        g_parameter_initializer_map={},
-        g_extended_config_funcs={},
-
-        # store command args of paddle_trainer
-        g_command_config_args={},
-
-        # Used for PyDataProvider to avoid duplicate module name
-        g_py_module_name_list=[],
-        g_current_submodel=None,
-        g_root_submodel=None,
-        g_submodel_map={},
-        g_submodel_stack=[],
-        g_add_submodel_suffix=False, ):
-
-    # directly iterate through locals().iteritems() will change
-    # the size of locals() due to introducing k, v into scope
-    # which will break the process in some env
-
-    local_vars = copy.deepcopy(locals())
-    for k, v in local_vars.iteritems():
-        globals()[k] = v
-
-
-# Because type is widely used as a variable name in this code.
-# we need a different function name for the builtin type()
-def type_of(x):
-    return type(x)
-
-
-# Check a condition derived config file
-def config_assert(b, msg):
-    if not b:
-        logger.fatal(msg)
-
-
-g_config_funcs = {}
-
-
-# decorator for indicating a function which can be used in config file
-def config_func(func):
-    g_config_funcs[func.func_name] = func
-    return func
-
-
-# decorator for indicating a class which can be used in config file
-def config_class(cls):
-    g_config_funcs[cls.__name__] = cls
-    return cls
-
-
-# decorator for indicating a class for a layer type
-def config_layer(layer_type):
-    def wrap(cls):
-        g_config_funcs[cls.__name__] = cls
-        g_layer_type_map[layer_type] = cls
-        return cls
-
-    return wrap
-
-
-def gen_parameter_name(layer_name, input_index):
-    return '_%s.w%d' % (layer_name, input_index)
-
-
-def gen_bias_parameter_name(layer_name):
-    return '_%s.wbias' % layer_name
-
-
-def default(x, default_value):
-    return default_value if x is None else x
-
-
-class Cfg(object):
-    def add_keys(self, locals):
-        for k, v in locals.iteritems():
-            if not k.startswith('_'):
-                self.__setattr__(k, v)
-
-
-# functions available in config file
-
-
-# Define the name of the input layers of the NeuralNetwork.
-# The type of these layers must be "data".
-# These layers will be provided with the DataBatch obtained
-# from DataProvider. The data streams from DataProvider must
-# have the same order.
-@config_func
-def Inputs(*args):
-    for name in args:
-        name = MakeLayerNameInSubmodel(name)
-        global g_current_submodel, g_root_submodel
-        if g_current_submodel.is_recurrent_layer_group:
-            config_assert(False, "Do not set Inputs in recurrent layer group")
-        else:
-            g_current_submodel.input_layer_names.append(name)
-
-        if g_current_submodel is g_root_submodel:
-            g_config.model_config.input_layer_names.append(name)
-
-
-@config_func
-def HasInputsSet():
-    return len(g_current_submodel.input_layer_names) != 0
-
-
-# Define the name of the output layers of the NeuralNetwork.
-# Usually the output is simply the cost layer.
-# You can specify other layers as outputs and calculate the
-# cost (and its derivative) yourself.
-@config_func
-def Outputs(*args):
-    for name in args:
-        name = MakeLayerNameInSubmodel(name)
-        global g_current_submodel, g_root_submodel
-        if g_current_submodel.is_recurrent_layer_group:
-            config_assert(False, "Do not set Outputs in recurrent layer group")
-        else:
-            g_current_submodel.output_layer_names.append(name)
-
-        if g_current_submodel is g_root_submodel:
-            g_config.model_config.output_layer_names.append(name)
-
-
-@config_func
-def SubModelBegin(name):
-    global g_current_submodel, g_root_submodel, g_submodel_stack
-    g_submodel_stack.append(g_current_submodel)
-
-    name = MakeLayerNameInParentSubmodel(name)  #rename in nested submodel
-
-    config_assert(name not in g_submodel_map,
-                  'Duplicated submodel name: %s' % name)
-
-    sub_model = g_config.model_config.sub_models.add()
-    sub_model.name = name
-    g_submodel_map[name] = sub_model
-    g_current_submodel = sub_model
-
-
-@config_func
-def SubModelEnd(name=None):
-    global g_current_submodel, g_root_submodel, g_submodel_stack
-    config_assert(g_current_submodel is not g_root_submodel,
-                  "submodel not begin")
-    if name is not None:
-        config_assert(
-            g_current_submodel.name == MakeLayerNameInParentSubmodel(name),
-            "submodel name error")
-
-    g_current_submodel = g_submodel_stack.pop()
-
-
-def MakeLayerNameInParentSubmodel(name):
-    suffix = ""
-    if len(g_submodel_stack) > 1:
-        suffix = "@" + g_submodel_stack[-1].name
-    return name + suffix
-
-
-def GetLayerBaseName(name):
-    return name.split('@')[0]
-
-
-def MakeLayerNameInSubmodel(name, submodel_name=None):
-    global g_current_submodel
-    global g_add_submodel_suffix
-    if (submodel_name is None and not g_add_submodel_suffix and
-            not g_current_submodel.is_recurrent_layer_group):
-        return name
-    if submodel_name is None:
-        submodel_name = g_current_submodel.name
-    return name + "@" + submodel_name
-
-
-# Define a recurrent layer group begin with RecurrentLayerGroupBegin
-# and end with RecurrentLayerGroupEnd.
-# A recurrent layer group forward/backward one frame after previous frame
-# forward/backward through all layers in layer group.
-# in_links are names of layer used as input layer in the layer group.
-# out_links are names of layer in layer group used as outside layer's input.
-#
-# If generator is set, the layer group need one or more than one outlinks.
-# The first outlink should always be the generated token ids.
-# If generator.num_results_per_sample is not set, the output for one sample is
-# a ids sequence. Else if num_results_per_sample is more than one,
-# the output for one sample is up to #num_results_per_sample generated
-# sequences, which are packed in one sequence in output ids vector. Each
-# generated sequence has a generation probability. The probabilities for one
-# sample are stored in one row of output value matrix.
-# Packed generated sequences format, for each i:
-#   seq_i_length: one interger, seq_i content length,
-#   [seq_i content], length = seq_i_length
-#   seq_i_end_mark: one interger, for format check, always -1
-# You can use "seq_text_printer" to print the output of the generator.
-@config_func
-def RecurrentLayerGroupWithoutOutLinksBegin(name,
-                                            in_links,
-                                            seq_reversed=False,
-                                            target_inlinkname=""):
-    global g_current_submodel
-    config_assert(g_config.model_config.type == "recurrent_nn",
-                  "RecurrentLayerGroup should be used only in recurrent_nn")
-    RecurrentLayerGroup(name=name)  # add to father model
-    SubModelBegin(name)
-    g_current_submodel.is_recurrent_layer_group = True
-    g_current_submodel.reversed = seq_reversed
-    in_links_count = 0
-    for linkid, link in enumerate(in_links):
-        if isinstance(link, basestring):
-            name = link
-        else:
-            name = link.link_name
-
-        in_links_count += 1
-        layer_name = MakeLayerNameInParentSubmodel(name)
-        layer = g_layer_map[layer_name]
-        ScatterAgentLayer(
-            name=name, size=layer.size, width=layer.width, height=layer.height)
-
-        pair = g_current_submodel.in_links.add()
-        pair.layer_name = layer_name
-        pair.link_name = MakeLayerNameInSubmodel(name)
-
-
-@config_func
-def RecurrentLayerGroupSetOutLink(link):
-    if isinstance(link, basestring):
-        name = link
-    else:
-        name = link.link_name
-    layer_name = MakeLayerNameInParentSubmodel(name)
-    pair = g_current_submodel.out_links.add()
-    pair.layer_name = MakeLayerNameInSubmodel(name)
-    pair.link_name = layer_name
-
-
-def RecurrentLayerGroupSetGenerator(generator=None):
-    generator.eos_layer_name = MakeLayerNameInSubmodel(generator.eos_layer_name)
-    g_current_submodel.generator.CopyFrom(generator)
-
-
-@config_func
-def RecurrentLayerGroupBegin(name,
-                             in_links,
-                             out_links,
-                             generator=None,
-                             target_inlinkname="",
-                             seq_reversed=False):
-    RecurrentLayerGroupWithoutOutLinksBegin(name, in_links, seq_reversed)
-    for link in out_links:
-        RecurrentLayerGroupSetOutLink(link)
-
-    if generator is not None:
-        RecurrentLayerGroupSetGenerator(generator)
-        config_assert(
-            len(in_links) == 0, "no in_links should be passed to generator")
-        config_assert(
-            len(out_links) >= 1,
-            "one or more than one out_links should be passed to generator")
-
-
-@config_func
-def RecurrentLayerGroupEnd(name):
-    global g_current_submodel
-    config_assert(g_current_submodel.is_recurrent_layer_group,
-                  "RecurrentLayerGroup not begin")
-    for pair in g_current_submodel.memories:  #check exist
-        layer = g_layer_map[pair.layer_name]
-        config_assert(layer is not None,
-                      "memory declare wrong name:%s" % pair.layer_name)
-        memory_link = g_layer_map[pair.link_name]
-        config_assert(layer.size == memory_link.size,
-                      "memory declare wrong size:%d" % memory_link.size)
-
-    prev_submodel = g_current_submodel
-    SubModelEnd(name)
-
-    for pair in prev_submodel.out_links:
-        layer = g_layer_map[pair.layer_name]
-        # add out agent to father model
-        agent_name = GetLayerBaseName(pair.link_name)
-        if prev_submodel.HasField("generator"):
-            DataLayer(name=agent_name, size=layer.size)
-        else:
-            GatherAgentLayer(name=agent_name, size=layer.size)
-
-
-# Define the model type
-# currently, the paddle supports "nn", "recurrent_nn", "recursive_nn" and "multi_nn"
-@config_func
-def model_type(name):
-    g_config.model_config.type = name
-
-
-@config_class
-class Bias(Cfg):
-    def __init__(self,
-                 parameter_name=None,
-                 learning_rate=None,
-                 momentum=None,
-                 decay_rate=None,
-                 decay_rate_l1=None,
-                 initial_mean=None,
-                 initial_std=None,
-                 initial_strategy=None,
-                 initial_smart=None,
-                 num_batches_regularization=None,
-                 sparse_remote_update=None,
-                 gradient_clipping_threshold=None,
-                 is_static=None,
-                 is_shared=None,
-                 initializer=None):
-        self.add_keys(locals())
-
-
-# Define one input for a layer
-@config_class
-class Input(Cfg):
-    def __init__(
-            self,
-            input_layer_name,
-            parameter_name=None,
-            initializer=None,
-            learning_rate=None,
-            momentum=None,
-            decay_rate=None,
-            decay_rate_l1=None,
-            initial_mean=None,
-            initial_std=None,
-            initial_strategy=None,
-            initial_smart=None,
-            num_batches_regularization=None,
-            sparse_remote_update=None,
-            sparse_update=None,
-            gradient_clipping_threshold=None,
-            conv=None,
-            bilinear_interp=None,
-            norm=None,
-            pool=None,
-            image=None,
-            block_expand=None,
-            maxout=None,
-            spp=None,
-            pad=None,
-            upsample=None,
-            format=None,
-            nnz=None,
-            is_static=None,
-            is_shared=None,
-            update_hooks=None,
-            input_layer_argument=None,
-            make_layer_name_in_submodel=True, ):
-        """
-        @param make_layer_name_in_submodel True by defalut, you might need to
-        set it carefully when adding Input in config_parser.py.
-        """
-        self.add_keys(locals())
-        self.input_layer_name = MakeLayerNameInSubmodel(
-            input_layer_name
-        ) if make_layer_name_in_submodel else input_layer_name
-
-
-# Define a projection for iexed layer
-@config_class
-class Projection(Input):
-    type = None  # subclass should set it correctly
-
-    def __init__(
-            self,
-            input_layer_name,
-            size=0,  # projection output size
-            parameter_name=None,
-            learning_rate=None,
-            momentum=None,
-            decay_rate=None,
-            decay_rate_l1=None,
-            initial_mean=None,
-            initial_std=None,
-            initial_strategy=None,
-            initial_smart=None,
-            initializer=None,
-            num_batches_regularization=None,
-            sparse_remote_update=None,
-            sparse_update=None,
-            gradient_clipping_threshold=None,
-            ptype=None,
-            format=None,
-            nnz=None,
-            is_static=None,
-            is_shared=None,
-            update_hooks=None,
-            input_layer_argument=None, ):
-        self.add_keys(locals())
-        self.input_layer_name = MakeLayerNameInSubmodel(input_layer_name)
-
-        self.proj_conf = ProjectionConfig()
-        if ptype is not None:
-            self.proj_conf.type = ptype
-        else:
-            self.proj_conf.type = self.type
-
-    # calculate the output_size given input_size. return 0
-    # to indicate using the size from Layer config
-    def calc_output_size(self, input_layer_config):
-        return self.size
-
-    def calc_parameter_size(self, input_size, output_size):
-        raise NotimplementedError
-
-    def calc_parameter_dims(self, input_size, output_size):
-        raise NotimplementedError
-
-
-@config_class
-class IdentityProjection(Projection):
-    type = 'identity'
-
-    def calc_output_size(self, input_layer_config):
-        return input_layer_config.size
-
-    def calc_parameter_size(self, input_size, output_size):
-        return 0
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return []
-
-
-# Like IdentityProjection, but layer size may smaller than input size,
-# the projection select dimesions [offset, offset+layer_size) from input
-@config_class
-class IdentityOffsetProjection(Projection):
-    type = 'identity_offset'
-
-    def __init__(self, input_layer_name, offset, **xargs):
-        super(IdentityOffsetProjection, self).__init__(input_layer_name,
-                                                       **xargs)
-        self.proj_conf.offset = offset
-
-    def calc_output_size(self, input_layer_config):
-        return 0  # depends on the outside MixedLayer
-
-    def calc_parameter_size(self, input_size, output_size):
-        return 0
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return []
-
-
-@config_class
-class SliceProjection(Projection):
-    type = 'slice'
-
-    def __init__(self, input_layer_name, slices, **xargs):
-        super(SliceProjection, self).__init__(input_layer_name, **xargs)
-        input = g_layer_map[input_layer_name]
-        if input.type in ["exconv", "cudnn_conv"]:
-            # the slice operator is for the channel dimension
-            assert input.num_filters is not None
-            channels = input.num_filters
-            image_size = input.size / channels
-            assert slices[len(slices) - 1][1] <= channels
-            for i in xrange(len(slices)):
-                slice = self.proj_conf.slices.add()
-                slice.start = slices[i][0] * image_size
-                slice.end = slices[i][1] * image_size
-                self.size += slice.end - slice.start
-        else:
-            config_assert(False,
-                          'Currently the input should be convolution layer')
-
-    def calc_parameter_size(self, input_size, output_size):
-        return 0
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return []
-
-
-# DotMulProjection performs element-wise multiplication with weight
-@config_class
-class DotMulProjection(Projection):
-    type = 'dot_mul'
-
-    def calc_output_size(self, input_layer_config):
-        return input_layer_config.size
-
-    def calc_parameter_size(self, input_size, output_size):
-        return output_size
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return [1, output_size]
-
-
-# ScalingProjection
-@config_class
-class ScalingProjection(Projection):
-    type = 'scaling'
-
-    def calc_output_size(self, input_layer_config):
-        return input_layer_config.size
-
-    def calc_parameter_size(self, input_size, output_size):
-        return 1
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return [1, 1]
-
-
-@config_class
-class TableProjection(Projection):
-    type = 'table'
-
-    def calc_parameter_size(self, input_size, output_size):
-        return input_size * output_size
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return [input_size, output_size]
-
-
-@config_class
-class FullMatrixProjection(Projection):
-    type = 'fc'
-
-    def calc_parameter_size(self, input_size, output_size):
-        return input_size * output_size
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return [input_size, output_size]
-
-
-@config_class
-class TransposedFullMatrixProjection(Projection):
-    type = 'trans_fc'
-
-    def calc_parameter_size(self, input_size, output_size):
-        return input_size * output_size
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return [output_size, input_size]
-
-
-@config_class
-class ContextProjection(Projection):
-    type = 'context'
-
-    def __init__(self, input_layer_name, context_start, context_length,
-                 trainable_padding, **xargs):
-        super(ContextProjection, self).__init__(input_layer_name, **xargs)
-        self.proj_conf.context_start = context_start
-        self.proj_conf.context_length = context_length
-        self.proj_conf.trainable_padding = trainable_padding
-        self._total_pad = max(0, -self.proj_conf.context_start) \
-                          + max(0, self.proj_conf.context_start \
-                                + self.proj_conf.context_length - 1)
-
-    def calc_output_size(self, input_layer_config):
-        return input_layer_config.size * self.proj_conf.context_length
-
-    def calc_parameter_size(self, input_size, output_size):
-        if self.proj_conf.trainable_padding == False:
-            return 0
-        else:
-            return input_size * self._total_pad
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return [self._total_pad, input_size]
-
-    _total_pad = 0
-
-
-@config_class
-class ConvBaseProjection(Projection):
-    def __init__(self,
-                 input_layer_name,
-                 num_filters=None,
-                 conv_conf=None,
-                 **xargs):
-        super(ConvBaseProjection, self).__init__(input_layer_name, **xargs)
-
-        if num_filters is not None:
-            self.proj_conf.num_filters = num_filters
-
-    def calc_output_size(self, input_layer_config):
-        return self.proj_conf.output_size
-
-    def calc_parameter_size(self, input_size, output_size):
-        co = self.proj_conf.num_filters
-        ci = self.proj_conf.conv_conf.channels
-        fh = self.proj_conf.conv_conf.filter_size
-        fw = self.proj_conf.conv_conf.filter_size_y
-        gr = self.proj_conf.conv_conf.groups
-        return co * ci * fh * fw / gr
-
-    def calc_bias_size(self):
-        return self.proj_conf.num_filters
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return None
-
-
-@config_class
-class ConvProjection(ConvBaseProjection):
-    type = 'conv'
-
-    def __init__(self,
-                 input_layer_name,
-                 num_filters=None,
-                 conv_conf=None,
-                 **xargs):
-        super(ConvProjection, self).__init__(input_layer_name, num_filters,
-                                             conv_conf, **xargs)
-
-        parse_conv(conv_conf, self.input_layer_name, self.proj_conf.conv_conf,
-                   num_filters)
-        self.proj_conf.output_size = self.proj_conf.conv_conf.output_x * \
-                                     self.proj_conf.conv_conf.output_y * \
-                                     num_filters
-
-
-@config_class
-class ConvTransProjection(ConvBaseProjection):
-    type = 'convt'
-
-    def __init__(self,
-                 input_layer_name,
-                 num_filters=None,
-                 conv_conf=None,
-                 **xargs):
-        super(ConvTransProjection, self).__init__(input_layer_name, num_filters,
-                                                  conv_conf, **xargs)
-
-        parse_conv(
-            conv_conf,
-            self.input_layer_name,
-            self.proj_conf.conv_conf,
-            num_filters,
-            trans=True)
-        self.proj_conf.output_size = self.proj_conf.conv_conf.img_size_y * \
-                                     self.proj_conf.conv_conf.img_size * \
-                                     num_filters
-
-
-# Define a operator for mixed layer
-@config_class
-class Operator(Cfg):
-    type = None  # subclass should set it correctly
-
-    def __init__(
-            self,
-            input_layer_names, ):
-        self.add_keys(locals())
-        self.operator_conf = OperatorConfig()
-        self.operator_conf.type = self.type
-
-    def check_dims(self):
-        pass
-
-    def calc_output_size(self, input_sizes):
-        return 0
-
-
-@config_class
-class DotMulOperator(Operator):
-    type = 'dot_mul'
-
-    def __init__(self, input_layer_names, scale=None, **xargs):
-        super(DotMulOperator, self).__init__(input_layer_names, **xargs)
-        if scale is not None:
-            self.operator_conf.dotmul_scale = scale
-
-        config_assert(len(input_layer_names) == 2, "DotMul is binary operator")
-
-    def check_dims(self):
-        for i in range(2):
-            config_assert(self.operator_conf.input_sizes[i] ==
-                          self.operator_conf.output_size,
-                          "DotMul input_size != output_size")
-
-    def calc_output_size(self, input_sizes):
-        return input_sizes[0]
-
-
-@config_class
-class ConvOperator(Operator):
-    type = 'conv'
-
-    def __init__(self,
-                 input_layer_names,
-                 num_filters=None,
-                 conv_conf=None,
-                 **xargs):
-        super(ConvOperator, self).__init__(input_layer_names, **xargs)
-        if num_filters is not None:
-            self.operator_conf.num_filters = num_filters
-
-        parse_conv(conv_conf,
-                   MakeLayerNameInSubmodel(input_layer_names[0]),
-                   self.operator_conf.conv_conf, num_filters)
-        self.operator_conf.output_size = self.operator_conf.conv_conf.output_x * \
-                                         self.operator_conf.conv_conf.output_y * \
-                                         num_filters
-
-        config_assert(len(input_layer_names) == 2, "Conv is binary operator")
-
-    def calc_output_size(self, input_sizes):
-        return self.operator_conf.output_size
-
-
-@config_class
-class ConvTransOperator(Operator):
-    type = 'convt'
-
-    def __init__(self,
-                 input_layer_names,
-                 num_filters=None,
-                 conv_conf=None,
-                 **xargs):
-        super(ConvTransOperator, self).__init__(input_layer_names, **xargs)
-        if num_filters is not None:
-            self.operator_conf.num_filters = num_filters
-
-        parse_conv(
-            conv_conf,
-            MakeLayerNameInSubmodel(input_layer_names[0]),
-            self.operator_conf.conv_conf,
-            num_filters,
-            trans=True)
-        self.operator_conf.output_size = \
-            self.operator_conf.conv_conf.img_size * \
-            self.operator_conf.conv_conf.img_size_y * \
-            num_filters
-
-        config_assert(len(input_layer_names) == 2, "Conv is binary operator")
-
-    def calc_output_size(self, input_sizes):
-        return self.operator_conf.output_size
-
-
-# please refer to the comments in proto/ModelConfig.proto
-@config_class
-class Conv(Cfg):
-    def __init__(self,
-                 filter_size,
-                 channels,
-                 padding=None,
-                 stride=None,
-                 groups=None,
-                 filter_channels=None,
-                 output_x=None,
-                 img_size=None,
-                 caffe_mode=True,
-                 filter_size_y=None,
-                 padding_y=None,
-                 stride_y=None,
-                 dilation=None,
-                 dilation_y=None):
-        self.add_keys(locals())
-        if filter_size_y is None:
-            self.filter_size_y = filter_size
-        if padding_y is None:
-            self.padding_y = padding
-        if dilation_y is None:
-            self.dilation_y = dilation
-        if stride_y is None:
-            self.stride_y = stride
-        if output_x is not None:
-            config_assert(output_x <= 0)
-
-
-# please refer to the comments in proto/ModelConfig.proto
-@config_class
-class Conv3D(Cfg):
-    def __init__(self,
-                 filter_size,
-                 channels,
-                 padding=None,
-                 stride=None,
-                 groups=None,
-                 filter_channels=None,
-                 output_x=None,
-                 img_size=None,
-                 caffe_mode=True,
-                 filter_size_y=None,
-                 padding_y=None,
-                 stride_y=None,
-                 filter_size_z=None,
-                 padding_z=None,
-                 stride_z=None):
-        self.add_keys(locals())
-        self.filter_size_y = filter_size_y if filter_size_y else filter_size
-        self.filter_size_z = filter_size_z if filter_size_z else filter_size
-        self.padding_y = padding_y if padding_y else padding
-        self.padding_z = padding_z if padding_z else padding
-        self.stride_y = stride_y if stride_y else stride
-        self.stride_z = stride_z if stride_z else stride
-        if output_x is not None:
-            config_assert(output_x <= 0)
-
-
-@config_class
-class BilinearInterp(Cfg):
-    def __init__(self, out_size_x=None, out_size_y=None, channels=None):
-        self.add_keys(locals())
-
-
-@config_class
-class Pool(Cfg):
-    def __init__(
-            self,
-            pool_type,
-            channels,
-            size_x,
-            size_y=None,
-            start=None,
-            stride=None,  # 1 by defalut in protobuf
-            stride_y=None,
-            padding=None,  # 0 by defalut in protobuf
-            padding_y=None):
-        self.add_keys(locals())
-
-
-@config_class
-class Pool3d(Cfg):
-    def __init__(
-            self,
-            pool_type,
-            channels,
-            size_x,
-            size_y=None,
-            size_z=None,
-            start=None,
-            stride=None,  # 1 by defalut in protobuf
-            stride_y=None,
-            stride_z=None,
-            padding=None,  # 0 by defalut in protobuf
-            padding_y=None,
-            padding_z=None):
-        self.add_keys(locals())
-        self.filter_size_y = size_y if size_y else size_x
-        self.filter_size_z = size_z if size_z else size_x
-        self.padding_y = padding_y if padding_y else padding
-        self.padding_z = padding_z if padding_z else padding
-        self.stride_y = stride_y if stride_y else stride
-        self.stride_z = stride_z if stride_z else stride
-
-
-@config_class
-class SpatialPyramidPool(Cfg):
-    def __init__(self, pool_type, pyramid_height, channels):
-        self.add_keys(locals())
-
-
-@config_class
-class Pad(Cfg):
-    def __init__(self, channels, pad_c, pad_h, pad_w):
-        self.add_keys(locals())
-
-
-@config_class
-class Upsample(Cfg):
-    def __init__(self, scale, scale_y, pad_out_x, pad_out_y, upsample_size,
-                 upsample_size_y):
-        self.add_keys(locals())
-
-
-@config_class
-class Norm(Cfg):
-    def __init__(self,
-                 norm_type,
-                 channels,
-                 size,
-                 scale,
-                 pow,
-                 output_x=None,
-                 img_size=None,
-                 blocked=None):
-        self.add_keys(locals())
-
-
-@config_class
-class Image(Cfg):
-    def __init__(self, channels, img_size=None):
-        self.add_keys(locals())
-
-
-@config_class
-class BlockExpand(Cfg):
-    def __init__(self,
-                 channels,
-                 padding_x=0,
-                 padding_y=0,
-                 stride_x=0,
-                 stride_y=0,
-                 block_x=0,
-                 block_y=0,
-                 img_size_x=0,
-                 img_size_y=0,
-                 output_x=0,
-                 output_y=0):
-        self.add_keys(locals())
-
-
-@config_class
-class MaxOut(Cfg):
-    def __init__(self, channels, groups, img_size_x=0, img_size_y=0):
-        self.add_keys(locals())
-
-
-def create_data_config_proto(async_load_data=False,
-                             constant_slots=None,
-                             data_ratio=1,
-                             is_main_data=True,
-                             usage_ratio=None):
-    # default: all sub dataproviders are treat as "main data".
-    # see proto/DataConfig.proto for is_main_data
-    data_config = DataConfig()
-
-    data_config.async_load_data = async_load_data
-
-    if constant_slots:
-        data_config.constant_slots.extend(constant_slots)
-    data_config.data_ratio = data_ratio
-    data_config.is_main_data = is_main_data
-
-    usage_ratio = default(usage_ratio, settings_deprecated["usage_ratio"])
-    config_assert(usage_ratio >= 0 and usage_ratio <= 1,
-                  "The range of usage_ratio is [0, 1]")
-    data_config.usage_ratio = usage_ratio
-
-    return data_config
-
-
-@config_func
-def SimpleData(files=None,
-               feat_dim=None,
-               context_len=None,
-               buffer_capacity=None,
-               **xargs):
-    data_config = create_data_config_proto(**xargs)
-    data_config.type = 'simple'
-    data_config.files = files
-    data_config.feat_dim = feat_dim
-    if context_len is not None:
-        data_config.context_len = context_len
-    if buffer_capacity:
-        data_config.buffer_capacity = buffer_capacity
-    return data_config
-
-
-@config_func
-def PyData(files=None,
-           type=None,
-           file_group_queue_capacity=None,
-           load_data_module=None,
-           load_data_object=None,
-           load_data_args="",
-           load_file_count=None,
-           constant_slots=None,
-           load_thread_num=None,
-           **xargs):
-    data_config = create_data_config_proto(**xargs)
-    data_config.type = 'py'
-    if load_data_module in g_py_module_name_list:
-
-        def get_path(module):
-            m = __import__(load_data_module)
-            return os.path.split(os.path.realpath(m.__file__))[0]
-
-        # python C-api is not thread safe, one module can only be import once,
-        # so here we nedd to copy the module with different names if it has to be
-        # imported several times.
-        module_new_name = "%s_copy_%d" % (load_data_module,
-                                          len(g_py_module_name_list))
-        g_py_module_name_list.append(module_new_name)
-        module_path = "%s/%s.py" % (get_path(load_data_module),
-                                    load_data_module)
-        new_module_path = "%s/%s.py" % (get_path(load_data_module),
-                                        module_new_name)
-        if os.path.isfile(module_path) == False:
-            raise Exception("File %s is not exist." % module_path)
-        shutil.copy2(module_path, new_module_path)
-        load_data_module = module_new_name
-    else:
-        g_py_module_name_list.append(load_data_module)
-    if load_data_module is not None and load_data_object is not None:
-        data_config.load_data_module = load_data_module
-        data_config.load_data_object = load_data_object
-    else:
-        raise ValueError('load_data_module, load_data_object is not defined.')
-    data_config.load_data_args = load_data_args
-
-    data_config.files = files or ''
-    if file_group_queue_capacity is not None:
-        data_config.file_group_conf.queue_capacity = file_group_queue_capacity
-    if load_file_count is not None:
-        data_config.file_group_conf.load_file_count = load_file_count
-    if load_thread_num is not None:
-        data_config.file_group_conf.load_thread_num = load_thread_num
-    if constant_slots:
-        data_config.constant_slots.extend(constant_slots)
-    return data_config
-
-
-#real data for training is actually provided by "sub_data" data providers.
-@config_func
-def MultiData(sub_data=[]):
-    data_config = DataConfig()
-    data_config.type = 'multi'
-    data_config.sub_data_configs.extend(sub_data)
-    return data_config
-
-
-@config_func
-def Data(type,
-         files=None,
-         feat_dim=None,
-         slot_dims=None,
-         context_len=None,
-         buffer_capacity=None,
-         **xargs):
-
-    data_config = create_data_config_proto(**xargs)
-    data_config.type = type
-    data_config.files = files
-    data_config.feat_dim = feat_dim
-    data_config.slot_dims.extend(slot_dims)
-    if context_len is not None:
-        data_config.context_len = context_len
-    data_config.buffer_capacity = buffer_capacity
-    return data_config
-
-
-@config_func
-def TrainData(data_config, async_load_data=None):
-    config_assert(not g_config.HasField('data_config'),
-                  'Only one TrainData definition is allowed')
-    g_config.data_config.CopyFrom(data_config)
-    g_config.data_config.for_test = False
-    if async_load_data is not None:
-        logger.warning("Deprecated: async_load_data should be used inside"
-                       " Data definition")
-        g_config.data_config.async_load_data = async_load_data
-
-
-@config_func
-def TestData(data_config, async_load_data=None):
-    config_assert(not g_config.HasField('test_data_config'),
-                  'Only one TestData definition is allowed')
-    g_config.test_data_config.CopyFrom(data_config)
-    g_config.test_data_config.for_test = True
-    if async_load_data is not None:
-        logger.warning("Deprecated: async_load_data should be used inside"
-                       " Data definition")
-        g_config.test_data_config.async_load_data = async_load_data
-
-
-#caffe_mode: compute the output size using floor instead of ceil,
-#            which is consistent of caffe and CuDNN's convention.
-def cnn_output_size(img_size,
-                    filter_size,
-                    padding,
-                    stride,
-                    caffe_mode,
-                    dilation=1):
-    filter_s = (filter_size - 1) * dilation + 1
-    output = (2 * padding + img_size - filter_s) / float(stride)
-    if caffe_mode:
-        return 1 + int(math.floor(output))
-    else:
-        return 1 + int(math.ceil(output))
-
-
-#calcualte image_size based on output_size for de-convolution (ConvTransLayer).
-#It is the reverse function of cnn_output_size
-def cnn_image_size(output_size,
-                   filter_size,
-                   padding,
-                   stride,
-                   caffe_mode,
-                   dilation=1):
-    filter_s = (filter_size - 1) * dilation + 1
-    img_size = (output_size - 1) * stride + filter_s - 2 * padding
-    if not caffe_mode:
-        img_size = img_size + 1
-    return img_size
-
-
-def get_img_size(input_layer_name, channels):
-    input = g_layer_map[input_layer_name]
-    img_pixels = input.size / channels
-    img_size = input.width if input.width > 0 else int(img_pixels**0.5)
-    img_size_y = input.height if input.height > 0 else int(img_pixels /
-                                                           img_size)
-    config_assert(
-        img_size * img_size_y == img_pixels,
-        "Input layer %s: Incorrect input image size %d * %d for input image pixels %d"
-        % (input_layer_name, img_size, img_size_y, img_pixels))
-    return img_size, img_size_y
-
-
-def get_img3d_size(input_layer_name, channels):
-    input = g_layer_map[input_layer_name]
-    img_pixels = input.size / channels
-    img_size = input.width
-    img_size_y = input.height
-    img_size_z = input.depth
-
-    config_assert(
-        img_size * img_size_y * img_size_z == img_pixels,
-        "Input layer %s: Incorrect input image size %d * %d * %d for input image pixels %d"
-        % (input_layer_name, img_size, img_size_y, img_size_z, img_pixels))
-    return img_size, img_size_y, img_size_z
-
-
-def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
-    parse_image(bilinear, input_layer_name, bilinear_conf.image_conf)
-    bilinear_conf.out_size_x = bilinear.out_size_x
-    bilinear_conf.out_size_y = bilinear.out_size_y
-
-
-def parse_pool(pool, input_layer_name, pool_conf, ceil_mode, exclude_mode):
-    pool_conf.pool_type = pool.pool_type
-    config_assert(pool.pool_type in [
-        'max-projection', 'avg-projection', 'max-pool-with-mask', 'cudnn-max-pool', 'cudnn-avg-pool'
-    ], "pool-type %s is not in " \
-              "['max-projection', 'avg-projection', 'max-pool-with-mask'," \
-                  "'cudnn-max-pool', 'cudnn-avg-pool']" % pool.pool_type)
-
-    pool_conf.channels = pool.channels
-    pool_conf.size_x = pool.size_x
-    pool_conf.stride = pool.stride
-
-    pool_conf.size_y = default(pool.size_y, pool_conf.size_x)
-    pool_conf.stride_y = default(pool.stride_y, pool_conf.stride)
-
-    pool_conf.img_size, pool_conf.img_size_y = \
-        get_img_size(input_layer_name, pool.channels)
-
-    config_assert(not pool.start, "start is deprecated in pooling.")
-
-    if pool.padding is not None:
-        pool_conf.padding = pool.padding
-    pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
-    pool_conf.output_x = cnn_output_size(pool_conf.img_size, pool_conf.size_x,
-                                         pool_conf.padding, pool_conf.stride,
-                                         not ceil_mode)
-    pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
-                                         pool_conf.padding_y,
-                                         pool_conf.stride_y, not ceil_mode)
-    if exclude_mode != None:
-        pool_conf.exclude_mode = exclude_mode
-
-
-def parse_pool3d(pool, input_layer_name, pool_conf, ceil_mode):
-    pool_conf.pool_type = pool.pool_type
-    config_assert(pool.pool_type in ['max-projection', 'avg-projection'],
-                  "pool-type %s is not in "
-                  "['max-projection', 'avg-projection']" % pool.pool_type)
-
-    pool_conf.channels = pool.channels
-
-    pool_conf.size_x = pool.size_x
-    pool_conf.stride = pool.stride
-    pool_conf.padding = pool.padding
-
-    pool_conf.size_y = default(pool.size_y, pool_conf.size_x)
-    pool_conf.size_z = default(pool.size_z, pool_conf.size_x)
-    pool_conf.stride_y = default(pool.stride_y, pool_conf.stride)
-    pool_conf.stride_z = default(pool.stride_z, pool_conf.stride)
-    pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
-    pool_conf.padding_z = default(pool.padding_z, pool_conf.padding)
-
-    pool_conf.img_size, pool_conf.img_size_y, pool_conf.img_size_z = \
-        get_img3d_size(input_layer_name, pool.channels)
-
-    config_assert(not pool.start, "start is deprecated in pooling.")
-
-    if pool.padding is not None:
-        pool_conf.padding = pool.padding
-    pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
-    pool_conf.padding_z = default(pool.padding_z, pool_conf.padding)
-    pool_conf.output_x = cnn_output_size(pool_conf.img_size, pool_conf.size_x,
-                                         pool_conf.padding, pool_conf.stride,
-                                         not ceil_mode)
-    pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
-                                         pool_conf.padding_y,
-                                         pool_conf.stride_y, not ceil_mode)
-    pool_conf.output_z = cnn_output_size(pool_conf.img_size_z, pool_conf.size_z,
-                                         pool_conf.padding_z,
-                                         pool_conf.stride_z, not ceil_mode)
-
-
-def parse_spp(spp, input_layer_name, spp_conf):
-    parse_image(spp, input_layer_name, spp_conf.image_conf)
-    spp_conf.pool_type = spp.pool_type
-    config_assert(spp.pool_type in ['max-projection', 'avg-projection'],
-                  "pool-type %s is not in "
-                  "['max-projection', 'avg-projection']" % spp.pool_type)
-    spp_conf.pyramid_height = spp.pyramid_height
-
-
-def parse_image(image, input_layer_name, image_conf):
-    image_conf.channels = image.channels
-    image_conf.img_size, image_conf.img_size_y = \
-        get_img_size(input_layer_name, image_conf.channels)
-
-
-def parse_image3d(image, input_layer_name, image_conf):
-    image_conf.channels = image.channels
-    image_conf.img_size, image_conf.img_size_y, image_conf.img_size_z = \
-        get_img3d_size(input_layer_name, image_conf.channels)
-
-
-def parse_norm(norm, input_layer_name, norm_conf):
-    norm_conf.norm_type = norm.norm_type
-    config_assert(
-        norm.norm_type in
-        ['rnorm', 'cmrnorm-projection', 'cross-channel-norm'],
-        "norm-type %s is not in [rnorm, cmrnorm-projection, cross-channel-norm]"
-        % norm.norm_type)
-    norm_conf.channels = norm.channels
-    norm_conf.size = norm.size
-    norm_conf.scale = norm.scale
-    norm_conf.pow = norm.pow
-    norm_conf.blocked = norm.blocked
-
-    norm_conf.img_size, norm_conf.img_size_y = \
-        get_img_size(input_layer_name, norm.channels)
-    norm_conf.output_x = norm_conf.img_size
-    norm_conf.output_y = norm_conf.img_size_y
-    if norm.norm_type in ['cmrnorm-projection']:
-        norm_conf.scale /= norm.size
-    else:
-        norm_conf.scale /= norm.size**2
-
-
-#caffe_mode: compute the output size using floor instead of ceil,
-#            which is consistent of caffe and CuDNN's convention.
-def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
-    conv_conf.filter_size = conv.filter_size
-    conv_conf.filter_size_y = conv.filter_size_y
-    conv_conf.channels = conv.channels
-    conv_conf.padding = conv.padding
-    conv_conf.padding_y = conv.padding_y
-    conv_conf.stride = conv.stride
-    conv_conf.stride_y = conv.stride_y
-    conv_conf.groups = conv.groups
-    conv_conf.caffe_mode = conv.caffe_mode
-    if not conv.dilation:
-        conv.dilation = 1
-        conv.dilation_y = 1
-    else:
-        conv_conf.dilation = conv.dilation
-        conv_conf.dilation_y = conv.dilation_y
-
-    if not trans:
-        conv_conf.filter_channels = conv.channels / conv.groups
-        conv_conf.img_size, conv_conf.img_size_y = \
-            get_img_size(input_layer_name, conv.channels)
-        conv_conf.output_x = cnn_output_size(
-            conv_conf.img_size, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode, conv.dilation)
-        conv_conf.output_y = cnn_output_size(
-            conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode, conv.dilation_y)
-    else:
-        conv_conf.filter_channels = num_filters / conv.groups
-        conv_conf.output_x, conv_conf.output_y = \
-            get_img_size(input_layer_name, conv.channels)
-        conv_conf.img_size = cnn_image_size(
-            conv_conf.output_x, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode, conv.dilation)
-        conv_conf.img_size_y = cnn_image_size(
-            conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode, conv.dilation_y)
-
-
-#caffe_mode: compute the output size using floor instead of ceil,
-#            which is consistent of caffe and CuDNN's convention.
-def parse_conv3d(conv, input_layer_name, conv_conf, num_filters, trans=False):
-    conv_conf.filter_size = conv.filter_size
-    conv_conf.filter_size_y = conv.filter_size_y
-    conv_conf.filter_size_z = conv.filter_size_z
-    conv_conf.channels = conv.channels
-    conv_conf.padding = conv.padding
-    conv_conf.padding_y = conv.padding_y
-    conv_conf.padding_z = conv.padding_z
-    conv_conf.stride = conv.stride
-    conv_conf.stride_y = conv.stride_y
-    conv_conf.stride_z = conv.stride_z
-    conv_conf.groups = conv.groups
-    conv_conf.caffe_mode = conv.caffe_mode
-
-    if not trans:
-        conv_conf.filter_channels = conv.channels / conv.groups
-        conv_conf.img_size, conv_conf.img_size_y, conv_conf.img_size_z = \
-            get_img3d_size(input_layer_name, conv.channels)
-        conv_conf.output_x = cnn_output_size(
-            conv_conf.img_size, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode)
-        conv_conf.output_y = cnn_output_size(
-            conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode)
-        conv_conf.output_z = cnn_output_size(
-            conv_conf.img_size_z, conv_conf.filter_size_z, conv_conf.padding_z,
-            conv_conf.stride_z, conv_conf.caffe_mode)
-    else:
-        conv_conf.filter_channels = num_filters / conv.groups
-        conv_conf.output_x, conv_conf.output_y, conv_conf.output_z = \
-            get_img3d_size(input_layer_name, conv.channels)
-        conv_conf.img_size = cnn_image_size(
-            conv_conf.output_x, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode)
-        conv_conf.img_size_y = cnn_image_size(
-            conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode)
-        conv_conf.img_size_z = cnn_image_size(
-            conv_conf.output_z, conv_conf.filter_size_z, conv_conf.padding_z,
-            conv_conf.stride_z, conv_conf.caffe_mode)
-
-
-def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
-    block_expand_conf.channels = block_expand.channels
-    block_expand_conf.stride_x = block_expand.stride_x
-    block_expand_conf.stride_y = block_expand.stride_y
-    block_expand_conf.padding_x = block_expand.padding_x
-    block_expand_conf.padding_y = block_expand.padding_y
-    block_expand_conf.block_x = block_expand.block_x
-    block_expand_conf.block_y = block_expand.block_y
-    block_expand_conf.img_size_x = block_expand.img_size_x
-    block_expand_conf.img_size_y = block_expand.img_size_y
-    if block_expand_conf.img_size_x == 0:
-        block_expand_conf.output_x = 0
-    else:
-        block_expand_conf.output_x = cnn_output_size(
-            block_expand.img_size_x, block_expand.block_x,
-            block_expand.padding_x, block_expand.stride_x, False)
-
-    if block_expand_conf.img_size_y == 0:
-        block_expand_conf.output_y = 0
-    else:
-        block_expand_conf.output_y = cnn_output_size(
-            block_expand.img_size_y, block_expand.block_y,
-            block_expand.padding_y, block_expand.stride_y, False)
-
-
-def parse_maxout(maxout, input_layer_name, maxout_conf):
-    parse_image(maxout, input_layer_name, maxout_conf.image_conf)
-    maxout_conf.groups = maxout.groups
-
-
-# Define an evaluator
-@config_func
-def Evaluator(name,
-              type,
-              inputs,
-              chunk_scheme=None,
-              num_chunk_types=None,
-              classification_threshold=None,
-              positive_label=None,
-              dict_file=None,
-              result_file=None,
-              num_results=None,
-              top_k=None,
-              delimited=None,
-              excluded_chunk_types=None,
-              overlap_threshold=None,
-              background_id=None,
-              evaluate_difficult=None,
-              ap_type=None):
-    evaluator = g_config.model_config.evaluators.add()
-    evaluator.type = type
-    evaluator.name = MakeLayerNameInSubmodel(name)
-    if type_of(inputs) == str:
-        inputs = [inputs]
-
-    evaluator.input_layers.extend(
-        [MakeLayerNameInSubmodel(name) for name in inputs])
-
-    if chunk_scheme is not None:
-        evaluator.chunk_scheme = chunk_scheme
-        evaluator.num_chunk_types = num_chunk_types
-    g_current_submodel.evaluator_names.append(evaluator.name)
-
-    if classification_threshold is not None:
-        evaluator.classification_threshold = classification_threshold
-    if positive_label is not None:
-        evaluator.positive_label = positive_label
-    if dict_file is not None:
-        evaluator.dict_file = dict_file
-
-    if result_file is not None:
-        evaluator.result_file = result_file
-    if num_results is not None:
-        evaluator.num_results = num_results
-    if top_k is not None:
-        evaluator.top_k = top_k
-    if delimited is not None:
-        evaluator.delimited = delimited
-
-    if excluded_chunk_types:
-        evaluator.excluded_chunk_types.extend(excluded_chunk_types)
-
-    if overlap_threshold is not None:
-        evaluator.overlap_threshold = overlap_threshold
-
-    if background_id is not None:
-        evaluator.background_id = background_id
-
-    if evaluate_difficult is not None:
-        evaluator.evaluate_difficult = evaluate_difficult
-
-    if ap_type is not None:
-        evaluator.ap_type = ap_type
-
-
-class LayerBase(object):
-    def __init__(
-            self,
-            name,
-            type,
-            size,  # size can be 0. In this case, subclass should set it.
-            inputs,
-            device=None,
-            active_type="",
-            drop_rate=0.,
-            coeff=None,
-            error_clipping_threshold=None):
-        config_assert('@' not in name,
-                      "layer name: %s contain special character @" % name)
-        global g_current_submodel
-        name = MakeLayerNameInSubmodel(name)
-
-        config_assert(name not in g_layer_map,
-                      'Duplicated layer name: %s' % name)
-
-        self.inputs = copy.deepcopy(inputs)
-        self.operators = []
-
-        if self.inputs is None:
-            self.inputs = []
-        elif type_of(self.inputs) != list:
-            self.inputs = [self.inputs]
-
-        self.config = g_config.model_config.layers.add()
-        assert isinstance(self.config, LayerConfig)
-        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
-        mkldnn_acts = ['relu', 'tanh', 'softmax']
-        if use_mkldnn and active_type in mkldnn_acts:
-            active_type = "mkldnn_" + active_type
-        self.config.name = name
-        self.config.type = type
-        self.config.active_type = active_type
-        if coeff is not None:
-            self.config.coeff = float(coeff)
-        if size != 0:
-            self.config.size = size
-        if drop_rate != 0:
-            self.config.drop_rate = drop_rate
-
-        if device is not None:
-            self.config.device = device
-        elif g_default_device is not None:
-            self.config.device = g_default_device
-
-        if error_clipping_threshold is not None:
-            self.config.error_clipping_threshold = error_clipping_threshold
-
-        for input_index in xrange(len(self.inputs)):
-            input = self.inputs[input_index]
-            input_config = None
-            input_layer_name = ''
-            if type_of(input) == str:
-                input_layer_name = input
-                input_config = Input(
-                    input_layer_name=input,
-                    parameter_name=gen_parameter_name(name, input_index))
-                input_layer_name = input_config.input_layer_name
-            elif isinstance(input, Input):
-                input_layer_name = input.input_layer_name
-                input_config = input
-                if input_config.parameter_name is None:
-                    input_config.parameter_name = \
-                        gen_parameter_name(name, input_index)
-            elif isinstance(input, Operator):
-                self.operators.append(input)
-                input.operator_conf.input_indices.append(input_index)
-                input_config = Input(input.input_layer_names[0])
-                input_layer_name = input_config.input_layer_name
-            else:
-                raise ValueError('Wrong type for inputs: %s' % type_of(input))
-            config_assert(input_layer_name in g_layer_map,
-                          "Unknown input layer '%s' for layer %s" %
-                          (input_layer_name, name))
-            self.inputs[input_index] = input_config
-            layer_input = self.config.inputs.add()
-            layer_input.input_layer_name = input_config.input_layer_name
-            if input_config.input_layer_argument is not None:
-                layer_input.input_layer_argument = \
-                    input_config.input_layer_argument
-
-        g_layer_map[name] = self.config
-
-        g_current_submodel.layer_names.append(self.config.name)
-
-    def get_input_layer(self, input_index):
-        return g_layer_map[self.config.inputs[input_index].input_layer_name]
-
-    # will return the bias created if not *for_self*
-    def create_bias_parameter(
-            self,
-            bias,  # True/False or BiasCfg
-            size,
-            dims=None,
-            for_self=True,  # whether create bias for layer self
-    ):
-
-        if size == 0:
-            return
-        if dims is None:
-            dims = [1, size]
-
-        config_assert(
-            type_of(bias) == bool or type_of(bias) == Bias,
-            'Incorrect type for bias: %s' % type_of(bias))
-
-        if type_of(bias) == bool:
-            if bias:
-                bias = Bias()
-
-        if type_of(bias) == Bias:
-            if bias.parameter_name is None:
-                bias.parameter_name = gen_bias_parameter_name(self.config.name)
-            if bias.parameter_name not in g_parameter_map:
-                assert isinstance(self.config, LayerConfig)
-
-                Parameter(
-                    bias.parameter_name,
-                    size,
-                    self.config.device
-                    if self.config.HasField('device') else None,
-                    dims,
-                    bias.learning_rate,
-                    bias.momentum,
-                    decay_rate=bias.decay_rate,
-                    decay_rate_l1=bias.decay_rate_l1,
-                    initial_mean=bias.initial_mean,
-                    initial_std=bias.initial_std,
-                    initial_strategy=bias.initial_strategy,
-                    initial_smart=bias.initial_smart,
-                    num_batches_regularization=bias.num_batches_regularization,
-                    sparse_remote_update=bias.sparse_remote_update,
-                    gradient_clipping_threshold=bias.
-                    gradient_clipping_threshold,
-                    is_static=bias.is_static,
-                    is_shared=bias.is_shared,
-                    initializer=bias.initializer)
-            if for_self:
-                self.config.bias_parameter_name = bias.parameter_name
-            else:
-                return bias.parameter_name
-
-    def create_input_parameter(self,
-                               input_index,
-                               size,
-                               dims=None,
-                               sparse=None,
-                               format=None):
-        if dims is None:
-            # TODO(yuyang18): print warning and callstack here!
-            dims = list()
-
-        if size == 0:
-            return
-
-        input_config = self.inputs[input_index]
-
-        self.config.inputs[input_index].input_parameter_name = \
-            input_config.parameter_name
-
-        if input_config.parameter_name in g_parameter_map:
-            para = g_parameter_map[input_config.parameter_name]
-            config_assert(size == para.size, (
-                'Shared parameter "%s" does not ' + 'have same size: %s vs. %s')
-                          % (input_config.parameter_name, para.size, size))
-
-            config_assert(dims == para.dims, (
-                'Shared parameter "%s" does not ' + 'have same dims: %s vs. %s')
-                          % (input_config.parameter_name, para.dims, dims))
-            return
-
-        Parameter(
-            input_config.parameter_name,
-            size,
-            self.config.device if self.config.HasField("device") else None,
-            dims,
-            input_config.learning_rate,
-            input_config.momentum,
-            decay_rate=input_config.decay_rate,
-            decay_rate_l1=input_config.decay_rate_l1,
-            initial_mean=input_config.initial_mean,
-            initial_std=input_config.initial_std,
-            initial_strategy=input_config.initial_strategy,
-            initial_smart=input_config.initial_smart,
-            num_batches_regularization=input_config.num_batches_regularization,
-            sparse_remote_update=input_config.sparse_remote_update,
-            sparse_update=input_config.sparse_update,
-            gradient_clipping_threshold=input_config.
-            gradient_clipping_threshold,
-            sparse=sparse,
-            format=format,
-            is_static=input_config.is_static,
-            is_shared=input_config.is_shared,
-            update_hooks=input_config.update_hooks,
-            initializer=input_config.initializer)
-
-    def set_layer_size(self, size):
-        if self.config.size == 0:
-            self.config.size = size
-        else:
-            config_assert(self.config.size == size,
-                          'Different inputs result in' +
-                          'different layer size at layer %s' % self.config.name)
-
-    def set_layer_height_width(self, height, width):
-        self.config.height = height
-        self.config.width = width
-
-    def set_layer_depth(self, depth):
-        self.config.depth = depth
-
-    def set_cnn_layer(self,
-                      input_layer_name,
-                      height,
-                      width,
-                      channels,
-                      is_print=True):
-        size = height * width * channels
-        self.set_layer_size(size)
-        self.set_layer_height_width(height, width)
-        if is_print:
-            print("output for %s: c = %d, h = %d, w = %d, size = %d" %
-                  (input_layer_name, channels, height, width, size))
-
-
-@config_layer('multi_class_cross_entropy_with_selfnorm')
-class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
-    def __init__(self, name, inputs, softmax_selfnorm_alpha=0.1, **xargs):
-        super(MultiClassCrossEntropySelfNormCostLayer, self).__init__(
-            name, 'multi_class_cross_entropy_with_selfnorm', 0, inputs, **xargs)
-        self.config.softmax_selfnorm_alpha = softmax_selfnorm_alpha
-
-
-@config_layer('cross_entropy_over_beam')
-class CrossEntropyOverBeamLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        config_assert(len(inputs) % 3 == 0, "Error input number.")
-        super(CrossEntropyOverBeamLayer, self).__init__(
-            name, 'cross_entropy_over_beam', 0, inputs, **xargs)
-        input_num = len(inputs) / 3
-        for i in range(input_num):
-            input_layer = self.get_input_layer(i * 3)
-            config_assert(input_layer.size == 1, (
-                "Inputs for this layer are made up of "
-                "several triples, in which the first one is scores over "
-                "all candidate paths, whose size should be equal to 1."))
-
-
-@config_layer('fc')
-class FCLayer(LayerBase):
-    layer_type = 'fc'
-
-    def __init__(self,
-                 name,
-                 size,
-                 inputs,
-                 bias=True,
-                 error_clipping_threshold=None,
-                 **xargs):
-        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
-        use_mkldnn_wgt = bool(
-            int(g_command_config_args.get("use_mkldnn_wgt", 0)))
-        if use_mkldnn:
-            self.layer_type = 'mkldnn_fc'
-            config_assert(
-                len(inputs) == 1,
-                "MKLDNNFCLayer support one and only one input!")
-        super(FCLayer, self).__init__(
-            name, self.layer_type, size, inputs=inputs, **xargs)
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            psize = self.config.size * input_layer.size
-            dims = [input_layer.size, self.config.size]
-            format = self.inputs[input_index].format
-            sparse = format == "csr" or format == "csc"
-            if use_mkldnn:
-                config_assert(not sparse,
-                              "MKLDNNFCLayer do not support sparse format yet")
-                if use_mkldnn_wgt:
-                    dims = [self.config.size, input_layer.size]
-            if sparse:
-                psize = self.inputs[input_index].nnz
-            else:
-                sparse = None
-
-            self.create_input_parameter(input_index, psize, dims, sparse,
-                                        format)
-        self.create_bias_parameter(bias, self.config.size)
-        if error_clipping_threshold is not None:
-            self.config.error_clipping_threshold = error_clipping_threshold
-
-
-@config_layer('mkldnn_fc')
-class MKLDNNFcLayer(FCLayer):
-    layer_type = 'mkldnn_fc'
-
-
-@config_layer('selective_fc')
-class SelectiveFCLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 size,
-                 inputs,
-                 bias=True,
-                 selective_fc_pass_generation=False,
-                 has_selected_colums=True,
-                 selective_fc_full_mul_ratio=0.02,
-                 selective_fc_parallel_plain_mul_thread_num=None,
-                 **xargs):
-        super(SelectiveFCLayer, self).__init__(
-            name, 'selective_fc', size, inputs=inputs, **xargs)
-        # user MUST know if selctive fc is used in training,
-        # parameter matrices saved by this layer are automatically transposed,
-        # BUT bias is not.
-
-        # if selective_fc is used only in testing mode, and parameters for
-        # this layer are trained by fully connected layers,
-        # then TranposedFullMatrixProjectin MUST be used in training
-        # to avoid manual transpose in testing.
-
-        self.config.selective_fc_pass_generation = selective_fc_pass_generation
-        self.config.has_selected_colums = has_selected_colums
-        self.config.selective_fc_full_mul_ratio = selective_fc_full_mul_ratio
-        if selective_fc_parallel_plain_mul_thread_num is not None:
-            self.config.selective_fc_parallel_plain_mul_thread_num = selective_fc_parallel_plain_mul_thread_num
-
-        input_num = len(self.inputs)
-        if has_selected_colums:
-            config_assert(input_num >= 2,
-                          ("if indices of selected columns are not specified, "
-                           "selective_fc Layer has at least two inputs"))
-            input_num -= 1
-
-        for input_index in xrange(input_num):
-            input_layer = self.get_input_layer(input_index)
-            psize = self.config.size * input_layer.size
-            dims = [input_layer.size, self.config.size]
-            dims = dims[::-1]  # transpose the parameter
-            format = self.inputs[input_index].format
-            sparse = format == "csr" or format == "csc"
-            if sparse:
-                psize = self.inputs[input_index].nnz
-
-            self.create_input_parameter(input_index, psize, dims, sparse,
-                                        format)
-        self.create_bias_parameter(bias, self.config.size)
-
-
-@config_layer('print')
-class PrintLayer(LayerBase):
-    def __init__(self, name, inputs, format=None):
-        super(PrintLayer, self).__init__(name, 'print', 0, inputs)
-        if format is None:
-            format = "\n".join([
-                "layer=" + input.input_layer_name + " %s"
-                for input in self.inputs
-            ])
-        self.config.user_arg = format
-
-
-@config_layer('priorbox')
-class PriorBoxLayer(LayerBase):
-    def __init__(self, name, inputs, size, min_size, max_size, aspect_ratio,
-                 variance):
-        super(PriorBoxLayer, self).__init__(name, 'priorbox', 0, inputs)
-        config_assert(len(inputs) == 2, 'PriorBoxLayer must have 2 inputs')
-        input_layer = self.get_input_layer(1)
-        config_assert(
-            input_layer.type == 'data',
-            'Expecting the second input layer of an priorbox layer to be '
-            'a data layer')
-        config_assert(input_layer.width > 0, 'The data layer must set width')
-        config_assert(input_layer.height > 0, 'The data layer must set height')
-        config_assert(len(variance) == 4, 'The variance must have 4 inputs')
-        self.config.inputs[0].priorbox_conf.min_size.extend(min_size)
-        self.config.inputs[0].priorbox_conf.max_size.extend(max_size)
-        self.config.inputs[0].priorbox_conf.aspect_ratio.extend(aspect_ratio)
-        self.config.inputs[0].priorbox_conf.variance.extend(variance)
-        self.config.size = size
-
-
-@config_layer('multibox_loss')
-class MultiBoxLossLayer(LayerBase):
-    def __init__(self, name, inputs, input_num, num_classes, overlap_threshold,
-                 neg_pos_ratio, neg_overlap, background_id, **xargs):
-        super(MultiBoxLossLayer, self).__init__(name, 'multibox_loss', 0,
-                                                inputs)
-        config_assert(
-            len(inputs) == (input_num * 2 + 2),
-            'MultiBoxLossLayer does not have enough inputs')
-        config_assert(num_classes > background_id,
-                      'Classes number must greater than background ID')
-        self.config.inputs[0].multibox_loss_conf.num_classes = num_classes
-        self.config.inputs[
-            0].multibox_loss_conf.overlap_threshold = overlap_threshold
-        self.config.inputs[0].multibox_loss_conf.neg_pos_ratio = neg_pos_ratio
-        self.config.inputs[0].multibox_loss_conf.neg_overlap = neg_overlap
-        self.config.inputs[0].multibox_loss_conf.background_id = background_id
-        self.config.inputs[0].multibox_loss_conf.input_num = input_num
-        self.config.size = 1
-
-
-@config_layer('detection_output')
-class DetectionOutputLayer(LayerBase):
-    def __init__(self, name, inputs, size, input_num, num_classes,
-                 nms_threshold, nms_top_k, keep_top_k, confidence_threshold,
-                 background_id, **xargs):
-        super(DetectionOutputLayer, self).__init__(name, 'detection_output', 0,
-                                                   inputs)
-        config_assert(
-            len(inputs) == (input_num * 2 + 1),
-            'DetectionOutputLayer does not have enough inputs')
-        config_assert(num_classes > background_id,
-                      'Classes number must greater than background ID')
-        self.config.inputs[0].detection_output_conf.num_classes = num_classes
-        self.config.inputs[
-            0].detection_output_conf.nms_threshold = nms_threshold
-        self.config.inputs[0].detection_output_conf.nms_top_k = nms_top_k
-        self.config.inputs[0].detection_output_conf.keep_top_k = keep_top_k
-        self.config.inputs[
-            0].detection_output_conf.confidence_threshold = confidence_threshold
-        self.config.inputs[
-            0].detection_output_conf.background_id = background_id
-        self.config.inputs[0].detection_output_conf.input_num = input_num
-        self.config.size = size
-
-
-@config_layer('roi_pool')
-class ROIPoolLayer(LayerBase):
-    def __init__(self, name, inputs, pooled_width, pooled_height, spatial_scale,
-                 num_channels, **xargs):
-        super(ROIPoolLayer, self).__init__(name, 'roi_pool', 0, inputs)
-        config_assert(len(inputs) == 2, 'ROIPoolLayer must have 2 inputs')
-        self.config.inputs[0].roi_pool_conf.pooled_width = pooled_width
-        self.config.inputs[0].roi_pool_conf.pooled_height = pooled_height
-        self.config.inputs[0].roi_pool_conf.spatial_scale = spatial_scale
-        self.set_cnn_layer(name, pooled_height, pooled_width, num_channels)
-
-
-@config_layer('data')
-class DataLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 size,
-                 depth=None,
-                 height=None,
-                 width=None,
-                 device=None):
-        super(DataLayer, self).__init__(
-            name, 'data', size, inputs=[], device=device)
-        if height and width:
-            self.set_layer_height_width(height, width)
-        if depth:
-            self.set_layer_depth(depth)
-
-
-'''
-DataNormLayer: A layer for data normalization
-Input: One and only one input layer is accepted. The input layer must
-       be DataLayer with dense data type
-Output: The normalization of the input data
-
-Reference:
-    LA Shalabi, Z Shaaban, B Kasasbeh. Data mining: A preprocessing engine
-
-Example:
-    Layer(
-        name = "norm_input_layer",
-        type = "data_norm",
-        inputs = [Input("input_layer",
-                        parameter_name = "_slot0.stats")],
-        data_norm_strategy = "z-score",
-    )
-
-Note:
-  (1) The parameter has been calculated in the preprocessing stage,
-      and should be initialized by --init_model_path when training.
-  (2) Three data normalization methoeds are considered
-          z-score: y = (x-mean)/std
-          min-max: y = (x-min)/(max-min)
-          decimal-scaling: y = x/10^j, where j is the smallest integer such that max(|y|)<1
-'''
-
-
-@config_layer('data_norm')
-class DataNormLayer(LayerBase):
-    def __init__(self, name, inputs, data_norm_strategy="z-score", device=None):
-        super(DataNormLayer, self).__init__(
-            name, 'data_norm', 0, inputs=inputs, device=device)
-        self.config.data_norm_strategy = data_norm_strategy
-        config_assert(len(inputs) == 1, 'DataNormLayer must have 1 input')
-        input_layer = self.get_input_layer(0)
-        self.set_layer_size(input_layer.size)
-        para_size = 5 * input_layer.size
-        para_dims = [5, input_layer.size]
-        self.inputs[0].is_static = True
-        self.create_input_parameter(0, para_size, para_dims)
-
-
-@config_layer('prelu')
-class ParameterReluLayer(LayerBase):
-    layer_type = 'prelu'
-
-    def __init__(self, name, inputs, partial_sum=1, **args):
-        super(ParameterReluLayer, self).__init__(
-            name, self.layer_type, 0, inputs=inputs, **args)
-
-        input_layer = self.get_input_layer(0)
-        config_assert(len(self.inputs) == 1, "prelu layer has only one input.")
-        config_assert(input_layer.size % partial_sum == 0,
-                      "a wrong setting for partial_sum")
-
-        dims = [1, input_layer.size / partial_sum]
-        self.set_layer_size(input_layer.size)
-        self.config.partial_sum = partial_sum
-        self.create_input_parameter(0, input_layer.size / partial_sum, dims)
-
-        self.set_layer_height_width(self.get_input_layer(0).height, \
-                                        self.get_input_layer(0).width)
-        self.set_layer_depth(self.get_input_layer(0).depth)
-
-
-@config_layer('conv')
-class ConvLayerBase(LayerBase):
-    layer_type = 'conv'
-
-    def __init__(self,
-                 name,
-                 inputs=[],
-                 bias=True,
-                 num_filters=None,
-                 shared_biases=False,
-                 **xargs):
-        super(ConvLayerBase, self).__init__(
-            name, self.layer_type, 0, inputs=inputs, **xargs)
-
-        if num_filters is not None:
-            self.config.num_filters = num_filters
-
-        use_mkldnn = int(g_command_config_args.get("use_mkldnn", 0))
-        use_gpu = int(g_command_config_args.get("use_gpu", 0))
-        parallel_nn = int(g_command_config_args.get("parallel_nn", 0))
-
-        # Automatically select cudnn_type for GPU, exconv for CPU
-        # and mkldnn_conv for MKLDNN
-        # if set type=conv, but still reserve the way user specify
-        # exconv, mkldnn_conv or cudnn_conv manually.
-        if self.layer_type == "cudnn_conv":
-            config_assert(use_gpu, "cudnn_conv only support GPU")
-
-        if self.layer_type == "mkldnn_conv":
-            config_assert(use_mkldnn, "mkldnn_conv only support MKLDNN")
-
-        if (use_gpu == 1 and self.layer_type != "exconv" and
-                self.layer_type != "mkldnn_conv" and
-            (parallel_nn == 0 or self.config.device > -1)):
-            self.layer_type = "cudnn_conv"
-        else:
-            self.layer_type = "mkldnn_conv" if use_mkldnn else "exconv"
-        # need to specify layer in config
-        self.config.type = self.layer_type
-
-        if shared_biases is not None:
-            self.config.shared_biases = shared_biases
-
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            conv_conf = self.config.inputs[input_index].conv_conf
-            parse_conv(self.inputs[input_index].conv, input_layer.name,
-                       conv_conf, num_filters)
-            psize = self.calc_parameter_size(conv_conf)
-            self.create_input_parameter(input_index, psize)
-            self.set_cnn_layer(name, conv_conf.output_y, conv_conf.output_x,
-                               self.config.num_filters)
-
-        psize = self.config.size
-        if shared_biases:
-            psize = self.config.num_filters
-        self.create_bias_parameter(bias, psize, [psize, 1])
-
-    def calc_parameter_size(self, conv_conf):
-        return self.config.num_filters * conv_conf.filter_channels \
-               * (conv_conf.filter_size * conv_conf.filter_size_y)
-
-
-@config_layer('exconv')
-class ConvLayer(ConvLayerBase):
-    layer_type = 'exconv'
-
-
-@config_layer('mkldnn_conv')
-class ConvLayer(ConvLayerBase):
-    layer_type = 'mkldnn_conv'
-
-
-@config_layer('cudnn_conv')
-class ConvLayer(ConvLayerBase):
-    layer_type = 'cudnn_conv'
-
-
-@config_layer('convt')
-class ConvTransLayerBase(LayerBase):
-    layer_type = 'convt'
-
-    def __init__(self,
-                 name,
-                 inputs=[],
-                 bias=True,
-                 num_filters=None,
-                 shared_biases=False,
-                 **xargs):
-        super(ConvTransLayerBase, self).__init__(
-            name, self.layer_type, 0, inputs=inputs, **xargs)
-
-        if num_filters is not None:
-            self.config.num_filters = num_filters
-
-        use_gpu = int(g_command_config_args.get("use_gpu", 0))
-        parallel_nn = int(g_command_config_args.get("parallel_nn", 0))
-
-        # Automatically select cudnn_type for GPU and exconvt for CPU
-        # if set type=exconvt, but still reserve the way user specify
-        # exconvt or cudnn_convt manually.
-        if self.layer_type == "cudnn_convt":
-            config_assert(use_gpu, "cudnn_convt only support GPU")
-
-        if (use_gpu == 1 and self.layer_type != "exconvt" and
-            (parallel_nn == 0 or self.config.device > -1)):
-            self.layer_type = "cudnn_convt"
-        else:
-            self.layer_type = "exconvt"
-        # need to specify layer in config
-        self.config.type = self.layer_type
-
-        if shared_biases is not None:
-            self.config.shared_biases = shared_biases
-
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            parse_conv(
-                self.inputs[input_index].conv,
-                input_layer.name,
-                self.config.inputs[input_index].conv_conf,
-                num_filters,
-                trans=True)
-            conv_conf = self.config.inputs[input_index].conv_conf
-            psize = self.calc_parameter_size(conv_conf)
-            self.create_input_parameter(input_index, psize)
-            self.set_cnn_layer(name, conv_conf.img_size_y, conv_conf.img_size,
-                               self.config.num_filters)
-
-        psize = self.config.size
-        if shared_biases:
-            psize = self.config.num_filters
-        self.create_bias_parameter(bias, psize, [psize, 1])
-
-    def calc_parameter_size(self, conv_conf):
-        return conv_conf.channels * conv_conf.filter_channels \
-                    * (conv_conf.filter_size * conv_conf.filter_size_y)
-
-
-@config_layer('exconvt')
-class ConvTransLayer(ConvTransLayerBase):
-    layer_type = 'exconvt'
-
-
-@config_layer('cudnn_convt')
-class ConvTransLayer(ConvTransLayerBase):
-    layer_type = 'cudnn_convt'
-
-
-@config_layer('conv_3d')
-class Conv3DLayerBase(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs=[],
-                 bias=True,
-                 num_filters=None,
-                 shared_biases=True,
-                 **xargs):
-        super(Conv3DLayerBase, self).__init__(
-            name, self.layer_type, 0, inputs=inputs, **xargs)
-
-        if num_filters is not None:
-            self.config.num_filters = num_filters
-
-        # need to specify layer in config
-        self.config.type = self.layer_type
-
-        trans = False
-        if self.config.type == "deconv3d":
-            trans = True
-
-        if shared_biases is not None:
-            self.config.shared_biases = shared_biases
-
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            conv_conf = self.config.inputs[input_index].conv_conf
-            parse_conv3d(
-                self.inputs[input_index].conv,
-                input_layer.name,
-                conv_conf,
-                num_filters,
-                trans=trans
-            )  # for z-axis pad:0, strid:1, filter_size:1, img_size:1
-            psize = self.calc_parameter_size(conv_conf)
-            self.create_input_parameter(input_index, psize)
-            if trans:
-                self.set_cnn_layer(name, conv_conf.img_size_z,
-                                   conv_conf.img_size_y, conv_conf.img_size,
-                                   self.config.num_filters)
-            else:
-                self.set_cnn_layer(name, conv_conf.output_z, conv_conf.output_y,
-                                   conv_conf.output_x, self.config.num_filters)
-
-        psize = self.config.size
-        if shared_biases:
-            psize = self.config.num_filters
-        self.create_bias_parameter(bias, psize, [psize, 1])
-
-    def calc_parameter_size(self, conv_conf):
-        return self.config.num_filters * conv_conf.filter_channels \
-               * (conv_conf.filter_size * conv_conf.filter_size_y \
-                  * conv_conf.filter_size_z)
-
-    def set_cnn_layer(self,
-                      input_layer_name,
-                      depth,
-                      height,
-                      width,
-                      channels,
-                      is_print=True):
-        size = depth * height * width * channels
-        self.set_layer_size(size)
-        self.set_layer_height_width(height, width)
-        self.set_layer_depth(depth)
-        if is_print:
-            print("output for %s: c = %d, d = %d, h = %d, w = %d, size = %d" %
-                  (input_layer_name, channels, depth, height, width, size))
-
-
-@config_layer('conv3d')
-class Conv3DLayer(Conv3DLayerBase):
-    layer_type = 'conv3d'
-
-
-@config_layer('deconv3d')
-class Conv3DLayer(Conv3DLayerBase):
-    layer_type = 'deconv3d'
-
-
-@config_layer('norm')
-class NormLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        super(NormLayer, self).__init__(name, 'norm', 0, inputs=inputs, **xargs)
-        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
-        use_mkldnn = True if use_mkldnn and self.inputs[
-            0].norm.norm_type == 'cmrnorm-projection' else False
-        self.config.type = 'mkldnn_lrn' if use_mkldnn else self.config.type
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            norm_conf = self.config.inputs[input_index].norm_conf
-            parse_norm(self.inputs[input_index].norm, input_layer.name,
-                       norm_conf)
-            norm_conf.scale = self.inputs[
-                input_index].norm.scale if use_mkldnn else norm_conf.scale
-            self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x,
-                               norm_conf.channels, False)
-            if norm_conf.norm_type == "cross-channel-norm":
-                self.create_input_parameter(0, norm_conf.channels,
-                                            [norm_conf.channels, 1])
-
-
-@config_layer('pool')
-class PoolLayer(LayerBase):
-    layer_type = 'pool'
-
-    def __init__(self, name, inputs, ceil_mode=True, exclude_mode=None,
-                 **xargs):
-        use_mkldnn = int(g_command_config_args.get("use_mkldnn", 0))
-        if self.layer_type == "mkldnn_pool":
-            config_assert(use_mkldnn, "mkldnn_pool only support MKLDNN")
-        self.layer_type = 'mkldnn_pool' if use_mkldnn else 'pool'
-        super(PoolLayer, self).__init__(
-            name, self.layer_type, 0, inputs=inputs, **xargs)
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            pool_conf = self.config.inputs[input_index].pool_conf
-            parse_pool(self.inputs[input_index].pool, input_layer.name,
-                       pool_conf, ceil_mode, exclude_mode)
-            self.set_cnn_layer(name, pool_conf.output_y, pool_conf.output_x,
-                               pool_conf.channels)
-
-
-@config_layer('mkldnn_pool')
-class MKLDNNPoolLayer(PoolLayer):
-    layer_type = 'mkldnn_pool'
-
-
-@config_layer('pool3d')
-class Pool3DLayer(LayerBase):
-    def __init__(self, name, inputs, ceil_mode=True, **xargs):
-        super(Pool3DLayer, self).__init__(
-            name, 'pool3d', 0, inputs=inputs, **xargs)
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            pool_conf = self.config.inputs[input_index].pool_conf
-            parse_pool3d(self.inputs[input_index].pool, input_layer.name,
-                         pool_conf, ceil_mode)
-            self.set_cnn_layer(name, pool_conf.output_z, pool_conf.output_y,
-                               pool_conf.output_x, pool_conf.channels)
-
-    def set_cnn_layer(self,
-                      input_layer_name,
-                      depth,
-                      height,
-                      width,
-                      channels,
-                      is_print=True):
-        size = depth * height * width * channels
-        self.set_layer_size(size)
-        self.set_layer_height_width(height, width)
-        self.set_layer_depth(depth)
-        if is_print:
-            print("output for %s: c = %d, d = %d, h = %d, w = %d, size = %d" %
-                  (input_layer_name, channels, depth, height, width, size))
-
-
-@config_layer('spp')
-class SpatialPyramidPoolLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        super(SpatialPyramidPoolLayer, self).__init__(
-            name, 'spp', 0, inputs=inputs, **xargs)
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            spp_conf = self.config.inputs[input_index].spp_conf
-            parse_spp(self.inputs[input_index].spp, input_layer.name, spp_conf)
-            output_x = (pow(4, spp_conf.pyramid_height) - 1) / (4 - 1)
-            self.set_cnn_layer(name, 1, output_x, spp_conf.image_conf.channels)
-
-
-@config_layer('upsample')
-class UpsampleLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        super(UpsampleLayer, self).__init__(
-            name, 'upsample', 0, inputs=inputs, **xargs)
-
-        input_layer = self.get_input_layer(0)
-        image_conf = self.config.inputs[0].upsample_conf.image_conf
-        image_conf.img_size = input_layer.width
-        image_conf.img_size_y = input_layer.height
-        image_conf.channels = input_layer.size / (input_layer.width *
-                                                  input_layer.height)
-
-        upsample = self.inputs[0].upsample
-        output_x = 0
-        output_y = 0
-        output_size = 0
-
-        if upsample.scale:
-            self.config.inputs[0].upsample_conf.scale = upsample.scale
-            self.config.inputs[0].upsample_conf.scale_y = upsample.scale_y
-            output_x = input_layer.width * upsample.scale
-            output_y = input_layer.height * upsample.scale_y
-        self.config.inputs[0].upsample_conf.pad_out_x = upsample.pad_out_x
-        self.config.inputs[0].upsample_conf.pad_out_y = upsample.pad_out_y
-        if upsample.upsample_size:
-            self.config.inputs[
-                0].upsample_conf.upsample_size = upsample.upsample_size
-            self.config.inputs[
-                0].upsample_conf.upsample_size_y = upsample.upsample_size_y
-            output_x = upsample.upsample_size
-            output_y = upsample.upsample_size_y
-
-        output_size = image_conf.channels * output_x * output_y
-
-        self.set_layer_height_width(output_y, output_x)
-        self.set_layer_depth(input_layer.depth)
-        self.set_layer_size(output_size)
-
-
-@config_layer('pad')
-class PadLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        super(PadLayer, self).__init__(name, 'pad', 0, inputs=inputs, **xargs)
-        pad = self.inputs[0].pad
-        self.config.inputs[0].pad_conf.pad_c.extend(pad.pad_c)
-        self.config.inputs[0].pad_conf.pad_h.extend(pad.pad_h)
-        self.config.inputs[0].pad_conf.pad_w.extend(pad.pad_w)
-
-        input_layer = self.get_input_layer(0)
-        image_conf = self.config.inputs[0].pad_conf.image_conf
-        parse_image(pad, input_layer.name, image_conf)
-        out_ch = pad.channels + pad.pad_c[0] + pad.pad_c[1]
-        out_h = image_conf.img_size_y + pad.pad_h[0] + pad.pad_h[1]
-        out_w = image_conf.img_size + pad.pad_w[0] + pad.pad_w[1]
-        self.set_cnn_layer(name, out_h, out_w, out_ch)
-        self.config.size = out_ch * out_h * out_w
-
-
-@config_layer('crop')
-class CropLayer(LayerBase):
-    def __init__(self, name, inputs, axis, offset, shape, **xargs):
-        super(CropLayer, self).__init__(name, 'crop', 0, inputs=inputs, **xargs)
-        self.config.axis = axis
-        self.config.offset.extend(offset)
-        self.config.shape.extend(shape)
-
-        # get channel, width and height from input_0 layer
-        input_layer = self.get_input_layer(0)
-        image_conf = self.config.inputs[0].image_conf
-        image_conf.img_size = input_layer.width
-        image_conf.img_size_y = input_layer.height
-        image_conf.channels = input_layer.size / (input_layer.width *
-                                                  input_layer.height)
-        # only support for 4-dims inputs and NCHW order
-        if (len(self.config.inputs) == 2):
-            self.set_layer_height_width(
-                self.get_input_layer(1).height, self.get_input_layer(1).width)
-            self.set_layer_size(self.get_input_layer(1).size)
-        else:
-            self.set_layer_height_width(shape[-2], shape[-1])
-            self.set_layer_size(reduce(lambda x, y: x * y, shape[1:]))
-
-
-@config_layer('batch_norm')
-class BatchNormLayer(LayerBase):
-    layer_type = 'batch_norm'
-
-    def __init__(self,
-                 name,
-                 inputs,
-                 bias=True,
-                 img3D=False,
-                 use_global_stats=True,
-                 epsilon=1e-5,
-                 moving_average_fraction=0.9,
-                 batch_norm_type=None,
-                 mean_var_names=None,
-                 **xargs):
-        if inputs is None:
-            inputs = []
-        elif not isinstance(inputs, list):
-            inputs = [inputs]
-        config_assert(
-            len(inputs) == 1, "BatchNormLayer must have one and only one input")
-        # Create Input for moving mean and std,
-        # in batch normalization layer.
-        # These paras no need to update, so set is_static is true.
-        # If not use is_static, even set learning_rate = 0, decay_rate = 0,
-        # these paras will change if set average_window in configure.
-        use_gpu = bool(int(g_command_config_args.get("use_gpu", 0)))
-        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
-        is_shared = True if not use_gpu else False
-        for i in xrange(2):
-            inputs.append(
-                Input(
-                    inputs[0].input_layer_name,
-                    initial_std=0.0,
-                    initial_mean=0.0,
-                    is_static=True,
-                    is_shared=is_shared,
-                    make_layer_name_in_submodel=False, ))
-
-        parallel_nn = bool(int(g_command_config_args.get("parallel_nn", 0)))
-        cudnn_version = int(g_command_config_args.get("cudnn_version", 0))
-        # Automatically select cudnn_batch_norm for GPU, batch_norm for CPU
-        # and mkldnn_batch_norm for MKLDNN. Also based on cudnn version.
-        if batch_norm_type == "mkldnn_batch_norm":
-            config_assert(use_mkldnn, "mkldnn_batch_norm only support MKLDNN")
-        use_cudnn = use_gpu and batch_norm_type != "batch_norm" and \
-                not use_mkldnn and batch_norm_type != "mkldnn_batch_norm" and \
-                ((not parallel_nn) or self.config.device > -1)
-        if use_cudnn:
-            self.layer_type = "cudnn_batch_norm"
-        else:
-            self.layer_type = "mkldnn_batch_norm" if use_mkldnn else "batch_norm"
-        super(BatchNormLayer, self).__init__(
-            name, self.layer_type, 0, inputs=inputs, **xargs)
-
-        if use_global_stats is not None:
-            self.config.use_global_stats = use_global_stats
-        if moving_average_fraction is not None:
-            self.config.moving_average_fraction = moving_average_fraction
-        if epsilon is not None:
-            assert epsilon >= 1e-5, "epsilon must be no less than 1e-5."
-            self.config.epsilon = epsilon
-
-        input_layer = self.get_input_layer(0)
-        image_conf = self.config.inputs[0].image_conf
-        if img3D:
-            parse_image3d(self.inputs[0].image, input_layer.name, image_conf)
-            # Only pass the width and height of input to batch_norm layer
-            # when either of it is non-zero.
-            if input_layer.width != 0 or input_layer.height != 0:
-                self.set_cnn_layer(
-                    input_layer_name=name,
-                    depth=image_conf.img_size_z,
-                    height=image_conf.img_size_y,
-                    width=image_conf.img_size,
-                    channels=image_conf.channels,
-                    is_print=True)
-            else:
-                self.set_layer_size(input_layer.size)
-        else:
-            parse_image(self.inputs[0].image, input_layer.name, image_conf)
-            # Only pass the width and height of input to batch_norm layer
-            # when either of it is non-zero.
-            if input_layer.width != 0 or input_layer.height != 0:
-                self.set_cnn_layer(
-                    input_layer_name=name,
-                    height=image_conf.img_size_y,
-                    width=image_conf.img_size,
-                    channels=image_conf.channels,
-                    is_print=True)
-            else:
-                self.set_layer_size(input_layer.size)
-
-        psize = self.calc_parameter_size(image_conf)
-        dims = [1, psize]
-        if mean_var_names is not None:
-            assert len(mean_var_names) == 2
-            self.inputs[1].parameter_name = mean_var_names[0]
-            self.inputs[2].parameter_name = mean_var_names[1]
-
-        self.create_input_parameter(0, psize)
-        self.create_input_parameter(1, psize, dims)
-        self.create_input_parameter(2, psize, dims)
-
-        self.create_bias_parameter(bias, psize)
-
-    def set_cnn_layer(self,
-                      input_layer_name,
-                      depth=None,
-                      height=None,
-                      width=None,
-                      channels=None,
-                      is_print=True):
-        depthIsNone = False
-        if depth is None:
-            depth = 1
-            depthIsNone = True
-        size = depth * height * width * channels
-        self.set_layer_size(size)
-        self.set_layer_height_width(height, width)
-        self.set_layer_depth(depth)
-        if is_print and depthIsNone:
-            print("output for %s: c = %d, h = %d, w = %d, size = %d" %
-                  (input_layer_name, channels, height, width, size))
-        elif is_print:
-            print("output for %s: c = %d, d = %d, h = %d, w = %d, size = %d" %
-                  (input_layer_name, channels, depth, height, width, size))
-
-    def calc_parameter_size(self, image_conf):
-        return image_conf.channels
-
-
-@config_layer('trans')
-class TransLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        super(TransLayer, self).__init__(
-            name, 'trans', 0, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 1,
-            'TransLayer must have one and only one input')
-        self.set_layer_size(self.get_input_layer(0).size)
-
-
-@config_layer('resize')
-class ResizeLayer(LayerBase):
-    def __init__(self, name, size, inputs, **xargs):
-        super(ResizeLayer, self).__init__(
-            name, 'resize', size=size, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 1,
-            'ResizeLayer must have one and only one input')
-
-
-@config_layer('rotate')
-class RotateLayer(LayerBase):
-    def __init__(self, name, inputs, height, width, device=None):
-        super(RotateLayer, self).__init__(
-            name, 'rotate', 0, inputs=inputs, device=device)
-        config_assert(
-            len(self.inputs) == 1,
-            'RotateLayer must have one and only one input')
-        self.set_layer_height_width(height, width)
-        self.set_layer_size(self.get_input_layer(0).size)
-
-
-@config_layer('blockexpand')
-class BlockExpandLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        super(BlockExpandLayer, self).__init__(
-            name, 'blockexpand', 0, inputs=inputs, **xargs)
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            parse_block_expand(
-                self.inputs[input_index].block_expand, input_layer.name,
-                self.config.inputs[input_index].block_expand_conf)
-            block_expand_conf = self.config.inputs[
-                input_index].block_expand_conf
-            self.set_layer_size(block_expand_conf.block_x *
-                                block_expand_conf.block_y *
-                                block_expand_conf.channels)
-
-
-@config_layer('maxout')
-class MaxOutLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        super(MaxOutLayer, self).__init__(
-            name, 'maxout', 0, inputs=inputs, **xargs)
-        input_layer = self.get_input_layer(0)
-        maxout_conf = self.config.inputs[0].maxout_conf
-        parse_maxout(self.inputs[0].maxout, input_layer.name, maxout_conf)
-        out_channels = maxout_conf.image_conf.channels / maxout_conf.groups
-        self.set_cnn_layer(name, maxout_conf.image_conf.img_size_y,
-                           maxout_conf.image_conf.img_size, out_channels)
-
-
-@config_layer('row_conv')
-class RowConvLayer(LayerBase):
-    def __init__(self, name, inputs, context_length, **xargs):
-        super(RowConvLayer, self).__init__(
-            name, 'row_conv', 0, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 1,
-            'row convolution layer must have one and only one input.')
-        input_layer = self.get_input_layer(0)
-        row_conv_conf = self.config.inputs[0].row_conv_conf
-        row_conv_conf.context_length = context_length
-        self.set_layer_size(input_layer.size)
-        psize = context_length * input_layer.size
-        dims = [context_length, input_layer.size]
-        self.create_input_parameter(0, psize, dims)
-
-
-@config_layer('clip')
-class ClipLayer(LayerBase):
-    def __init__(self, name, inputs, min, max, **xargs):
-        super(ClipLayer, self).__init__(name, 'clip', 0, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 1,
-            'ClipLayer must have one and only one input.')
-        config_assert(min < max, 'min must be less than max.')
-        input_layer = self.get_input_layer(0)
-        self.set_layer_size(input_layer.size)
-        self.config.inputs[0].clip_conf.min = min
-        self.config.inputs[0].clip_conf.max = max
-
-
-@config_layer('scale_shift')
-class ScaleShiftLayer(LayerBase):
-    def __init__(self, name, inputs, bias=True, **xargs):
-        super(ScaleShiftLayer, self).__init__(
-            name, 'scale_shift', 0, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 1,
-            'ScaleShiftLayer must have one and only one input.')
-        input_layer = self.get_input_layer(0)
-        self.set_layer_size(input_layer.size)
-        self.create_input_parameter(0, 1, [1, 1])
-        self.create_bias_parameter(bias, 1)
-
-
-# key: cost type
-# value: cost class
-g_cost_map = {}
-
-
-# define a cost layer without any parameters
-def define_cost(class_name, cost_type):
-    def init(cls, name, inputs, device=None, coeff=1.):
-        super(type(cls), cls).__init__(
-            name, cost_type, 1, inputs, device=device, coeff=coeff)
-
-    cls = type(class_name, (LayerBase, ), dict(__init__=init))
-    global g_cost_map
-    g_cost_map[cost_type] = cls
-
-
-define_cost('MultiClassCrossEntropy', 'multi-class-cross-entropy')
-define_cost('CrossEntropyOverBeamCostLayer', 'cross_entropy_over_beam')
-define_cost('RankingCost', 'rank-cost')
-define_cost('AucValidation', 'auc-validation')
-define_cost('PnpairValidation', 'pnpair-validation')
-define_cost('SumOfSquaresCostLayer', 'square_error')
-define_cost('MultiBinaryLabelCrossEntropy', 'multi_binary_label_cross_entropy')
-define_cost('SoftBinaryClassCrossEntropy', 'soft_binary_class_cross_entropy')
-define_cost('HuberTwoClassification', 'huber_classification')
-define_cost('SumCost', 'sum_cost')
-define_cost('SmoothL1Cost', 'smooth_l1')
-
-
-@config_layer('hsigmoid')
-class HierarchicalSigmoidLayer(LayerBase):
-    def __init__(self, name, num_classes, inputs, device=None, bias=True):
-        super(HierarchicalSigmoidLayer, self).__init__(
-            name, 'hsigmoid', 1, inputs=inputs, device=device)
-        config_assert(
-            len(self.inputs) >= 2,
-            'HierarchicalSigmoidLayer must have at least 2 inputs')
-        self.config.num_classes = num_classes
-        for input_index in xrange(len(self.inputs) - 1):
-            input_layer = self.get_input_layer(input_index)
-            psize = (num_classes - 1) * input_layer.size
-            dims = [num_classes - 1, input_layer.size]
-            self.create_input_parameter(input_index, psize, dims)
-        self.create_bias_parameter(bias, num_classes - 1)
-
-
-'''
-lambdaCost for lambdaRank LTR approach
-
-Usage:
-  Example: Layer(name = "cost", type = "lambda_cost", NDCG_num = 8,
-             max_sort_size = -1, inputs = ["output", "score"])
-
-  Input data: Samples of the same query should be loaded as a sequence,
-          by PyDataProvider etc.. User should provide
-          scores for each sample. The score slot should be the 2nd
-          input of lambdaRank layer.
-
-  NDCG_num = the size of NDCG, e.g., 5 for NDCG@5.
-    Note: NDCG_num must be less than or equal to the minimum
-          size of lists.
-
-  max_sort_size = the size of partial sorting in calculating gradient.
-    Note: If max_sort_size = -1, then for each list, the algorithm will
-          sort the entire list to get gradient.
-          In other cases, max_sort_size must be greater than or equal
-          to NDCG_num.
-          max_sort_size can be greater than the size of a list, in which
-          case the algorithm will sort the entire list to get gradient.
-'''
-
-
-@config_layer('lambda_cost')
-class LambdaCost(LayerBase):
-    def __init__(self, name, inputs, NDCG_num=5, max_sort_size=-1, device=None):
-        super(LambdaCost, self).__init__(
-            name, 'lambda_cost', 1, inputs=inputs, device=device)
-        config_assert(len(self.inputs) == 2, 'lambdaCost must have 2 inputs')
-        self.config.NDCG_num = NDCG_num
-        if max_sort_size != -1:
-            config_assert(
-                NDCG_num <= max_sort_size,
-                'NDCG_num must be less than or equal to max_sort_size')
-        self.config.max_sort_size = max_sort_size
-
-
-@config_layer('huber_regression')
-class HuberRegressionLoss(LayerBase):
-    def __init__(self, name, inputs, delta=1., coeff=1., device=None):
-        super(HuberRegressionLoss, self).__init__(
-            name, 'huber_regression', 1, inputs=inputs, device=device)
-        config_assert(
-            len(self.inputs) == 2, 'HuberRegression must have 2 inputs')
-        self.config.delta = delta
-        self.config.coeff = coeff
-
-
-@config_layer('nce')
-class NCELayer(LayerBase):
-    def __init__(self,
-                 name,
-                 num_classes,
-                 inputs,
-                 num_neg_samples=10,
-                 neg_sampling_dist=None,
-                 bias=True,
-                 **xargs):
-        super(NCELayer, self).__init__(name, 'nce', 1, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) >= 2, 'NCELayer must have at least 2 inputs')
-        self.config.num_classes = num_classes
-        if neg_sampling_dist is not None:
-            config_assert(
-                len(neg_sampling_dist) == num_classes,
-                'len(neg_sampling_dist)(%s) is not same as num_classes (%s)' %
-                (len(neg_sampling_dist), num_classes))
-            s = sum(neg_sampling_dist)
-            config_assert(
-                abs(s - 1) < 1e-5,
-                'The sum of neg_sampling_dist (%s) is not 1' % s)
-
-            self.config.neg_sampling_dist.extend(neg_sampling_dist)
-
-        self.config.num_neg_samples = num_neg_samples
-        num_real_inputs = len(self.inputs) - 1
-        input_layer = self.get_input_layer(num_real_inputs)
-        config_assert(input_layer.type == 'data',
-                      'Expecting the last input layer of an nce layer to be '
-                      'a data layer')
-
-        if (num_real_inputs > 1 and input_layer.size == 1 and
-                self.get_input_layer(num_real_inputs - 1).type == 'data'):
-            # This input layer is assumed to be a sample weight layer
-            num_real_inputs -= 1
-
-        for input_index in xrange(num_real_inputs):
-            input_layer = self.get_input_layer(input_index)
-            psize = num_classes * input_layer.size
-            dims = [num_classes, input_layer.size]
-            self.create_input_parameter(input_index, psize, dims)
-        self.create_bias_parameter(bias, num_classes)
-
-
-@config_layer('addto')
-class AddToLayer(LayerBase):
-    layer_type = 'addto'
-
-    def __init__(self, name, inputs, bias=True, **xargs):
-        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
-        if self.layer_type == "mkldnn_addto":
-            config_assert(use_mkldnn, "mkldnn_addto only support MKLDNN")
-        self.layer_type = 'mkldnn_addto' if use_mkldnn else 'addto'
-        super(AddToLayer, self).__init__(
-            name, self.layer_type, 0, inputs=inputs, **xargs)
-        config_assert(len(inputs) > 0, 'inputs cannot be empty for AddToLayer')
-
-        layer_size = self.get_input_layer(0).size
-        # To reserve heght, width, depth.
-        layer_with_hwc = self.get_input_layer(0)
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            assert layer_size == input_layer.size
-            if input_layer.height and input_layer.height and input_layer.height:
-                layer_with_hwc = input_layer
-
-        self.set_layer_size(layer_with_hwc.size)
-        self.set_layer_height_width(layer_with_hwc.height, layer_with_hwc.width)
-        self.set_layer_depth(layer_with_hwc.depth)
-        self.create_bias_parameter(bias, self.config.size)
-
-
-@config_layer('mkldnn_addto')
-class MKLDNNAddtoLayer(AddToLayer):
-    layer_type = 'mkldnn_addto'
-
-
-@config_layer('agent')
-class AgentLayer(LayerBase):
-    def __init__(self, name, size, device=None):
-        super(AgentLayer, self).__init__(
-            name, 'agent', size, inputs=[], device=device)
-
-
-@config_layer('gather_agent')
-class GatherAgentLayer(LayerBase):
-    def __init__(self, name, size, device=None):
-        super(GatherAgentLayer, self).__init__(
-            name, 'gather_agent', size, inputs=[], device=device)
-
-
-@config_layer('scatter_agent')
-class ScatterAgentLayer(LayerBase):
-    def __init__(self, name, size, width=None, height=None, device=None):
-        super(ScatterAgentLayer, self).__init__(
-            name, 'scatter_agent', size, inputs=[], device=device)
-        if height and width:
-            self.set_layer_height_width(height, width)
-
-
-@config_layer('multiplex')
-class MultiplexLayer(LayerBase):
-    def __init__(self, name, inputs, size, device=None):
-        super(MultiplexLayer, self).__init__(
-            name, 'multiplex', size, inputs=inputs, device=device)
-        config_assert(
-            len(inputs) > 2, 'MultiplexLayer should have more than 2 inputs.')
-        for i in range(1, len(inputs)):
-            config_assert(
-                self.get_input_layer(i).size == size,
-                "All the input layers except the first one should"
-                "have the same size as the MultiplexLayer.")
-
-
-@config_func
-def Link(name, has_subseq=False):
-    """
-    Still keeping has_subseq for backward compatibility
-    """
-    link_config = LinkConfig()
-    link_config.link_name = name
-    return link_config
-
-
-# memory for recurrent layer group.
-# *name* and *size* are actual layer's name and size.
-# If *name* is None, need to provide *memory_name* and need to use
-# SetMemoryInput() later to specify the layer which this memory remembers.
-#
-# return the name of the memory,
-# use this name if you assign the memory as other layer's input
-#
-# boot frame of memory is zeroed by default,
-# or initialize by boot layer output if *boot_layer* set,
-# or initialize by trainable bias if *boot_bias* set,
-# or initialize by a constant id if *boot_with_const_id* set
-#
-# Memory can be a sequence if *is_sequence* set, this type of memory
-# can only be initailized by a *boot_layer* which is a sequence.
-#
-@config_func
-def Memory(name,
-           size,
-           is_sequence=False,
-           boot_layer=None,
-           boot_bias=False,
-           boot_bias_active_type="",
-           boot_with_const_id=None,
-           memory_name=None):
-    if not memory_name:
-        config_assert(name is not None, "name needs cannot be None")
-        memory_name = name + "+delay1"
-    agent_name = memory_name
-    agent_layer = AgentLayer(agent_name, size)
-    config_assert(g_current_submodel.is_recurrent_layer_group,
-                  'Memory should be used in recurrent layer group only')
-    memory = g_current_submodel.memories.add()
-    if name is not None:
-        memory.layer_name = MakeLayerNameInSubmodel(name)
-    memory.link_name = MakeLayerNameInSubmodel(agent_name)
-    options = sum((boot_layer is not None, bool(boot_bias),
-                   boot_with_const_id is not None))
-    config_assert(
-        options <= 1,
-        'take one option at most from boot_layer, boot_bias, or boot_with_const_id'
-    )
-    if boot_layer is not None:
-        boot_layer = MakeLayerNameInParentSubmodel(boot_layer)
-        config_assert(boot_layer in g_layer_map,
-                      'boot_layer "%s" does not correspond to a layer name' %
-                      boot_layer)
-        memory.boot_layer_name = boot_layer
-    elif boot_bias:
-        memory.boot_bias_parameter_name = agent_layer.create_bias_parameter(
-            boot_bias, size, for_self=False)
-        memory.boot_bias_active_type = boot_bias_active_type
-    elif boot_with_const_id is not None:
-        memory.boot_with_const_id = boot_with_const_id
-    return agent_name
-
-
-@config_func
-def SetMemoryInput(memory_name, layer_name):
-    memory_name = MakeLayerNameInSubmodel(memory_name)
-    layer_name = MakeLayerNameInSubmodel(layer_name)
-    for mem in g_current_submodel.memories:
-        if mem.link_name == memory_name:
-            mem.layer_name = layer_name
-            return
-    logger.fatal("Nonexistent memory name: " + memory_name)
-
-
-# Generator for recurrent layer group, to use it:
-#  1. define a id layer as output of layer group
-#  2. define a memory of this id layer, and assign a boot id(begin of sequence)
-#  3. define a eos check layer and fill its name in generator's *eos_layer_name*
-# Sequence generation will stop when eos check return 1 or *max_num_frames* reached.
-# If *beam_size* is greater than one, generator will use beam search.
-#   in beam search, if *num_results_per_sample* set, one sample sequence can output
-#   multiple results each with a probility.
-@config_func
-def Generator(
-        max_num_frames,
-        eos_layer_name="eos_check",
-        num_results_per_sample=1,
-        beam_size=1,
-        log_prob=None, ):
-    generator_config = GeneratorConfig()
-    generator_config.max_num_frames = max_num_frames
-    generator_config.eos_layer_name = eos_layer_name
-    generator_config.num_results_per_sample = num_results_per_sample
-    generator_config.beam_size = beam_size
-    if log_prob is not None:
-        generator_config.log_prob = log_prob
-    return generator_config
-
-
-@config_layer('expand')
-class ExpandLayer(LayerBase):
-    def __init__(self, name, inputs, trans_type='non-seq', bias=False, **xargs):
-        super(ExpandLayer, self).__init__(
-            name, 'expand', 0, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 2, 'ExpandLayer takes 2 and only 2 inputs')
-        self.config.trans_type = trans_type
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-        self.set_layer_size(self.get_input_layer(0).size)
-        self.create_bias_parameter(bias, self.config.size)
-
-
-@config_layer('featmap_expand')
-class FeatMapExpandLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 num_filters=None,
-                 as_row_vector=True,
-                 bias=False,
-                 **xargs):
-        super(FeatMapExpandLayer, self).__init__(
-            name, 'featmap_expand', 0, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 1, 'ExpandLayer takes 1 and only 1 inputs')
-        if num_filters is not None:
-            self.config.num_filters = num_filters
-        else:
-            logger.fatal("FeatMapExpandLayer must specify num_filters.")
-        if not as_row_vector:
-            self.config.user_arg = "as_col_vec"
-        self.set_layer_size(self.get_input_layer(0).size * num_filters)
-
-
-@config_layer('max')
-class MaxLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 trans_type='non-seq',
-                 bias=False,
-                 output_max_index=None,
-                 stride=-1,
-                 **xargs):
-        super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, **xargs)
-        config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input')
-        if trans_type == 'seq':
-            config_assert(stride == -1, 'subseq does not support stride window')
-        self.config.trans_type = trans_type
-        self.config.seq_pool_stride = stride
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            self.set_layer_size(input_layer.size)
-        self.create_bias_parameter(bias, self.config.size)
-        if output_max_index is not None:
-            self.config.output_max_index = output_max_index
-
-
-@config_layer('maxid')
-class MaxIdLayer(LayerBase):
-    def __init__(self, name, inputs, beam_size=None, device=None):
-        super(MaxIdLayer, self).__init__(
-            name, 'maxid', 0, inputs=inputs, device=device)
-        config_assert(len(self.inputs) == 1, 'MaxIdLayer must have 1 input')
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            self.set_layer_size(input_layer.size)
-
-        if beam_size is None:
-            global g_current_submodel
-            if g_current_submodel.HasField("generator"):
-                self.config.beam_size = g_current_submodel.generator.beam_size
-        else:
-            self.config.beam_size = beam_size
-
-
-@config_layer('eos_id')
-class EosIdLayer(LayerBase):
-    def __init__(self, name, inputs, eos_id, device=None):
-        super(EosIdLayer, self).__init__(
-            name, 'eos_id', 0, inputs=inputs, device=device)
-        config_assert(len(self.inputs) == 1, 'EosIdLayer must have 1 input')
-        self.set_layer_size(2)  # boolean output
-        self.config.eos_id = eos_id
-
-
-@config_layer('seqlastins')
-class SequenceLastInstanceLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 trans_type='non-seq',
-                 bias=False,
-                 stride=-1,
-                 **xargs):
-        super(SequenceLastInstanceLayer, self).__init__(
-            name, 'seqlastins', 0, inputs=inputs, **xargs)
-        config_assert(
-            len(inputs) == 1, 'SequenceLastInstanceLayer must have 1 input')
-        if trans_type == 'seq':
-            config_assert(stride == -1, 'subseq does not support stride window')
-        self.config.trans_type = trans_type
-        self.config.seq_pool_stride = stride
-        self.set_layer_size(self.get_input_layer(0).size)
-        self.create_bias_parameter(bias, self.config.size)
-
-
-@config_layer('seqfirstins')
-class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
-    def __init__(self,
-                 name,
-                 inputs,
-                 trans_type='non-seq',
-                 bias=False,
-                 stride=-1,
-                 **xargs):
-        super(SequenceFirstInstanceLayer, self).__init__(
-            name,
-            inputs=inputs,
-            trans_type=trans_type,
-            bias=bias,
-            stride=stride,
-            **xargs)
-        self.config.select_first = True
-
-
-@config_layer('seqconcat')
-class SequenceConcatLayer(LayerBase):
-    def __init__(self, name, inputs, bias=False, **xargs):
-        super(SequenceConcatLayer, self).__init__(
-            name, 'seqconcat', 0, inputs=inputs, **xargs)
-        config_assert(
-            len(inputs) == 2, 'SequenceConcatLayer must have 2 inputs')
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            self.set_layer_size(input_layer.size)
-        self.create_bias_parameter(bias, self.config.size)
-
-
-@config_layer('seqreshape')
-class SequenceReshapeLayer(LayerBase):
-    def __init__(self, name, size, inputs, bias=False, **xargs):
-        super(SequenceReshapeLayer, self).__init__(
-            name, 'seqreshape', size, inputs=inputs, **xargs)
-        config_assert(
-            len(inputs) == 1, 'SequenceReshapeLayer must have 1 inputs')
-        self.set_layer_size(size)
-        self.create_bias_parameter(bias, size)
-
-
-@config_layer('subseq')
-class SubSequenceLayer(LayerBase):
-    def __init__(self, name, inputs, bias=False, **xargs):
-        super(SubSequenceLayer, self).__init__(
-            name, 'subseq', 0, inputs=inputs, **xargs)
-        config_assert(len(inputs) == 3, 'SubSequenceLayer must have 3 inputs')
-        input_layer0 = self.get_input_layer(0)
-        size = input_layer0.size
-        self.set_layer_size(size)
-        self.create_bias_parameter(bias, size)
-
-
-@config_layer('seq_slice')
-class SeqSliceLayer(LayerBase):
-    def __init__(self, name, inputs, starts, ends, bias=False, **xargs):
-        if isinstance(inputs, list):
-            assert len(inputs) == 1, ('the first input of sequence slice layer '
-                                      'is a single sequence input.')
-        else:
-            inputs = [inputs]
-
-        if starts is not None:
-            if isinstance(starts, list):
-                assert len(starts) == 1, (
-                    'the start indices for sequence slice layer cannot '
-                    'be a list having more than one element.')
-                starts = starts[0]
-            inputs.append(starts)
-
-        if ends is not None:
-            if isinstance(ends, list):
-                assert len(ends) == 1, (
-                    'the end indices for sequence slice layer cannot '
-                    'be a list having more than one element.')
-                ends = ends[0]
-            inputs.append(ends)
-        assert len(inputs) >= 2, (
-            'the sequence slice layer has at least two inputs.')
-
-        super(SeqSliceLayer, self).__init__(
-            name, 'seq_slice', 0, inputs=inputs, **xargs)
-
-        input_layer0 = self.get_input_layer(0)
-        size = input_layer0.size
-        self.set_layer_size(size)
-
-        if len(inputs) == 3:
-            assert (
-                self.get_input_layer(1).size == self.get_input_layer(2).size), (
-                    'If start and end indices are both given to'
-                    'sequence slice layer, they should have the same width.')
-        elif len(inputs) == 2:
-            self.config.select_first = (starts is not None)
-
-
-@config_layer('sub_nested_seq')
-class SubNestedSequenceLayer(LayerBase):
-    def __init__(self, name, inputs, selected_indices, bias=False, **xargs):
-        if isinstance(inputs, list):
-            assert len(inputs) == 1, ('the first input of sub_nested_seq '
-                                      'layer is a single nested sequence.')
-            inputs = inputs[0]
-        if isinstance(selected_indices, list):
-            assert len(selected_indices) == 1, (
-                'the second input of '
-                'sub_nested_seq layer is a single layer which is a '
-                'set of selected indices.')
-            selected_indices = selected_indices[0]
-
-        super(SubNestedSequenceLayer, self).__init__(
-            name,
-            'sub_nested_seq',
-            0,
-            inputs=[inputs, selected_indices],
-            **xargs)
-        input_layer0 = self.get_input_layer(0)
-        size = input_layer0.size
-        self.set_layer_size(size)
-
-
-@config_layer('dot_prod')
-class DotProdLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(DotProdLayer, self).__init__(
-            name, 'dot_prod', 0, inputs, device=device)
-        config_assert(len(inputs) == 2, 'DotProdLayer must have 2 inputs.')
-        config_assert(
-            self.get_input_layer(0).size == self.get_input_layer(1).size,
-            "Two inputs should have the same size.")
-        self.set_layer_size(1)
-
-
-@config_layer('out_prod')
-class OuterProdLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(OuterProdLayer, self).__init__(
-            name, 'out_prod', 0, inputs=inputs, device=device)
-        config_assert(len(inputs) == 2, 'OuterProdLayer must have 2 inputs')
-        input_layer0 = self.get_input_layer(0)
-        input_layer1 = self.get_input_layer(1)
-        self.set_layer_size(input_layer0.size * input_layer1.size)
-
-
-@config_layer('power')
-class PowerLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(PowerLayer, self).__init__(
-            name, 'power', 0, inputs=inputs, device=device)
-        config_assert(len(inputs) == 2, 'PowerLayer must have 2 inputs')
-        input_layer1 = self.get_input_layer(1)
-        self.set_layer_size(input_layer1.size)
-        input_layer0 = self.get_input_layer(0)
-        config_assert(1 == input_layer0.size,
-                      'The left input is the exponent and should be of size 1')
-
-
-@config_layer('slope_intercept')
-class SlopeInterceptLayer(LayerBase):
-    def __init__(self, name, inputs, slope=1.0, intercept=0.0, device=None):
-        super(SlopeInterceptLayer, self).__init__(
-            name, 'slope_intercept', 0, inputs=inputs, device=device)
-        self.config.slope = slope
-        self.config.intercept = intercept
-        config_assert(len(inputs) == 1, 'SlopeInterceptLayer must have 1 input')
-        input_layer0 = self.get_input_layer(0)
-        self.set_layer_size(input_layer0.size)
-
-
-@config_layer('scaling')
-class ScalingLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(ScalingLayer, self).__init__(
-            name, 'scaling', 0, inputs=inputs, device=device)
-        config_assert(len(inputs) == 2, 'ScalingLayer must have 2 inputs')
-        input_layer1 = self.get_input_layer(1)
-        self.set_layer_size(input_layer1.size)
-        input_layer0 = self.get_input_layer(0)
-        config_assert(1 == input_layer0.size,
-                      'The left input should be of size 1')
-
-
-@config_layer('conv_shift')
-class ConvShiftLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(ConvShiftLayer, self).__init__(
-            name, 'conv_shift', 0, inputs=inputs, device=device)
-        config_assert(len(inputs) == 2, 'ConvShiftLayer must have 2 inputs')
-        input_layer0 = self.get_input_layer(0)
-        self.set_layer_size(input_layer0.size)
-
-
-@config_layer('convex_comb')
-class ConvexCombinationLayer(LayerBase):
-    def __init__(self, name, size, inputs, device=None):
-        super(ConvexCombinationLayer, self).__init__(
-            name, 'convex_comb', size, inputs=inputs, device=device)
-        config_assert(
-            len(self.inputs) == 2, 'ConvexCombinationLayer must have 2 inputs')
-        config_assert(
-            size * self.get_input_layer(0).size == self.get_input_layer(1).size,
-            'Wrong input size for ConvexCombinationLayer')
-        self.set_layer_size(size)
-
-
-@config_layer('interpolation')
-class InterpolationLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(InterpolationLayer, self).__init__(
-            name, 'interpolation', 0, inputs=inputs, device=device)
-        config_assert(
-            len(self.inputs) == 3, 'InterpolationLayer must have 3 inputs')
-        input_layer0 = self.get_input_layer(0)
-        input_layer1 = self.get_input_layer(1)
-        input_layer2 = self.get_input_layer(2)
-        self.set_layer_size(input_layer1.size)
-        config_assert(input_layer0.size == 1, 'weight should be of size 1')
-        config_assert(input_layer1.size == input_layer2.size,
-                      'the two vector inputs should be of the same size')
-
-
-@config_layer('bilinear_interp')
-class BilinearInterpLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        super(BilinearInterpLayer, self).__init__(
-            name, 'bilinear_interp', 0, inputs=inputs, **xargs)
-        input_layer = self.get_input_layer(0)
-        conf = self.config.inputs[0].bilinear_interp_conf
-        parse_bilinear(self.inputs[0].bilinear_interp, input_layer.name, conf)
-        self.set_cnn_layer(name, conf.out_size_y, conf.out_size_x,
-                           conf.image_conf.channels)
-
-
-@config_layer('sum_to_one_norm')
-class SumToOneNormLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(SumToOneNormLayer, self).__init__(
-            name, 'sum_to_one_norm', 0, inputs=inputs, device=device)
-        config_assert(
-            len(self.inputs) == 1, 'SumToOneNormLayer must have 1 input')
-        input_layer0 = self.get_input_layer(0)
-        self.set_layer_size(input_layer0.size)
-
-
-@config_layer('row_l2_norm')
-class RowL2NormLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        super(RowL2NormLayer, self).__init__(
-            name, 'row_l2_norm', 0, inputs=inputs, **xargs)
-        config_assert(len(self.inputs) == 1, 'RowL2NormLayer must have 1 input')
-        input_layer = self.get_input_layer(0)
-        self.set_layer_size(input_layer.size)
-
-
-@config_layer('cos')
-class CosSimLayer(LayerBase):
-    def __init__(self, name, inputs, cos_scale=1, device=None):
-        super(CosSimLayer, self).__init__(
-            name, 'cos', 1, inputs=inputs, device=device)
-        config_assert(
-            len(self.inputs) == 2,
-            'The CosSimLayer expects two and only two inputs.')
-        config_assert(
-            self.get_input_layer(0).size == self.get_input_layer(1).size,
-            'The two inputs of CosSimLayer must have the same dimensionality.')
-        self.config.cos_scale = cos_scale
-
-
-@config_layer('cos_vm')
-class CosSimVecMatLayer(LayerBase):
-    def __init__(self, name, size, inputs, cos_scale=1.0, device=None):
-        super(CosSimVecMatLayer, self).__init__(
-            name, 'cos_vm', size, inputs=inputs, device=device)
-        self.config.cos_scale = cos_scale
-        config_assert(
-            len(self.inputs) == 2, 'The CosSimVecMatLayer must have 2 inputs.')
-        config_assert(
-            size * self.get_input_layer(0).size == self.get_input_layer(1).size,
-            'Wrong input size for CosSimVecMatLayer.')
-
-
-@config_layer('l2_distance')
-class L2DistanceLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(L2DistanceLayer, self).__init__(
-            name, 'l2_distance', 1, inputs=inputs, device=device)
-        config_assert(
-            len(self.inputs) == 2, ('The L2DistanceLayer must have '
-                                    'and only have 2 inputs.'))
-        config_assert(
-            self.get_input_layer(0).size == self.get_input_layer(1).size,
-            ('Two inputs of the L2DistanceLayer must have '
-             'the same dimensionality.'))
-
-
-@config_layer('sampling_id')
-class SamplingIdLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(SamplingIdLayer, self).__init__(
-            name, 'sampling_id', 0, inputs=inputs, device=device)
-        config_assert(
-            len(self.inputs) == 1, 'SamplingIdLayer must have 1 input')
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            self.set_layer_size(input_layer.size)
-
-
-# AverageLayer: "average" for each sample within a sequence.
-# average_stratrgy: set to one of the following:
-# 'average': plain average.
-# 'sum': sum each sample instead of average (which is divide by sample_num).
-# 'squarerootn': sum each sample, but divide by sqrt(sample_num).
-@config_layer('average')
-class AverageLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 average_strategy='average',
-                 trans_type='non-seq',
-                 bias=False,
-                 stride=-1,
-                 **xargs):
-        super(AverageLayer, self).__init__(
-            name, 'average', 0, inputs=inputs, **xargs)
-        self.config.average_strategy = average_strategy
-        if trans_type == 'seq':
-            config_assert(stride == -1, 'subseq does not support stride window')
-        self.config.trans_type = trans_type
-        self.config.seq_pool_stride = stride
-        config_assert(len(inputs) == 1, 'AverageLayer must have 1 input')
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            self.set_layer_size(input_layer.size)
-        self.create_bias_parameter(bias, self.config.size)
-
-
-@config_layer('tensor')
-class TensorLayer(LayerBase):
-    def __init__(self, name, size, inputs, bias=True, **xargs):
-        super(TensorLayer, self).__init__(
-            name, 'tensor', size, inputs=inputs, **xargs)
-        config_assert(len(self.inputs) == 2, 'TensorLayer must have 2 inputs')
-        config_assert(size > 0, 'size must be positive')
-        config_assert(inputs[1].parameter_name == None,
-                      'second parameter should be None.')
-        input_layer0 = self.get_input_layer(0)
-        input_layer1 = self.get_input_layer(1)
-        psize = size * input_layer0.size * input_layer1.size
-        dims = [input_layer0.size, input_layer1.size, size]
-        self.create_input_parameter(0, psize, dims)
-        self.create_bias_parameter(bias, size)
-
-
-@config_layer('mixed')
-class MixedLayer(LayerBase):
-    def __init__(self, name, inputs, size=0, bias=True, **xargs):
-        config_assert(inputs, 'inputs cannot be empty')
-        super(MixedLayer, self).__init__(
-            name, 'mixed', size, inputs=inputs, **xargs)
-        operator_input_index = []
-        for operator in self.operators:
-            operator_conf = operator.operator_conf
-            for i in xrange(1, len(operator.input_layer_names)):
-                input_index = len(self.config.inputs)
-                operator_conf.input_indices.append(input_index)
-                input_config = Input(operator.input_layer_names[i])
-                self.inputs.append(input_config)
-                layer_input = self.config.inputs.add()
-                layer_input.input_layer_name = input_config.input_layer_name
-            for input_index in operator_conf.input_indices:
-                input_layer = self.get_input_layer(input_index)
-                operator_conf.input_sizes.append(input_layer.size)
-                operator_input_index.append(input_index)
-            if self.config.size == 0:
-                size = operator.calc_output_size(operator_conf.input_sizes)
-                if size != 0:
-                    self.set_layer_size(size)
-            else:
-                sz = operator.calc_output_size(operator_conf.input_sizes)
-                if sz != 0:
-                    config_assert(
-                        sz == self.config.size,
-                        "different inputs have different size: %s vs. %s" %
-                        (sz, self.config.size))
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            input = self.inputs[input_index]
-            if input_index not in operator_input_index:
-                config_assert(
-                    isinstance(input, Projection),
-                    "input should be projection or operation")
-            if self.config.size == 0 and isinstance(input, Projection):
-                size = input.calc_output_size(input_layer)
-                if size != 0:
-                    self.set_layer_size(size)
-            elif isinstance(input, Projection):
-                sz = input.calc_output_size(input_layer)
-                if sz != 0:
-                    config_assert(
-                        sz == self.config.size,
-                        "different inputs have different size: %s vs. %s" %
-                        (sz, self.config.size))
-        config_assert(size != 0, "size is not set")
-
-        for input_index in xrange(len(self.inputs)):
-            input = self.inputs[input_index]
-            if isinstance(input, Projection):
-                input_layer = self.get_input_layer(input_index)
-                input.proj_conf.input_size = input_layer.size
-                input.proj_conf.output_size = size
-
-                input_config = self.config.inputs[input_index]
-                input_config.proj_conf.CopyFrom(input.proj_conf)
-                input_config.proj_conf.name = gen_parameter_name(name,
-                                                                 input_index)
-                psize = input.calc_parameter_size(input_layer.size, size)
-                dims = input.calc_parameter_dims(input_layer.size, size)
-                self.create_input_parameter(input_index, psize, dims)
-
-        for operator in self.operators:
-            operator_conf = operator.operator_conf
-            operator_conf.output_size = self.config.size
-            operator.check_dims()
-            record_operator_conf = self.config.operator_confs.add()
-            record_operator_conf.CopyFrom(operator_conf)
-
-        psize = self.config.size
-        if isinstance(self.inputs[0], ConvProjection):
-            self.config.shared_biases = True
-            psize = 0
-            for input in self.inputs:
-                psize += input.calc_bias_size()
-
-        if bias:
-            self.config.bias_size = psize
-            self.create_bias_parameter(bias, psize)
-
-
-# like MixedLayer, but no bias parameter
-@config_func
-def ExpressionLayer(name, inputs, **xargs):
-    MixedLayer(name, inputs, bias=False, **xargs)
-
-
-@config_layer('concat')
-class ConcatenateLayer(LayerBase):
-    layer_type = 'concat'
-
-    def __init__(self, name, inputs, bias=False, **xargs):
-        config_assert(inputs, 'inputs cannot be empty')
-        config_assert(not bias, 'ConcatenateLayer cannot support bias.')
-        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
-        if self.layer_type == "mkldnn_concat":
-            config_assert(use_mkldnn, "mkldnn_concat only support MKLDNN")
-        self.layer_type = 'mkldnn_concat' if use_mkldnn else 'concat'
-        super(ConcatenateLayer, self).__init__(
-            name, self.layer_type, 0, inputs=inputs, **xargs)
-        size = 0
-        for input_index in xrange(len(self.inputs)):
-            assert self.get_input_layer(0).height == self.get_input_layer(
-                input_index).height
-            assert self.get_input_layer(0).width == self.get_input_layer(
-                input_index).width
-            assert self.get_input_layer(0).depth == self.get_input_layer(
-                input_index).depth
-            input_layer = self.get_input_layer(input_index)
-            input = self.inputs[input_index]
-            if self.config.size == 0:
-                size += input_layer.size
-
-        self.set_layer_height_width(self.get_input_layer(0).height, \
-                                    self.get_input_layer(0).width)
-        self.set_layer_depth(self.get_input_layer(0).depth)
-        self.set_layer_size(size)
-
-
-@config_layer('mkldnn_concat')
-class MKLDNNConcatLayer(ConcatenateLayer):
-    layer_type = 'mkldnn_concat'
-
-
-# like concat layer, but each input layer was processed by a Projection.
-@config_layer('concat2')
-class ConcatenateLayer2(LayerBase):
-    def __init__(self, name, inputs, bias=False, **xargs):
-        config_assert(inputs, 'inputs cannot be empty')
-        super(ConcatenateLayer2, self).__init__(
-            name, 'concat2', 0, inputs=inputs, **xargs)
-
-        if isinstance(self.inputs[0], ConvProjection):
-            for input_index in xrange(len(self.inputs) - 1):
-                input = self.inputs[input_index + 1]
-                config_assert(
-                    isinstance(input, ConvProjection),
-                    "The first input of ConcatenateLayer2 is ConvProjection, "
-                    "the other inputs should also be ConvProjection.")
-
-        size = 0
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            input = self.inputs[input_index]
-            output_size = input.calc_output_size(input_layer)
-            config_assert(output_size != 0, "proj output size is not set")
-            size += output_size
-
-        self.set_layer_size(size)
-
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            input = self.inputs[input_index]
-            input.proj_conf.input_size = input_layer.size
-            input.proj_conf.output_size = input.calc_output_size(input_layer)
-
-            input_config = self.config.inputs[input_index]
-            input_config.proj_conf.CopyFrom(input.proj_conf)
-            input_config.proj_conf.name = gen_parameter_name(name, input_index)
-            psize = input.calc_parameter_size(input.proj_conf.input_size,
-                                              input.proj_conf.output_size)
-            dims = input.calc_parameter_dims(input.proj_conf.input_size,
-                                             input.proj_conf.output_size)
-            self.create_input_parameter(input_index, psize, dims)
-
-        psize = self.config.size
-        if isinstance(self.inputs[0], ConvProjection):
-            self.config.shared_biases = True
-            psize = 0
-            for input in self.inputs:
-                psize += input.calc_bias_size()
-
-        if bias:
-            self.config.bias_size = psize
-            self.create_bias_parameter(bias, psize)
-
-
-@config_layer('recurrent')
-class RecurrentLayer(LayerBase):
-    layer_type = 'recurrent'
-
-    def __init__(self, name, inputs, reversed=False, bias=True, **xargs):
-        use_mkl_packed = bool(
-            int(g_command_config_args.get("use_mkl_packed", 0)))
-        self.layer_type = 'mkl_packed_recurrent' if use_mkl_packed else 'recurrent'
-        super(RecurrentLayer, self).__init__(name, self.layer_type, 0, inputs,
-                                             **xargs)
-        config_assert(len(self.inputs) == 1, 'RecurrentLayer must have 1 input')
-        input_layer = self.get_input_layer(0)
-        size = input_layer.size
-        self.set_layer_size(size)
-        self.config.reversed = reversed
-        dims = [size, size]
-        self.create_input_parameter(0, size * size, dims)
-        self.create_bias_parameter(bias, self.config.size)
-
-
-@config_layer('lstmemory')
-class LstmLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 reversed=False,
-                 active_gate_type="sigmoid",
-                 active_state_type="sigmoid",
-                 bias=True,
-                 **xargs):
-        super(LstmLayer, self).__init__(name, 'lstmemory', 0, inputs, **xargs)
-        config_assert(len(self.inputs) == 1, 'LstmLayer must have 1 input')
-        input_layer = self.get_input_layer(0)
-        #check input_layer.size is divided by 4
-        config_assert(input_layer.size % 4 == 0, "size % 4 should be 0!")
-        size = input_layer.size / 4
-        self.set_layer_size(size)
-        self.config.reversed = reversed
-        self.config.active_gate_type = active_gate_type
-        self.config.active_state_type = active_state_type
-        self.create_input_parameter(0, size * size * 4, [size, size, 4])
-        #bias includes 3 kinds of peephole, 4 + 3 = 7
-        self.create_bias_parameter(bias, size * 7)
-
-
-@config_layer('lstm_step')
-class LstmStepLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 size,
-                 inputs,
-                 active_gate_type="sigmoid",
-                 active_state_type="sigmoid",
-                 bias=True,
-                 **xargs):
-        super(LstmStepLayer, self).__init__(name, 'lstm_step', size, inputs,
-                                            **xargs)
-        config_assert(len(inputs) == 2, 'LstmStepLayer must have 2 inputs')
-        input_layer0 = self.get_input_layer(0)
-        input_layer1 = self.get_input_layer(1)
-        config_assert(input_layer0.size == 4 * size,
-                      'input_layer0.size != 4 * layer.size')
-        config_assert(input_layer1.size == size,
-                      'input_layer1.size != layer.size')
-        self.config.active_gate_type = active_gate_type
-        self.config.active_state_type = active_state_type
-        self.create_bias_parameter(bias, size * 3)
-
-
-# get the specific output from the input layer.
-@config_layer('get_output')
-class GetOutputLayer(LayerBase):
-    def __init__(self, name, size, inputs):
-        super(GetOutputLayer, self).__init__(name, 'get_output', size, inputs)
-        config_assert(
-            len(self.inputs) == 1, 'GetOutputLayer must have 1 inputs')
-        inputs = self.inputs[0]
-        config_assert(inputs.input_layer_argument,
-                      'input_layer_argument cannot be empty')
-
-
-@config_layer('mdlstmemory')
-class MDLstmLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 directions=True,
-                 active_gate_type="sigmoid",
-                 active_state_type="sigmoid",
-                 bias=True,
-                 **xargs):
-        super(MDLstmLayer, self).__init__(name, 'mdlstmemory', 0, inputs,
-                                          **xargs)
-        config_assert(len(self.inputs) == 1, 'MDLstmLayer must have 1 input')
-        input_layer = self.get_input_layer(0)
-        dim_num = len(directions)
-        #check input_layer.size is divided by (3+dim_num)
-        config_assert(input_layer.size % (3 + dim_num) == 0,
-                      "size % (dim_num) should be 0!")
-        size = input_layer.size / (3 + dim_num)
-        self.set_layer_size(size)
-        self.config.active_gate_type = active_gate_type
-        self.config.active_state_type = active_state_type
-        for i in xrange(len(directions)):
-            self.config.directions.append(int(directions[i]))
-        self.create_input_parameter(0, size * size * (3 + dim_num),
-                                    [size, size, 3 + dim_num])
-        #bias includes 3 kinds of peephole, 3+dim_num+2+dim_num
-        self.create_bias_parameter(bias, size * (5 + 2 * dim_num))
-
-
-@config_layer('gated_recurrent')
-class GatedRecurrentLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 reversed=False,
-                 active_gate_type="sigmoid",
-                 bias=True,
-                 **xargs):
-        super(GatedRecurrentLayer, self).__init__(name, 'gated_recurrent', 0,
-                                                  inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 1, 'GatedRecurrentLayer must have 1 input')
-        input_layer = self.get_input_layer(0)
-        #check input_layer.size is divided by 3
-        config_assert(input_layer.size % 3 == 0, "size % 3 should be 0!")
-        size = input_layer.size / 3
-        self.set_layer_size(size)
-        self.config.reversed = reversed
-        self.config.active_gate_type = active_gate_type
-        self.create_input_parameter(0, size * size * 3, [size, size * 3])
-        self.create_bias_parameter(bias, size * 3)
-
-
-@config_layer('gru_step')
-class GruStepLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 size,
-                 inputs,
-                 active_gate_type="sigmoid",
-                 bias=True,
-                 **xargs):
-        super(GruStepLayer, self).__init__(name, 'gru_step', size, inputs,
-                                           **xargs)
-        config_assert(len(self.inputs) == 2, 'GruStepLayer must have 2 input')
-        input_layer0 = self.get_input_layer(0)
-        input_layer1 = self.get_input_layer(1)
-        config_assert(input_layer0.size == 3 * size,
-                      'input_layer0.size != 3 * layer.size')
-        config_assert(input_layer1.size == size,
-                      'input_layer1.size != layer.size')
-        self.config.active_gate_type = active_gate_type
-        self.create_input_parameter(0, size * size * 3, [size, size * 3])
-        self.create_bias_parameter(bias, size * 3)
-
-
-'''
- A layer for calculating the cost of sequential conditional random field model.
- Example: CRFLayer(name="crf_cost", size=label_num,
-                   inputs=["output", "label", "weight"])
-          where "weight" is optional, one weight for each sequence
- @param coeff: weight of the layer
-'''
-
-
-@config_layer('crf')
-class CRFLayer(LayerBase):
-    def __init__(self, name, size, inputs, coeff=1.0, device=None):
-        super(CRFLayer, self).__init__(name, 'crf', size, inputs, device=device)
-        config_assert(2 <= len(self.inputs) <= 3,
-                      'CRFLayer must have 2 or 3 inputs')
-        self.create_input_parameter(0, size * (size + 2), [size + 2, size])
-        self.config.coeff = coeff
-
-
-'''
- A layer for calculating the decoding sequence of sequential conditional
- random field model.
- The decoding sequence is stored in output_.ids
- If a second input is provided, it is treated as the ground-truth label, and
- this layer will also calculate error, output_.value[i] is 1 for incorrect
- decoding or 0 for correct decoding
-'''
-
-
-@config_layer('crf_decoding')
-class CRFDecodingLayer(LayerBase):
-    def __init__(self, name, size, inputs, device=None):
-        super(CRFDecodingLayer, self).__init__(
-            name, 'crf_decoding', size, inputs, device=device)
-        config_assert(
-            len(self.inputs) <= 2,
-            'CRFDecodingLayer cannot have more than 2 inputs')
-        self.create_input_parameter(0, size * (size + 2), [size + 2, size])
-
-
-@config_layer('ctc')
-class CTCLayer(LayerBase):
-    def __init__(self, name, size, inputs, norm_by_times=False, device=None):
-        super(CTCLayer, self).__init__(name, 'ctc', size, inputs, device=device)
-        self.config.norm_by_times = norm_by_times
-        config_assert(len(self.inputs) == 2, 'CTCLayer must have 2 inputs')
-
-
-@config_layer('kmax_seq_score')
-class KmaxSeqScoreLayer(LayerBase):
-    def __init__(self, name, inputs, beam_size, **xargs):
-        super(KmaxSeqScoreLayer, self).__init__(
-            name, 'kmax_seq_score', 0, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 1, 'KmaxSeqScoreLayer has only one input.')
-        self.config.beam_size = beam_size
-
-
-@config_layer('warp_ctc')
-class WarpCTCLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 size,
-                 inputs,
-                 blank=0,
-                 norm_by_times=False,
-                 device=None):
-        super(WarpCTCLayer, self).__init__(
-            name, 'warp_ctc', size=size, inputs=inputs, device=device)
-        self.config.blank = blank
-        self.config.norm_by_times = norm_by_times
-        config_assert(len(self.inputs) == 2, 'WarpCTCLayer must have 2 inputs')
-        input_layer = self.get_input_layer(0)
-        config_assert(
-            (input_layer.active_type == '' or
-             input_layer.active_type == 'linear'),
-            "Expecting the active_type of input layer to be linear or null")
-
-
-@config_layer('recurrent_layer_group')
-class RecurrentLayerGroup(LayerBase):
-    def __init__(self, name, device=None):
-        super(RecurrentLayerGroup, self).__init__(
-            name, 'recurrent_layer_group', 0, inputs=[], device=device)
-
-
-@config_layer('switch_order')
-class SwitchOrderLayer(LayerBase):
-    def __init__(self, name, inputs, reshape, **xargs):
-        super(SwitchOrderLayer, self).__init__(
-            name, 'switch_order', 0, inputs=inputs, **xargs)
-        self.config.reshape_conf.height_axis.extend(reshape['height'])
-        self.config.reshape_conf.width_axis.extend(reshape['width'])
-        input_layer = self.get_input_layer(0)
-        if reshape is None:
-            self.set_layer_size(input_layer.size)
-        else:
-            in_h = input_layer.height
-            in_w = input_layer.width
-            out_dims = None
-            if input_layer.has_depth():
-                in_d = input_layer.depth
-                in_c = input_layer.size / in_h / in_w / in_d
-                # batch_size, depth, height, width, channel
-                out_dims = [0, in_d, in_h, in_w, in_c]
-            else:
-                in_c = input_layer.size / in_h / in_w
-                # batch_size, height, width, channel
-                out_dims = [0, in_h, in_w, in_c]
-            # Because (reshape['width'][0] > 0) always be true.
-            # So out_dims[0] won't be used.
-            size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:])
-            self.set_layer_size(size)
-
-
-@config_layer('scale_sub_region')
-class ScaleSubRegionLayer(LayerBase):
-    def __init__(self, name, inputs, value, **xargs):
-        super(ScaleSubRegionLayer, self).__init__(
-            name, 'scale_sub_region', 0, inputs=inputs, **xargs)
-        scale_sub_region_conf = self.config.inputs[0].scale_sub_region_conf
-        scale_sub_region_conf.value = value
-
-        # get channel, width and height from input_0 layer
-        input_layer = self.get_input_layer(0)
-        image_conf = scale_sub_region_conf.image_conf
-        image_conf.img_size = input_layer.width
-        image_conf.img_size_y = input_layer.height
-        image_conf.channels = input_layer.size / (input_layer.width *
-                                                  input_layer.height)
-        self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size,
-                           image_conf.channels)
-
-
-@config_layer('factorization_machine')
-class FactorizationMachineLayer(LayerBase):
-    def __init__(self, name, inputs, factor_size, **xargs):
-        super(FactorizationMachineLayer, self).__init__(
-            name, 'factorization_machine', size=1, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 1,
-            'factorization machine layer must have one and only one input.')
-        self.config.factor_size = factor_size
-        input_layer = self.get_input_layer(0)
-        psize = input_layer.size * factor_size
-        dims = [input_layer.size, factor_size]
-        self.create_input_parameter(0, psize, dims)
-
-
-# Deprecated, use a new layer specific class instead
-@config_func
-def Layer(name, type, **xargs):
-    layers = {}
-    layers.update(g_cost_map)
-    layers.update(g_layer_type_map)
-    layer_func = layers.get(type)
-    config_assert(layer_func, "layer type '%s' not supported." % type)
-    return layer_func(name, **xargs)
-
-
-@config_func
-def ParameterHook(type, **kwargs):
-    if type == 'pruning':
-        hook = ParameterUpdaterHookConfig()
-        hook.type = type
-        sparsity_ratio = kwargs.get('sparsity_ratio', None)
-        if sparsity_ratio is not None:
-            hook.sparsity_ratio = sparsity_ratio
-        return hook
-    elif type == 'dpruning':
-        hook = ParameterUpdaterHookConfig()
-        hook.type = type
-        return hook
-    else:
-        return None
-
-
-@config_func
-def Parameter(name,
-              size,
-              device,
-              dims,
-              learning_rate=None,
-              momentum=None,
-              decay_rate=None,
-              decay_rate_l1=None,
-              initial_mean=None,
-              initial_std=None,
-              initial_strategy=None,
-              initial_smart=None,
-              num_batches_regularization=None,
-              sparse_remote_update=None,
-              sparse_update=None,
-              gradient_clipping_threshold=None,
-              sparse=None,
-              format=None,
-              need_compact=None,
-              is_static=None,
-              is_shared=None,
-              update_hooks=None,
-              initializer=None):
-
-    config_assert(name not in g_parameter_map,
-                  'Duplicated parameter name: ' + name)
-
-    para = g_config.model_config.parameters.add()
-    para.name = name
-    para.size = size
-    if device is not None:
-        para.device = int(device)
-    para.dims.extend(dims)
-
-    if learning_rate is not None:
-        para.learning_rate = float(learning_rate)
-
-    momentum = default(momentum, g_default_momentum)
-    if momentum is not None:
-        para.momentum = float(momentum)
-
-    config_assert(not momentum or not decay_rate_l1,
-                  "momentum and decay_rate_l1 cannot both be non-zero")
-
-    decay_rate = default(decay_rate, g_default_decay_rate)
-    if decay_rate is not None:
-        para.decay_rate = decay_rate
-
-    if decay_rate_l1 is not None:
-        para.decay_rate_l1 = decay_rate_l1
-    para.initial_std = default(initial_std, g_default_initial_std)
-    para.initial_mean = default(initial_mean, g_default_initial_mean)
-
-    num_batches_regularization = default(num_batches_regularization,
-                                         g_default_num_batches_regularization)
-    if num_batches_regularization is not None:
-        para.num_batches_regularization = int(num_batches_regularization)
-
-    if sparse_remote_update is not None:
-        para.sparse_remote_update = sparse_remote_update
-        if sparse_remote_update:
-            g_config.opt_config.use_sparse_remote_updater = True
-    if sparse_update is not None:
-        para.sparse_update = sparse_update
-    gradient_clipping_threshold = default(gradient_clipping_threshold,
-                                          g_default_gradient_clipping_threshold)
-    if gradient_clipping_threshold is not None:
-        para.gradient_clipping_threshold = gradient_clipping_threshold
-    para.initial_strategy = default(initial_strategy,
-                                    g_default_initial_strategy)
-    para.initial_smart = default(initial_smart, g_default_initial_smart)
-    if para.initial_smart:
-        para.initial_mean = 0.
-        if len(para.dims) != 0:
-            para.initial_std = 1. / math.sqrt(para.dims[0])
-        else:
-            print(
-                "Use initial_smart, but dims not set. Initial_smart may not be used in this layer"
-            )
-            traceback.print_exc()
-            para.initial_std = 1. / math.sqrt(para.size)
-    if g_default_compact_func is not None:
-        sparse, format, need_compact = g_default_compact_func(para.name)
-
-    if sparse is not None:
-        para.is_sparse = sparse
-    if format is not None:
-        para.format = format
-    if need_compact is not None:
-        para.need_compact = need_compact
-    if is_static is not None:
-        para.is_static = is_static
-    config_assert(not para.sparse_remote_update or not para.is_static,
-                  "sparse_remote_update and is_static cannot both be true")
-    if is_shared is not None:
-        para.is_shared = is_shared
-
-    update_hooks = default(update_hooks, g_default_update_hooks)
-
-    if update_hooks is not None:
-        if hasattr(update_hooks, '__call__'):
-            update_hooks = update_hooks()
-
-        if isinstance(update_hooks, list):
-            for hook in update_hooks:
-                para.update_hooks.extend([hook])
-        else:
-            para.update_hooks.extend([update_hooks])
-
-    g_parameter_map[name] = para
-    if initializer is not None:
-        config_assert(
-            callable(initializer),
-            "parameter initializer should be a callable object")
-        g_parameter_initializer_map[name] = initializer
-
-
-@config_func
-def default_initial_std(val):
-    global g_default_initial_std
-    g_default_initial_std = val
-
-
-@config_func
-def default_initial_mean(val):
-    global g_default_initial_mean
-    g_default_initial_mean = val
-
-
-@config_func
-def default_initial_strategy(val):
-    global g_default_initial_strategy
-    g_default_initial_strategy = val
-
-
-@config_func
-def default_initial_smart(val):
-    global g_default_initial_smart
-    g_default_initial_smart = val
-
-
-@config_func
-def default_momentum(val):
-    global g_default_momentum
-    g_default_momentum = val
-
-
-@config_func
-def default_decay_rate(val):
-    global g_default_decay_rate
-    g_default_decay_rate = val
-
-
-@config_func
-def default_num_batches_regularization(val):
-    global g_default_num_batches_regularization
-    g_default_num_batches_regularization = val
-
-
-@config_func
-def default_gradient_clipping_threshold(val):
-    global g_default_gradient_clipping_threshold
-    g_default_gradient_clipping_threshold = val
-
-
-@config_func
-def default_device(val):
-    global g_default_device
-    g_default_device = val
-
-
-@config_func
-def default_update_hooks(val):
-    global g_default_update_hooks
-    g_default_update_hooks = val
-
-
-@config_func
-def default_compact_func(val):
-    global g_default_compact_func
-    g_default_compact_func = val
-
-
-def make_importer(config_dir, config_args):
-    def Import(config_file, local_args={}):
-        if not config_file.startswith('/'):
-            config_file = config_dir + '/' + config_file
-            g_config.config_files.append(config_file)
-        execfile(config_file,
-                 make_config_environment(config_file, config_args), local_args)
-
-    return Import
-
-
-DEFAULT_SETTING = dict(
-    batch_size=None,
-    mini_batch_size=None,
-    algorithm='async_sgd',
-    async_lagged_grad_discard_ratio=1.5,
-    learning_method='momentum',
-    gradient_clipping_threshold=None,
-    num_batches_per_send_parameter=None,
-    num_batches_per_get_parameter=None,
-    center_parameter_update_method=None,
-    learning_rate=1.,
-    learning_rate_decay_a=0.,
-    learning_rate_decay_b=0.,
-    learning_rate_schedule='poly',
-    learning_rate_args='',
-    l1weight=0.1,
-    l2weight=0.,
-    l2weight_zero_iter=0,
-    c1=0.0001,
-    backoff=0.5,
-    owlqn_steps=10,
-    max_backoff=5,
-    average_window=0,
-    do_average_in_cpu=False,
-    max_average_window=None,
-    ada_epsilon=1e-6,
-    ada_rou=0.95,
-    delta_add_rate=1.0,
-    shrink_parameter_value=0,
-    adam_beta1=0.9,
-    adam_beta2=0.999,
-    adam_epsilon=1e-8, )
-
-settings = copy.deepcopy(DEFAULT_SETTING)
-
-settings_deprecated = dict(usage_ratio=1., )
-
-trainer_settings = dict(
-    save_dir="./output/model",
-    init_model_path=None,
-    start_pass=0, )
-
-
-@config_func
-def Settings(**args):
-    for k, v in args.iteritems():
-        if k == "usage_ratio":
-            logger.warning(
-                "Deprecated: define usage_ratio in DataConfig instead")
-            if g_config.HasField("data_config"):
-                g_config.data_config.__setattr__(k, v)
-            settings_deprecated[k] = v
-            continue
-        elif k in settings:
-            settings[k] = v
-        elif k in trainer_settings:
-            trainer_settings[k] = v
-        else:
-            logger.fatal('Unkown setting: %s' % k)
-
-
-@config_func
-def cluster_config(**args):
-    pass
-
-
-@config_func
-def EnableSubmodelSuffix(flag=True):
-    """
-    If enabled, the layer and evaluator names in submodel will be automatically
-    appended with @submodel_name
-    """
-    global g_add_submodel_suffix
-    g_add_submodel_suffix = flag
-
-
-def make_config_environment(config_file, config_args):
-    def make_setter(k):
-        def setter(v):
-            logger.fatal("Obsolete: use Settings(%s=%s, ...) instead" % (k, v))
-
-        return setter
-
-    funcs = {}
-    funcs.update(g_config_funcs)
-
-    for k in settings.iterkeys():
-        funcs[k] = make_setter(k)
-    for k in settings_deprecated.iterkeys():
-        funcs[k] = make_setter(k)
-    config_dir = os.path.dirname(config_file)
-    if not config_dir:
-        config_dir = '.'
-
-    funcs.update(
-        Import=make_importer(config_dir, config_args),
-        get_config_arg=make_get_config_arg(config_args), )
-
-    funcs.update(g_extended_config_funcs)
-
-    return funcs
-
-
-def make_get_config_arg(config_args):
-    def get_config_arg(name, type, default=None):
-        if type == bool:
-            s = config_args.get(name)
-            if not s:
-                return default
-            if s == 'True' or s == '1' or s == 'true':
-                return True
-            if s == 'False' or s == '0' or s == 'false':
-                return False
-            raise ValueError('Value of config_arg %s is not boolean' % name)
-        else:
-            return type(config_args.get(name, default))
-
-    return get_config_arg
-
-
-def importlib(name):
-    __import__(name)
-    return sys.modules[name]
-
-
-def find_caller():
-    stack = traceback.extract_stack()
-    for s in stack[-4::-1]:
-        if not s[0].endswith('config_parser.py'):
-            return s[0], s[1], s[2]
-    return "(unknown file)", 0, "(unknown function)"
-
-
-def my_fatal(s):
-    logger.critical(s)
-    raise Exception()
-
-
-_parse_config_hooks = set()
-
-
-def register_parse_config_hook(f):
-    """
-    Register a hook function for parse_config. parse_config will invoke the hook
-    at the beginning of parse. This make it possible to reset global state for
-    for constructing the model.
-    """
-    _parse_config_hooks.add(f)
-
-
-def update_g_config():
-    '''
-    Update g_config after execute config_file or config_functions.
-    '''
-    for k, v in settings.iteritems():
-        if v is None:
-            continue
-        g_config.opt_config.__setattr__(k, v)
-
-    for k, v in trainer_settings.iteritems():
-        if v is None:
-            continue
-        g_config.__setattr__(k, v)
-
-    for name in g_config.model_config.input_layer_names:
-        assert name in g_layer_map, \
-            'input name "%s" does not correspond to a layer name' % name
-        assert (g_layer_map[name].type == "data" or g_layer_map[name].type == "data_trim"), \
-            'The type of input layer "%s" is not "data"' % name
-    for name in g_config.model_config.output_layer_names:
-        assert name in g_layer_map, \
-            'input name "%s" does not correspond to a layer name' % name
-    return g_config
-
-
-def begin_parse():
-    init_config_environment()
-    for hook in _parse_config_hooks:
-        hook()
-
-    logger.findCaller = find_caller
-    logger.fatal = my_fatal
-
-    g_config.model_config.type = "nn"
-
-    global g_current_submodel, g_root_submodel
-    g_root_submodel = g_config.model_config.sub_models.add()
-    g_root_submodel.name = 'root'
-    g_root_submodel.is_recurrent_layer_group = False
-    g_current_submodel = g_root_submodel
-
-
-def parse_config(trainer_config, config_arg_str):
-    '''
-    @param config_arg_str: a string of the form var1=val1,var2=val2. It will be
-    passed to config script as a dictionary CONFIG_ARGS
-    '''
-
-    begin_parse()
-    config_args = {}
-
-    if config_arg_str:
-        config_args = dict([f.split('=') for f in config_arg_str.split(',')])
-
-    global g_command_config_args
-    g_command_config_args.update(config_args)
-
-    extension_module_name = config_args.get('extension_module_name')
-    if extension_module_name:
-        global g_extended_config_funcs
-        extension_module = importlib(extension_module_name)
-        g_extended_config_funcs = extension_module.get_config_funcs(g_config)
-
-    if hasattr(trainer_config, '__call__'):
-        trainer_config.func_globals.update(
-            make_config_environment("", config_args))
-        trainer_config()
-    else:
-        execfile(trainer_config,
-                 make_config_environment(trainer_config, config_args))
-
-    return update_g_config()
-
-
-def parse_config_and_serialize(trainer_config, config_arg_str):
-    try:
-        config = parse_config(trainer_config, config_arg_str)
-        #logger.info(config)
-        return config.SerializeToString()
-    except:
-        traceback.print_exc()
-        raise
-
-
-if __name__ == '__main__':
-    try:
-        config = parse_config(sys.argv[1], '')
-        config.SerializeToString()
-        __real_print__(str(config))
-    except:
-        traceback.print_exc()
-        raise
diff --git a/python/paddle/trainer/config_parser_extension.py b/python/paddle/trainer/config_parser_extension.py
deleted file mode 100644
index b9e0f3eb13d..00000000000
--- a/python/paddle/trainer/config_parser_extension.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.proto.DataConfig_pb2 import DataConfig
-
-g_config = None
-
-
-def SimpleData(files=None,
-               feat_dim=None,
-               context_len=None,
-               buffer_capacity=None):
-
-    data_config = DataConfig()
-    data_config.type = 'simple'
-    data_config.files = files
-    data_config.feat_dim = feat_dim
-    if context_len is not None:
-        data_config.context_len = context_len
-    if buffer_capacity:
-        data_config.buffer_capacity = buffer_capacity
-    return data_config
-
-
-def get_config_funcs(trainer_config):
-    global g_config
-    g_config = trainer_config
-    return dict(SimpleData=SimpleData)
diff --git a/python/paddle/trainer/recurrent_units.py b/python/paddle/trainer/recurrent_units.py
deleted file mode 100644
index ef92107a109..00000000000
--- a/python/paddle/trainer/recurrent_units.py
+++ /dev/null
@@ -1,357 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# recurrent_units.py
-# Version 2.0
-#
-# Some recurrent units can be used in recurrent layer group,
-#   to use these units, import this module in your config_file:
-#     import trainer.recurrent_units
-#
-# The modules in this file are DEPRECATED.
-# If you would like to use lstm/gru
-# please use the functions defined in paddle.trainer_config_helpers.
-
-from paddle.trainer.config_parser import *
-
-
-# long short term memory, can be used in recurrent machine
-# *inputs* must be a list of Projections, for example:
-#   inputs = [FullMatrixProjection("input_layer_name")],
-# *para_prefix* defines parameter names, if the *para_prefix* of
-#   two LstmRecurrentUnit is same, they share same parameters
-# *out_memory* can be defined outside if it's used outside
-def LstmRecurrentUnit(name,
-                      size,
-                      active_type,
-                      state_active_type,
-                      gate_active_type,
-                      inputs,
-                      para_prefix=None,
-                      error_clipping_threshold=0,
-                      out_memory=None):
-
-    if para_prefix is None:
-        para_prefix = name
-    if out_memory is None:
-        out_memory = Memory(name=name, size=size)
-
-    state_memory = Memory(name=name + "_" + "state", size=size)
-
-    Layer(
-        name=name + "_" + "input_recurrent",
-        type="mixed",
-        size=size * 4,  #(input_s, input_gate, forget_gate, output_gate)
-        error_clipping_threshold=error_clipping_threshold,
-        bias=Bias(
-            initial_std=0, parameter_name=para_prefix + "_input_recurrent.b"),
-        inputs=inputs + [
-            FullMatrixProjection(
-                out_memory, parameter_name=para_prefix + "_input_recurrent.w"),
-        ], )
-    LstmStepLayer(
-        name=name,
-        size=size,
-        bias=Bias(parameter_name=para_prefix + "_check.b"),
-        inputs=[name + "_" + "input_recurrent", state_memory],
-        active_type=active_type,
-        active_gate_type=gate_active_type,
-        active_state_type=state_active_type, )
-    GetOutputLayer(
-        name=name + "_" + "state",
-        size=size,
-        inputs=Input(
-            name, input_layer_argument="state"), )
-
-
-def LstmRecurrentUnitNaive(name,
-                           size,
-                           active_type,
-                           state_active_type,
-                           gate_active_type,
-                           inputs,
-                           para_prefix=None,
-                           error_clipping_threshold=0,
-                           out_memory=None):
-
-    if para_prefix is None:
-        para_prefix = name
-    if out_memory is None:
-        out_memory = Memory(name=name, size=size)
-
-    state_memory = Memory(name=name + "_" + "state", size=size)
-
-    Layer(
-        name=name + "_" + "input_recurrent",
-        type="mixed",
-        size=size * 4,  #(input_s, input_gate, forget_gate, output_gate)
-        error_clipping_threshold=error_clipping_threshold,
-        bias=Bias(
-            initial_std=0, parameter_name=para_prefix + "_input_recurrent.b"),
-        inputs=inputs + [
-            FullMatrixProjection(
-                out_memory, parameter_name=para_prefix + "_input_recurrent.w"),
-        ], )
-    ExpressionLayer(
-        name=name + "_" + "input_s",
-        size=size,
-        active_type=active_type,
-        inputs=[
-            IdentityOffsetProjection(
-                name + "_" + "input_recurrent", offset=0)
-        ], )
-    ExpressionLayer(
-        name=name + "_" + "input_gate",
-        active_type=gate_active_type,
-        inputs=[
-            IdentityOffsetProjection(
-                name + "_" + "input_recurrent", offset=size), DotMulProjection(
-                    state_memory, parameter_name=para_prefix + "_input_check.w")
-        ], )
-    ExpressionLayer(
-        name=name + "_" + "forget_gate",
-        active_type=gate_active_type,
-        inputs=[
-            IdentityOffsetProjection(
-                name + "_" + "input_recurrent", offset=size * 2),
-            DotMulProjection(
-                state_memory, parameter_name=para_prefix + "_forget_check.w")
-        ], )
-    ExpressionLayer(
-        name=name + "_" + "state",
-        inputs=[
-            DotMulOperator([name + "_" + "input_s", name + "_" + "input_gate"]),
-            DotMulOperator([state_memory, name + "_" + "forget_gate"]),
-        ], )
-    ExpressionLayer(
-        name=name + "_" + "output_gate",
-        active_type=gate_active_type,
-        inputs=[
-            IdentityOffsetProjection(
-                name + "_" + "input_recurrent", offset=size * 3),
-            DotMulProjection(
-                name + "_" + "state",
-                parameter_name=para_prefix + "_output_check.w")
-        ], )
-    ExpressionLayer(
-        name=name + "_" + "state_atv",
-        active_type=state_active_type,
-        inputs=IdentityProjection(name + "_" + "state"), )
-    ExpressionLayer(
-        name=name,
-        inputs=DotMulOperator(
-            [name + "_" + "state_atv", name + "_" + "output_gate"]), )
-
-
-# like LstmRecurrentUnit, but it's a layer group.
-# it is equivalent to LstmLayer
-def LstmRecurrentLayerGroup(name,
-                            size,
-                            active_type,
-                            state_active_type,
-                            gate_active_type,
-                            inputs,
-                            para_prefix=None,
-                            error_clipping_threshold=0,
-                            seq_reversed=False):
-
-    input_layer_name = name + "_" + "transform_input"
-    Layer(
-        name=input_layer_name,
-        type="mixed",
-        size=size * 4,
-        active_type="",
-        bias=False,
-        inputs=inputs, )
-
-    RecurrentLayerGroupBegin(
-        name + "_layer_group",
-        in_links=[input_layer_name],
-        out_links=[name],
-        seq_reversed=seq_reversed)
-
-    LstmRecurrentUnit(
-        name=name,
-        size=size,
-        active_type=active_type,
-        state_active_type=state_active_type,
-        gate_active_type=gate_active_type,
-        inputs=[IdentityProjection(input_layer_name)],
-        para_prefix=para_prefix,
-        error_clipping_threshold=error_clipping_threshold, )
-
-    RecurrentLayerGroupEnd(name + "_layer_group")
-
-
-# gated recurrent unit, can be used in recurrent machine
-# *inputs* should be a list of Projections, for example:
-#   inputs = [FullMatrixProjection("input_layer_name")],
-# *para_prefix* defines parameter names, if the *para_prefix* of
-#   two GatedRecurrentUnit is same, they share same parameters
-# *out_memory* can be defined outside if it's used outside
-
-
-def GatedRecurrentUnit(name,
-                       size,
-                       active_type,
-                       gate_active_type,
-                       inputs,
-                       para_prefix=None,
-                       error_clipping_threshold=0,
-                       out_memory=None):
-    if type_of(inputs) == str:  #only used by GatedRecurrentLayerGroup
-        input_layer_name = inputs
-    else:
-        input_layer_name = name + "_" + "transform_input"
-        Layer(
-            name=input_layer_name,
-            type="mixed",
-            size=size * 3,
-            active_type="",
-            bias=False,
-            inputs=inputs, )
-
-    if para_prefix is None:
-        para_prefix = name
-    if out_memory is None:
-        out_memory = Memory(name=name, size=size)
-
-    GruStepLayer(
-        name=name,
-        size=size,
-        bias=Bias(parameter_name=para_prefix + "_gate.b"),
-        inputs=[
-            input_layer_name, Input(
-                out_memory, parameter_name=para_prefix + "_gate.w")
-        ],
-        active_type=active_type,
-        active_gate_type=gate_active_type, )
-
-
-def GatedRecurrentUnitNaive(name,
-                            size,
-                            active_type,
-                            gate_active_type,
-                            inputs,
-                            para_prefix=None,
-                            error_clipping_threshold=0,
-                            out_memory=None):
-
-    if type_of(inputs) == str:  #only used by GatedRecurrentLayerGroup
-        input_layer_name = inputs
-    else:
-        input_layer_name = name + "_" + "transform_input"
-        Layer(
-            name=input_layer_name,
-            type="mixed",
-            size=size * 3,
-            active_type="",
-            bias=False,
-            inputs=inputs, )
-
-    if para_prefix is None:
-        para_prefix = name
-    if out_memory is None:
-        out_memory = Memory(name=name, size=size)
-
-    Layer(
-        name=name + "_" + "update_gate",
-        type="mixed",
-        size=size,
-        active_type=gate_active_type,
-        error_clipping_threshold=error_clipping_threshold,
-        bias=Bias(
-            initial_std=0, parameter_name=para_prefix + "_update_gate.b"),
-        inputs=[
-            IdentityOffsetProjection(
-                input_layer_name, offset=0), FullMatrixProjection(
-                    out_memory, parameter_name=para_prefix + "_update_gate.w")
-        ], )
-    Layer(
-        name=name + "_" + "reset_gate",
-        type="mixed",
-        size=size,
-        active_type=gate_active_type,
-        error_clipping_threshold=error_clipping_threshold,
-        bias=Bias(
-            initial_std=0, parameter_name=para_prefix + "_reset_gate.b"),
-        inputs=[
-            IdentityOffsetProjection(
-                input_layer_name, offset=size), FullMatrixProjection(
-                    out_memory, parameter_name=para_prefix + "_reset_gate.w")
-        ], )
-    ExpressionLayer(
-        name=name + "_" + "reset_output",
-        inputs=DotMulOperator([out_memory, name + "_" + "reset_gate"]), )
-    Layer(
-        name=name + "_" + "output_candidate",
-        type="mixed",
-        size=size,
-        active_type=active_type,
-        error_clipping_threshold=error_clipping_threshold,
-        bias=Bias(
-            initial_std=0, parameter_name=para_prefix + "_output_candidate.b"),
-        inputs=[
-            IdentityOffsetProjection(
-                input_layer_name, offset=size * 2), FullMatrixProjection(
-                    name + "_" + "reset_output",
-                    parameter_name=para_prefix + "_output_candidate.w")
-        ], )
-    ExpressionLayer(  #element-wise interpolation
-        name=name,
-        inputs=[
-            IdentityProjection(out_memory),
-            DotMulOperator(
-                [out_memory, name + "_" + "update_gate"], scale=-1.0),
-            DotMulOperator(
-                [name + "_" + "output_candidate", name + "_" + "update_gate"]),
-        ], )
-
-
-# like GatedRecurrentUnit, but it's a layer group.
-# it is equivalent to GatedRecurrentLayer.
-def GatedRecurrentLayerGroup(name,
-                             size,
-                             active_type,
-                             gate_active_type,
-                             inputs,
-                             para_prefix=None,
-                             error_clipping_threshold=0,
-                             seq_reversed=False):
-
-    input_layer_name = name + "_" + "transform_input"
-    Layer(
-        name=input_layer_name,
-        type="mixed",
-        size=size * 3,
-        active_type="",
-        bias=False,
-        inputs=inputs, )
-
-    RecurrentLayerGroupBegin(
-        name + "_layer_group",
-        in_links=[input_layer_name],
-        out_links=[name],
-        seq_reversed=seq_reversed)
-
-    GatedRecurrentUnit(
-        name=name,
-        size=size,
-        active_type=active_type,
-        gate_active_type=gate_active_type,
-        inputs=input_layer_name,  #transform outside
-        para_prefix=para_prefix,
-        error_clipping_threshold=error_clipping_threshold, )
-
-    RecurrentLayerGroupEnd(name + "_layer_group")
diff --git a/python/paddle/trainer_config_helpers/__init__.py b/python/paddle/trainer_config_helpers/__init__.py
deleted file mode 100644
index 13155ebddbb..00000000000
--- a/python/paddle/trainer_config_helpers/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from activations import *
-from data_sources import *
-from poolings import *
-from evaluators import *
-from layers import *
-from networks import *
-from optimizers import *
-from attrs import *
-from config_parser_utils import *
-# This will enable operator overload for LayerOutput
-import layer_math
diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py
deleted file mode 100644
index 36839682622..00000000000
--- a/python/paddle/trainer_config_helpers/activations.py
+++ /dev/null
@@ -1,263 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = [
-    "TanhActivation", "SigmoidActivation", "SoftmaxActivation",
-    "IdentityActivation", "LinearActivation", 'SequenceSoftmaxActivation',
-    'ExpActivation', "ReluActivation", "BReluActivation", "SoftReluActivation",
-    "STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation",
-    "LogActivation", "SqrtActivation", "ReciprocalActivation",
-    "SoftSignActivation"
-]
-
-
-class BaseActivation(object):
-    """
-    A mark for activation class.
-    Each activation inherit BaseActivation, which has two parameters.
-
-    :param name: activation name in paddle config.
-    :type name: basestring
-    :param support_hppl: True if supported by hppl. HPPL is a library used by paddle
-                         internally. Currently, lstm layer can only use activations
-                         supported by hppl.
-    :type support_hppl: bool
-    """
-
-    def __init__(self, name, support_hppl):
-        self.name = name
-        self.support_hppl = support_hppl
-
-    def __repr__(self):
-        return self.name
-
-
-class TanhActivation(BaseActivation):
-    """
-    Tanh activation.
-
-    .. math::
-
-       f(z)=tanh(z)=\\frac{e^z-e^{-z}}{e^z+e^{-z}}
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'tanh', True)
-
-
-class SigmoidActivation(BaseActivation):
-    """
-    Sigmoid activation.
-
-    .. math::
-
-       f(z) = \\frac{1}{1+exp(-z)}
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'sigmoid', True)
-
-
-class SoftmaxActivation(BaseActivation):
-    """
-    Softmax activation for simple input
-
-
-
-    .. math::
-
-       P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_k} }
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'softmax', False)
-
-
-class SequenceSoftmaxActivation(BaseActivation):
-    """
-    Softmax activation for one sequence. The dimension of input feature must be
-    1 and a sequence.
-
-    ..  code:: python
-
-        result = softmax(for each_feature_vector[0] in input_feature)
-        for i, each_time_step_output in enumerate(output):
-            each_time_step_output = result[i]
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'sequence_softmax', False)
-
-
-class IdentityActivation(BaseActivation):
-    """
-    Identity Activation.
-
-    Just do nothing for output both forward/backward.
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, '', False)
-
-
-LinearActivation = IdentityActivation
-
-
-class ReluActivation(BaseActivation):
-    """
-    Relu activation.
-
-    forward. :math:`y = max(0, z)`
-
-    derivative:
-
-    .. math::
-
-       1  &\\quad if z > 0 \\\\
-       0  &\\quad\\mathrm{otherwize}
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'relu', True)
-
-
-class BReluActivation(BaseActivation):
-    """
-    BRelu Activation.
-
-    forward.  :math:`y = min(24, max(0, z))`
-
-    derivative:
-
-    .. math::
-
-       1  &\\quad if 0 < z < 24 \\\\
-       0  &\\quad \\mathrm{otherwise}
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'brelu', False)
-
-
-class SoftReluActivation(BaseActivation):
-    """
-    SoftRelu Activation.
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'softrelu', False)
-
-
-class STanhActivation(BaseActivation):
-    """
-    Scaled Tanh Activation.
-
-    .. math::
-
-       f(z) = 1.7159 * tanh(2/3*z)
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'stanh', False)
-
-
-class AbsActivation(BaseActivation):
-    """
-    Abs Activation.
-
-    Forward:    :math:`f(z) = abs(z)`
-
-    Derivative:
-
-    .. math::
-
-       1 &\\quad if \\quad z > 0 \\\\
-       -1 &\\quad if \\quad z < 0 \\\\
-       0 &\\quad if \\quad z = 0
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'abs', False)
-
-
-class SquareActivation(BaseActivation):
-    """
-    Square Activation.
-
-    .. math::
-       f(z) = z^2.
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'square', False)
-
-
-class ExpActivation(BaseActivation):
-    """
-    Exponential Activation.
-
-    .. math::
-       f(z) = e^z.
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'exponential', False)
-
-
-class LogActivation(BaseActivation):
-    """
-    Logarithm Activation.
-
-    .. math::
-       f(z) = log(z)
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'log', False)
-
-
-class SqrtActivation(BaseActivation):
-    """
-    Square Root Activation.
-
-    .. math::
-       f(z) = sqrt(z)
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'sqrt', False)
-
-
-class ReciprocalActivation(BaseActivation):
-    """
-    Reciprocal Activation.
-
-    .. math::
-       f(z)=\\frac{1}{z}
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'reciprocal', False)
-
-
-class SoftSignActivation(BaseActivation):
-    """
-    SoftSign Activation.
-
-    .. math::
-       f(z)=\\frac{z}{1 + |z|}
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'softsign', False)
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
deleted file mode 100644
index 4e3beaf639b..00000000000
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ /dev/null
@@ -1,291 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.config_parser import *
-__all__ = [
-    'HookAttr', 'ParamAttr', 'ExtraAttr', 'ParameterAttribute',
-    'ExtraLayerAttribute'
-]
-
-
-def convert_and_compare(x, Type):
-    """
-    Convert x to be the same type as Type and then convert back to
-    check whether there is a loss of information
-    :param x: object to be checked
-    :param Type: target type to check x over
-
-    """
-    return type(x)(Type(x)) == x
-
-
-def is_compatible_with(x, Type):
-    """
-    Check if x has a type compatible with Type
-    :param x: object to be checked
-    :param Type: target type to check x over
-
-    """
-    if type(x) == Type:
-        return True
-    try:
-        if float == Type or int == Type:
-            # avoid those types that can be converted to float/int but not very
-            # meaningful and  could potentially lead to error
-            # i.e., str and bool typed value should not be used for initializing float/int variable
-            if not isinstance(x, str) and not isinstance(x, bool):
-                return convert_and_compare(x, Type)
-        elif bool == Type:
-            # should not use string type to initialize bool variable
-            if not isinstance(x, str):
-                return convert_and_compare(x, Type)
-        else:
-            return False
-    except:
-        return False
-
-
-class HookAttribute(object):
-    """
-    Hook Attribute object. As a member of ParameterAttribute class, the hook is an auxiliary operation that occurs
-    during training process of a layer with parameters, such as img_conv layer, fc layer.
-
-    :param  type: Hook type, currently supported types:
-                        'pruning' :  user specify a sparsity_ratio before training started, and the
-                            network will prune the parameters based on the sparsity_ratio.
-                            eg: The definition of Hook object can be hk = HookAttribute('pruning', 0.6)
-                            The specific usage can be paddle.layer.img_conv(input=img, filter_size=3,
-                                                                       num_channels=3, num_filters=64,
-                                                                       param_attr=ParameterAttribute(update_hooks=hk) )
-                            The pruning details can be found https://arxiv.org/pdf/1506.02626.pdf
-    :type type: string
-
-    :param sparsity_ratio: Must be specified if hook type is 'pruning',
-                        it represents the ratio of the zero elements to be set by the Parameter.
-    :type sparsity_ratio: float or None
-
-    """
-
-    def __init__(self, type, sparsity_ratio=None):
-        self.type = type
-        self.sparsity_ratio = sparsity_ratio
-        if self.sparsity_ratio is not None:
-            assert is_compatible_with(
-                self.sparsity_ratio,
-                float), 'sparisity_ratio must be float type'
-            assert self.sparsity_ratio <= 1 and self.sparsity_ratio >= 0, 'sparsity_ratio must be a float between [0, 1] '
-
-    def __call__(self):
-        return ParameterHook(self.type, sparsity_ratio=self.sparsity_ratio)
-
-
-class ParameterAttribute(object):
-    """
-    Parameter Attributes object. To fine-tuning network training process, user
-    can set attribute to control training details, such as l1,l2 rate / learning
-    rate / how to init param.
-
-    NOTE: IT IS A HIGH LEVEL USER INTERFACE.
-
-    :param is_static: True if this parameter will be fixed while training.
-    :type is_static: bool
-
-    :param initial_std: Gauss Random initialization standard deviation.
-                        None if not using Gauss Random initialize parameter.
-    :type initial_std: float or None
-    :param initial_mean:  Gauss Random initialization mean.
-                         None if not using Gauss Random initialize parameter.
-    :type initial_mean: float or None
-    :param initial_max: Uniform initialization max value.
-    :type initial_max: float or None
-    :param initial_min: Uniform initialization min value.
-    :type initial_min: float or None
-    :param l1_rate: the l1 regularization factor
-    :type l1_rate: float or None
-    :param l2_rate: the l2 regularization factor
-    :type l2_rate: float or None
-    :param learning_rate: The parameter learning rate. None means 1.
-                          The learning rate when optimize is LEARNING_RATE =
-                          GLOBAL_LEARNING_RATE * PARAMETER_LEARNING_RATE
-                          * SCHEDULER_FACTOR.
-
-    :type learning_rate: float or None
-    :param momentum: The parameter momentum. None means use global value.
-    :type momentum: float or None
-    :param gradient_clipping_threshold: gradient clipping threshold. If gradient
-                                        value larger than some value, will be
-                                        clipped.
-    :type gradient_clipping_threshold: float
-    :param sparse_update: Enable sparse update for this parameter. It will
-                          enable both local and remote sparse update.
-    :type sparse_update: bool
-    :param update_hooks: A HookAttribute object.
-    :type update_hooks: HookAttribute
-    :param initializer: If not None, it should be a callable object which accepts
-                        a parameter name and returns numpy array for the initial
-                        value of the parameter
-    :type initializer: callable object
-    """
-
-    def __init__(self,
-                 name=None,
-                 is_static=False,
-                 initial_std=None,
-                 initial_mean=None,
-                 initial_max=None,
-                 initial_min=None,
-                 l1_rate=None,
-                 l2_rate=None,
-                 learning_rate=None,
-                 momentum=None,
-                 gradient_clipping_threshold=None,
-                 sparse_update=False,
-                 update_hooks=None,
-                 initializer=None):
-        self.attr = {}
-
-        if is_static:
-            self.attr['is_static'] = True
-
-        if initial_std is None and initial_mean is None and initial_max \
-                is None and initial_min is None:
-            self.attr['initial_smart'] = True
-        elif is_compatible_with(initial_std, float) or \
-             is_compatible_with(initial_mean, float):
-            if initial_std is not None:
-                self.attr['initial_std'] = initial_std
-            if initial_mean is not None:
-                self.attr['initial_mean'] = initial_mean
-            self.attr['initial_strategy'] = 0  # Gauss Random
-        elif is_compatible_with(initial_max, float) and \
-             is_compatible_with(initial_min, float):
-            initial_max = initial_max
-            initial_min = initial_min
-            assert initial_min < initial_max
-            initial_mean = (initial_max + initial_min) / 2
-            initial_std = initial_mean - initial_min
-            self.attr['initial_mean'] = initial_mean
-            self.attr['initial_std'] = initial_std
-            self.attr['initial_strategy'] = 1  # Uniform Random
-        else:
-            raise RuntimeError("Unexpected branch.")
-
-        if not is_static and is_compatible_with(l1_rate, float):
-            self.attr['decay_rate_l1'] = l1_rate
-
-        if not is_static and is_compatible_with(l2_rate, float):
-            self.attr['decay_rate'] = l2_rate
-
-        if not is_static and is_compatible_with(learning_rate, float):
-            self.attr['learning_rate'] = learning_rate
-
-        if not is_static and is_compatible_with(momentum, float):
-            self.attr['momentum'] = momentum
-
-        if name is not None:
-            self.attr['parameter_name'] = name
-
-        if sparse_update:
-            self.attr['sparse_update'] = True
-            self.attr['sparse_remote_update'] = True
-
-        if gradient_clipping_threshold is not None and \
-                is_compatible_with(gradient_clipping_threshold, float):
-            self.attr['gradient_clipping_threshold'] = \
-                gradient_clipping_threshold
-        if initializer is not None:
-            self.attr['initializer'] = initializer
-
-        if update_hooks:
-            self.attr['update_hooks'] = update_hooks
-
-    def set_default_parameter_name(self, name):
-        """
-        Set default parameter name. If parameter not set, then will use default
-        parameter name.
-
-
-        :param name: default parameter name.
-        :type name: basestring
-        """
-        if 'parameter_name' not in self.attr:
-            self.attr['parameter_name'] = name
-
-    @staticmethod
-    def to_bias(bias_attr):
-        if isinstance(bias_attr, ParameterAttribute):
-            return Bias(**bias_attr.attr)
-        else:
-            return False
-
-
-class ExtraLayerAttribute(object):
-    """
-    Some high level layer attributes config. You can set all attributes here,
-    but some layer doesn't support all attributes. If you set an attribute to a
-    layer that not support this attribute, paddle will print an error and core.
-
-    :param error_clipping_threshold: Error clipping threshold.
-    :type error_clipping_threshold: float
-    :param drop_rate: Dropout rate. Dropout will create a mask on layer output.
-                      The dropout rate is the zero rate of this mask. The
-                      details of what dropout is please refer to `JMLRdropout
-                      <https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf
-                      >`_.
-    :type drop_rate: float
-    :param device: device ID of layer. device=-1, use CPU. device>=0, use GPU.
-                   The details allocation in parallel_nn please refer to `use_case
-                   <https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2
-                   /howto/cmd_parameter/use_case_en.md#case-2-specify-layers-in
-                   -different-devices>`_.
-    :type device: int
-    """
-
-    def __init__(self,
-                 error_clipping_threshold=None,
-                 drop_rate=None,
-                 device=None):
-        self.attr = dict()
-        if error_clipping_threshold is not None:
-            error_clipping_threshold = float(error_clipping_threshold)
-            if error_clipping_threshold < 0:
-                raise ValueError("Error clipping must > 0")
-            self.attr['error_clipping_threshold'] = error_clipping_threshold
-        if drop_rate is not None:
-            drop_rate = float(drop_rate)
-            if drop_rate < 0:
-                raise ValueError("Dropout rate must > 0")
-            self.attr["drop_rate"] = drop_rate
-
-        if isinstance(device, int):
-            self.attr["device"] = device
-
-    def check(self, layer_name):
-        for key in self.attr:
-            if not hasattr(self, 'can_%s' % key) or \
-                    not getattr(self, 'can_%s' % key):
-                raise NotImplementedError("Layer %s does not support %s" %
-                                          (layer_name, key))
-
-    @staticmethod
-    def to_kwargs(attr):
-        if attr is None:
-            return dict()
-        else:
-            return attr.attr
-
-
-HookAttr = HookAttribute
-ParamAttr = ParameterAttribute
-ExtraAttr = ExtraLayerAttribute
diff --git a/python/paddle/trainer_config_helpers/config_parser_utils.py b/python/paddle/trainer_config_helpers/config_parser_utils.py
deleted file mode 100644
index ee5bbbfb2de..00000000000
--- a/python/paddle/trainer_config_helpers/config_parser_utils.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import paddle.trainer.config_parser as config_parser
-from paddle.proto.TrainerConfig_pb2 import OptimizationConfig
-'''
-This file is a wrapper of formal config_parser. The main idea of this file is to
-separete different config logic into different function, such as network configuration
- and optimizer configuration.
-'''
-
-__all__ = [
-    "parse_trainer_config", "parse_network_config", "parse_optimizer_config",
-    "reset_parser"
-]
-
-
-def parse_trainer_config(trainer_conf, config_arg_str):
-    return config_parser.parse_config(trainer_conf, config_arg_str)
-
-
-def parse_network_config(network_conf, config_arg_str=''):
-    config = config_parser.parse_config(network_conf, config_arg_str)
-    return config.model_config
-
-
-def parse_optimizer_config(optimizer_conf, config_arg_str=''):
-    config_parser.settings = copy.deepcopy(config_parser.DEFAULT_SETTING)
-    optimizer_conf()
-    opt_config = OptimizationConfig()
-    for k, v in config_parser.settings.iteritems():
-        if v is None:
-            continue
-        opt_config.__setattr__(k, v)
-    return opt_config
-
-
-def reset_parser():
-    config_parser.begin_parse()
diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py
deleted file mode 100644
index a2a32d848cb..00000000000
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Data Sources are helpers to define paddle training data or testing data.
-"""
-from paddle.trainer.config_parser import *
-from .utils import deprecated
-
-try:
-    import cPickle as pickle
-except ImportError:
-    import six.moves.cPickle as pickle
-
-__all__ = ['define_py_data_sources2']
-
-
-def define_py_data_source(file_list,
-                          cls,
-                          module,
-                          obj,
-                          args=None,
-                          async=False,
-                          data_cls=PyData):
-    """
-    Define a python data source.
-
-    For example, the simplest usage in trainer_config.py as follow:
-
-    ..  code-block:: python
-
-        define_py_data_source("train.list", TrainData, "data_provider", "process")
-
-    Or. if you want to pass arguments from trainer_config to data_provider.py, then
-
-    ..  code-block:: python
-
-        define_py_data_source("train.list", TrainData, "data_provider", "process",
-                              args={"dictionary": dict_name})
-
-    :param data_cls:
-    :param file_list: file list name, which contains all data file paths
-    :type file_list: basestring
-    :param cls: Train or Test Class.
-    :type cls: TrainData or TestData
-    :param module: python module name.
-    :type module: basestring
-    :param obj: python object name. May be a function name if using
-                PyDataProviderWrapper.
-    :type obj: basestring
-    :param args: The best practice is using dict to pass arguments into
-                 DataProvider, and use :code:`@init_hook_wrapper` to
-                 receive arguments.
-    :type args: string or picklable object
-    :param async: Load Data asynchronously or not.
-    :type async: bool
-    :return: None
-    :rtype: None
-    """
-    if isinstance(file_list, list):
-        file_list_name = 'train.list'
-        if cls == TestData:
-            file_list_name = 'test.list'
-        with open(file_list_name, 'w') as f:
-            f.writelines(file_list)
-        file_list = file_list_name
-
-    if not isinstance(args, basestring) and args is not None:
-        args = pickle.dumps(args, 0)
-
-    cls(
-        data_cls(
-            files=file_list,
-            load_data_module=module,
-            load_data_object=obj,
-            load_data_args=args,
-            async_load_data=async))
-
-
-def define_py_data_sources(train_list,
-                           test_list,
-                           module,
-                           obj,
-                           args=None,
-                           train_async=False,
-                           data_cls=PyData):
-    """
-    The annotation is almost the same as define_py_data_sources2, except that
-    it can specific train_async and data_cls.
-
-    :param data_cls:
-    :param train_list: Train list name.
-    :type train_list: basestring
-    :param test_list: Test list name.
-    :type test_list: basestring
-    :param module: python module name. If train and test is different, then
-                   pass a tuple or list to this argument.
-    :type module: basestring or tuple or list
-    :param obj: python object name. May be a function name if using
-                PyDataProviderWrapper. If train and test is different, then pass
-                a tuple or list to this argument.
-    :type obj: basestring or tuple or list
-    :param args: The best practice is using dict() to pass arguments into
-                 DataProvider, and use :code:`@init_hook_wrapper` to receive
-                 arguments. If train and test is different, then pass a tuple
-                 or list to this argument.
-    :type args: string or picklable object or list or tuple.
-    :param train_async: Is training data load asynchronously or not.
-    :type train_async: bool
-    :return: None
-    :rtype: None
-    """
-
-    def __is_splitable__(o):
-        return (isinstance(o, list) or
-                isinstance(o, tuple)) and hasattr(o, '__len__') and len(o) == 2
-
-    assert train_list is not None or test_list is not None
-    assert module is not None and obj is not None
-
-    test_module = module
-    train_module = module
-    if __is_splitable__(module):
-        train_module, test_module = module
-
-    test_obj = obj
-    train_obj = obj
-    if __is_splitable__(obj):
-        train_obj, test_obj = obj
-
-    if args is None:
-        args = ""
-
-    train_args = args
-    test_args = args
-    if __is_splitable__(args):
-        train_args, test_args = args
-
-    if train_list is not None:
-        define_py_data_source(train_list, TrainData, train_module, train_obj,
-                              train_args, train_async, data_cls)
-
-    if test_list is not None:
-        define_py_data_source(test_list, TestData, test_module, test_obj,
-                              test_args, False, data_cls)
-
-
-def define_py_data_sources2(train_list, test_list, module, obj, args=None):
-    """
-    Define python Train/Test data sources in one method. If train/test use
-    the same Data Provider configuration, module/obj/args contain one argument,
-    otherwise contain a list or tuple of arguments. For example\:
-
-    ..  code-block:: python
-
-        define_py_data_sources2(train_list="train.list",
-                                test_list="test.list",
-                                module="data_provider"
-                                # if train/test use different configurations,
-                                # obj=["process_train", "process_test"]
-                                obj="process",
-                                args={"dictionary": dict_name})
-
-    The related data provider can refer to :ref:`api_pydataprovider2_sequential_model` .
-
-    :param train_list: Train list name.
-    :type train_list: basestring
-    :param test_list: Test list name.
-    :type test_list: basestring
-    :param module: python module name. If train and test is different, then
-                   pass a tuple or list to this argument.
-    :type module: basestring or tuple or list
-    :param obj: python object name. May be a function name if using
-                PyDataProviderWrapper. If train and test is different, then pass
-                a tuple or list to this argument.
-    :type obj: basestring or tuple or list
-    :param args: The best practice is using dict() to pass arguments into
-                 DataProvider, and use :code:`@init_hook_wrapper` to receive
-                 arguments. If train and test is different, then pass a tuple
-                 or list to this argument.
-    :type args: string or picklable object or list or tuple.
-    :return: None
-    :rtype: None
-    """
-
-    def py_data2(files, load_data_module, load_data_object, load_data_args,
-                 **kwargs):
-        data = create_data_config_proto()
-        data.type = 'py2'
-        data.files = files
-        data.load_data_module = load_data_module
-        data.load_data_object = load_data_object
-        data.load_data_args = load_data_args
-        data.async_load_data = False
-        return data
-
-    define_py_data_sources(
-        train_list=train_list,
-        test_list=test_list,
-        module=module,
-        obj=obj,
-        args=args,
-        data_cls=py_data2)
diff --git a/python/paddle/trainer_config_helpers/default_decorators.py b/python/paddle/trainer_config_helpers/default_decorators.py
deleted file mode 100644
index 69d860d9dab..00000000000
--- a/python/paddle/trainer_config_helpers/default_decorators.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import functools
-import inspect
-from .attrs import ParamAttr
-from .activations import TanhActivation
-from paddle.trainer.config_parser import *
-
-__all__ = [
-    'wrap_name_default', 'wrap_param_attr_default', 'wrap_bias_attr_default',
-    'wrap_act_default', 'wrap_param_default'
-]
-
-
-def __default_not_set_callback__(kwargs, name):
-    return name not in kwargs or kwargs[name] is None
-
-
-def wrap_param_default(param_names=None,
-                       default_factory=None,
-                       not_set_callback=__default_not_set_callback__):
-    assert param_names is not None
-    assert isinstance(param_names, list) or isinstance(param_names, tuple)
-    for each_param_name in param_names:
-        assert isinstance(each_param_name, basestring)
-
-    def __impl__(func):
-        @functools.wraps(func)
-        def __wrapper__(*args, **kwargs):
-            if len(args) != 0:
-                argspec = inspect.getargspec(func)
-                num_positional = len(argspec.args)
-                if argspec.defaults:
-                    num_positional -= len(argspec.defaults)
-                if not argspec.varargs and len(args) > num_positional:
-                    logger.fatal(
-                        "Must use keyword arguments for non-positional args")
-            for name in param_names:
-                if not_set_callback(kwargs, name):  # Not set
-                    kwargs[name] = default_factory(func)
-            return func(*args, **kwargs)
-
-        if hasattr(func, 'argspec'):
-            __wrapper__.argspec = func.argspec
-        else:
-            __wrapper__.argspec = inspect.getargspec(func)
-        return __wrapper__
-
-    return __impl__
-
-
-class DefaultNameFactory(object):
-    def __init__(self, name_prefix):
-        self.__counter__ = 0
-        self.__name_prefix__ = name_prefix
-
-    def __call__(self, func):
-        if self.__name_prefix__ is None:
-            self.__name_prefix__ = func.__name__
-        tmp = "__%s_%d__" % (self.__name_prefix__, self.__counter__)
-        self.__check_name__(tmp)
-        self.__counter__ += 1
-        return tmp
-
-    def __check_name__(self, nm):
-        """
-        @TODO(yuyang18): Implement it!
-        @param nm:
-        @return:
-        """
-        pass
-
-    def reset(self):
-        self.__counter__ = 0
-
-
-_name_factories = []
-
-
-def reset_hook():
-    for factory in _name_factories:
-        factory.reset()
-
-
-register_parse_config_hook(reset_hook)
-
-
-def wrap_name_default(name_prefix=None, name_param="name"):
-    """
-    Decorator to set "name" arguments default to "{name_prefix}_{invoke_count}".
-
-    ..  code:: python
-
-        @wrap_name_default("some_name")
-        def func(name=None):
-            print name      # name will never be None. If name is not set,
-                            # name will be "some_name_%d"
-
-    :param name_prefix: name prefix. wrapped function's __name__ if None.
-    :type name_prefix: basestring
-    :return: a decorator to set default name
-    :rtype: callable
-    """
-    factory = DefaultNameFactory(name_prefix)
-    _name_factories.append(factory)
-    return wrap_param_default([name_param], factory)
-
-
-def wrap_param_attr_default(param_names=None, default_factory=None):
-    """
-    Setting Default Parameter Attributes Decorator.
-
-    :param default_factory:
-    :param param_names: Parameter Attribute's Names, list of string
-    :type param_names: list
-    :return: decorator
-    """
-    if param_names is None:
-        param_names = ['param_attr']
-    if default_factory is None:
-        default_factory = lambda _: ParamAttr()
-
-    return wrap_param_default(param_names, default_factory)
-
-
-def wrap_bias_attr_default(param_names=None,
-                           default_factory=None,
-                           has_bias=True):
-    if param_names is None:
-        param_names = ['bias_attr']
-    if default_factory is None:
-        default_factory = lambda _: ParamAttr(initial_std=0., initial_mean=0.)
-
-    def __bias_attr_not_set__(kwargs, name):
-        if has_bias:
-            return name not in kwargs or kwargs[name] is None or \
-                   kwargs[name] == True
-        else:
-            return name in kwargs and kwargs[name] == True
-
-    return wrap_param_default(param_names, default_factory,
-                              __bias_attr_not_set__)
-
-
-def wrap_act_default(param_names=None, act=None):
-    if param_names is None:
-        param_names = ["act"]
-
-    if act is None:
-        act = TanhActivation()
-
-    return wrap_param_default(param_names, lambda _: act)
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
deleted file mode 100644
index 0eeaf7eabb1..00000000000
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ /dev/null
@@ -1,813 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.config_parser import *
-from default_decorators import *
-
-__all__ = [
-    "evaluator_base",
-    "classification_error_evaluator",
-    "auc_evaluator",
-    "pnpair_evaluator",
-    "precision_recall_evaluator",
-    "ctc_error_evaluator",
-    "chunk_evaluator",
-    "sum_evaluator",
-    "column_sum_evaluator",
-    "value_printer_evaluator",
-    "gradient_printer_evaluator",
-    "maxid_printer_evaluator",
-    "maxframe_printer_evaluator",
-    "seqtext_printer_evaluator",
-    "classification_error_printer_evaluator",
-    "detection_map_evaluator",
-]
-
-
-class EvaluatorAttribute(object):
-    FOR_CLASSIFICATION = 1
-    FOR_REGRESSION = 1 << 1
-    FOR_RANK = 1 << 2
-    FOR_PRINT = 1 << 3
-    FOR_UTILS = 1 << 4
-    FOR_DETECTION = 1 << 5
-
-    KEYS = [
-        "for_classification", "for_regression", "for_rank", "for_print",
-        "for_utils", "for_detection"
-    ]
-
-    @staticmethod
-    def to_key(idx):
-        tmp = 1
-        for i in xrange(0, len(EvaluatorAttribute.KEYS)):
-            if idx == tmp:
-                return EvaluatorAttribute.KEYS[i]
-            else:
-                tmp = (tmp << 1)
-
-
-def evaluator(*attrs):
-    def impl(method):
-        for attr in attrs:
-            setattr(method, EvaluatorAttribute.to_key(attr), True)
-        method.is_evaluator = True
-        return method
-
-    return impl
-
-
-def evaluator_base(input,
-                   type,
-                   label=None,
-                   weight=None,
-                   name=None,
-                   chunk_scheme=None,
-                   num_chunk_types=None,
-                   classification_threshold=None,
-                   positive_label=None,
-                   dict_file=None,
-                   result_file=None,
-                   num_results=None,
-                   delimited=None,
-                   top_k=None,
-                   excluded_chunk_types=None,
-                   overlap_threshold=None,
-                   background_id=None,
-                   evaluate_difficult=None,
-                   ap_type=None):
-    """
-    Evaluator will evaluate the network status while training/testing.
-
-    User can use evaluator by classify/regression job. For example.
-
-    ..  code-block:: python
-
-        classify(prediction, output, evaluator=classification_error_evaluator)
-
-    And user could define evaluator separately as follow.
-
-    ..  code-block:: python
-
-        classification_error_evaluator("ErrorRate", prediction, label)
-
-    The evaluator often contains a name parameter. It will also be printed when
-    evaluating network. The printed information may look like the following.
-
-    ..  code-block:: text
-
-         Batch=200 samples=20000 AvgCost=0.679655 CurrentCost=0.662179 Eval:
-         classification_error_evaluator=0.4486
-         CurrentEval: ErrorRate=0.3964
-
-    :param input: Input layers, a object of LayerOutput or a list of
-                  LayerOutput.
-    :type input: list|LayerOutput
-    :param label: An input layer containing the ground truth label.
-    :type label: LayerOutput|None
-    :param weight: An input layer which is a weight for each sample.
-                   Each evaluator may calculate differently to use this weight.
-    :type weight: LayerOutput.
-    :param top_k: number k in top-k error rate
-    :type top_k: int
-    :param overlap_threshold: In detection tasks to filter detection results
-    :type overlap_threshold: float
-    :param background_id: Identifier of background class
-    :type background_id: int
-    :param evaluate_difficult: Whether to evaluate difficult objects
-    :type evaluate_difficult: bool
-    :param ap_type: How to calculate average persicion
-    :type ap_type: str
-    """
-    # inputs type assertions.
-    assert classification_threshold is None or isinstance(
-        classification_threshold, float)
-    assert positive_label is None or isinstance(positive_label, int)
-    assert num_results is None or isinstance(num_results, int)
-    assert top_k is None or isinstance(top_k, int)
-
-    if not isinstance(input, list):
-        input = [input]
-
-    if label:
-        input.append(label)
-    if weight:
-        input.append(weight)
-
-    Evaluator(
-        name=name,
-        type=type,
-        inputs=[i.name for i in input],
-        chunk_scheme=chunk_scheme,
-        num_chunk_types=num_chunk_types,
-        classification_threshold=classification_threshold,
-        positive_label=positive_label,
-        dict_file=dict_file,
-        result_file=result_file,
-        delimited=delimited,
-        num_results=num_results,
-        top_k=top_k,
-        excluded_chunk_types=excluded_chunk_types,
-        overlap_threshold=overlap_threshold,
-        background_id=background_id,
-        evaluate_difficult=evaluate_difficult,
-        ap_type=ap_type)
-
-
-@evaluator(EvaluatorAttribute.FOR_DETECTION)
-@wrap_name_default()
-def detection_map_evaluator(input,
-                            label,
-                            overlap_threshold=0.5,
-                            background_id=0,
-                            evaluate_difficult=False,
-                            ap_type="11point",
-                            name=None):
-    """
-    Detection mAP Evaluator. It will print mean Average Precision (mAP) for detection.
-
-    The detection mAP Evaluator based on the output of detection_output layer counts
-    the true positive and the false positive bbox and integral them to get the
-    mAP.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval =  detection_map_evaluator(input=det_output,label=lbl)
-
-    :param input: Input layer.
-    :type input: LayerOutput
-    :param label: Label layer.
-    :type label: LayerOutput
-    :param overlap_threshold: The bbox overlap threshold of a true positive.
-    :type overlap_threshold: float
-    :param background_id: The background class index.
-    :type background_id: int
-    :param evaluate_difficult: Whether evaluate a difficult ground truth.
-    :type evaluate_difficult: bool
-    """
-    if not isinstance(input, list):
-        input = [input]
-
-    if label:
-        input.append(label)
-
-    evaluator_base(
-        name=name,
-        type="detection_map",
-        input=input,
-        label=label,
-        overlap_threshold=overlap_threshold,
-        background_id=background_id,
-        evaluate_difficult=evaluate_difficult,
-        ap_type=ap_type)
-
-
-@evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
-@wrap_name_default()
-def classification_error_evaluator(input,
-                                   label,
-                                   name=None,
-                                   weight=None,
-                                   top_k=None,
-                                   threshold=None):
-    """
-    Classification Error Evaluator. It will print error rate for classification.
-
-    The classification error is:
-
-    ..  math::
-
-        classification\\_error = \\frac{NumOfWrongPredicts}{NumOfAllSamples}
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval =  classification_error_evaluator(input=prob,label=lbl)
-
-    :param name: Evaluator name.
-    :type name: basestring
-    :param input: Input Layer name. The output prediction of network.
-    :type input: LayerOutput
-    :param label: Label layer name.
-    :type label: basestring
-    :param weight: Weight Layer name. It should be a matrix with size
-                  [sample_num, 1]. And will just multiply to NumOfWrongPredicts
-                  and NumOfAllSamples. So, the elements of weight are all one,
-                  then means not set weight. The larger weight it is, the more
-                  important this sample is.
-    :type weight: LayerOutput
-    :param top_k: number k in top-k error rate
-    :type top_k: int
-    :param threshold: The classification threshold.
-    :type threshold: float
-    :return: None.
-    """
-
-    evaluator_base(
-        name=name,
-        type="classification_error",
-        input=input,
-        label=label,
-        weight=weight,
-        top_k=top_k,
-        classification_threshold=threshold, )
-
-
-@evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
-@wrap_name_default()
-def auc_evaluator(
-        input,
-        label,
-        name=None,
-        weight=None, ):
-    """
-    Auc Evaluator which adapts to binary classification.
-
-    The simple usage:
-
-    .. code-block:: python
-
-       eval = auc_evaluator(input, label)
-
-    :param name: Evaluator name.
-    :type name: None|basestring
-    :param input: Input Layer name. The output prediction of network.
-    :type input: LayerOutput
-    :param label: Label layer name.
-    :type label: None|basestring
-    :param weight: Weight Layer name. It should be a matrix with size
-                  [sample_num, 1].
-    :type weight: LayerOutput
-    """
-    evaluator_base(
-        name=name,
-        type="last-column-auc",
-        input=input,
-        label=label,
-        weight=weight)
-
-
-@evaluator(EvaluatorAttribute.FOR_RANK)
-@wrap_name_default()
-def pnpair_evaluator(
-        input,
-        label,
-        query_id,
-        weight=None,
-        name=None, ):
-    """
-    Positive-negative pair rate Evaluator which adapts to rank task like
-    learning to rank. This evaluator must contain at least three layers.
-
-    The simple usage:
-
-    .. code-block:: python
-
-       eval = pnpair_evaluator(input, label, query_id)
-
-    :param input: Input Layer name. The output prediction of network.
-    :type input: LayerOutput
-    :param label: Label layer name.
-    :type label: LayerOutput
-    :param query_id: Query_id layer name. Query_id indicates that which query
-     each sample belongs to. Its shape should be
-     the same as output of Label layer.
-    :type query_id: LayerOutput
-    :param weight: Weight Layer name. It should be a matrix with size
-                  [sample_num, 1] which indicates the weight of each sample.
-                  The default weight of sample is 1 if the weight layer is None.
-                  And the pair weight is the mean of the two samples' weight.
-    :type weight: LayerOutput
-    :param name: Evaluator name.
-    :type name: None|basestring
-    """
-    if not isinstance(input, list):
-        input = [input]
-    if label:
-        input.append(label)
-    if query_id:
-        input.append(query_id)
-    evaluator_base(
-        input=input,
-        type="pnpair",
-        weight=weight,
-        name=name, )
-
-
-@evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
-@wrap_name_default()
-def precision_recall_evaluator(
-        input,
-        label,
-        positive_label=None,
-        weight=None,
-        name=None, ):
-    """
-    An Evaluator to calculate precision and recall, F1-score.
-    It is adapt to the task with multiple labels.
-
-    - If positive_label=-1, it will print the average precision, recall,
-      F1-score of all labels.
-
-    - If use specify positive_label, it will print the precision, recall,
-      F1-score of this label.
-
-    The simple usage:
-
-    .. code-block:: python
-
-       eval = precision_recall_evaluator(input, label)
-
-    :param name: Evaluator name.
-    :type name: None|basestring
-    :param input: Input Layer name. The output prediction of network.
-    :type input: LayerOutput
-    :param label: Label layer name.
-    :type label: LayerOutput
-    :param positive_label: The input label layer.
-    :type positive_label: LayerOutput.
-    :param weight: Weight Layer name. It should be a matrix with size
-                  [sample_num, 1]. (TODO, explaination)
-    :type weight: LayerOutput
-    """
-    evaluator_base(
-        name=name,
-        type="precision_recall",
-        input=input,
-        label=label,
-        positive_label=positive_label,
-        weight=weight)
-
-
-@evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
-@wrap_name_default()
-def ctc_error_evaluator(
-        input,
-        label,
-        name=None, ):
-    """
-    This evaluator is to calculate sequence-to-sequence edit distance.
-
-    The simple usage is :
-
-    .. code-block:: python
-
-       eval = ctc_error_evaluator(input=input, label=lbl)
-
-    :param name: Evaluator name.
-    :type name: None|basestring
-    :param input: Input Layer. Should be the same as the input for ctc_layer.
-    :type input: LayerOutput
-    :param label: input label, which is a data_layer. Should be the same as the
-                  label for ctc_layer
-    :type label: LayerOutput
-    """
-    evaluator_base(
-        name=name, type="ctc_edit_distance", input=input, label=label)
-
-
-@evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
-@wrap_name_default()
-def chunk_evaluator(
-        input,
-        label,
-        chunk_scheme,
-        num_chunk_types,
-        name=None,
-        excluded_chunk_types=None, ):
-    """
-    Chunk evaluator is used to evaluate segment labelling accuracy for a
-    sequence. It calculates precision, recall and F1 scores for the chunk detection.
-
-    To use chunk evaluator, several concepts need to be clarified firstly.
-
-    * **Chunk type** is the type of the whole chunk and a chunk consists of one or several words.  (For example in NER, ORG for organization name, PER for person name etc.)
-
-    * **Tag type** indicates the position of a word in a chunk. (B for begin, I for inside, E for end, S for single)
-    We can name a label by combining tag type and chunk type. (ie. B-ORG for begining of an organization name)
-
-    The construction of label dictionary should obey the following rules:
-
-    - Use one of the listed labelling schemes. These schemes differ in ways indicating chunk boundry.
-
-    .. code-block:: text
-
-        Scheme    Description
-        plain    Use the same label for the whole chunk.
-        IOB      Two labels for chunk type X, B-X for chunk begining and I-X for chunk inside.
-        IOE      Two labels for chunk type X, E-X for chunk ending and I-X for chunk inside.
-        IOBES    Four labels for chunk type X, B-X for chunk begining, I-X for chunk inside, E-X for chunk end and S-X for single word chunk.
-
-    To make it clear, let's illustrate by an NER example.
-    Assuming that there are three named entity types including ORG, PER and LOC which are called 'chunk type' here,
-    if 'IOB' scheme were used, the label set will be extended to a set including B-ORG, I-ORG, B-PER, I-PER, B-LOC, I-LOC and O,
-    in which B-ORG for begining of ORG and I-ORG for inside of ORG.
-    Prefixes which are called 'tag type' here are added to chunk types and there are two tag types including B and I.
-    Of course, the training data should be labeled accordingly.
-
-    - Mapping is done correctly by the listed equations and assigning protocol.
-
-    The following table are equations to extract tag type and chunk type from a label.
-
-    .. code-block:: text
-
-        tagType = label % numTagType
-        chunkType = label / numTagType
-        otherChunkType = numChunkTypes
-
-    The following table shows the mapping rule between tagType and tag type in each scheme.
-
-    .. code-block:: text
-
-        Scheme Begin Inside End   Single
-        plain  0     -      -     -
-        IOB    0     1      -     -
-        IOE    -     0      1     -
-        IOBES  0     1      2     3
-
-    Continue the NER example, and the label dict should look like this to satify above equations:
-
-    .. code-block:: text
-
-        B-ORG  0
-        I-ORG  1
-        B-PER  2
-        I-PER  3
-        B-LOC  4
-        I-LOC  5
-        O      6
-
-    In this example, chunkType has three values: 0 for ORG, 1 for PER, 2 for LOC, because the scheme is
-    "IOB" so tagType has two values: 0 for B and 1 for I.
-    Here we will use I-LOC to explain the above mapping rules in detail.
-    For I-LOC, the label id is 5, so we can get tagType=1 and chunkType=2, which means I-LOC is a part of NER chunk LOC
-    and the tag is I.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval = chunk_evaluator(input, label, chunk_scheme, num_chunk_types)
-
-
-    :param input: The input layers.
-    :type input: LayerOutput
-    :param label: An input layer containing the ground truth label.
-    :type label: LayerOutput
-    :param chunk_scheme: The labelling schemes support 4 types. It is one of
-                         "IOB", "IOE", "IOBES", "plain". It is required.
-    :type chunk_scheme: basestring
-    :param num_chunk_types: number of chunk types other than "other"
-    :param name: The Evaluator name, it is optional.
-    :type name: basename|None
-    :param excluded_chunk_types: chunks of these types are not considered
-    :type excluded_chunk_types: list of integer|None
-    """
-    evaluator_base(
-        name=name,
-        type="chunk",
-        input=input,
-        label=label,
-        chunk_scheme=chunk_scheme,
-        num_chunk_types=num_chunk_types,
-        excluded_chunk_types=excluded_chunk_types, )
-
-
-@evaluator(EvaluatorAttribute.FOR_UTILS)
-@wrap_name_default()
-def sum_evaluator(
-        input,
-        name=None,
-        weight=None, ):
-    """
-    An Evaluator to sum the result of input.
-
-    The simple usage:
-
-    .. code-block:: python
-
-       eval = sum_evaluator(input)
-
-    :param name: Evaluator name.
-    :type name: None|basestring
-    :param input: Input Layer name.
-    :type input: LayerOutput
-    :param weight: Weight Layer name. It should be a matrix with size
-                  [sample_num, 1]. (TODO, explaination)
-    :type weight: LayerOutput
-    """
-    evaluator_base(name=name, type="sum", input=input, weight=weight)
-
-
-@evaluator(EvaluatorAttribute.FOR_UTILS)
-@wrap_name_default()
-def column_sum_evaluator(
-        input,
-        name=None,
-        weight=None, ):
-    """
-    This Evaluator is used to sum the last column of input.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval = column_sum_evaluator(input, label)
-
-    :param name: Evaluator name.
-    :type name: None|basestring
-    :param input: Input Layer name.
-    :type input: LayerOutput
-    """
-    evaluator_base(
-        name=name, type="last-column-sum", input=input, weight=weight)
-
-
-"""
-The following are printer Evaluators which are usually used to
-print the result, like value or gradient of input layers, the
-results generated in machine translation, the classification error etc.
-"""
-
-
-@evaluator(EvaluatorAttribute.FOR_PRINT)
-@wrap_name_default()
-def value_printer_evaluator(
-        input,
-        name=None, ):
-    """
-    This Evaluator is used to print the values of input layers. It contains
-    one or more input layers.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval = value_printer_evaluator(input)
-
-    :param input: One or more input layers.
-    :type input: LayerOutput|list
-    :param name: Evaluator name.
-    :type name: None|basestring
-    """
-    evaluator_base(name=name, type="value_printer", input=input)
-
-
-@evaluator(EvaluatorAttribute.FOR_PRINT)
-@wrap_name_default()
-def gradient_printer_evaluator(
-        input,
-        name=None, ):
-    """
-    This Evaluator is used to print the gradient of input layers. It contains
-    one or more input layers.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval = gradient_printer_evaluator(input)
-
-    :param input: One or more input layers.
-    :type input: LayerOutput|list
-    :param name: Evaluator name.
-    :type name: None|basestring
-    """
-    evaluator_base(name=name, type="gradient_printer", input=input)
-
-
-@evaluator(EvaluatorAttribute.FOR_PRINT)
-@wrap_name_default()
-def maxid_printer_evaluator(
-        input,
-        num_results=None,
-        name=None, ):
-    """
-    This Evaluator is used to print maximum top k values and their indexes
-    of each row of input layers. It contains one or more input layers.
-    k is specified by num_results.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval = maxid_printer_evaluator(input)
-
-    :param input: Input Layer name.
-    :type input: LayerOutput|list
-    :param num_results: This number is used to specify the top k numbers.
-                        It is 1 by default.
-    :type num_results: int.
-    :param name: Evaluator name.
-    :type name: None|basestring
-    """
-    evaluator_base(
-        name=name, type="max_id_printer", input=input, num_results=num_results)
-
-
-@evaluator(EvaluatorAttribute.FOR_PRINT)
-@wrap_name_default()
-def maxframe_printer_evaluator(
-        input,
-        num_results=None,
-        name=None, ):
-    """
-    This Evaluator is used to print the top k frames of each input layers.
-    The input layers should contain sequences info or sequences type.
-    k is specified by num_results.
-    It contains one or more input layers.
-
-    Note:
-        The width of each frame is 1.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval = maxframe_printer_evaluator(input)
-
-    :param input: Input Layer name.
-    :type input: LayerOutput|list
-    :param name: Evaluator name.
-    :type name: None|basestring
-    """
-    evaluator_base(
-        name=name,
-        type="max_frame_printer",
-        input=input,
-        num_results=num_results)
-
-
-@evaluator(EvaluatorAttribute.FOR_PRINT)
-@wrap_name_default()
-def seqtext_printer_evaluator(
-        input,
-        result_file,
-        id_input=None,
-        dict_file=None,
-        delimited=None,
-        name=None, ):
-    """
-    Sequence text printer will print text according to index matrix and a
-    dictionary. There can be multiple input to this layer:
-
-    1. If there is no id_input, the input must be a matrix containing
-    the sequence of indices;
-
-    2. If there is id_input, it should be ids, and interpreted as sample ids.
-
-    The output format will be:
-
-    1. sequence without sub-sequence, and there is probability.
-
-    .. code-block:: python
-
-         id \t prob space_seperated_tokens_from_dictionary_according_to_seq
-
-    2. sequence without sub-sequence, and there is not probability.
-
-    .. code-block:: python
-
-         id \t space_seperated_tokens_from_dictionary_according_to_seq
-
-    3. sequence with sub-sequence, and there is not probability.
-
-    .. code-block:: python
-
-         id \t space_seperated_tokens_from_dictionary_according_to_sub_seq
-         \t \t space_seperated_tokens_from_dictionary_according_to_sub_seq
-         ...
-
-    Typically SequenceTextPrinter layer takes output of maxid or RecurrentGroup
-    with maxid (when generating) as an input.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval = seqtext_printer_evaluator(input=maxid_layer,
-                                        id_input=sample_id,
-                                        dict_file=dict_file,
-                                        result_file=result_file)
-
-    :param input: Input Layer name.
-    :type input: LayerOutput|list
-    :param result_file: Path of the file to store the generated results.
-    :type result_file: basestring
-    :param id_input: Index of the input sequence, and the specified index will
-                     be prited in the gereated results. This an optional
-                     parameter.
-    :type id_input: LayerOutput
-    :param dict_file: Path of dictionary. This is an optional parameter.
-                      Every line is a word in the dictionary with
-                      (line number - 1) as the word index.
-                      If this parameter is set to None, or to an empty string,
-                      only word index are printed in the generated results.
-    :type dict_file: basestring
-    :param delimited: Whether to use space to separate output tokens.
-                Default is True. No space is added if set to False.
-    :type delimited: bool
-    :param name: Evaluator name.
-    :type name: None|basestring
-    :return: The seq_text_printer that prints the generated sequence to a file.
-    :rtype: evaluator
-    """
-    assert isinstance(result_file, basestring)
-    if id_input is None:
-        inputs = [input]
-    else:
-        inputs = [id_input, input]
-        input.parents.append(id_input)
-
-    evaluator_base(
-        name=name,
-        type="seq_text_printer",
-        input=inputs,
-        dict_file=dict_file,
-        result_file=result_file,
-        delimited=delimited)
-
-
-@evaluator(EvaluatorAttribute.FOR_PRINT)
-@wrap_name_default()
-def classification_error_printer_evaluator(
-        input,
-        label,
-        threshold=0.5,
-        name=None, ):
-    """
-    This Evaluator is used to print the classification error of each sample.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval = classification_error_printer_evaluator(input)
-
-    :param input: Input layer.
-    :type input: LayerOutput
-    :param label: Input label layer.
-    :type label: LayerOutput
-    :param name: Evaluator name.
-    :type name: None|basestring
-    """
-    evaluator_base(
-        name=name,
-        type="classification_error_printer",
-        input=input,
-        label=label,
-        classification_threshold=threshold)
diff --git a/python/paddle/trainer_config_helpers/layer_math.py b/python/paddle/trainer_config_helpers/layer_math.py
deleted file mode 100644
index ee84188bacc..00000000000
--- a/python/paddle/trainer_config_helpers/layer_math.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .layers import LayerOutput, mixed_layer, identity_projection, \
-    slope_intercept_layer, scaling_layer, repeat_layer
-from .attrs import is_compatible_with
-from .default_decorators import *
-import activations as act
-from paddle.trainer.config_parser import logger
-
-__all__ = []
-
-
-def register_unary_math_op(op_name, act):
-    def op(input, name=None):
-        return mixed_layer(
-            input=[identity_projection(input=input)], name=name, act=act)
-
-    op = wrap_name_default(op_name)(op)
-    op.__doc__ = type(act).__doc__
-    globals()[op_name] = op
-    __all__.append(op_name)
-
-
-register_unary_math_op('exp', act.ExpActivation())
-register_unary_math_op('log', act.LogActivation())
-register_unary_math_op('abs', act.AbsActivation())
-register_unary_math_op('sigmoid', act.SigmoidActivation())
-register_unary_math_op('tanh', act.TanhActivation())
-register_unary_math_op('square', act.SquareActivation())
-register_unary_math_op('relu', act.ReluActivation())
-register_unary_math_op('sqrt', act.SqrtActivation())
-register_unary_math_op('reciprocal', act.ReciprocalActivation())
-
-
-def add(layeroutput, other):
-    if is_compatible_with(other, float):
-        return slope_intercept_layer(input=layeroutput, intercept=other)
-    if not isinstance(other, LayerOutput):
-        logger.fatal("LayerOutput can only be added with"
-                     " another LayerOutput or a number")
-    if layeroutput.size == other.size:
-        return mixed_layer(input=[
-            identity_projection(input=layeroutput),
-            identity_projection(input=other)
-        ])
-    if other.size != 1 and layeroutput.size != 1:
-        logger.fatal("Two LayerOutput can be added only if they have equal size"
-                     " or one of their sizes is 1. sizes are %s and %s" %
-                     (layeroutput.size, other.size))
-    elif layeroutput.size == 1:
-        tmp = layeroutput
-        layeroutput = other
-        other = tmp
-    other = repeat_layer(other, layeroutput.size)
-    return mixed_layer(input=[
-        identity_projection(input=layeroutput), identity_projection(input=other)
-    ])
-
-
-LayerOutput.__radd__ = add
-LayerOutput.__add__ = add
-
-
-def sub(layeroutput, other):
-    if is_compatible_with(other, float):
-        return slope_intercept_layer(input=layeroutput, intercept=-other)
-    if not isinstance(other, LayerOutput):
-        logger.fatal("LayerOutput can only be subtracted with"
-                     " another Layeroutput or a number")
-    neg = slope_intercept_layer(input=other, slope=-1.0)
-    return add(layeroutput, neg)
-
-
-LayerOutput.__sub__ = sub
-
-
-def rsub(layeroutput, other):
-    neg = slope_intercept_layer(input=layeroutput, slope=-1.0)
-    return add(neg, other)
-
-
-LayerOutput.__rsub__ = rsub
-
-
-def mul(layeroutput, other):
-    if is_compatible_with(other, float):
-        return slope_intercept_layer(input=layeroutput, slope=other)
-    if not isinstance(other, LayerOutput):
-        logger.fatal("LayerOutput can only be multiplied with"
-                     " another Layeroutput or a number")
-    elif layeroutput.size == 1:
-        return scaling_layer(input=other, weight=layeroutput)
-    elif other.size == 1:
-        return scaling_layer(input=layeroutput, weight=other)
-    else:
-        logger.fatal("At least one of the operand of '*' must be a number"
-                     " or a LayerOutput with size=1")
-
-
-LayerOutput.__mul__ = mul
-LayerOutput.__rmul__ = mul
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
deleted file mode 100644
index ee34c157334..00000000000
--- a/python/paddle/trainer_config_helpers/layers.py
+++ /dev/null
@@ -1,7610 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import functools
-import collections
-import inspect
-
-import paddle.trainer.config_parser as cp
-from paddle.trainer.config_parser import *
-from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
-    ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
-from .evaluators import *
-from .poolings import MaxPooling, AvgPooling, MaxWithMaskPooling, BasePoolingType, \
-    CudnnAvgPooling, CudnnAvgInclPadPooling, CudnnMaxPooling
-from .attrs import *
-from .default_decorators import *
-
-try:
-    import cPickle as pickle
-except ImportError:
-    import six.moves.cPickle as pickle
-import copy
-
-__all__ = [
-    'full_matrix_projection',
-    'AggregateLevel',
-    'ExpandLevel',
-    'identity_projection',
-    'dotmul_projection',
-    'dotmul_operator',
-    'repeat_layer',
-    'seq_reshape_layer',
-    'table_projection',
-    'mixed_layer',
-    'data_layer',
-    'embedding_layer',
-    'fc_layer',
-    'grumemory',
-    'pooling_layer',
-    'lstmemory',
-    'last_seq',
-    'first_seq',
-    'cos_sim',
-    'l2_distance_layer',
-    'hsigmoid',
-    'conv_projection',
-    'square_error_cost',
-    'regression_cost',
-    'classification_cost',
-    'LayerOutput',
-    'img_conv_layer',
-    'img_pool_layer',
-    'batch_norm_layer',
-    'img_cmrnorm_layer',
-    'addto_layer',
-    'concat_layer',
-    'seq_concat_layer',
-    'lstm_step_layer',
-    'recurrent_group',
-    'memory',
-    'StaticInput',
-    'expand_layer',
-    'scaling_layer',
-    'scaling_projection',
-    'power_layer',
-    'interpolation_layer',
-    'bilinear_interp_layer',
-    'trans_layer',
-    'rotate_layer',
-    'sum_to_one_norm_layer',
-    'row_l2_norm_layer',
-    'get_output_layer',
-    'LayerType',
-    'context_projection',
-    'beam_search',
-    'maxid_layer',
-    'GeneratedInput',
-    'SubsequenceInput',
-    'gru_step_layer',
-    'gru_step_naive_layer',
-    'recurrent_layer',
-    'BaseGeneratedInput',
-    'conv_operator',
-    'conv_shift_layer',
-    'tensor_layer',
-    'selective_fc_layer',
-    'sampling_id_layer',
-    'slope_intercept_layer',
-    'trans_full_matrix_projection',
-    'linear_comb_layer',
-    'convex_comb_layer',
-    'ctc_layer',
-    'warp_ctc_layer',
-    'crf_layer',
-    'crf_decoding_layer',
-    'nce_layer',
-    'cross_entropy_with_selfnorm',
-    'cross_entropy',
-    'BeamInput',
-    'cross_entropy_over_beam',
-    'multi_binary_label_cross_entropy',
-    'sum_cost',
-    'rank_cost',
-    'lambda_cost',
-    'huber_regression_cost',
-    'huber_classification_cost',
-    'block_expand_layer',
-    'maxout_layer',
-    'dot_prod_layer',
-    'out_prod_layer',
-    'printer_layer',
-    'print_layer',
-    'priorbox_layer',
-    'cross_channel_norm_layer',
-    'multibox_loss_layer',
-    'detection_output_layer',
-    'roi_pool_layer',
-    'spp_layer',
-    'pad_layer',
-    'eos_layer',
-    'smooth_l1_cost',
-    'layer_support',
-    'multiplex_layer',
-    'row_conv_layer',
-    'dropout_layer',
-    'prelu_layer',
-    'switch_order_layer',
-    'gated_unit_layer',
-    'crop_layer',
-    'sub_nested_seq_layer',
-    'clip_layer',
-    'slice_projection',
-    'seq_slice_layer',
-    'kmax_seq_score_layer',
-    'img_pool3d_layer',
-    'scale_shift_layer',
-    'img_conv3d_layer',
-    'resize_layer',
-    'sub_seq_layer',
-    'scale_sub_region_layer',
-    'upsample_layer',
-    'factorization_machine',
-]
-
-
-class LayerType(object):
-    """
-    Layer type enumerations.
-    """
-
-    DATA = 'data'
-    MIXED_LAYER = 'mixed'
-    LSTMEMORY = 'lstmemory'
-    GRUMEMORY = 'gated_recurrent'
-    SEQUENCE_LAST_INSTANCE = 'seqlastins'
-    SEQUENCE_FIRST_INSTANCE = 'seqfirstins'
-    SEQUENCE_RESHAPE = 'seqreshape'
-    POOLING_MAX = 'max'
-    POOLING_AVG = 'average'
-    UPSAMPLE_LAYER = 'upsample'
-    FC_LAYER = 'fc'
-    COST = 'cost'
-    COSINE_SIM_VEC = 'cos_vm'
-    COSINE_SIM = 'cos'
-    L2_DISTANCE = 'l2_distance'
-    HSIGMOID = 'hsigmoid'
-    CONV_LAYER = 'conv'
-    CONVTRANS_LAYER = 'convt'
-    EXCONV_LAYER = 'exconv'
-    EXCONVTRANS_LAYER = 'exconvt'
-    CUDNNCONV_LAYER = 'cudnn_conv'
-    CUDNNCONVTRANS_LAYER = 'cudnn_convt'
-    POOL_LAYER = 'pool'
-    POOL3D_LAYER = 'pool3d'
-    BATCH_NORM_LAYER = 'batch_norm'
-    NORM_LAYER = 'norm'
-    SUM_TO_ONE_NORM_LAYER = 'sum_to_one_norm'
-    ROW_L2_NORM_LAYER = 'row_l2_norm'
-    ADDTO_LAYER = 'addto'
-
-    CONCAT_LAYER = 'concat'
-    CONCAT_PROJ_LAYER = 'concat2'
-    SEQUENCE_CONCAT_LAYER = 'seqconcat'
-
-    LSTM_STEP_LAYER = 'lstm_step'
-    GRU_STEP_LAYER = 'gru_step'
-    GET_OUTPUT_LAYER = 'get_output'
-
-    EXPAND_LAYER = 'expand'
-    INTERPOLATION_LAYER = 'interpolation'
-    BILINEAR_INTERP_LAYER = 'bilinear_interp'
-    POWER_LAYER = 'power'
-    SCALING_LAYER = 'scaling'
-    TRANS_LAYER = 'trans'
-    ROTATE_LAYER = 'rotate'
-    DOT_PROD_LAYER = 'dot_prod'
-    OUT_PROD_LAYER = 'out_prod'
-    FEATURE_MAP_EXPAND_LAYER = 'featmap_expand'
-
-    MEMORY = 'memory'
-    MAXID_LAYER = 'maxid'
-    EOSID_LAYER = 'eos_id'
-    RECURRENT_LAYER = 'recurrent'
-
-    CONV_SHIFT_LAYER = "conv_shift"
-    TENSOR_LAYER = "tensor"
-    SEL_FC_LAYER = "selective_fc"
-    SAMPLING_ID_LAYER = "sampling_id"
-    SLOPE_INTERCEPT_LAYER = "slope_intercept"
-    LINEAR_COMBINATION_LAYER = "convex_comb"
-    BLOCK_EXPAND = "blockexpand"
-    MAXOUT = "maxout"
-    SPP_LAYER = "spp"
-    PAD_LAYER = "pad"
-    MULTIPLEX_LAYER = "multiplex"
-    ROW_CONV_LAYER = "row_conv"
-
-    PRINT_LAYER = 'print'
-    PRIORBOX_LAYER = 'priorbox'
-    MULTIBOX_LOSS_LAYER = 'multibox_loss'
-    DETECTION_OUTPUT_LAYER = 'detection_output'
-    ROI_POOL_LAYER = 'roi_pool'
-
-    CTC_LAYER = 'ctc'
-    WARP_CTC_LAYER = 'warp_ctc'
-    CRF_LAYER = 'crf'
-    CRF_DECODING_LAYER = 'crf_decoding'
-    NCE_LAYER = 'nce'
-
-    CONV3D_LAYER = 'conv3d'
-    DECONV3D_LAYER = 'deconv3d'
-
-    RANK_COST = 'rank-cost'
-    LAMBDA_COST = 'lambda_cost'
-    HUBER_REGRESSION = 'huber_regression'
-    HUBER_CLASSIFICATION = 'huber_classification'
-    CROSS_ENTROPY = 'multi-class-cross-entropy'
-    CROSS_ENTROPY_WITH_SELFNORM = 'multi_class_cross_entropy_with_selfnorm'
-    CROSS_ENTROPY_OVER_BEAM = 'cross_entropy_over_beam'
-    SOFT_BIN_CLASS_CROSS_ENTROPY = 'soft_binary_class_cross_entropy'
-    MULTI_BIN_LABEL_CROSS_ENTROPY = 'multi_binary_label_cross_entropy'
-    SUM_COST = 'sum_cost'
-    SMOOTH_L1 = 'smooth_l1'
-
-    PRELU = 'prelu'
-    SWITCH_ORDER_LAYER = 'switch_order'
-    CROP_LAYER = 'crop'
-    SUB_NESTED_SEQ = 'sub_nested_seq'
-    CLIP_LAYER = 'clip'
-    SEQ_SLICE = 'seq_slice'
-
-    KMAX_SEQ_SCORE = 'kmax_seq_score'
-    SCALE_SHIFT_LAYER = 'scale_shift'
-
-    RESIZE = 'resize'
-    SUB_SEQ_LAYER = 'subseq'
-
-    SCALE_SUB_REGION_LAYER = 'scale_sub_region'
-
-    FACTORIZATION_MACHINE = 'factorization_machine'
-
-    @staticmethod
-    def is_layer_type(type_name):
-        """
-        Whether type_name is a layer type.
-
-        :param type_name: layer type name. Because layer type enumerations are
-                          strings.
-        :type type_name: basestring
-        :return: True if is a layer_type
-        :rtype: bool
-        """
-        for key in dir(LayerType):
-            if key.isupper():
-                att = getattr(LayerType, key)
-                if isinstance(att, basestring) and type_name == att:
-                    return True
-        return False
-
-
-class AggregateLevel(object):
-    """
-    PaddlePaddle supports three sequence types:
-
-    - :code:`SequenceType.NO_SEQUENCE` means the sample is not a sequence.
-    - :code:`SequenceType.SEQUENCE` means the sample is a sequence.
-    - :code:`SequenceType.SUB_SEQUENCE` means the sample is a nested sequence,
-      each timestep of which is also a sequence.
-
-    Accordingly, AggregateLevel supports two modes:
-
-    - :code:`AggregateLevel.TO_NO_SEQUENCE` means the aggregation acts on each
-      timestep of a sequence, both :code:`SUB_SEQUENCE` and :code:`SEQUENCE` will
-      be aggregated to :code:`NO_SEQUENCE`.
-
-    - :code:`AggregateLevel.TO_SEQUENCE` means the aggregation acts on each
-      sequence of a nested sequence, :code:`SUB_SEQUENCE` will be aggregated to
-      :code:`SEQUENCE`.
-    """
-    TO_NO_SEQUENCE = 'non-seq'
-    TO_SEQUENCE = 'seq'
-    # compatible with previous configuration
-    EACH_TIMESTEP = TO_NO_SEQUENCE
-    EACH_SEQUENCE = TO_SEQUENCE
-
-
-class LayerOutput(object):
-    """
-    LayerOutput is output for layer function. It is used internally by several
-    reasons.
-
-    - Check layer connection make sense.
-
-        - FC(Softmax) => Cost(MSE Error) is not good for example.
-
-    - Tracking layer connection.
-
-    - Pass to layer methods as input.
-
-    :param name: Layer output name.
-    :type name: basestring
-    :param layer_type: Current Layer Type. One of LayerType enumeration.
-    :type layer_type: basestring
-    :param activation: Layer Activation.
-    :type activation: BaseActivation.
-    :param parents: Layer's parents.
-    :type parents: list | tuple | collections.Sequence
-    """
-
-    def __init__(self,
-                 name,
-                 layer_type,
-                 parents=None,
-                 activation=None,
-                 num_filters=None,
-                 img_norm_type=None,
-                 size=None,
-                 outputs=None,
-                 reverse=None):
-        assert isinstance(name, basestring)
-        assert isinstance(layer_type, basestring)
-        assert size is not None
-        assert LayerType.is_layer_type(layer_type)
-        self.name = name
-        self.full_name = MakeLayerNameInSubmodel(name)
-        self.layer_type = layer_type
-        if parents is not None and type(parents) != list:
-            parents = [parents]
-        self.parents = [] if parents is None else parents
-        self.activation = activation
-        self.num_filters = num_filters
-        self.img_norm_type = img_norm_type
-        self.size = size
-        if outputs is None:
-            outputs = ['default']
-        self.outputs = outputs
-        self.reverse = reverse
-
-    @property
-    def width(self):
-        return cp.g_layer_map[self.full_name].width
-
-    @property
-    def height(self):
-        return cp.g_layer_map[self.full_name].height
-
-    @property
-    def depth(self):
-        return cp.g_layer_map[self.full_name].depth
-
-    def set_input(self, input):
-        """
-        Set the input for a memory layer. Can only be used for memory layer
-        """
-        assert isinstance(input, LayerOutput)
-        assert self.layer_type == LayerType.MEMORY
-        SetMemoryInput(self.name, input.name)
-
-
-ERROR_CLIPPING = 'error_clipping_threshold'
-DROPOUT = 'drop_rate'
-DEVICE = 'device'
-
-
-def layer_support(*attrs):
-    attrs_list = list(attrs)
-    attrs_list.append(DEVICE)
-
-    def decorator(method):
-        @functools.wraps(method)
-        def wrapper(*args, **kwargs):
-            for attr in attrs_list:
-                for each in args:
-                    if isinstance(each, ExtraLayerAttribute):
-                        setattr(each, '_'.join(['can', attr]), True)
-                for key in kwargs:
-                    val = kwargs[key]
-                    if isinstance(val, ExtraLayerAttribute):
-                        setattr(val, '_'.join(['can', attr]), True)
-            for each in args:
-                if isinstance(each, ExtraLayerAttribute):
-                    each.check(method.__name__)
-            for key in kwargs:
-                val = kwargs[key]
-                if isinstance(val, ExtraLayerAttribute):
-                    val.check(method.__name__)
-            return method(*args, **kwargs)
-
-        if hasattr(method, 'argspec'):
-            wrapper.argspec = method.argspec
-        else:
-            wrapper.argspec = inspect.getargspec(method)
-
-        return wrapper
-
-    return decorator
-
-
-@wrap_param_attr_default()
-def full_matrix_projection(input, size=0, param_attr=None):
-    """
-    Full Matrix Projection. It performs full matrix multiplication.
-
-    ..  math::
-        out.row[i] += in.row[i] * weight
-
-    There are two styles of usage.
-
-    1. When used in mixed_layer like this, you can only set the input:
-
-    .. code-block:: python
-
-       with mixed_layer(size=100) as m:
-           m += full_matrix_projection(input=layer)
-
-    2. When used as an independent object like this, you must set the size:
-
-    .. code-block:: python
-
-       proj = full_matrix_projection(input=layer,
-                                     size=100,
-                                     param_attr=ParamAttr(name='_proj'))
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param size: The dimension of this layer.
-    :type size: int
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :return: FullMatrixProjection Object.
-    :rtype: FullMatrixProjection
-    """
-    proj = FullMatrixProjection(
-        input_layer_name=input.name, size=size, **param_attr.attr)
-    proj.origin = input
-    return proj
-
-
-@wrap_param_attr_default()
-def trans_full_matrix_projection(input, size=0, param_attr=None):
-    """
-    Different from full_matrix_projection, this projection performs matrix
-    multiplication, using the transpose of weight.
-
-    ..  math::
-        out.row[i] += in.row[i] * w^\mathrm{T}
-
-    :math:`w^\mathrm{T}` means the transpose of weight.
-    The simply usage is:
-
-    .. code-block:: python
-
-       proj = trans_full_matrix_projection(input=layer,
-                                           size=100,
-                                           param_attr=ParamAttr(
-                                                name='_proj',
-                                                initial_mean=0.0,
-                                                initial_std=0.01))
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param size: The parameter size. Means the width of parameter.
-    :type size: int
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :return: TransposedFullMatrixProjection Object.
-    :rtype: TransposedFullMatrixProjection
-    """
-    proj = TransposedFullMatrixProjection(
-        input_layer_name=input.name, size=size, **param_attr.attr)
-    proj.origin = input
-    return proj
-
-
-@wrap_param_attr_default()
-def table_projection(input, size=0, param_attr=None):
-    """
-    Table Projection. It selects rows from parameter where row\_id
-    is in input\_ids.
-
-    .. math::
-       out.row[i] += table.row[ids[i]]
-
-    where :math:`out` is output, :math:`table` is parameter, :math:`ids` is input\_ids,
-    and :math:`i` is row\_id.
-
-    There are two styles of usage.
-
-    1. When used in mixed_layer like this, you can only set the input:
-
-    .. code-block:: python
-
-       with mixed_layer(size=100) as m:
-           m += table_projection(input=layer)
-
-    2. When used as an independent object like this, you must set the size:
-
-    .. code-block:: python
-
-       proj = table_projection(input=layer,
-                               size=100,
-                               param_attr=ParamAttr(name='_proj'))
-
-
-    :param input: The input of this layer, which must contains id fields.
-    :type input: LayerOutput
-    :param size: The dimension of the output.
-    :type size: int
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :return: TableProjection Object.
-    :rtype: TableProjection
-    """
-    proj = TableProjection(
-        input_layer_name=input.name, size=size, **param_attr.attr)
-    proj.origin = input
-    return proj
-
-
-def identity_projection(input, offset=None, size=None):
-    """
-    1. If offset=None, it performs IdentityProjection as follows:
-
-    .. math::
-       out.row[i] += in.row[i]
-
-    The example usage is:
-
-    .. code-block:: python
-
-       proj = identity_projection(input=layer)
-
-
-    2. If offset!=None, It executes IdentityOffsetProjection and takes the
-       elements of the input in the range [offset, offset+size) as output.
-
-    .. math::
-       out.row[i] += in.row[i + \\textrm{offset}]
-
-    The example usage is:
-
-    .. code-block:: python
-
-       proj = identity_projection(input=layer,
-                                  offset=10)
-
-    Note that neither of the projections have trainable parameter.
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param offset: The offset from the start of the input. The input's
-                   elements in the range [offset, offset+size) will be
-                   taken as output. If this parameter is not set or set
-                   to None, the output will be the same as the input.
-    :type offset: int
-    :param size: The dimension of this layer. It will be neglected
-                 when offset is None or not set.
-    :type size: int
-    :return: IdentityProjection or IdentityOffsetProjection object
-    :rtype: IdentityProjection | IdentityOffsetProjection
-    """
-    if offset is None:
-        proj = IdentityProjection(input_layer_name=input.name)
-        proj.origin = input
-    else:
-        if size is None:
-            size = input.size - offset
-        proj = IdentityOffsetProjection(
-            input_layer_name=input.name, offset=offset, size=size)
-        proj.origin = input
-    return proj
-
-
-def slice_projection(input, slices):
-    """
-    slice_projection slices the input value into multiple parts,
-    then selects and merges some of them into a new output.
-
-    .. math::
-       output = [input.slices()]
-
-    The example usage is:
-
-    .. code-block:: python
-
-       proj = slice_projection(input=layer, slices=[(0, 10), (20, 30)])
-
-    Note that slice_projection has no trainable parameter.
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param slices: A list of start and end offsets of each slice.
-    :type slices: list of tuple
-    :return: SliceProjection object.
-    :rtype: SliceProjection
-    """
-    assert len(slices) >= 1
-    start = 0
-    for i in xrange(len(slices)):
-        assert len(slices[i]) == 2
-        # The start position of the next slice needs to be greater than
-        # or equal to the end position of the previous slice.
-        assert slices[i][0] >= start
-        assert slices[i][1] >= slices[i][0]
-        start = slices[i][1]
-    proj = SliceProjection(input_layer_name=input.name, slices=slices)
-    proj.origin = input
-    return proj
-
-
-@wrap_param_attr_default()
-def scaling_projection(input, param_attr=None):
-    """
-    scaling_projection multiplies the input with a scalar parameter.
-
-    .. math::
-       out += w * in
-
-    The example usage is:
-
-    .. code-block:: python
-
-       proj = scaling_projection(input=layer)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :return: ScalingProjection object.
-    :rtype: ScalingProjection
-    """
-    proj = ScalingProjection(input_layer_name=input.name, **param_attr.attr)
-    proj.origin = input
-    return proj
-
-
-@wrap_param_attr_default()
-def dotmul_projection(input, param_attr=None):
-    """
-    DotMulProjection takes a layer as input and performs
-    element-wise multiplication with weight.
-
-    ..  math::
-        out.row[i] += in.row[i] .* weight
-
-    where :math:`.*` means element-wise multiplication.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       proj = dotmul_projection(input=layer)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :return: DotMulProjection object.
-    :rtype: DotMulProjection
-    """
-    proj = DotMulProjection(
-        input_layer_name=input.name, size=input.size, **param_attr.attr)
-    proj.origin = input
-    return proj
-
-
-def dotmul_operator(a=None, b=None, scale=1, **kwargs):
-    """
-    DotMulOperator takes two inputs and performs element-wise multiplication:
-
-    .. math::
-       out.row[i] += scale * (a.row[i] .* b.row[i])
-
-    where :math:`.*` means element-wise multiplication, and
-    scale is a config scalar, its default value is 1.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       op = dotmul_operator(a=layer1, b=layer2, scale=0.5)
-
-    :param a: The first input of this layer.
-    :type a: LayerOutput
-    :param b: The second input of this layer.
-    :type b: LayerOutput
-    :param scale: A scalar to scale the product. Its default value is 1.
-    :type scale: float
-    :return: DotMulOperator object.
-    :rtype: DotMulOperator
-    """
-    if 'x' in kwargs or 'y' in kwargs:
-        logger.warning('x and y arguments for dotmul_operator is deprecated. '
-                       'Please use a and b as parameter.')
-    a = kwargs.get('x', a)  # For Backward capacity.
-    b = kwargs.get('y', b)
-    assert isinstance(a, LayerOutput)
-    assert isinstance(b, LayerOutput)
-    if a.size is not None and b.size is not None:
-        assert a.size == b.size
-
-    op = DotMulOperator(input_layer_names=[a.name, b.name], scale=scale)
-    op.origin = [a, b]
-    return op
-
-
-@wrap_bias_attr_default(['padding_attr'])
-def context_projection(input,
-                       context_len,
-                       context_start=None,
-                       padding_attr=False):
-    """
-    Context Projection.
-
-    It just reorganizes input sequence, combines "context_len" elements of the
-    sequence to one context from context_start. "context_start" will be set to
-    -(context_len - 1) / 2 by default. When context position is out of sequence
-    length, padding will be filled as zero if padding_attr = False, otherwise
-    it is trainable.
-
-    For example, origin sequence is [A B C D E F G], context len is 3, padding_attr
-    is not set, then after context projection, sequence will
-    be [ 0AB ABC BCD CDE DEF EFG FG0 ].
-
-    :param input: The input of this layer, which should be a sequence.
-    :type input: LayerOutput
-    :param context_len: The length of the context.
-    :type context_len: int
-    :param context_start: The start position of the context. The default value is
-                          -(context_len - 1)/2
-    :type context_start: int
-    :param padding_attr: Parameter attribute of the padding. If the parameter is
-                         set to False, padding will be zero. In other cases, the
-                         padding is trainable, and its parameter attribute is set
-                         by this parameter.
-    :type padding_attr: bool | ParameterAttribute
-    :return: Projection object.
-    :rtype: Projection
-    """
-    context_start = -(
-        context_len - 1) / 2 if context_start is None else context_start
-
-    extra_dict = dict()
-    trainable = isinstance(padding_attr, ParameterAttribute)
-    if trainable:
-        extra_dict = padding_attr.attr
-
-    proj = ContextProjection(
-        input_layer_name=input.name,
-        context_length=context_len,
-        context_start=context_start,
-        trainable_padding=trainable,
-        **extra_dict)
-    proj.origin = input
-    return proj
-
-
-class MixedLayerType(LayerOutput):
-    """
-    The internal object for trainer_helpers.
-    """
-
-    class AddToSealedMixedLayerException(Exception):
-        def __init__(self):
-            Exception.__init__(self)
-
-    def __init__(self, name, size, act, bias_attr, layer_attr, parents=None):
-        """
-        :param name: The name of this layer.
-        :type name: basestring
-        :param size: The dimension of this layer.
-        :type size: int
-        :param act: Activation type.
-        :type act: BaseActivation
-        :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                          whose type is not ParameterAttribute, no bias is defined. If the
-                          parameter is set to True, the bias is initialized to zero.
-        :type bias_attr: ParameterAttribute | None | bool | Any
-        :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                           details.
-        :type layer_attr: ExtraLayerAttribute | None
-        """
-        LayerOutput.__init__(
-            self,
-            name,
-            LayerType.MIXED_LAYER,
-            parents,
-            size=size,
-            activation=act)
-        self.bias_attr = bias_attr
-        self.layer_attr = layer_attr
-        self.inputs = []
-        self.finalized = False
-
-    def __iadd__(self, other):
-        """
-        + += operator
-        :param other: Other projection.
-        :type other: Projection
-        :return: self.
-        :rtype: MixedLayerType
-        """
-        if not self.finalized:
-            assert isinstance(other, Projection) or isinstance(other, Operator)
-            self.inputs.append(other)
-            if isinstance(other, Projection):
-                self.parents.append(other.origin)
-            else:
-                self.parents.extend(other.origin)
-            return self
-        else:
-            raise MixedLayerType.AddToSealedMixedLayerException()
-
-    def __enter__(self):
-        assert len(self.inputs) == 0
-        return self
-
-    def __exit__(self, exc_type, exc_value, tb):
-        if exc_value is not None:
-            raise exc_value
-        assert len(self.inputs) != 0
-        ml = MixedLayer(
-            name=self.name,
-            size=self.size,
-            active_type=self.activation.name,
-            bias=ParamAttr.to_bias(self.bias_attr),
-            inputs=self.inputs,
-            **ExtraLayerAttribute.to_kwargs(self.layer_attr))
-        # update the size which might be computed inside MixedLayer
-        # according to the operator's output size
-        self.size = ml.config.size
-        self.finalized = True
-
-
-@wrap_name_default("mixed")
-@wrap_act_default(act=LinearActivation())
-@wrap_bias_attr_default(has_bias=False)
-@layer_support(ERROR_CLIPPING, DROPOUT)
-def mixed_layer(size=0,
-                input=None,
-                name=None,
-                act=None,
-                bias_attr=False,
-                layer_attr=None):
-    """
-    Mixed Layer. A mixed layer will add all inputs together, then activate the sum.
-    Each input is a projection or operator.
-
-    There are two styles of usages.
-
-    1. When the parameter input is not set, use mixed_layer like this:
-
-    .. code-block:: python
-
-       with mixed_layer(size=256) as m:
-           m += full_matrix_projection(input=layer1)
-           m += identity_projection(input=layer2)
-
-    2. You can also set all inputs when invoke mixed_layer as follows:
-
-    .. code-block:: python
-
-       m = mixed_layer(size=256,
-                       input=[full_matrix_projection(input=layer1),
-                              full_matrix_projection(input=layer2)])
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param size: The dimension of this layer.
-    :type size: int
-    :param input: The input of this layer. It is an optional parameter.
-    :param act: Activation Type. LinearActivation is the default activation.
-    :type act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: MixedLayerType object.
-    :rtype: MixedLayerType
-    """
-
-    if input is None:
-        return MixedLayerType(name, size, act, bias_attr, layer_attr)
-    else:
-        with mixed_layer(
-                name=name,
-                size=size,
-                act=act,
-                bias_attr=bias_attr,
-                layer_attr=layer_attr) as m:
-            if isinstance(input, collections.Sequence):
-                for each in input:
-                    m += each
-            else:
-                m += input
-        return m
-
-
-@layer_support()
-def data_layer(name, size, depth=None, height=None, width=None,
-               layer_attr=None):
-    """
-    Define DataLayer For NeuralNetwork.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        data = data_layer(name="input", size=1000)
-
-    :param name: The name of this layer.
-    :type name: basestring
-    :param size: The dimension of this data layer.
-    :type size: int
-    :param height: The height of the input image data.
-    :type height: int | None
-    :param width: The width of the input image data.
-    :type width: int | None
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(
-        type=LayerType.DATA,
-        name=name,
-        size=size,
-        depth=depth,
-        height=height,
-        width=width,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    if depth is None:
-        depth = 1
-    num_filters = None
-    if height is not None and width is not None:
-        num_filters = size / (width * height * depth)
-        assert num_filters * width * height * depth == size, \
-                "size=%s width=%s height=%s depth=%s" % (size, width, height, depth)
-
-    return LayerOutput(name, LayerType.DATA, size=size, num_filters=num_filters)
-
-
-@wrap_name_default("embedding")
-@wrap_param_attr_default()
-@layer_support(ERROR_CLIPPING, DROPOUT)
-def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None):
-    """
-    Define a embedding Layer.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer, whose type must be Index Data.
-    :type input: LayerOutput
-    :param size: The dimension of the embedding vector.
-    :type size: int
-    :param param_attr: The embedding parameter attribute. See ParameterAttribute
-                      for details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    with mixed_layer(
-            name=name,
-            size=size,
-            act=LinearActivation(),
-            bias_attr=False,
-            layer_attr=layer_attr) as mix:
-        mix += table_projection(input=input, size=size, param_attr=param_attr)
-    return mix
-
-
-@wrap_name_default()
-@wrap_param_attr_default()
-@wrap_bias_attr_default()
-@wrap_act_default()
-@layer_support(ERROR_CLIPPING, DROPOUT)
-def fc_layer(input,
-             size,
-             act=None,
-             name=None,
-             param_attr=None,
-             bias_attr=None,
-             layer_attr=None):
-    """
-    The fully connected layer.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       fc = fc_layer(input=layer,
-                     size=1024,
-                     act=LinearActivation(),
-                     bias_attr=False)
-
-    which is equal to:
-
-    .. code-block:: python
-
-       with mixed_layer(size=1024) as fc:
-           fc += full_matrix_projection(input=layer)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput | list | tuple
-    :param size: The dimension of this layer.
-    :type size: int
-    :param act: Activation Type. TanhActivation is the default activation.
-    :type act: BaseActivation
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if isinstance(input, LayerOutput):
-        input = [input]
-        assert not isinstance(param_attr, collections.Sequence)
-        param_attr = [param_attr]
-    else:
-        if isinstance(param_attr, collections.Sequence):
-            assert len(input) == len(param_attr)
-        else:
-            if "parameter_name" in param_attr.attr and len(input) > 1:
-                logger.fatal(
-                    "When the name field of param_attr is manually specified "
-                    "and the input is a list, the param_attr should also be a "
-                    "list with each item being the param_attr for each input "
-                    "item. If only one named param_attr is provided, all the "
-                    "input items would share this parameter.")
-            param_attr = [copy.deepcopy(param_attr) for _ in range(len(input))]
-
-    assert isinstance(input, collections.Sequence)
-
-    Layer(
-        inputs=[
-            Input(ipt.name, **attr.attr) for ipt, attr in zip(input, param_attr)
-        ],
-        name=name,
-        type=LayerType.FC_LAYER,
-        size=size,
-        bias=ParamAttr.to_bias(bias_attr),
-        active_type=act.name,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.FC_LAYER, input, activation=act, size=size)
-
-
-@wrap_name_default("print")
-def printer_layer(input, format=None, name=None):
-    """
-    Print the output value of the layers specified by the parameter input.
-    This layer is useful for debugging.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput | list | tuple
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if isinstance(input, LayerOutput):
-        input = [input]
-    assert isinstance(input, collections.Sequence)  # list or tuple
-    for each in input:
-        assert isinstance(each, LayerOutput)
-
-    Layer(
-        name=name,
-        format=format,
-        type=LayerType.PRINT_LAYER,
-        inputs=[l.name for l in input], )
-    # this layer don't return anything, can not be input of other layer.
-
-# Keep print_layer for compatibility with V1 API.
-# 'print_layer' does not work for V2 API because it will be changed to
-# 'print' for V2 API. But 'print' is a reserved key word in python.
-
-
-print_layer = printer_layer
-
-
-@wrap_name_default("priorbox")
-def priorbox_layer(input,
-                   image,
-                   aspect_ratio,
-                   variance,
-                   min_size,
-                   max_size=[],
-                   name=None):
-    """
-    Compute the priorbox and set the variance. This layer is necessary for ssd.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param image: The network input image.
-    :type image: LayerOutput
-    :param aspect_ratio: The aspect ratio.
-    :type aspect_ratio: list
-    :param variance: The bounding box variance.
-    :type min_size: The minimum size of the priorbox width/height.
-    :param min_size: list
-    :type max_size: The maximum size of the priorbox width/height. It could be NULL.
-    :param max_size: list
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    # plus one for ratio 1.
-    num_filters = (len(aspect_ratio) * 2 + 1 + len(max_size)) * 4
-    size = (input.size / input.num_filters) * num_filters * 2
-    Layer(
-        name=name,
-        type=LayerType.PRIORBOX_LAYER,
-        inputs=[input.name, image.name],
-        size=size,
-        min_size=min_size,
-        max_size=max_size,
-        aspect_ratio=aspect_ratio,
-        variance=variance)
-    return LayerOutput(
-        name,
-        LayerType.PRIORBOX_LAYER,
-        parents=[input, image],
-        num_filters=num_filters,
-        size=size)
-
-
-@wrap_name_default("multibox_loss")
-def multibox_loss_layer(input_loc,
-                        input_conf,
-                        priorbox,
-                        label,
-                        num_classes,
-                        overlap_threshold=0.5,
-                        neg_pos_ratio=3.0,
-                        neg_overlap=0.5,
-                        background_id=0,
-                        name=None):
-    """
-    Compute the location loss and the confidence loss for ssd.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input_loc: The input predicted locations.
-    :type input_loc: LayerOutput | List of LayerOutput
-    :param input_conf: The input priorbox confidence.
-    :type input_conf: LayerOutput | List of LayerOutput
-    :param priorbox: The input priorbox location and the variance.
-    :type priorbox: LayerOutput
-    :param label: The input label.
-    :type label: LayerOutput
-    :param num_classes: The number of the classification.
-    :type num_classes: int
-    :param overlap_threshold: The threshold of the overlap.
-    :type overlap_threshold: float
-    :param neg_pos_ratio: The ratio of the negative bounding box to
-                          the positive bounding box.
-    :type neg_pos_ratio: float
-    :param neg_overlap: The negative bounding box overlap threshold.
-    :type neg_overlap: float
-    :param background_id: The background class index.
-    :type background_id: int
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if isinstance(input_loc, LayerOutput):
-        input_loc = [input_loc]
-    assert isinstance(input_loc, collections.Sequence)  # list or tuple
-    for each in input_loc:
-        assert isinstance(each, LayerOutput)
-    input_loc_num = len(input_loc)
-
-    if isinstance(input_conf, LayerOutput):
-        input_conf = [input_conf]
-    assert isinstance(input_conf, collections.Sequence)  # list or tuple
-    for each in input_conf:
-        assert isinstance(each, LayerOutput)
-    input_conf_num = len(input_conf)
-    # Check the input layer number.
-    assert input_loc_num == input_conf_num
-
-    inputs = [priorbox.name, label.name]
-    inputs.extend([l.name for l in input_loc])
-    inputs.extend([l.name for l in input_conf])
-    parents = [priorbox, label]
-    parents.extend(input_loc)
-    parents.extend(input_conf)
-
-    Layer(
-        name=name,
-        type=LayerType.MULTIBOX_LOSS_LAYER,
-        inputs=inputs,
-        input_num=input_loc_num,
-        num_classes=num_classes,
-        overlap_threshold=overlap_threshold,
-        neg_pos_ratio=neg_pos_ratio,
-        neg_overlap=neg_overlap,
-        background_id=background_id)
-    return LayerOutput(
-        name, LayerType.MULTIBOX_LOSS_LAYER, parents=parents, size=1)
-
-
-@wrap_name_default("detection_output")
-def detection_output_layer(input_loc,
-                           input_conf,
-                           priorbox,
-                           num_classes,
-                           nms_threshold=0.45,
-                           nms_top_k=400,
-                           keep_top_k=200,
-                           confidence_threshold=0.01,
-                           background_id=0,
-                           name=None):
-    """
-    Apply the NMS to the output of network and compute the predict bounding
-    box location. The output's shape of this layer could be zero if there is
-    no valid bounding box.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input_loc: The input predict locations.
-    :type input_loc: LayerOutput | List of LayerOutput.
-    :param input_conf: The input priorbox confidence.
-    :type input_conf: LayerOutput | List of LayerOutput.
-    :param priorbox: The input priorbox location and the variance.
-    :type priorbox: LayerOutput
-    :param num_classes: The number of the classes.
-    :type num_classes: int
-    :param nms_threshold: The Non-maximum suppression threshold.
-    :type nms_threshold: float
-    :param nms_top_k: The bounding boxes number kept of the NMS's output.
-    :type nms_top_k: int
-    :param keep_top_k: The bounding boxes number kept of the layer's output.
-    :type keep_top_k: int
-    :param confidence_threshold: The classification confidence threshold.
-    :type confidence_threshold: float
-    :param background_id: The background class index.
-    :type background_id: int
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if isinstance(input_loc, LayerOutput):
-        input_loc = [input_loc]
-    assert isinstance(input_loc, collections.Sequence)  # list or tuple
-    for each in input_loc:
-        assert isinstance(each, LayerOutput)
-    input_loc_num = len(input_loc)
-
-    if isinstance(input_conf, LayerOutput):
-        input_conf = [input_conf]
-    assert isinstance(input_conf, collections.Sequence)  # list or tuple
-    for each in input_conf:
-        assert isinstance(each, LayerOutput)
-    input_conf_num = len(input_conf)
-
-    # Check the input layer number.
-    assert input_loc_num == input_conf_num
-
-    inputs = [priorbox.name]
-    inputs.extend([l.name for l in input_loc])
-    inputs.extend([l.name for l in input_conf])
-    parents = [priorbox]
-    parents.extend(input_loc)
-    parents.extend(input_conf)
-
-    size = keep_top_k * 7
-
-    Layer(
-        name=name,
-        type=LayerType.DETECTION_OUTPUT_LAYER,
-        inputs=inputs,
-        size=size,
-        input_num=input_loc_num,
-        num_classes=num_classes,
-        nms_threshold=nms_threshold,
-        nms_top_k=nms_top_k,
-        keep_top_k=keep_top_k,
-        confidence_threshold=confidence_threshold,
-        background_id=background_id)
-    return LayerOutput(
-        name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size)
-
-
-@wrap_name_default("roi_pool")
-def roi_pool_layer(input,
-                   rois,
-                   pooled_width,
-                   pooled_height,
-                   spatial_scale,
-                   num_channels=None,
-                   name=None):
-    """
-    A layer used by Fast R-CNN to extract feature maps of ROIs from the last
-    feature map.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input layer.
-    :type input: LayerOutput.
-    :param rois: The input ROIs' data.
-    :type rois: LayerOutput.
-    :param pooled_width: The width after pooling.
-    :type pooled_width: int
-    :param pooled_height: The height after pooling.
-    :type pooled_height: int
-    :param spatial_scale: The spatial scale between the image and feature map.
-    :type spatial_scale: float
-    :param num_channels: The number of the input channels.
-    :type num_channels: int
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-    size = num_channels * pooled_width * pooled_height
-    Layer(
-        name=name,
-        type=LayerType.ROI_POOL_LAYER,
-        inputs=[input.name, rois.name],
-        pooled_width=pooled_width,
-        pooled_height=pooled_height,
-        spatial_scale=spatial_scale,
-        num_channels=num_channels)
-    return LayerOutput(
-        name, LayerType.ROI_POOL_LAYER, parents=[input, rois], size=size)
-
-
-@wrap_name_default("cross_channel_norm")
-def cross_channel_norm_layer(input, name=None, param_attr=None):
-    """
-    Normalize a layer's output. This layer is necessary for ssd. This
-    layer applys normalization across the channels of each sample to
-    a convolutional layer's output and scales the output by a group of
-    trainable factors whose dimensions equal to the channel's number.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert input.num_filters is not None
-    Layer(
-        name=name,
-        type=LayerType.NORM_LAYER,
-        inputs=[
-            Input(
-                input.name,
-                norm=Norm(
-                    norm_type="cross-channel-norm",
-                    channels=input.num_filters,
-                    size=input.size,
-                    scale=0,
-                    pow=0,
-                    blocked=0),
-                **param_attr.attr)
-        ])
-    return LayerOutput(
-        name,
-        LayerType.NORM_LAYER,
-        parents=input,
-        num_filters=input.num_filters,
-        size=input.size)
-
-
-@wrap_name_default("seq_pooling")
-@wrap_bias_attr_default(has_bias=False)
-@wrap_param_default(['pooling_type'], default_factory=lambda _: MaxPooling())
-@layer_support()
-def pooling_layer(input,
-                  pooling_type=None,
-                  name=None,
-                  bias_attr=None,
-                  agg_level=AggregateLevel.TO_NO_SEQUENCE,
-                  stride=-1,
-                  layer_attr=None):
-    """
-    Pooling layer for sequence inputs, not used for Image.
-
-    If stride > 0, this layer slides a window whose size is determined by stride,
-    and returns the pooling value of the sequence in the window as the output. Thus,
-    a long sequence will be shortened. Note that for sequence with sub-sequence, the
-    default value of stride is -1.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       seq_pool = pooling_layer(input=layer,
-                                pooling_type=AvgPooling(),
-                                agg_level=AggregateLevel.TO_NO_SEQUENCE)
-
-    :param agg_level: AggregateLevel.TO_NO_SEQUENCE or
-                      AggregateLevel.TO_SEQUENCE
-    :type agg_level: AggregateLevel
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param pooling_type: Type of pooling. MaxPooling is the default pooling.
-    :type pooling_type: BasePoolingType | None
-    :param stride: The step size between successive pooling regions.
-    :type stride: int
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    extra_dict = dict()
-    # noinspection PyUnresolvedReferences
-    if isinstance(pooling_type, AvgPooling):
-        extra_dict['average_strategy'] = pooling_type.strategy
-    elif isinstance(pooling_type, MaxPooling) and \
-                    pooling_type.output_max_index is not None:
-        assert isinstance(pooling_type.output_max_index, bool)
-        extra_dict['output_max_index'] = pooling_type.output_max_index
-    extra_dict.update(ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    if agg_level == AggregateLevel.TO_SEQUENCE:
-        assert stride == -1
-
-    Layer(
-        name=name,
-        type=pooling_type.name,
-        inputs=[Input(input.name)],
-        bias=ParamAttr.to_bias(bias_attr),
-        trans_type=agg_level,
-        stride=stride,
-        **extra_dict)
-
-    return LayerOutput(
-        name, pooling_type.name, parents=[input], size=input.size)
-
-
-@wrap_bias_attr_default()
-@wrap_param_attr_default()
-@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
-@wrap_act_default(param_names=["act", 'state_act'], act=TanhActivation())
-@wrap_name_default("lstmemory")
-@layer_support()
-def lstmemory(input,
-              name=None,
-              size=None,
-              reverse=False,
-              act=None,
-              gate_act=None,
-              state_act=None,
-              bias_attr=None,
-              param_attr=None,
-              layer_attr=None):
-    """
-    Long Short-term Memory Cell.
-
-    The memory cell was implemented as follow equations.
-
-    ..  math::
-
-        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
-
-        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
-
-        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
-
-        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
-
-        h_t & = o_t tanh(c_t)
-
-
-    NOTE: In PaddlePaddle's implementation, the multiplications
-    :math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`,
-    :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` are not done in the lstmemory layer,
-    so an additional mixed_layer with full_matrix_projection or a fc_layer must
-    be included in the configuration file to complete the input-to-hidden
-    mappings before lstmemory is called.
-
-    NOTE: This is a low level user interface. You can use network.simple_lstm
-    to config a simple plain lstm layer.
-
-    Reference:
-        `Generating Sequences With Recurrent Neural Networks
-        <https://arxiv.org/pdf/1308.0850.pdf>`_
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param size: DEPRECATED. The dimension of the lstm cell.
-    :type size: int
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param reverse: Whether the input sequence is processed in a reverse order.
-    :type reverse: bool
-    :param act: Activation type. TanhActivation is the default activation.
-    :type act: BaseActivation
-    :param gate_act: Activation type of this layer's gates. SigmoidActivation is the
-                     default activation.
-    :type gate_act: BaseActivation
-    :param state_act: Activation type of the state. TanhActivation is the default activation.
-    :type state_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert gate_act.support_hppl
-    assert state_act.support_hppl
-    assert act.support_hppl
-    assert input.size is not None and input.size % 4 == 0
-
-    if size is not None:
-        if input.size / 4 == size:
-            plog = logger.warning
-        else:
-            plog = logger.fatal
-        plog("size of lstmemory layer: %s is automatically set to "
-             "size of input layer / 4. The parameter size passing to "
-             "this layer is ignored." % (name))
-
-    Layer(
-        name=name,
-        type=LayerType.LSTMEMORY,
-        active_type=act.name,
-        active_state_type=state_act.name,
-        active_gate_type=gate_act.name,
-        reversed=reverse,
-        bias=ParamAttr.to_bias(bias_attr),
-        inputs=[Input(input.name, **param_attr.attr)],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name,
-        LayerType.LSTMEMORY, [input],
-        size=input.size / 4,
-        reverse=reverse)
-
-
-@wrap_bias_attr_default()
-@wrap_param_attr_default()
-@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
-@wrap_act_default(param_names=["act"], act=TanhActivation())
-@wrap_name_default("gru")
-@layer_support()
-def grumemory(input,
-              size=None,
-              name=None,
-              reverse=False,
-              act=None,
-              gate_act=None,
-              bias_attr=None,
-              param_attr=None,
-              layer_attr=None):
-    """
-    Gate Recurrent Unit Layer.
-
-    The memory cell was implemented as follow equations.
-
-    1. update gate :math:`z`: defines how much of the previous memory to
-    keep around or the unit updates its activations. The update gate
-    is computed by:
-
-    ..  math::
-
-        z_t = \\sigma(W_{z}x_{t} + U_{z}h_{t-1} + b_z)
-
-    2. reset gate :math:`r`: determines how to combine the new input with the
-    previous memory. The reset gate is computed similarly to the update gate:
-
-    ..  math::
-
-        r_t = \\sigma(W_{r}x_{t} + U_{r}h_{t-1} + b_r)
-
-    3. The candidate activation :math:`\\tilde{h_t}` is computed similarly to
-    that of the traditional recurrent unit:
-
-    ..  math::
-
-        {\\tilde{h_t}} = tanh(W x_{t} + U (r_{t} \odot h_{t-1}) + b)
-
-    4. The hidden activation :math:`h_t` of the GRU at time t is a linear
-    interpolation between the previous activation :math:`h_{t-1}` and the
-    candidate activation :math:`\\tilde{h_t}`:
-
-    ..  math::
-
-        h_t = (1 - z_t) h_{t-1} + z_t {\\tilde{h_t}}
-
-    NOTE: In PaddlePaddle's implementation, the multiplication operations
-    :math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not performed
-    in gate_recurrent layer. Consequently, an additional mixed_layer with
-    full_matrix_projection or a fc_layer must be included before grumemory
-    is called.
-
-    Reference:
-        `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling
-        <https://arxiv.org/abs/1412.3555>`_
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       gru = grumemory(input)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput.
-    :param size: DEPRECATED. The dimension of the gru cell.
-    :type size: int
-    :param reverse: Whether the input sequence is processed in a reverse order.
-    :type reverse: bool
-    :param act: Activation type, TanhActivation is the default. This activation
-                affects the :math:`{\\tilde{h_t}}`.
-    :type act: BaseActivation
-    :param gate_act: Activation type of this layer's two gates. SigmoidActivation is
-                     the default activation. This activation affects the :math:`z_t`
-                     and :math:`r_t`. It is the :math:`\\sigma` in the above formula.
-    :type gate_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert act.support_hppl
-    assert gate_act.support_hppl
-    assert input.size is not None and input.size % 3 == 0
-    if size is not None:
-        if input.size / 3 == size:
-            plog = logger.warning
-        else:
-            plog = logger.fatal
-        plog("size of grumemory layer: %s is automatically set to "
-             "size of input layer / 3. The parameter size passing to this "
-             "layer is ignored." % (name))
-
-    Layer(
-        name=name,
-        type=LayerType.GRUMEMORY,
-        active_type=act.name,
-        active_gate_type=gate_act.name,
-        reversed=reverse,
-        bias=ParamAttr.to_bias(bias_attr),
-        inputs=[Input(input.name, **param_attr.attr)],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name,
-        LayerType.GRUMEMORY, [input],
-        size=input.size / 3,
-        reverse=reverse)
-
-
-@wrap_name_default()
-@layer_support()
-def last_seq(input,
-             name=None,
-             agg_level=AggregateLevel.TO_NO_SEQUENCE,
-             stride=-1,
-             layer_attr=None):
-    """
-    Get Last Timestamp Activation of a sequence.
-
-    If stride > 0, this layer will slide a window whose size is determined by stride,
-    and return the last value of the sequence in the window as the output. Thus, a
-    long sequence will be shortened. Note that for sequence with sub-sequence, the
-    default value of stride is -1.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       seq = last_seq(input=layer)
-
-    :param agg_level: Aggregated level
-    :type agg_level: AggregateLevel
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param stride: The step size between successive pooling regions.
-    :type stride: int
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if input.reverse is not None and input.reverse:
-        logger.warning("You are getting the last instance of a sequence that"
-                       " is a output of a REVERSED layer. There is no time"
-                       " series information at all. Maybe you want to use"
-                       " first_seq instead.")
-
-    if agg_level == AggregateLevel.TO_SEQUENCE:
-        assert stride == -1
-
-    Layer(
-        name=name,
-        type=LayerType.SEQUENCE_LAST_INSTANCE,
-        inputs=[input.name],
-        trans_type=agg_level,
-        stride=stride,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        LayerType.SEQUENCE_LAST_INSTANCE,
-        parents=[input],
-        size=input.size)
-
-
-@wrap_name_default()
-@layer_support()
-def first_seq(input,
-              name=None,
-              agg_level=AggregateLevel.TO_NO_SEQUENCE,
-              stride=-1,
-              layer_attr=None):
-    """
-    Get First Timestamp Activation of a sequence.
-
-    If stride > 0, this layer will slide a window whose size is determined by stride,
-    and return the first value of the sequence in the window as the output. Thus, a
-    long sequence will be shortened. Note that for sequence with sub-sequence, the
-    default value of stride is -1.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       seq = first_seq(input=layer)
-
-    :param agg_level: aggregation level
-    :type agg_level: AggregateLevel
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param stride: The step size between successive pooling regions.
-    :type stride: int
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    if input.reverse is not None and not input.reverse:
-        logger.warning('You are getting the first instance for a time series,'
-                       ' and it is a normal recurrent layer output. There is no'
-                       ' time series information at all. Maybe you want to use'
-                       ' last_seq instead.')
-
-    if agg_level == AggregateLevel.TO_SEQUENCE:
-        assert stride == -1
-
-    Layer(
-        name=name,
-        type=LayerType.SEQUENCE_FIRST_INSTANCE,
-        inputs=[input.name],
-        trans_type=agg_level,
-        stride=stride,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        LayerType.SEQUENCE_FIRST_INSTANCE,
-        parents=[input],
-        size=input.size)
-
-
-class ExpandLevel(object):
-    """
-    Please refer to AggregateLevel first.
-
-    ExpandLevel supports two modes:
-
-    - :code:`ExpandLevel.FROM_NO_SEQUENCE` means the expansion acts on
-      :code:`NO_SEQUENCE`, which will be expanded to
-      :code:`SEQUENCE` or :code:`SUB_SEQUENCE`.
-
-    - :code:`ExpandLevel.FROM_SEQUENCE` means the expansion acts on
-      :code:`SEQUENCE`, which will be expanded to
-      :code:`SUB_SEQUENCE`.
-    """
-    FROM_NO_SEQUENCE = AggregateLevel.TO_NO_SEQUENCE
-    FROM_SEQUENCE = AggregateLevel.TO_SEQUENCE
-    # compatible with previous configuration
-    FROM_TIMESTEP = FROM_NO_SEQUENCE
-
-
-@wrap_name_default()
-@layer_support()
-def expand_layer(input,
-                 expand_as,
-                 name=None,
-                 bias_attr=False,
-                 expand_level=ExpandLevel.FROM_NO_SEQUENCE,
-                 layer_attr=None):
-    """
-    A layer for expanding dense data or (sequence data where the length of each
-    sequence is one) to sequence data.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       expand = expand_layer(input=layer1,
-                             expand_as=layer2,
-                             expand_level=ExpandLevel.FROM_NO_SEQUENCE)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param expand_as: Expand the input according to this layer's sequence infomation. And
-                      after the operation, the input expanded will have the same number of
-                      elememts as this layer.
-    :type expand_as: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param expand_level: Whether the input layer is a sequence or the element of a sequence.
-    :type expand_level: ExpandLevel
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    Layer(
-        inputs=[input.name, expand_as.name],
-        name=name,
-        bias=ParamAttr.to_bias(bias_attr=bias_attr),
-        type=LayerType.EXPAND_LAYER,
-        trans_type=expand_level,
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        size=input.size,
-        layer_type=LayerType.EXPAND_LAYER,
-        parents=[input, expand_as])
-
-
-@wrap_name_default()
-@wrap_act_default(act=IdentityActivation())
-@layer_support()
-def repeat_layer(input,
-                 num_repeats,
-                 as_row_vector=True,
-                 act=None,
-                 name=None,
-                 layer_attr=None):
-    """
-    A layer for repeating the input for num_repeats times.
-
-    If as_row_vector:
-
-    .. math::
-       y  = [x_1,\cdots, x_n, \cdots, x_1, \cdots, x_n]
-
-    If not as_row_vector:
-
-    .. math::
-       y  = [x_1,\cdots, x_1, \cdots, x_n, \cdots, x_n]
-
-
-    The example usage is:
-
-    .. code-block:: python
-
-       expand = repeat_layer(input=layer, num_repeats=4)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param num_repeats: The times of repeating the input.
-    :type num_repeats: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param as_row_vector: Whether to treat the input as row vectors or not. If
-                          the parameter is set to True, the repeating operation
-                          will be performed in the column direction. Otherwise,
-                          it will be performed in the row direction.
-    :type as_row_vector: bool
-    :param act: Activation type. IdentityActivation is the default activation.
-    :type act: BaseActivation
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    l = Layer(
-        inputs=[input.name],
-        name=name,
-        active_type=act.name,
-        num_filters=num_repeats,
-        as_row_vector=as_row_vector,
-        type=LayerType.FEATURE_MAP_EXPAND_LAYER,
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        size=l.config.size,
-        layer_type=LayerType.FEATURE_MAP_EXPAND_LAYER,
-        activation=act,
-        parents=[input])
-
-
-@wrap_name_default("seqreshape")
-@wrap_act_default(act=IdentityActivation())
-@wrap_bias_attr_default(has_bias=False)
-@layer_support(ERROR_CLIPPING, DROPOUT)
-def seq_reshape_layer(input,
-                      reshape_size,
-                      act=None,
-                      name=None,
-                      layer_attr=None,
-                      bias_attr=None):
-    """
-    A layer for reshaping the sequence. Assume the input sequence has T instances,
-    the dimension of each instance is M, and the input reshape_size is N, then the
-    output sequence has T*M/N instances, the dimension of each instance is N.
-
-    Note that T*M/N must be an integer.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       reshape = seq_reshape_layer(input=layer, reshape_size=4)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param reshape_size: The dimension of the reshaped sequence.
-    :type reshape_size: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param act: Activation type. IdentityActivation is the default activation.
-    :type act: BaseActivation
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    Layer(
-        inputs=[input.name],
-        name=name,
-        size=reshape_size,
-        type=LayerType.SEQUENCE_RESHAPE,
-        bias=ParamAttr.to_bias(bias_attr),
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        size=reshape_size,
-        layer_type=LayerType.SEQUENCE_RESHAPE,
-        parents=[input])
-
-
-@wrap_name_default()
-@layer_support()
-def interpolation_layer(input, weight, name=None, layer_attr=None):
-    """
-    This layer performs linear interpolation on two inputs,
-    which is used in NEURAL TURING MACHINE.
-
-    .. math::
-       y.row[i] = w[i] * x_1.row[i] + (1 - w[i]) * x_2.row[i]
-
-    where :math:`x_1` and :math:`x_2` are two (batchSize x dataDim) inputs,
-    :math:`w` is (batchSize x 1) weight vector, and :math:`y` is
-    (batchSize x dataDim) output.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       interpolation = interpolation_layer(input=[layer1, layer2], weight=layer3)
-
-    :param input: The input of this layer.
-    :type input: list | tuple
-    :param weight: Weight layer.
-    :type weight: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, collections.Sequence)
-    assert len(input) == 2
-    assert isinstance(input[0], LayerOutput) and isinstance(input[1],
-                                                            LayerOutput)
-    if input[0].size is not None and input[1].size is not None:
-        assert input[0].size == input[1].size
-    assert isinstance(weight, LayerOutput)
-    if weight.size is not None:
-        assert weight.size == 1
-    Layer(
-        name=name,
-        type=LayerType.INTERPOLATION_LAYER,
-        inputs=[weight.name, input[0].name, input[1].name],
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        LayerType.INTERPOLATION_LAYER,
-        parents=[weight, input[0], input[1]],
-        size=input[0].size)
-
-
-@wrap_name_default()
-@layer_support()
-def bilinear_interp_layer(input,
-                          out_size_x=None,
-                          out_size_y=None,
-                          name=None,
-                          layer_attr=None):
-    """
-    This layer implements bilinear interpolation on convolutional layer's output.
-
-    Please refer to Wikipedia: https://en.wikipedia.org/wiki/Bilinear_interpolation
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       bilinear = bilinear_interp_layer(input=layer1, out_size_x=64, out_size_y=64)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput.
-    :param out_size_x: The width of the output.
-    :type out_size_x: int
-    :param out_size_y: The height of the output.
-    :type out_size_y: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert input.layer_type == LayerType.CONV_LAYER
-    assert isinstance(input.activation, LinearActivation)
-    assert out_size_x > 0 and out_size_y > 0
-    assert input.num_filters is not None
-    num_channels = input.num_filters
-    l = Layer(
-        name=name,
-        inputs=Input(
-            input.name,
-            bilinear_interp=BilinearInterp(
-                out_size_x=out_size_x,
-                out_size_y=out_size_y,
-                channels=num_channels)),
-        type=LayerType.BILINEAR_INTERP_LAYER,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        LayerType.BILINEAR_INTERP_LAYER,
-        parents=[input],
-        num_filters=num_channels,
-        size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support()
-def power_layer(input, weight, name=None, layer_attr=None):
-    """
-    This layer applies a power function to a vector element-wise,
-    which is used in NEURAL TURING MACHINE.
-
-    .. math::
-       y = x^w
-
-    where :math:`x` is an input vector, :math:`w` is a scalar exponent,
-    and :math:`y` is an output vector.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       power = power_layer(input=layer1, weight=layer2)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param weight: The exponent of the power.
-    :type weight: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput) and isinstance(weight, LayerOutput)
-    if weight.size is not None:
-        assert weight.size == 1
-    Layer(
-        name=name,
-        type=LayerType.POWER_LAYER,
-        inputs=[weight.name, input.name],
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.POWER_LAYER, parents=[input, weight], size=input.size)
-
-
-@wrap_name_default()
-@layer_support()
-def scaling_layer(input, weight, name=None, layer_attr=None):
-    """
-    A layer for multiplying input vector by weight scalar.
-
-    .. math::
-       y  = w x
-
-    where :math:`x` is size=dataDim input, :math:`w` is size=1 weight,
-    and :math:`y` is size=dataDim output.
-
-    Note that the above computation is for one sample. Multiple samples are
-    processed in one batch.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       scale = scaling_layer(input=layer1, weight=layer2)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param weight: The weight of each sample.
-    :type weight: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(weight, LayerOutput) and isinstance(input, LayerOutput)
-    if weight.size is not None:
-        assert weight.size == 1
-    Layer(
-        name=name,
-        type=LayerType.SCALING_LAYER,
-        inputs=[weight.name, input.name],
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.SCALING_LAYER, parents=[weight, input], size=input.size)
-
-
-@wrap_name_default()
-@layer_support()
-def trans_layer(input, name=None, layer_attr=None):
-    """
-    A layer for transposing a minibatch matrix.
-
-    .. math::
-       y = x^\mathrm{T}
-
-    where :math:`x` is (M x N) input, and :math:`y` is (N x M) output.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       trans = trans_layer(input=layer)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(
-        name=name,
-        type=LayerType.TRANS_LAYER,
-        inputs=[input.name],
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.TRANS_LAYER, parents=[input], size=input.size)
-
-
-@wrap_name_default()
-@layer_support()
-def rotate_layer(input, height, width, name=None, layer_attr=None):
-    """
-    A layer for rotating 90 degrees (clock-wise) for each feature channel,
-    usually used when the input sample is some image or feature map.
-
-    .. math::
-       y(j,i,:) = x(M-i-1,j,:)
-
-    where :math:`x` is (M x N x C) input, and :math:`y` is (N x M x C) output.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       rot = rotate_layer(input=layer,
-                          height=100,
-                          width=100)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param height: The height of the sample matrix.
-    :type height: int
-    :param width: The width of the sample matrix.
-    :type width: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput)
-    l = Layer(
-        name=name,
-        height=height,
-        width=width,
-        type=LayerType.ROTATE_LAYER,
-        inputs=[input.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.ROTATE_LAYER,
-        parents=[input],
-        size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support()
-def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
-    """
-    Cosine Similarity Layer. The cosine similarity equation is here.
-
-    ..  math::
-        similarity = cos(\\theta) = {\\mathbf{a} \\cdot \\mathbf{b}
-        \\over \\|\\mathbf{a}\\| \\|\\mathbf{b}\\|}
-
-    The size of a is M, size of b is M*N,
-    Similarity will be calculated N times by step M. The output size is
-    N. The scale will be multiplied to similarity.
-
-    Note that the above computation is for one sample. Multiple samples are
-    processed in one batch.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cos = cos_sim(a=layer1, b=layer2, size=3)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param a: The first input of this layer.
-    :type a: LayerOutput
-    :param b: The second input of this layer.
-    :type b: LayerOutput
-    :param scale: The scale of the cosine similarity. 1 is the default value.
-    :type scale: float
-    :param size: The dimension of this layer. NOTE size_a * size should equal size_b.
-    :type size: int
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
-    if size == 1:
-        Layer(
-            name=name,
-            type=LayerType.COSINE_SIM,
-            cos_scale=scale,
-            inputs=[a.name, b.name],
-            **ExtraLayerAttribute.to_kwargs(layer_attr))
-    else:
-        if a.size is not None and b.size is not None:
-            assert size == b.size / a.size
-        Layer(
-            name=name,
-            type=LayerType.COSINE_SIM_VEC,
-            size=size,
-            cos_scale=scale,
-            inputs=[a.name, b.name],
-            **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name, LayerType.COSINE_SIM, parents=[a, b], size=size)
-
-
-@wrap_name_default()
-@layer_support()
-def l2_distance_layer(x, y, name=None, layer_attr=None):
-    """
-    This layer calculates and returns the Euclidean distance between two input
-    vectors x and y. The equation is as follows:
-
-    ..  math::
-        l2_distance(\\mathbf{x}, \\mathbf{y}) = \\sqrt{\\sum_{i=1}^D(x_i - y_i)}
-
-    The output size of this layer is fixed to be 1. Note that the above
-    computation is for one sample. Multiple samples are processed in one batch.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       l2_sim = l2_distance(x=layer1, y=layer2)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param x: The first input x for this layer, whose output is a matrix with
-              dimensionality N x D. N is the sample number in a mini-batch.
-              D is the dimensionality of x's output.
-    :type x: LayerOutput
-    :param y: The second input y for this layer, whose output is a matrix with
-              dimensionality N x D. N is the sample number in a mini-batch.
-              D is the dimensionality of y's output.
-    :type y: LayerOutput
-    :param layer_attr: The extra layer attributes, for example, drop rate.
-                       See ExtraLayerAttribute for more details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: The returned LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(x, LayerOutput) and isinstance(y, LayerOutput)
-    Layer(
-        name=name,
-        type=LayerType.L2_DISTANCE,
-        inputs=[x.name, y.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name, LayerType.L2_DISTANCE, parents=[x, y], size=1)
-
-
-@wrap_name_default()
-@wrap_bias_attr_default(has_bias=True)
-@wrap_param_attr_default()
-@layer_support()
-def hsigmoid(input,
-             label,
-             num_classes=None,
-             name=None,
-             bias_attr=None,
-             param_attr=None,
-             layer_attr=None):
-    """
-    Organize the classes into a binary tree. At each node, a sigmoid function
-    is used to calculate the probability of belonging to the right branch.
-
-    Reference:
-        `Hierarchical Probabilistic Neural Network Language Model
-        <http://www.gatsby.ucl.ac.uk/aistats/fullpapers/208.pdf>`_
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        cost = hsigmoid(input=[layer1, layer2],
-                        label=data_layer)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput | list | tuple
-    :param label: The input label.
-    :type label: LayerOutput
-    :param num_classes: The number of classes. And it should be larger than 2. If the parameter
-                        is not set or set to None, its actual value will be automatically set to
-                        the number of labels.
-    :type num_classes: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if isinstance(input, LayerOutput):
-        input = [input]
-        if not isinstance(param_attr, collections.Sequence):
-            param_attr = [param_attr]
-    else:
-        if not isinstance(param_attr, collections.Sequence):
-            param_attr = [param_attr] * len(input)
-        else:
-            assert len(param_attr) == len(input)
-
-    assert isinstance(input, collections.Sequence)
-    assert isinstance(label, LayerOutput)
-    assert label.layer_type == LayerType.DATA
-
-    if num_classes is None:
-        num_classes = label.size
-    if num_classes is None or num_classes <= 2:
-        raise ValueError("hsigmoid label size must larger than 2.")
-
-    ipts_for_layer = []
-    parents = []
-    for each_input, each_param_attr in zip(input, param_attr):
-        assert isinstance(each_input, LayerOutput)
-        ipts_for_layer.append(Input(each_input.name, **each_param_attr.attr))
-        parents.append(each_input)
-    ipts_for_layer.append(label.name)
-    parents.append(label)
-
-    l = Layer(
-        name=name,
-        type=LayerType.HSIGMOID,
-        num_classes=num_classes,
-        bias=ParamAttr.to_bias(bias_attr),
-        inputs=ipts_for_layer,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.HSIGMOID, parents=parents, size=l.config.size)
-
-
-@wrap_name_default("conv")
-@wrap_param_attr_default()
-@wrap_bias_attr_default()
-@wrap_act_default(act=ReluActivation())
-@layer_support(DROPOUT)
-def img_conv_layer(input,
-                   filter_size,
-                   num_filters,
-                   name=None,
-                   num_channels=None,
-                   act=None,
-                   groups=1,
-                   stride=1,
-                   padding=0,
-                   dilation=1,
-                   bias_attr=None,
-                   param_attr=None,
-                   shared_biases=True,
-                   layer_attr=None,
-                   filter_size_y=None,
-                   stride_y=None,
-                   padding_y=None,
-                   dilation_y=None,
-                   trans=False,
-                   layer_type=None):
-    """
-    Convolution layer for image. Paddle can support both square and non-square
-    input currently.
-
-    The details of convolution layer, please refer UFLDL's `convolution
-    <http://ufldl.stanford.edu/tutorial/supervised/
-    FeatureExtractionUsingConvolution/>`_ .
-
-    Convolution Transpose (deconv) layer for image. Paddle can support both square
-    and non-square input currently.
-
-    The details of convolution transpose layer,
-    please refer to the following explanation and references therein
-    <http://datascience.stackexchange.com/questions/6107/
-    what-are-deconvolutional-layers/>`_ .
-    The num_channel means input image's channel number. It may be 1 or 3 when
-    input is raw pixels of image(mono or RGB), or it may be the previous layer's
-    num_filters.
-
-    There are several groups of filters in PaddlePaddle implementation.
-    If the groups attribute is greater than 1, for example groups=2,
-    the input will be splitted into 2 parts along the channel axis, and
-    the filters will also be splitted into 2 parts. The first half of the filters 
-    is only connected to the first half of the input channels, while the second 
-    half of the filters is only connected to the second half of the input. After
-    the computation of convolution for each part of input,
-    the output will be obtained by concatenating the two results.
-
-    The details of grouped convolution, please refer to:
-    `ImageNet Classification With Deep Convolutional Neural Networks
-    <http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf>`_
-    
-    The example usage is:
-
-    ..  code-block:: python
-
-        conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                              num_channels=8,
-                              num_filters=16, stride=1,
-                              bias_attr=False,
-                              act=ReluActivation())
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param filter_size: The dimensions of the filter kernel. If the parameter is
-                        set to one integer, the two dimensions on x and y axises
-                        will be same when filter_size_y is not set. If it is set
-                        to a list, the first element indicates the dimension on
-                        the x axis, and the second is used to specify the dimension
-                        on the y axis when filter_size_y is not provided.
-    :type filter_size: int | tuple | list
-    :param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter
-                          is not set, it will be set automatically according to filter_size.
-    :type filter_size_y: int
-    :param num_filters: The number of filters. It is as same as the output image channel.
-    :type num_filters: int
-    :param act: Activation type. ReluActivation is the default activation.
-    :type act: BaseActivation
-    :param groups: The group number. 1 is the default group number.
-    :type groups: int
-    :param stride: The strides. If the parameter is set to one integer, the strides
-                   on x and y axises will be same when stride_y is not set. If it is
-                   set to a list, the first element indicates the stride on the x axis,
-                   and the second is used to specify the stride on the y axis when
-                   stride_y is not provided. 1 is the default value.
-    :type stride: int | tuple | list
-    :param stride_y: The stride on the y axis.
-    :type stride_y: int
-    :param padding: The padding sizes. If the parameter is set to one integer, the padding
-                    sizes on x and y axises will be same when padding_y is not set. If it
-                    is set to a list, the first element indicates the padding size on the
-                    x axis, and the second is used to specify the padding size on the y axis
-                    when padding_y is not provided. 0 is the default padding size.
-    :type padding: int | tuple | list
-    :param padding_y: The padding size on the y axis.
-    :type padding_y: int
-    :param dilation: The dimensions of the dilation. If the parameter is set to one integer,
-                     the two dimensions on x and y axises will be same when dilation_y is not
-                     set. If it is set to a list, the first element indicates the dimension
-                     on the x axis, and the second is used to specify the dimension on the y
-                     axis when dilation_y is not provided. 1 is the default dimension.
-    :type dilation: int | tuple | list
-    :param dilation_y: The dimension of the dilation on the y axis.
-    :type dilation_y: int
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None, its actual value will be automatically set to
-                         the channel number of the input.
-    :type num_channels: int
-    :param param_attr: The parameter attribute. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param shared_biases: Whether biases will be shared between filters or not.
-    :type shared_biases: bool
-    :param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :param trans: True if it is a convTransLayer, False if it is a convLayer
-    :type trans: bool
-    :param layer_type: Specify the layer type. If the dilation's dimension on one axis is
-                       larger than 1, layer_type has to be "cudnn_conv" or "cudnn_convt".
-                       If trans=True, layer_type has to be "exconvt" or "cudnn_convt",
-                       otherwise layer_type has to be either "exconv" or "cudnn_conv".
-    :type layer_type: basestring
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-
-    if filter_size_y is None:
-        if isinstance(filter_size, collections.Sequence):
-            assert len(filter_size) == 2
-            filter_size, filter_size_y = filter_size
-        else:
-            filter_size_y = filter_size
-
-    if stride_y is None:
-        if isinstance(stride, collections.Sequence):
-            assert len(stride) == 2
-            stride, stride_y = stride
-        else:
-            stride_y = stride
-
-    if padding_y is None:
-        if isinstance(padding, collections.Sequence):
-            assert len(padding) == 2
-            padding, padding_y = padding
-        else:
-            padding_y = padding
-
-    if dilation_y is None:
-        if isinstance(dilation, collections.Sequence):
-            assert len(dilation) == 2
-            dilation, dilation_y = dilation
-        else:
-            dilation_y = dilation
-
-    if param_attr.attr.get('initial_smart'):
-        # special initial for conv layers.
-        init_w = (2.0 / (filter_size**2 * num_channels))**0.5
-        param_attr.attr["initial_mean"] = 0.0
-        param_attr.attr["initial_std"] = init_w
-        param_attr.attr["initial_strategy"] = 0
-        param_attr.attr["initial_smart"] = False
-
-    if layer_type:
-        if dilation > 1 or dilation_y > 1:
-            assert layer_type in [
-                "cudnn_conv", "cudnn_convt", "exconv", "exconvt"
-            ]
-        if trans:
-            assert layer_type in ["exconvt", "cudnn_convt"]
-        else:
-            assert layer_type in ["exconv", "cudnn_conv"]
-        lt = layer_type
-    else:
-        lt = LayerType.CONVTRANS_LAYER if trans else LayerType.CONV_LAYER
-
-    l = Layer(
-        name=name,
-        inputs=Input(
-            input.name,
-            conv=Conv(
-                filter_size=filter_size,
-                padding=padding,
-                dilation=dilation,
-                stride=stride,
-                channels=num_channels,
-                groups=groups,
-                filter_size_y=filter_size_y,
-                padding_y=padding_y,
-                dilation_y=dilation_y,
-                stride_y=stride_y),
-            **param_attr.attr),
-        active_type=act.name,
-        num_filters=num_filters,
-        bias=ParamAttr.to_bias(bias_attr),
-        shared_biases=shared_biases,
-        type=lt,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        lt,
-        parents=[input],
-        activation=act,
-        num_filters=num_filters,
-        size=l.config.size)
-
-
-@wrap_name_default("pool")
-@layer_support()
-def img_pool_layer(input,
-                   pool_size,
-                   name=None,
-                   num_channels=None,
-                   pool_type=None,
-                   stride=1,
-                   padding=0,
-                   layer_attr=None,
-                   pool_size_y=None,
-                   stride_y=None,
-                   padding_y=None,
-                   ceil_mode=True,
-                   exclude_mode=None):
-    """
-    Image pooling Layer.
-
-    The details of pooling layer, please refer to ufldl's pooling_ .
-
-    .. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/
-
-    - ceil_mode=True:
-
-    ..  math::
-
-        w & = 1 + ceil(\\frac{input\_width + 2 * padding - pool\_size}{stride})
-
-        h & = 1 + ceil(\\frac{input\_height + 2 * padding\_y - pool\_size\_y}{stride\_y})
-
-    - ceil_mode=False:
-
-    ..  math::
-
-        w & = 1 + floor(\\frac{input\_width + 2 * padding - pool\_size}{stride})
-
-        h & = 1 + floor(\\frac{input\_height + 2 * padding\_y - pool\_size\_y}{stride\_y})
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        maxpool = img_pool_layer(input=conv,
-                                 pool_size=3,
-                                 pool_size_y=5,
-                                 num_channels=8,
-                                 stride=1,
-                                 stride_y=2,
-                                 padding=1,
-                                 padding_y=2,
-                                 pool_type=MaxPooling())
-
-    :param padding: The padding size on the x axis. 0 is the default padding size.
-    :type padding: int
-    :param padding_y: The padding size on the y axis. If the parameter is not set
-                      or set to None, it will be set to 'padding' automatically.
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param pool_size: The pooling window length on the x axis.
-    :type pool_size: int
-    :param pool_size_y: The pooling window length on the y axis. If the parameter is
-                        not set or set to None, its actual value will be automatically
-                        set to pool_size.
-    :type pool_size_y: int
-    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None, its actual value will be automatically set to
-                         the channels number of the input.
-    :type num_channels: int
-    :param pool_type: Pooling type. MaxPooling is the default pooling.
-    :type pool_type: BasePoolingType
-    :param stride: The stride on the x axis. 1 is the default value.
-    :type stride: int
-    :param stride_y: The stride on the y axis. If the parameter is not set or set to
-                     None, its actual value will be automatically set to 'stride'.
-    :type stride_y: int
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :param ceil_mode: Whether to use the ceil function to calculate output height and width.
-                      True is the default. If it is set to False, the floor function will
-                      be used.
-    :type ceil_mode: bool
-    :param exclude_mode: Whether to exclude the padding cells when calculating, but only 
-                         work when pool_type is AvgPooling. If None, also exclude the padding 
-                         cells. If use cudnn, use CudnnAvgPooling or CudnnAvgInclPadPooling 
-                         as pool_type to identify the mode.
-    :type exclude_mode: bool
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-
-    if pool_type is None:
-        pool_type = MaxPooling()
-    elif isinstance(pool_type, AvgPooling):
-        pool_type.name = 'avg'
-
-    assert type(pool_type) in [AvgPooling, MaxPooling, MaxWithMaskPooling, CudnnAvgPooling,
-                               CudnnMaxPooling, CudnnAvgInclPadPooling], \
-        "only (Cudnn)AvgPooling, (Cudnn)MaxPooling, MaxWithMaskPooling are supported"
-
-    type_name = pool_type.name + '-projection' \
-        if (
-        isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
-        else pool_type.name
-    pool_size_y = pool_size if pool_size_y is None else pool_size_y
-    stride_y = stride if stride_y is None else stride_y
-    padding_y = padding if padding_y is None else padding_y
-
-    l = Layer(
-        name=name,
-        type=LayerType.POOL_LAYER,
-        inputs=[
-            Input(
-                input.name,
-                pool=Pool(
-                    pool_type=type_name,
-                    channels=num_channels,
-                    size_x=pool_size,
-                    start=None,
-                    stride=stride,
-                    padding=padding,
-                    size_y=pool_size_y,
-                    stride_y=stride_y,
-                    padding_y=padding_y))
-        ],
-        ceil_mode=ceil_mode,
-        exclude_mode=exclude_mode,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        LayerType.POOL_LAYER,
-        parents=[input],
-        num_filters=num_channels,
-        size=l.config.size)
-
-
-@wrap_name_default("pool3d")
-@layer_support()
-def img_pool3d_layer(input,
-                     pool_size,
-                     name=None,
-                     num_channels=None,
-                     pool_type=None,
-                     stride=1,
-                     padding=0,
-                     layer_attr=None,
-                     pool_size_y=None,
-                     stride_y=None,
-                     padding_y=None,
-                     pool_size_z=None,
-                     stride_z=None,
-                     padding_z=None,
-                     ceil_mode=True):
-    """
-    Image pooling Layer.
-
-    The details of pooling layer, please refer ufldl's pooling_ .
-
-    .. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/
-
-    - ceil_mode=True:
-
-    ..  math::
-
-        w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride}
-
-        h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
-
-        d & = 1 + \\frac{ceil(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z}
-
-    - ceil_mode=False:
-
-    ..  math::
-
-        w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride}
-
-        h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
-
-        d & = 1 + \\frac{floor(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z}
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        maxpool = img_pool3d_layer(input=conv,
-                                 pool_size=3,
-                                 num_channels=8,
-                                 stride=1,
-                                 padding=1,
-                                 pool_type=MaxPooling())
-
-    :param padding: pooling padding width.
-    :type padding: int | tuple | list
-    :param name: The name of this layer. It is optional.
-    :type name: basestring.
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param pool_size: The pooling window lengths along three axises. If the parameter
-                      is set to one integer, the three lengths will be same.
-    :type pool_size: int | tuple | list
-    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None, its actual value will be automatically set to
-                         the channels number of the input.
-    :type num_channels: int
-    :param pool_type: Pooling type. MaxPooling is the default pooling.
-    :type pool_type: BasePoolingType
-    :param stride: The strides of the pooling along three axises. If the parameter
-                   is set to one integer, the three strides will be same. 1 is the
-                   default value.
-    :type stride: int | tuple | list
-    :param padding: The sizes of padding along three axises. If the parameter is set to
-                    one integer, they will be same. 0 is the default padding size.
-    :type padding: int | tuple | list
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :param ceil_mode: Wether to use the ceil function to calculate output height and width.
-                      True is the default. If it is set to False, the floor function will
-                      be used.
-    :type ceil_mode: bool
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-
-    if pool_type is None:
-        pool_type = MaxPooling()
-    elif isinstance(pool_type, AvgPooling):
-        pool_type.name = 'avg'
-
-    type_name = pool_type.name + '-projection' \
-        if (
-        isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
-        else pool_type.name
-
-    if isinstance(pool_size, collections.Sequence):
-        assert len(pool_size) == 3
-        pool_size, pool_size_y, pool_size_z = pool_size
-    else:
-        pool_size_y = pool_size
-        pool_size_z = pool_size
-
-    if isinstance(stride, collections.Sequence):
-        assert len(stride) == 3
-        stride, stride_y, stride_z = stride
-    else:
-        stride_y = stride
-        stride_z = stride
-
-    if isinstance(padding, collections.Sequence):
-        assert len(padding) == 3
-        padding, padding_y, padding_y = padding
-    else:
-        padding_y = padding
-        padding_z = padding
-
-    l = Layer(
-        name=name,
-        type=LayerType.POOL3D_LAYER,
-        inputs=[
-            Input(
-                input.name,
-                pool=Pool3d(
-                    pool_type=type_name,
-                    channels=num_channels,
-                    size_x=pool_size,
-                    start=None,
-                    stride=stride,
-                    padding=padding,
-                    size_y=pool_size_y,
-                    stride_y=stride_y,
-                    padding_y=padding_y,
-                    size_z=pool_size_z,
-                    stride_z=stride_z,
-                    padding_z=padding_z))
-        ],
-        ceil_mode=ceil_mode,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        LayerType.POOL_LAYER,
-        parents=[input],
-        num_filters=num_channels,
-        size=l.config.size)
-
-
-@wrap_name_default("upsample")
-@layer_support()
-def upsample_layer(input,
-                   name=None,
-                   scale=None,
-                   scale_y=None,
-                   upsample_size=None,
-                   upsample_size_y=None,
-                   pad_out_x=False,
-                   pad_out_y=False,
-                   layer_attr=None):
-    """
-    The DePooling process.
-    Inputs should be a list of length 2. The first input is a layer,
-    and the second input should be the MaxWithMaskPoolingLayer
-
-    The example usage is:
-
-    ..  code-block:: python
-        pool1 = paddle.v2.layer.img_pool(input=input, pool_size=2, stride=2,
-                                        pool_type=paddle.pooling.MaxWithMask())
-        upsample = paddle.v2.layer.upsample(input=[layer1, pool1])
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: contains an input layer and a MaxWithMaskPoolingLayer
-    :type input: list | tuple | collections.Sequence
-    :param scale: outputSize =  scale * inputSize
-    :type scale: int | list | tuple | .
-    :param scale_y: scale_y will be equal to scale, if it's value is None, 
-    :type scale: int | None. 
-    :param upsample_size: specify the outputSize.
-    :type upsample_size: int | list | tuple.
-    :param upsample_size_y: specify the y dimension outputSize.
-    :type upsample_size_y: int.
-    :param pad_out_x: specify exact x dimension size. This parameter only works when scale is 2
-    :type pad_out_x: bool.
-    :param pad_out_y: specify exact y dimension size. This parameter only works when scale is 2
-    :type pad_out_y: bool.
-    :param layer_attr: Extra Layer Attribute.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert (scale is not None) or (upsample_size is not None), \
-            'scale or upsample_size, there must be one to be designated'
-
-    assert len(input) == 2, 'layer input size must be 2'
-
-    assert input[1].layer_type == LayerType.POOL_LAYER, \
-            'the second input should be the MaxPoolWithMaskLayer'
-
-    scale_y = scale \
-            if scale is not None else scale_y
-    upsample_size_y = upsample_size  \
-            if upsample_size is not None else upsample_size_y
-
-    layer_type = LayerType.UPSAMPLE_LAYER
-
-    layer = Layer(
-        name=name,
-        type=layer_type,
-        inputs=[
-            Input(
-                input[0].name,
-                upsample=Upsample(scale, scale_y, pad_out_x, pad_out_y,
-                                  upsample_size, upsample_size_y)),
-            Input(input[1].name)
-        ],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    sz = layer.config.size
-
-    return LayerOutput(name, layer_type=layer_type, parents=input, size=sz)
-
-
-@wrap_name_default("spp")
-@layer_support()
-def spp_layer(input,
-              name=None,
-              num_channels=None,
-              pool_type=None,
-              pyramid_height=None,
-              layer_attr=None):
-    """
-    A layer performs spatial pyramid pooling.
-
-    Reference:
-        `Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
-        <https://arxiv.org/abs/1406.4729>`_
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        spp = spp_layer(input=data,
-                        pyramid_height=2,
-                        num_channels=16,
-                        pool_type=MaxPooling())
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None, its actual value will be automatically set to
-                         the channels number of the input.
-    :type num_channels: int
-    :param pool_type: Pooling type. MaxPooling is the default pooling.
-    :type scale: BasePoolingType
-    :param pyramid_height: The pyramid height of this pooling.
-    :type pyramid_height: int
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-
-    if pool_type is None:
-        pool_type = MaxPooling()
-    elif isinstance(pool_type, AvgPooling):
-        pool_type.name = 'avg'
-
-    type_name = pool_type.name
-    if (isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)):
-        type_name += '-projection'
-
-    l = Layer(
-        name=name,
-        type=LayerType.SPP_LAYER,
-        inputs=Input(
-            input.name,
-            spp=SpatialPyramidPool(
-                pool_type=type_name,
-                channels=num_channels,
-                pyramid_height=pyramid_height)),
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        layer_type=LayerType.SPP_LAYER,
-        parents=[input],
-        num_filters=num_channels,
-        size=l.config.size)
-
-
-def __img_norm_layer__(name, input, size, norm_type, scale, power, num_channels,
-                       blocked, layer_attr):
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-
-    l = Layer(
-        name=name,
-        type=LayerType.NORM_LAYER,
-        inputs=Input(
-            input.name,
-            norm=Norm(
-                norm_type=norm_type,
-                channels=num_channels,
-                size=size,
-                scale=scale,
-                pow=power,
-                blocked=blocked)),
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        layer_type=LayerType.NORM_LAYER,
-        parents=[input],
-        num_filters=num_channels,
-        img_norm_type=norm_type,
-        size=l.config.size)
-
-
-@wrap_name_default("crmnorm")
-@layer_support()
-def img_cmrnorm_layer(input,
-                      size,
-                      scale=0.0128,
-                      power=0.75,
-                      name=None,
-                      num_channels=None,
-                      layer_attr=None):
-    """
-    Response normalization across feature maps.
-
-    Reference:
-        `ImageNet Classification with Deep Convolutional Neural Networks
-        <http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf>`_
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        norm = img_cmrnorm_layer(input=net, size=5)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param size: Normalize in number of :math:`size` feature maps.
-    :type size: int
-    :param scale: The hyper-parameter.
-    :type scale: float
-    :param power: The hyper-parameter.
-    :type power: float
-    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None, its actual value will be automatically set to
-                         the channels number of the input.
-    :param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    return __img_norm_layer__(name, input, size, "cmrnorm-projection", scale,
-                              power, num_channels, 0, layer_attr)
-
-
-@wrap_bias_attr_default()
-@wrap_param_attr_default(
-    default_factory=lambda _: ParamAttr(initial_mean=1.0, initial_std=0.))
-@wrap_act_default(act=ReluActivation())
-@wrap_name_default("batch_norm")
-@layer_support(DROPOUT, ERROR_CLIPPING)
-def batch_norm_layer(input,
-                     act=None,
-                     name=None,
-                     img3D=False,
-                     num_channels=None,
-                     bias_attr=None,
-                     param_attr=None,
-                     layer_attr=None,
-                     batch_norm_type=None,
-                     epsilon=1e-5,
-                     moving_average_fraction=0.9,
-                     use_global_stats=None,
-                     mean_var_names=None):
-    """
-    Batch Normalization Layer. The notation of this layer is as follows.
-
-    :math:`x` is the input features over a mini-batch.
-
-    ..  math::
-
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
-        \ mini-batch\ mean \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
-
-    Reference:
-        `Batch Normalization: Accelerating Deep Network Training by Reducing
-        Internal Covariate Shift
-        <http://arxiv.org/abs/1502.03167>`_
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        norm = batch_norm_layer(input=net, act=ReluActivation())
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: This layer's input which is to be performed batch normalization on.
-    :type input: LayerOutput
-    :param batch_norm_type: We have batch_norm, mkldnn_batch_norm and cudnn_batch_norm.
-                            batch_norm supports CPU, MKLDNN and GPU. cudnn_batch_norm
-                            requires cuDNN version greater or equal to v4 (>=v4).
-                            But cudnn_batch_norm is faster and needs less
-                            memory than batch_norm. mkldnn_batch_norm requires
-                            use_mkldnn is enabled. By default (None), we will
-                            automatically select cudnn_batch_norm for GPU,
-                            mkldnn_batch_norm for MKLDNN and batch_norm for CPU.
-                            Users can specify the batch norm type. If you use
-                            cudnn_batch_norm, we suggested you use latest version,
-                            such as v5.1.
-    :type batch_norm_type: None | string, None or "batch_norm" or "cudnn_batch_norm"
-                           or "mkldnn_batch_norm"
-    :param act: Activation type. ReluActivation is the default activation.
-    :type act: BaseActivation
-    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None, its actual value will be automatically set to
-                         the channels number of the input.
-    :type num_channels: int
-    :param bias_attr: :math:`\\beta`. The bias attribute. If the parameter is set to
-                      False or an object whose type is not ParameterAttribute, no
-                      bias is defined. If the parameter is set to True, the bias is
-                      initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: :math:`\\gamma`. The parameter attribute. See ParameterAttribute
-                       for details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :param use_global_stats: Whether use moving mean/variance statistics during
-                             testing peroid. If the parameter is set to None or
-                             True, it will use moving mean/variance statistics
-                             during testing. If the parameter is set to False, it
-                             will use the mean and variance of the current batch
-                             of test data.
-    :type use_global_stats: bool | None.
-    :param epsilon: The small constant added to the variance to improve numeric stability.
-    :type epsilon: float.
-    :param moving_average_fraction: Factor used in the moving average computation.
-                                   :math:`runningMean = newMean*(1-factor) + runningMean*factor`
-    :type moving_average_fraction: float.
-    :param mean_var_names: [mean name, variance name]
-    :type mean_var_names: string list
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    if num_channels is None:
-        if input.num_filters is not None:
-            num_channels = input.num_filters
-        else:
-            num_channels = input.size
-    assert (batch_norm_type is None) or (batch_norm_type == "batch_norm") or \
-           (batch_norm_type == "mkldnn_batch_norm") or \
-           (batch_norm_type == "cudnn_batch_norm")
-
-    l = Layer(
-        name=name,
-        img3D=img3D,
-        inputs=Input(
-            input.name, image=Image(channels=num_channels), **param_attr.attr),
-        active_type=act.name,
-        type=LayerType.BATCH_NORM_LAYER,
-        batch_norm_type=batch_norm_type,
-        bias=ParamAttr.to_bias(bias_attr),
-        epsilon=epsilon,
-        moving_average_fraction=moving_average_fraction,
-        use_global_stats=use_global_stats,
-        mean_var_names=mean_var_names,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.BATCH_NORM_LAYER,
-        parents=[input],
-        activation=act,
-        num_filters=num_channels,
-        size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support()
-def sum_to_one_norm_layer(input, name=None, layer_attr=None):
-    """
-    A layer for sum-to-one normalization,
-    which is used in NEURAL TURING MACHINE.
-
-    .. math::
-       out[i] = \\frac {in[i]} {\sum_{k=1}^N in[k]}
-
-    where :math:`in` is a (batchSize x dataDim) input vector,
-    and :math:`out` is a (batchSize x dataDim) output vector.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       sum_to_one_norm = sum_to_one_norm_layer(input=layer)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute
-                       for details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(
-        name=name,
-        type=LayerType.SUM_TO_ONE_NORM_LAYER,
-        inputs=[input.name],
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.SUM_TO_ONE_NORM_LAYER, parents=[input], size=input.size)
-
-
-@wrap_name_default()
-@layer_support()
-def row_l2_norm_layer(input, name=None, layer_attr=None):
-    """
-    A layer for L2-normalization in each row.
-
-    .. math::
-       out[i] = \\frac{in[i]} {\\sqrt{\\sum_{k=1}^N in[k]^{2}}}
-
-    where the size of :math:`in` is (batchSize x dataDim) ,
-    and the size of :math:`out` is a (batchSize x dataDim) .
-
-    The example usage is:
-
-    .. code-block:: python
-
-       row_l2_norm_layer = row_l2_norm_layer(input=layer)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute
-                       for details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(
-        name=name,
-        type=LayerType.ROW_L2_NORM_LAYER,
-        inputs=[input.name],
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.ROW_L2_NORM_LAYER, parents=[input], size=input.size)
-
-
-@wrap_name_default("addto")
-@wrap_act_default(act=LinearActivation())
-@wrap_bias_attr_default(has_bias=False)
-@layer_support(DROPOUT, ERROR_CLIPPING)
-def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
-    """
-    AddtoLayer.
-
-    ..  math::
-
-        y = f(\\sum_{i} x_i + b)
-
-    where :math:`y` is output, :math:`x` is input, :math:`b` is bias,
-    and :math:`f` is activation function.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        addto = addto_layer(input=[layer1, layer2],
-                            act=ReluActivation(),
-                            bias_attr=False)
-
-    This layer just simply adds all input layers together, then activates the
-    sum. All inputs should share the same dimension, which is also the dimension
-    of this layer's output.
-
-    There is no weight matrix for each input, because it just a simple add
-    operation. If you want a complicated operation before add, please use
-    mixed_layer.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input layers. It could be a LayerOutput or list/tuple of
-                 LayerOutput.
-    :type input: LayerOutput | list | tuple
-    :param act: Activation Type. LinearActivation is the default activation.
-    :type act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    num_filters = None
-    if isinstance(input, LayerOutput):
-        input = [input]
-
-    assert isinstance(input, collections.Sequence)
-    ipts_for_layer = []
-    for each_input in input:
-        assert isinstance(each_input, LayerOutput)
-        ipts_for_layer.append(Input(each_input.name))
-        if each_input.num_filters is not None:
-            num_filters = each_input.num_filters
-
-    l = Layer(
-        name=name,
-        type=LayerType.ADDTO_LAYER,
-        inputs=ipts_for_layer,
-        bias=ParamAttr.to_bias(bias_attr),
-        active_type=act.name,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name,
-        LayerType.ADDTO_LAYER,
-        parents=input,
-        activation=act,
-        num_filters=num_filters,
-        size=l.config.size)
-
-
-@wrap_act_default(act=IdentityActivation())
-@wrap_name_default("concat")
-@layer_support(DROPOUT, ERROR_CLIPPING)
-def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
-    """
-    Concatenate all input vectors to one vector.
-    Inputs can be a list of LayerOutput or a list of projection.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        concat = concat_layer(input=[layer1, layer2])
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input layers or projections
-    :type input: list | tuple | collections.Sequence
-    :param act: Activation type. IdentityActivation is the default activation.
-    :type act: BaseActivation
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    if isinstance(input, LayerOutput):
-        input = [input]
-    elif isinstance(input, Projection):
-        input = [input]
-    else:
-        assert isinstance(input, collections.Sequence)
-
-    def __is_type__(o, tp):
-        if not isinstance(o, collections.Sequence):
-            if o == tp:
-                return True
-            elif len(o.__bases__) == 0:
-                return False
-            else:
-                for bs in o.__bases__:
-                    if __is_type__(bs, tp):
-                        return True
-                return False
-        else:
-            tmp = map(lambda _x: __is_type__(_x, tp), o)
-            a = tmp[0]
-            for b in tmp[1:]:
-                assert a == b
-            return a
-
-    def __reduce_concat_type__(a, b):
-        assert __is_type__([a, b], Projection) or __is_type__([a, b],
-                                                              LayerOutput)
-        return a
-
-    is_concat_layer = __is_type__(
-        reduce(__reduce_concat_type__, map(type, input)), LayerOutput)
-
-    layer_type = (LayerType.CONCAT_LAYER
-                  if is_concat_layer else LayerType.CONCAT_PROJ_LAYER)
-
-    if layer_type == LayerType.CONCAT_LAYER:
-        assert not bias_attr
-
-    layer = Layer(
-        name=name,
-        type=layer_type,
-        inputs=[x.name for x in input] if is_concat_layer else input,
-        active_type=act.name,
-        bias=ParamAttr.to_bias(bias_attr),
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    sz = layer.config.size
-
-    return LayerOutput(
-        name,
-        layer_type=layer_type,
-        parents=input if is_concat_layer else [x.origin for x in input],
-        activation=act,
-        size=sz)
-
-
-@wrap_name_default("seqconcat")
-@wrap_act_default(act=IdentityActivation())
-@wrap_bias_attr_default(has_bias=False)
-@layer_support(DROPOUT, ERROR_CLIPPING)
-def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
-                     bias_attr=None):
-    """
-    Concatenate sequence a and sequence b.
-
-    Inputs:
-      - a = [a1, a2, ..., am]
-      - b = [b1, b2, ..., bn]
-
-    Output: [a1, ..., am, b1, ..., bn]
-
-    Note that the above computation is for one sample. Multiple samples are
-    processed in one batch.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        concat = seq_concat_layer(a=layer1, b=layer2)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param a: The first input sequence layer
-    :type a: LayerOutput
-    :param b: The second input sequence layer
-    :type b: LayerOutput
-    :param act: Activation type. IdentityActivation is the default activation.
-    :type act: BaseActivation
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
-    assert a.size == b.size
-    Layer(
-        name=name,
-        type=LayerType.SEQUENCE_CONCAT_LAYER,
-        inputs=[a.name, b.name],
-        active_type=act.name,
-        bias=ParamAttr.to_bias(bias_attr),
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name,
-        layer_type=LayerType.SEQUENCE_CONCAT_LAYER,
-        parents=[a, b],
-        activation=act,
-        size=a.size)
-
-
-@wrap_name_default("memory", "memory_name")
-def memory(name,
-           size,
-           memory_name=None,
-           is_seq=False,
-           boot_layer=None,
-           boot_bias=None,
-           boot_bias_active_type=None,
-           boot_with_const_id=None):
-    """
-    The memory takes a layer's output at previous time step as its own output.
-
-    If boot_bias, the activation of the bias is the initial value of the memory.
-
-    If boot_with_const_id is set, then the memory's output at the first time step
-    is a IndexSlot, the Arguments.ids()[0] is this :code:`cost_id`.
-
-    If boot_layer is specified, the memory's output at the first time step will
-    be the boot_layer's output.
-
-    In other case, the default memory's output at the first time step is zero.
-
-    .. code-block:: python
-
-       mem = memory(size=256, name='state')
-       state = fc_layer(input=mem, size=256, name='state')
-
-    If you do not want to specify the name, you can also use set_input()
-    to specify the layer to be remembered as the following:
-
-    .. code-block:: python
-
-       mem = memory(size=256)
-       state = fc_layer(input=mem, size=256)
-       mem.set_input(mem)
-
-    :param name: The name of the layer which this memory remembers.
-                 If name is None, user should call set_input() to specify the
-                 name of the layer which this memory remembers.
-    :type name: basestring
-    :param size: The dimensionality of memory.
-    :type size: int
-    :param memory_name: The name of the memory. It is ignored when name is provided.
-    :type memory_name: basestring
-    :param is_seq: DEPRECATED. is sequence for boot_layer
-    :type is_seq: bool
-    :param boot_layer: This parameter specifies memory's output at the first time
-                       step and the output is boot_layer's output.
-    :type boot_layer: LayerOutput | None
-    :param boot_bias: The bias attribute of memory's output at the first time step.
-                      If the parameter is set to False or an object whose type is not
-                      ParameterAttribute, no bias is defined. If the parameter is set
-                      to True, the bias is initialized to zero.
-    :type boot_bias: ParameterAttribute | None
-    :param boot_bias_active_type: Activation type for memory's bias at the first time
-                                  step. LinearActivation is the default activation.
-    :type boot_bias_active_type: BaseActivation
-    :param boot_with_const_id: This parameter specifies memory's output at the first
-                               time step and the output is an index.
-    :type boot_with_const_id: int
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if boot_bias_active_type is None:
-        boot_bias_active_type = LinearActivation()
-
-    assert boot_bias is None or isinstance(boot_bias, ParameterAttribute)
-    if isinstance(boot_bias, ParameterAttribute):
-        boot_bias = ParamAttr.to_bias(boot_bias)
-
-    assert boot_layer is None or isinstance(boot_layer, LayerOutput)
-    if name is not None:
-        memory_name = None
-
-    memory_name = Memory(
-        name,
-        size,
-        boot_layer=boot_layer.name if boot_layer is not None else None,
-        boot_bias=boot_bias,
-        boot_bias_active_type=boot_bias_active_type.name,
-        boot_with_const_id=boot_with_const_id,
-        memory_name=memory_name)
-
-    lout = LayerOutput(
-        name=memory_name,
-        size=size,
-        layer_type=LayerType.MEMORY,
-        parents=[boot_layer] if boot_layer is not None else None)
-    return lout
-
-
-@wrap_bias_attr_default()
-@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
-@wrap_act_default(param_names=['state_act'], act=TanhActivation())
-@wrap_act_default(act=TanhActivation())
-@wrap_name_default('lstm_step')
-@layer_support()
-def lstm_step_layer(input,
-                    state,
-                    size=None,
-                    act=None,
-                    name=None,
-                    gate_act=None,
-                    state_act=None,
-                    bias_attr=None,
-                    layer_attr=None):
-    """
-    LSTM Step Layer. This function is used only in recurrent_group.
-    The lstm equations are shown as follows.
-
-    ..  math::
-
-        i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
-
-        f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
-
-        c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
-
-        o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
-
-        h_t & = o_t tanh(c_t)
-
-
-    The input of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
-    :code:`mixed_layer` and :code:`full_matrix_projection` to calculate these
-    input vectors.
-
-    The state of lstm step is :math:`c_{t-1}`. And lstm step layer will do
-
-    ..  math::
-
-        i_t = \\sigma(input + W_{ci}c_{t-1} + b_i)
-
-        ...
-
-
-    This layer has two outputs. The default output is :math:`h_t`. The other
-    output is :math:`o_t`, whose name is 'state' and users can use
-    :code:`get_output_layer` to extract this output.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param size: The dimension of this layer's output, which must be
-                 equal to the dimension of the state.
-    :type size: int
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param state: The state of the LSTM unit.
-    :type state: LayerOutput
-    :param act: Activation type. TanhActivation is the default activation.
-    :type act: BaseActivation
-    :param gate_act: Activation type of the gate. SigmoidActivation is the
-                     default activation.
-    :type gate_act: BaseActivation
-    :param state_act: Activation type of the state. TanhActivation is the
-                      default activation.
-    :type state_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert size is None or state.size == size
-    size = state.size
-    Layer(
-        name=name,
-        type=LayerType.LSTM_STEP_LAYER,
-        active_type=act.name,
-        active_gate_type=gate_act.name,
-        active_state_type=state_act.name,
-        bias=ParamAttr.to_bias(bias_attr),
-        size=state.size,
-        inputs=[input.name, state.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.LSTM_STEP_LAYER,
-        parents=[input, state],
-        activation=act,
-        size=size,
-        outputs=['default', 'state'])
-
-
-@wrap_bias_attr_default()
-@wrap_param_attr_default()
-@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
-@wrap_act_default(act=TanhActivation())
-@wrap_name_default('gru_step')
-@layer_support()
-def gru_step_layer(input,
-                   output_mem,
-                   size=None,
-                   act=None,
-                   name=None,
-                   gate_act=None,
-                   bias_attr=None,
-                   param_attr=None,
-                   layer_attr=None):
-    """
-
-    :param input: The input of this layer, whose dimension can be divided by 3.
-    :type input: LayerOutput
-    :param output_mem: A memory which memorizes the output of this layer at previous
-                       time step.
-    :type output_mem: LayerOutput
-    :param size: The dimension of this layer's output. If it is not set or set to None,
-                 it will be set to one-third of the dimension of the input automatically.
-    :type size: int
-    :param act: Activation type of this layer's output. TanhActivation
-                is the default activation.
-    :type act: BaseActivation
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param gate_act: Activation type of this layer's two gates. SigmoidActivation is
-                     the default activation.
-    :type gate_act: BaseActivation
-    :param bias_attr: The parameter attribute for bias. If this parameter is set to
-                      False or an object whose type is not ParameterAttribute, no bias
-                      is defined. If this parameter is set to True,
-                      the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert input.size % 3 == 0
-    if size is None:
-        size = input.size / 3
-    Layer(
-        name=name,
-        type=LayerType.GRU_STEP_LAYER,
-        # The parameter here is for transforming the output_mem. The input has
-        # already been transformed outside this module so it does not need
-        # parameter associated with it.
-        # The parameter here is instead grouped with input is due to
-        # backward model compatibility.
-        inputs=[Input(input.name, **param_attr.attr), output_mem.name],
-        bias=ParamAttr.to_bias(bias_attr),
-        size=size,
-        active_type=act.name,
-        active_gate_type=gate_act.name,
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.GRU_STEP_LAYER,
-        parents=[input, output_mem],
-        size=size,
-        activation=act)
-
-
-@wrap_bias_attr_default()
-@wrap_param_attr_default()
-@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
-@wrap_act_default(act=TanhActivation())
-@wrap_name_default('gru_step_naive')
-@layer_support(ERROR_CLIPPING, DROPOUT)
-def gru_step_naive_layer(input,
-                         output_mem,
-                         size=None,
-                         name=None,
-                         act=None,
-                         gate_act=None,
-                         bias_attr=None,
-                         param_attr=None,
-                         layer_attr=None):
-    """
-    GRU Step Layer, which is realized using PaddlePaddle API. It supports ERROR_CLIPPING
-    and DROPOUT.
-
-    :param input: The input of this layer, whose dimensionality can be divided by 3.
-    :param output_mem: A memory which memorizes the output of this layer at previous
-                       time step.
-    :type output_mem: LayerOutput
-    :param size: The dimension of this layer's output. If it is not set or set to None,
-                 it will be set to one-third of the dimension of the input automatically.
-    :type size: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param act: Activation type of this layer's output. TanhActivation
-                is the default activation.
-    :type act: BaseActivation
-    :param gate_act: Activation type of this layer's two gates. SigmoidActivation
-                     is the default activation.
-    :type gate_act: BaseActivation
-    :param bias_attr: The parameter attribute for bias. If this parameter is set to
-                      False or an object whose type is not ParameterAttribute, no bias
-                      is defined. If this parameter is set to True,
-                      the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if input.size % 3 != 0:
-        raise ValueError("GruStep input size must be divided by 3")
-    if size is None:
-        size = input.size / 3
-
-    if bias_attr and bias_attr.attr.get("parameter_name", None) is not None:
-        raise ValueError("You should not specify the field `name` in bias_attr."
-                         " Otherwise, the three biases, which correponding to "
-                         " the two gates and the mixed layer for computing Wx+b"
-                         ", will share the same parameter matrix unexpectedly.")
-
-    def __gate__(gate_name, offset):
-        with mixed_layer(
-                name=name + "_" + gate_name,
-                size=size,
-                layer_attr=layer_attr,
-                bias_attr=bias_attr,
-                act=gate_act) as gate:
-            gate += identity_projection(input=input, offset=offset)
-            gate += full_matrix_projection(
-                input=output_mem, param_attr=param_attr)
-        return gate
-
-    update_gate = __gate__("update", 0)
-    reset_gate = __gate__("reset", size)
-
-    with mixed_layer(
-            name=name + "_reset_output", bias_attr=False) as reset_output:
-        reset_output += dotmul_operator(a=output_mem, b=reset_gate)
-
-    with mixed_layer(
-            name=name + "_output_candidate",
-            size=size,
-            layer_attr=layer_attr,
-            bias_attr=bias_attr,
-            act=act) as output_candidate:
-        output_candidate += identity_projection(input=input, offset=2 * size)
-        output_candidate += full_matrix_projection(
-            input=reset_output, param_attr=param_attr)
-
-    with mixed_layer(name=name) as output:
-        output += identity_projection(output_mem)
-        output += dotmul_operator(a=output_mem, b=update_gate, scale=-1.0)
-        output += dotmul_operator(a=output_candidate, b=update_gate)
-
-    return output
-
-
-@wrap_name_default()
-@layer_support()
-def get_output_layer(input, arg_name, name=None, layer_attr=None):
-    """
-    Get layer's output by name. In PaddlePaddle, a layer might return multiple
-    values, but returns one layer's output. If the user wants to use another
-    output besides the default one, please use get_output_layer first to get
-    the output from input.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input layer. And this layer should contain
-                   multiple outputs.
-    :type input: LayerOutput
-    :param arg_name: The name of the output to be extracted from the input layer.
-    :type arg_name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    # GetOutputLayer
-    assert arg_name in input.outputs, 'Get Output From an not existed input.' \
-                                      ' The get output name is %s, which not' \
-                                      ' in %s' % (
-                                          arg_name, ",".join(input.outputs))
-    Layer(
-        name=name,
-        type=LayerType.GET_OUTPUT_LAYER,
-        inputs=[Input(
-            input.name, input_layer_argument=arg_name)],
-        size=input.size,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.GET_OUTPUT_LAYER,
-        parents=[input],
-        size=input.size)
-
-
-@wrap_name_default()
-@wrap_act_default()
-@wrap_bias_attr_default()
-@wrap_param_attr_default()
-@layer_support()
-def recurrent_layer(input,
-                    act=None,
-                    bias_attr=None,
-                    param_attr=None,
-                    name=None,
-                    reverse=False,
-                    layer_attr=None):
-    """
-    Simple recurrent unit layer. It is just a fully connect layer through both
-    time and neural network.
-
-    For each sequence [start, end] it performs the following computation\:
-
-    ..  math::
-
-        out_{i} = act(in_{i})     \\      \\      \\text{for} \\ i = start \\\\
-        out_{i} = act(in_{i} + out_{i-1} * W) \\ \\ \\text{for} \\ start < i <= end
-
-    If reversed is true, the order is reversed\:
-
-    ..  math::
-
-        out_{i} = act(in_{i})           \\    \\   \\text{for} \\ i = end  \\\\
-        out_{i} = act(in_{i} + out_{i+1} * W) \\ \\ \\text{for} \\ start <= i < end
-
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param act: Activation type. TanhActivation is the default activation.
-    :type act: BaseActivation
-    :param bias_attr: The parameter attribute for bias. If this parameter is set to
-                      False or an object whose type is not ParameterAttribute,
-                      no bias is defined. If the parameter is set to True,
-                      the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: The parameter attribute. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(
-        name=name,
-        type=LayerType.RECURRENT_LAYER,
-        inputs=Input(input.name, **param_attr.attr),
-        active_type=act.name,
-        bias=ParamAttr.to_bias(bias_attr),
-        reversed=reverse,
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.RECURRENT_LAYER,
-        parents=[input],
-        size=input.size,
-        activation=act,
-        reverse=reverse)
-
-
-class StaticInput(object):
-    """
-    StaticInput is only used in recurrent_group which defines a read-only memory
-    and can be a sequence or non-sequence.
-    :param size: DEPRECATED
-    :param is_seq: DEPRECATED
-    """
-
-    def __init__(self, input, is_seq=False, size=None):
-        assert isinstance(input, LayerOutput)
-        self.input = input
-        assert input.size is not None
-        if size is not None:
-            assert input.size == size
-
-
-def SubsequenceInput(input):
-    """
-    DEPRECATED.
-    Input sequence has sub-sequence, used in recurrent_group.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       input = SubsequenceInput(layer)
-    """
-    return input
-
-
-@wrap_name_default("recurrent_group")
-def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
-    """
-    Recurrent layer group is an extremely flexible recurrent unit in
-    PaddlePaddle. As long as the user defines the calculation done within a
-    time step, PaddlePaddle will iterate such a recurrent calculation over
-    sequence input. This is useful for attention-based models, or Neural
-    Turning Machine like models.
-
-    The basic usage (time steps) is:
-
-    .. code-block:: python
-
-       def step(input):
-           output = fc_layer(input=layer,
-                             size=1024,
-                             act=LinearActivation(),
-                             bias_attr=False)
-           return output
-
-       group = recurrent_group(input=layer,
-                               step=step)
-
-    You can see following configs for further usages:
-
-    - time steps: lstmemory_group, paddle/legacy/gserver/tests/sequence_layer_group.conf, \
-                  demo/seqToseq/seqToseq_net.py
-    - sequence steps: paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
-
-    :param step: A step function which takes the input of recurrent_group as its own
-                 input and returns values as recurrent_group's output every time step.
-
-                 The recurrent group scatters a sequence into time steps. And
-                 for each time step, it will invoke step function, and return
-                 a time step result. Then gather outputs of each time step into
-                 layer group's output.
-
-    :type step: callable
-
-    :param name: The recurrent_group's name. It is optional.
-    :type name: basestring
-
-    :param input: Input links array.
-
-                  LayerOutput will be scattered into time steps.
-                  SubsequenceInput will be scattered into sequence steps.
-                  StaticInput will be imported to each time step, and doesn't change
-                  over time. It's a mechanism to access layer outside step function.
-
-    :type input: LayerOutput | StaticInput | SubsequenceInput | list | tuple
-
-    :param reverse: If reverse is set to True, the recurrent unit will process the
-                    input sequence in a reverse order.
-    :type reverse: bool
-
-    :param targetInlink: DEPRECATED.
-                         The input layer which share info with layer group's output
-
-                         Param input specifies multiple input layers. For
-                         SubsequenceInput inputs, config should assign one input
-                         layer that share info(the number of sentences and the number
-                         of words in each sentence) with all layer group's outputs.
-                         targetInlink should be one of the layer group's input.
-
-    :type targetInlink: LayerOutput | SubsequenceInput
-
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    model_type('recurrent_nn')
-
-    if isinstance(input, LayerOutput) or isinstance(input, StaticInput):
-        input = [input]
-    assert isinstance(input, collections.Sequence)
-
-    def is_in_links(x):
-        return isinstance(x, LayerOutput)
-
-    in_links = filter(is_in_links, input)
-
-    RecurrentLayerGroupWithoutOutLinksBegin(
-        name=name,
-        in_links=map(lambda x: x.name, in_links),
-        seq_reversed=reverse)
-    in_args = []
-    for each_input in input:
-        if isinstance(each_input, StaticInput):  # StaticInput
-            mem_name = "__%s_memory__" % each_input.input.name
-            mem = memory(
-                name=None,
-                size=each_input.input.size,
-                boot_layer=each_input.input)
-            mem.set_input(mem)
-            in_args.append(mem)
-        else:
-            in_args.append(each_input)
-
-    layer_outs = step(*in_args)
-
-    if isinstance(layer_outs, LayerOutput):
-        layer_outs = [layer_outs]
-
-    for layer_out in layer_outs:
-        assert isinstance(
-            layer_out, LayerOutput
-        ), "Type of step function's return value must be LayerOutput."
-        layer_out.reverse = reverse
-        RecurrentLayerGroupSetOutLink(layer_out.name)
-
-    RecurrentLayerGroupEnd(name=name)
-
-    for layer_out in layer_outs:
-        # The previous full_name is the name inside the recurrent group.
-        # We need a full_name outside the recurrent group.
-        layer_out.full_name = MakeLayerNameInSubmodel(layer_out.name)
-
-    if len(layer_outs) == 1:
-        return layer_outs[0]
-    else:
-        return layer_outs
-
-
-class BaseGeneratedInput(object):
-    def __init__(self):
-        self.bos_id = None
-        self.eos_id = None
-
-    def before_real_step(self):
-        raise NotImplementedError()
-
-    def after_real_step(self, *args):
-        raise NotImplementedError()
-
-
-class GeneratedInput(BaseGeneratedInput):
-    def after_real_step(self, input):
-        if isinstance(input, LayerOutput):
-            input = [input]
-        elif isinstance(input, collections.Sequence):
-            input = list(input)
-            if len(input) > 1:
-                logger.info(
-                    ("More than one layers inside the recurrent_group "
-                     "are returned as outputs of the entire recurrent_group "
-                     "PLEASE garantee the first output is probability of "
-                     "the predicted next word."))
-
-        return [maxid_layer(
-            input=input[0], name='__beam_search_predict__')] + (
-                input[1:] if len(input) > 1 else [])
-
-    def before_real_step(self):
-        predict_id = memory(
-            name='__beam_search_predict__',
-            size=self.size,
-            boot_with_const_id=self.bos_id)
-
-        trg_emb = embedding_layer(
-            input=predict_id,
-            size=self.embedding_size,
-            param_attr=ParamAttr(name=self.embedding_name))
-        return trg_emb
-
-    def __init__(self, size, embedding_name, embedding_size):
-        super(GeneratedInput, self).__init__()
-        self.size = size
-        self.embedding_name = embedding_name
-        self.embedding_size = embedding_size
-
-
-@wrap_name_default()
-def maxid_layer(input, name=None, layer_attr=None):
-    """
-    A layer for finding the id which has the maximal value for each sample.
-    The result is stored in output.ids.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       maxid = maxid_layer(input=layer)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(input, LayerOutput)
-    l = Layer(
-        name=name,
-        type='maxid',
-        inputs=[input.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.MAXID_LAYER,
-        parents=[input],
-        size=l.config.size)
-
-
-@wrap_name_default()
-def dot_prod_layer(input1, input2, name=None, layer_attr=None):
-    """
-    A layer for computing the dot product of two vectors.
-
-    The example usage is:
-
-    .. code-block:: python
-
-        dot_prod = dot_prod_layer(input1=vec1, input2=vec2)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input1: The first input layer.
-    :type input1: LayerOutput
-    :param input2: The second input layer.
-    :type input2: LayerOutput
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input1, LayerOutput)
-    assert isinstance(input2, LayerOutput)
-    assert input1.size == input2.size, ("Two inputs should have the same size.")
-
-    l = Layer(
-        name=name,
-        type=LayerType.DOT_PROD_LAYER,
-        inputs=[input1.name, input2.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.DOT_PROD_LAYER,
-        parents=[input1, input2],
-        size=l.config.size)
-
-
-@wrap_name_default()
-def out_prod_layer(input1, input2, name=None, layer_attr=None):
-    """
-    A layer for computing the outer product of two vectors
-    The result is a matrix of size(input1) x size(input2)
-
-    The example usage is:
-
-    .. code-block:: python
-
-       out_prod = out_prod_layer(input1=vec1, input2=vec2)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input1: The first input layer.
-    :type input: LayerOutput
-    :param input2: The second input layer.
-    :type input2: LayerOutput
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(input1, LayerOutput)
-    assert isinstance(input2, LayerOutput)
-    l = Layer(
-        name=name,
-        type=LayerType.OUT_PROD_LAYER,
-        inputs=[input1.name, input2.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.OUT_PROD_LAYER,
-        parents=[input1, input2],
-        size=l.config.size)
-
-
-@wrap_name_default()
-def eos_layer(input, eos_id, name=None, layer_attr=None):
-    """
-    A layer for checking EOS for each sample:
-    - output_id = (input_id == conf.eos_id)
-
-    The result is stored in output\_.ids.
-    It is used by recurrent layer group.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       eos = eos_layer(input=layer, eos_id=id)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param eos_id: End id of sequence
-    :type eos_id: int
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    l = Layer(
-        name=name,
-        type=LayerType.EOSID_LAYER,
-        eos_id=eos_id,
-        inputs=[input.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.EOSID_LAYER,
-        parents=[input],
-        size=l.config.size)
-
-
-@wrap_name_default()
-def beam_search(step,
-                input,
-                bos_id,
-                eos_id,
-                beam_size,
-                max_length=500,
-                name=None,
-                num_results_per_sample=None):
-    """
-    Beam search is a heuristic search algorithm used in sequence generation.
-    It explores a graph by expanding the most promising nodes in a limited set
-    to maintain tractability.
-
-    The example usage is:
-
-    .. code-block:: python
-
-        def rnn_step(input):
-            last_time_step_output = memory(name='rnn', size=512)
-            with mixed_layer(size=512, name='rnn') as simple_rnn:
-                simple_rnn += full_matrix_projection(input)
-                simple_rnn += last_time_step_output
-            return simple_rnn
-
-        generated_word_embedding = GeneratedInput(
-                               size=target_dictionary_dim,
-                               embedding_name="target_language_embedding",
-                               embedding_size=word_vector_dim)
-
-        beam_gen = beam_search(name="decoder",
-                               step=rnn_step,
-                               input=[StaticInput(encoder_last),
-                                      generated_word_embedding],
-                               bos_id=0,
-                               eos_id=1,
-                               beam_size=5)
-
-    Please see the following demo for more details:
-
-    - machine translation : demo/seqToseq/translation/gen.conf \
-                            demo/seqToseq/seqToseq_net.py
-
-    :param name: The name of the recurrent unit that is responsible for
-                 generating sequences. It is optional.
-    :type name: basestring
-    :param step: A callable function that defines the calculation in a time
-                 step, and it is applied to sequences with arbitrary length by
-                 sharing a same set of weights.
-
-                 You can refer to the first parameter of recurrent_group, or
-                 demo/seqToseq/seqToseq_net.py for more details.
-    :type step: callable
-    :param input: Input data for the recurrent unit, which should include the
-                  previously generated words as a GeneratedInput object.
-                  In beam_search, none of the input's type should be LayerOutput.
-    :type input: list
-    :param bos_id: Index of the start symbol in the dictionary. The start symbol
-                   is a special token for NLP task, which indicates the
-                   beginning of a sequence. In the generation task, the start
-                   symbol is essential, since it is used to initialize the RNN
-                   internal state.
-    :type bos_id: int
-    :param eos_id: Index of the end symbol in the dictionary. The end symbol is
-                   a special token for NLP task, which indicates the end of a
-                   sequence. The generation process will stop once the end
-                   symbol is generated, or a pre-defined max iteration number
-                   is exceeded.
-    :type eos_id: int
-    :param max_length: Max generated sequence length.
-    :type max_length: int
-    :param beam_size: Beam search for sequence generation is an iterative search
-                      algorithm. To maintain tractability, every iteration only
-                      only stores a predetermined number, called the beam_size,
-                      of the most promising next words. The greater the beam
-                      size, the fewer candidate words are pruned.
-    :type beam_size: int
-    :param num_results_per_sample: Number of the generated results per input
-                                  sequence. This number must always be less than
-                                  beam size.
-    :type num_results_per_sample: int
-    :return: The generated word index.
-    :rtype: LayerOutput
-    """
-
-    if num_results_per_sample is None:
-        num_results_per_sample = beam_size
-    if num_results_per_sample > beam_size:
-        logger.warning("num_results_per_sample should be less than beam_size")
-
-    if isinstance(input, StaticInput) or isinstance(input, BaseGeneratedInput):
-        input = [input]
-
-    generated_input_index = -1
-
-    real_input = []
-    for i, each_input in enumerate(input):
-        assert not isinstance(each_input, LayerOutput), (
-            "in beam_search, "
-            "none of the input should has a type of LayerOutput.")
-        if isinstance(each_input, BaseGeneratedInput):
-            assert generated_input_index == -1, ("recurrent_group accepts "
-                                                 "only one GeneratedInput.")
-            generated_input_index = i
-
-        else:
-            real_input.append(each_input)
-
-    assert generated_input_index != -1, "No GeneratedInput is given."
-
-    gipt = input[generated_input_index]
-
-    gipt.bos_id = bos_id
-    gipt.eos_id = eos_id
-
-    def __real_step__(*args):
-        eos_name = "__%s_eos_layer__" % name
-        RecurrentLayerGroupSetGenerator(
-            Generator(
-                eos_layer_name=eos_name,
-                max_num_frames=max_length,
-                beam_size=beam_size,
-                num_results_per_sample=num_results_per_sample))
-
-        args = list(args)
-        args.insert(generated_input_index, gipt.before_real_step())
-
-        predict = gipt.after_real_step(step(*args))
-
-        eos_layer(input=predict[0], eos_id=eos_id, name=eos_name)
-        return predict
-
-    return recurrent_group(
-        step=__real_step__, input=real_input, reverse=False, name=name)
-
-
-def __cost_input__(input, label, weight=None):
-    """
-    inputs and parents for cost layers.
-    """
-    if isinstance(input, LayerOutput):
-        input = [input]
-    if isinstance(label, LayerOutput):
-        label = [label]
-    ipts = [Input(ipt.name) for ipt in (input + label)]
-    parents = [ipt for ipt in (input + label)]
-    if weight is not None:
-        assert weight.size == 1
-        ipts.append(Input(weight.name))
-        parents.append(weight)
-    return ipts, parents
-
-
-@wrap_name_default()
-@layer_support()
-def square_error_cost(input,
-                      label,
-                      weight=None,
-                      name=None,
-                      coeff=1.0,
-                      layer_attr=None):
-    """
-    sum of square error cost:
-
-    ..  math::
-
-        cost = \\sum_{i=1}^N(t_i-y_i)^2
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The first input layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type label: LayerOutput
-    :param weight: The weight layer defines a weight for each sample in the
-                   mini-batch. It is optional.
-    :type weight: LayerOutput
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    ipts, parents = __cost_input__(input, label, weight)
-
-    Layer(
-        inputs=ipts,
-        type="square_error",
-        name=name,
-        coeff=coeff,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name, LayerType.COST, parents=parents, size=1)
-
-
-regression_cost = square_error_cost
-
-
-@wrap_name_default("cost")
-@layer_support()
-def classification_cost(input,
-                        label,
-                        weight=None,
-                        name=None,
-                        evaluator=classification_error_evaluator,
-                        layer_attr=None,
-                        coeff=1.):
-    """
-    classification cost Layer.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The first input layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type label: LayerOutput
-    :param weight: The weight layer defines a weight for each sample in the
-                   mini-batch. It is optional.
-    :type weight: LayerOutput
-    :param evaluator: Evaluator method. classification_error_evaluator is the default.
-    :type evaluator: Evaluator method
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert input.layer_type != LayerType.DATA
-    assert isinstance(input.activation, SoftmaxActivation)
-    assert label.layer_type == LayerType.DATA
-
-    ipts, parents = __cost_input__(input, label, weight)
-
-    Layer(
-        name=name,
-        type="multi-class-cross-entropy",
-        inputs=ipts,
-        coeff=coeff,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    def __add_evaluator__(e):
-        assert callable(e)
-        assert hasattr(e, 'is_evaluator')
-        assert isinstance(e.is_evaluator, bool)
-        assert e.is_evaluator
-        assert hasattr(e, "for_classification")
-        assert isinstance(e.for_classification, bool)
-        assert e.for_classification
-
-        e(name=e.__name__, input=input, label=label, weight=weight)
-
-    if not isinstance(evaluator, collections.Sequence):
-        evaluator = [evaluator]
-
-    for each_evaluator in evaluator:
-        __add_evaluator__(each_evaluator)
-
-    return LayerOutput(name, LayerType.COST, parents=parents, size=1)
-
-
-def conv_operator(img,
-                  filter,
-                  filter_size,
-                  num_filters,
-                  num_channels=None,
-                  stride=1,
-                  padding=0,
-                  filter_size_y=None,
-                  stride_y=None,
-                  padding_y=None,
-                  trans=False):
-    """
-    Different from img_conv_layer, conv_op is an Operator, which can be used
-    in mixed_layer. And conv_op takes two inputs to perform convolution.
-    The first input is the image and the second is filter kernel. It only
-    supports GPU mode.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       op = conv_operator(img=input1,
-                          filter=input2,
-                          filter_size=3,
-                          num_filters=64,
-                          num_channels=64)
-
-    :param img: The input image.
-    :type img: LayerOutput
-    :param filter: The input filter.
-    :type filter: LayerOutput
-    :param filter_size: The dimension of the filter kernel on the x axis.
-    :type filter_size: int
-    :param filter_size_y: The dimension of the filter kernel on the y axis.
-                          If the parameter is not set or set to None, it will
-                          set to 'filter_size' automatically.
-    :type filter_size_y: int
-    :param num_filters: The number of the output channels.
-    :type num_filters: int
-    :param num_channels: The number of the input channels. If the parameter is not set
-                         or set to None, it will be automatically set to the channel
-                         number of the 'img'.
-    :type num_channels: int
-    :param stride: The stride on the x axis.
-    :type stride: int
-    :param stride_y: The stride on the y axis. If the parameter is not set or
-                     set to None, it will be set to 'stride' automatically.
-    :type stride_y: int
-    :param padding: The padding size on the x axis.
-    :type padding: int
-    :param padding_y: The padding size on the y axis. If the parameter is not set
-                      or set to None, it will be set to 'padding' automatically.
-    :type padding_y: int
-    :return: A ConvOperator Object.
-    :rtype: ConvOperator
-    """
-    if filter_size_y is None:
-        filter_size_y = filter_size
-    if stride_y is None:
-        stride_y = stride
-    if padding_y is None:
-        padding_y = padding
-
-    if num_channels is None:
-        num_channels = img.num_filters
-
-    assert isinstance(filter, LayerOutput)
-    assert filter.size is not None
-
-    opCls = ConvTransOperator if trans else ConvOperator
-
-    op = opCls(
-        input_layer_names=[img.name, filter.name],
-        num_filters=num_filters,
-        conv_conf=Conv(
-            filter_size=filter_size,
-            padding=padding,
-            stride=stride,
-            channels=num_channels,
-            filter_size_y=filter_size_y,
-            padding_y=padding_y,
-            stride_y=stride_y,
-            groups=1))
-
-    op.origin = [img, filter]
-    return op
-
-
-@wrap_param_attr_default()
-def conv_projection(input,
-                    filter_size,
-                    num_filters,
-                    num_channels=None,
-                    stride=1,
-                    padding=0,
-                    filter_size_y=None,
-                    stride_y=None,
-                    padding_y=None,
-                    groups=1,
-                    param_attr=None,
-                    trans=False):
-    """
-    Different from img_conv_layer and conv_op, conv_projection is a Projection,
-    which can be used in mixed_layer and concat_layer. It uses cudnn to implement
-    convolution and only supports GPU mode.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       proj = conv_projection(input=input1,
-                              filter_size=3,
-                              num_filters=64,
-                              num_channels=64)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param filter_size: The dimensions of the filter kernel. If the parameter is
-                        set to one integer, the two dimensions on x and y axises
-                        will be same when filter_size_y is not set. If it is set
-                        to a list, the first element indicates the dimension on
-                        the x axis, and the second is used to specify the dimension
-                        on the y axis when filter_size_y is not provided.
-    :type filter_size: int | tuple | list
-    :param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter
-                          is not set, it will be set automatically according to filter_size.
-    :type filter_size_y: int
-    :param num_filters: The number of filters.
-    :type num_filters: int
-    :param num_channels: The number of the input channels.
-    :type num_channels: int
-    :param stride: The strides. If the parameter is set to one integer, the strides
-                   on x and y axises will be same when stride_y is not set. If it is
-                   set to a list, the first element indicates the stride on the x axis,
-                   and the second is used to specify the stride on the y axis when
-                   stride_y is not provided.
-    :type stride: int | tuple | list
-    :param stride_y: The stride on the y axis.
-    :type stride_y: int
-    :param padding: The padding sizes. If the parameter is set to one integer, the padding
-                    sizes on x and y axises will be same when padding_y is not set. If it
-                    is set to a list, the first element indicates the padding size on the
-                    x axis, and the second is used to specify the padding size on the y axis
-                    when padding_y is not provided.
-    :type padding: int | tuple | list
-    :param padding_y: The padding size on the y axis.
-    :type padding_y: int
-    :param groups: The group number.
-    :type groups: int
-    :param param_attr: The parameter attribute of the convolution. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param trans: Whether it is ConvTransProjection or ConvProjection
-    :type trans: bool
-    :return: A Projection Object.
-    :rtype: ConvTransProjection | ConvProjection
-    """
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-
-    if filter_size_y is None:
-        if isinstance(filter_size, collections.Sequence):
-            assert len(filter_size) == 2
-            filter_size, filter_size_y = filter_size
-        else:
-            filter_size_y = filter_size
-
-    if stride_y is None:
-        if isinstance(stride, collections.Sequence):
-            assert len(stride) == 2
-            stride, stride_y = stride
-        else:
-            stride_y = stride
-
-    if padding_y is None:
-        if isinstance(padding, collections.Sequence):
-            assert len(padding) == 2
-            padding, padding_y = padding
-        else:
-            padding_y = padding
-
-    if param_attr.attr.get('initial_smart'):
-        # special initial for conv layers.
-        init_w = (2.0 / (filter_size**2 * num_channels))**0.5
-        param_attr.attr["initial_mean"] = 0.0
-        param_attr.attr["initial_std"] = init_w
-        param_attr.attr["initial_strategy"] = 0
-        param_attr.attr["initial_smart"] = False
-
-    projCls = ConvTransProjection if trans else ConvProjection
-
-    proj = projCls(
-        input_layer_name=input.name,
-        num_filters=num_filters,
-        conv_conf=Conv(
-            filter_size=filter_size,
-            padding=padding,
-            stride=stride,
-            channels=num_channels,
-            filter_size_y=filter_size_y,
-            padding_y=padding_y,
-            stride_y=stride_y,
-            groups=groups),
-        **param_attr.attr)
-
-    proj.origin = input
-    return proj
-
-
-@wrap_name_default("pad")
-@layer_support()
-def pad_layer(input,
-              pad_c=None,
-              pad_h=None,
-              pad_w=None,
-              name=None,
-              layer_attr=None):
-    """
-    This operation pads zeros to the input data according to pad_c,pad_h
-    and pad_w. pad_c, pad_h, pad_w specify the size in the corresponding
-    dimension. And the input data shape is NCHW.
-
-    For example, pad_c=[2,3] means padding 2 zeros before the input data
-    and 3 zeros after the input data in the channel dimension. pad_h means
-    padding zeros in the height dimension. pad_w means padding zeros in the
-    width dimension.
-
-    For example,
-
-    .. code-block:: python
-
-       input(2,2,2,3)  = [
-                           [ [[1,2,3], [3,4,5]],
-                             [[2,3,5], [1,6,7]] ],
-                           [ [[4,3,1], [1,8,7]],
-                             [[3,8,9], [2,3,5]] ]
-                         ]
-
-       pad_c=[1,1], pad_h=[0,0], pad_w=[0,0]
-
-       output(2,4,2,3) = [
-                           [ [[0,0,0], [0,0,0]],
-                             [[1,2,3], [3,4,5]],
-                             [[2,3,5], [1,6,7]],
-                             [[0,0,0], [0,0,0]] ],
-                           [ [[0,0,0], [0,0,0]],
-                             [[4,3,1], [1,8,7]],
-                             [[3,8,9], [2,3,5]],
-                             [[0,0,0], [0,0,0]] ]
-                         ]
-
-    The simply usage is:
-
-    .. code-block:: python
-
-       pad = pad_layer(input=ipt,
-                       pad_c=[4,4],
-                       pad_h=[0,0],
-                       pad_w=[2,2])
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param pad_c: The padding size in the channel dimension.
-    :type pad_c: list | None
-    :param pad_h: The padding size in the height dimension.
-    :type pad_h: list | None
-    :param pad_w: The padding size in the width dimension.
-    :type pad_w: list | None
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if pad_c is not None:
-        assert isinstance(pad_c, collections.Sequence) and len(pad_c) == 2
-    else:
-        pad_c = [0, 0]
-
-    if pad_h is not None:
-        assert isinstance(pad_h, collections.Sequence) and len(pad_h) == 2
-    else:
-        pad_h = [0, 0]
-
-    if pad_w is not None:
-        assert isinstance(pad_w, collections.Sequence) and len(pad_w) == 2
-    else:
-        pad_w = [0, 0]
-
-    assert input.num_filters is not None
-    in_ch = input.num_filters
-    out_ch = in_ch + pad_c[0] + pad_c[1]
-
-    l = Layer(
-        name=name,
-        type=LayerType.PAD_LAYER,
-        inputs=Input(
-            input.name,
-            pad=Pad(
-                channels=in_ch,
-                pad_c=pad_c,
-                pad_h=pad_h,
-                pad_w=pad_w, )),
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        layer_type=LayerType.PAD_LAYER,
-        parents=[input],
-        num_filters=out_ch,
-        size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support()
-def conv_shift_layer(a, b, name=None, layer_attr=None):
-    """
-    This layer performs cyclic convolution on two inputs. For example:
-      - a[in]: contains M elements.
-      - b[in]: contains N elements (N should be odd).
-      - c[out]: contains M elements.
-
-    .. math::
-
-        c[i] = \sum_{j=-(N-1)/2}^{(N-1)/2}a_{i+j} * b_{j}
-
-    In this formula:
-     - a's index is computed modulo M. When it is negative, then get item from
-       the right side (which is the end of array) to the left.
-     - b's index is computed modulo N. When it is negative, then get item from
-       the right size (which is the end of array) to the left.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       conv_shift = conv_shift_layer(a=layer1, b=layer2)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param a: The first input of this layer.
-    :type a: LayerOutput
-    :param b: The second input of this layer.
-    :type b: LayerOutput
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
-    assert b.size is None or b.size % 2 == 1  # size of b must be odd.
-    Layer(
-        name=name,
-        type=LayerType.CONV_SHIFT_LAYER,
-        inputs=[a.name, b.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name, LayerType.CONV_SHIFT_LAYER, parents=[a, b], size=a.size)
-
-
-@wrap_name_default()
-@wrap_param_attr_default()
-@wrap_bias_attr_default()
-@wrap_act_default(act=LinearActivation())
-@layer_support(ERROR_CLIPPING, DROPOUT)
-def tensor_layer(a,
-                 b,
-                 size,
-                 act=None,
-                 name=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 layer_attr=None):
-    """
-    This layer performs tensor operation on two inputs.
-    For example:
-
-    .. math::
-       y_{i} = a * W_{i} * {b^\mathrm{T}}, i=0,1,...,K-1
-
-    In this formular:
-      - :math:`a`: the first input contains M elements.
-      - :math:`b`: the second input contains N elements.
-      - :math:`y_{i}`: the i-th element of y.
-      - :math:`W_{i}`: the i-th learned weight, shape if [M, N]
-      - :math:`b^\mathrm{T}`: the transpose of :math:`b_{2}`.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       tensor = tensor_layer(a=layer1, b=layer2, size=1000)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param a: The first input of this layer.
-    :type a: LayerOutput
-    :param b: The second input of this layer.
-    :type b: LayerOutput
-    :param size: The dimension of this layer.
-    :type size: int
-    :param act: Activation type. LinearActivation is the default activation.
-    :type act: BaseActivation
-    :param param_attr: The parameter attribute. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param bias_attr: The parameter attribute for bias. If this parameter is set to
-                      False or an object whose type is not ParameterAttribute,
-                      no bias is defined. If this parameter is set to True,
-                      the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
-    Layer(
-        name=name,
-        size=size,
-        type=LayerType.TENSOR_LAYER,
-        active_type=act.name,
-        bias=ParamAttr.to_bias(bias_attr),
-        inputs=[Input(a.name, **param_attr.attr), Input(b.name)],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.TENSOR_LAYER, parents=[a, b], activation=act, size=size)
-
-
-@wrap_name_default()
-@wrap_param_attr_default()
-@wrap_bias_attr_default()
-@wrap_act_default()
-@layer_support(DROPOUT, ERROR_CLIPPING)
-def selective_fc_layer(input,
-                       size,
-                       select=None,
-                       act=None,
-                       name=None,
-                       pass_generation=False,
-                       has_selected_colums=True,
-                       mul_ratio=0.02,
-                       param_attr=None,
-                       bias_attr=None,
-                       layer_attr=None):
-    """
-    Selectived fully connected layer. Different from fc_layer, the output
-    of this layer can be sparse. It requires an additional input to indicate
-    several selected columns for output. If the selected columns is not
-    specified, selective_fc_layer acts exactly like fc_layer.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       sel_fc = selective_fc_layer(input=input, size=128, act=TanhActivation())
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput | list | tuple
-    :param select: The layer to select columns to output. It should be a sparse
-                   binary matrix, and is treated as the mask of selective fc. If
-                   it is not set or set to None, selective_fc_layer acts exactly
-                   like fc_layer.
-    :type select: LayerOutput
-    :param size: The dimension of this layer, which should be equal to that of
-                 the layer 'select'.
-    :type size: int
-    :param act: Activation type. TanhActivation is the default activation.
-    :type act: BaseActivation
-    :param pass_generation: The flag which indicates whether it is during generation.
-    :type pass_generation: bool
-    :param has_selected_colums: The flag which indicates whether the parameter 'select'
-                                has been set. True is the default.
-    :type has_selected_colums: bool
-    :param mul_ratio: A ratio helps to judge how sparse the output is and determine
-                      the computation method for speed consideration.
-    :type mul_ratio: float
-    :param param_attr: The parameter attribute. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param bias_attr: The parameter attribute for bias. If this parameter is set to
-                      False or an object whose type is not ParameterAttribute,
-                      no bias is defined. If this parameter is set to True,
-                      the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if isinstance(input, LayerOutput):
-        input = [input]
-        assert not isinstance(param_attr, collections.Sequence)
-        param_attr = [param_attr]
-    else:
-        if isinstance(param_attr, collections.Sequence):
-            assert len(input) == len(param_attr)
-        else:
-            if "parameter_name" in param_attr.attr and len(input) > 1:
-                logger.fatal(
-                    "When the name field of param_attr is manually specified "
-                    "and the input is a list, the param_attr should also be a "
-                    "list with each item being the param_attr for each input "
-                    "item. If only one named param_attr is provided, all the "
-                    "input items would share this parameter.")
-            param_attr = [copy.deepcopy(param_attr) for _ in range(len(input))]
-
-    assert isinstance(input, collections.Sequence)
-    assert isinstance(select, LayerOutput)
-    if select.size is not None:
-        assert select.size == size
-    Layer(
-        inputs=[
-            Input(ipt.name, **attr.attr) for ipt, attr in zip(input, param_attr)
-        ] + [select.name],
-        name=name,
-        type=LayerType.SEL_FC_LAYER,
-        size=size,
-        bias=ParameterAttribute.to_bias(bias_attr),
-        active_type=act.name,
-        selective_fc_pass_generation=pass_generation,
-        has_selected_colums=has_selected_colums,
-        selective_fc_full_mul_ratio=mul_ratio,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        LayerType.SEL_FC_LAYER,
-        list(input) + [select],
-        activation=act,
-        size=size)
-
-
-@wrap_name_default()
-@layer_support()
-def sampling_id_layer(input, name=None, layer_attr=None):
-    """
-    A layer for sampling id from a multinomial distribution from the input layer.
-    Sampling one id for one sample.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       samping_id = sampling_id_layer(input=input)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    l = Layer(
-        name=name,
-        type=LayerType.SAMPLING_ID_LAYER,
-        inputs=[Input(input.name)],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.SAMPLING_ID_LAYER, input, size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support()
-def slope_intercept_layer(input,
-                          name=None,
-                          slope=1.0,
-                          intercept=0.0,
-                          layer_attr=None):
-    """
-    This layer for applying a slope and an intercept to the input.
-
-    ..  math::
-        y = slope * x + intercept
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       scale = slope_intercept_layer(input=input, slope=-1.0, intercept=1.0)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param slope: The scale factor.
-    :type slope: float
-    :param intercept: The offset.
-    :type intercept: float
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(
-        name=name,
-        type=LayerType.SLOPE_INTERCEPT_LAYER,
-        slope=slope,
-        intercept=intercept,
-        inputs=[Input(input.name)],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.SLOPE_INTERCEPT_LAYER, input, size=input.size)
-
-
-@wrap_name_default()
-@layer_support()
-def linear_comb_layer(weights, vectors, size=None, name=None, layer_attr=None):
-    """
-    A layer for weighted sum of vectors takes two inputs.
-      - Input: size of weights is M
-               size of vectors is M*N
-      - Output: a vector of size=N
-
-    .. math::
-
-       z(i) = \sum_{j=0}^{M-1} x(j) y(i+Nj)
-
-    where :math:`0 \le i \le N-1`
-
-    Or in the matrix notation:
-
-    .. math::
-
-       z = x^\mathrm{T} Y
-
-    In this formular:
-      - :math:`x`: weights
-      - :math:`y`: vectors.
-      - :math:`z`: the output.
-
-    Note that the above computation is for one sample. Multiple samples are
-    processed in one batch.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       linear_comb = linear_comb_layer(weights=weight, vectors=vectors,
-                                       size=elem_dim)
-
-    :param weights: The weight layer.
-    :type weights: LayerOutput
-    :param vectors: The vector layer.
-    :type vectors: LayerOutput
-    :param size: The dimension of this layer.
-    :type size: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(weights, LayerOutput) and isinstance(vectors, LayerOutput)
-    if vectors.size is not None and weights.size is not None:
-        assert vectors.size % weights.size == 0
-        if size is None:
-            size = vectors.size / weights.size
-        else:
-            assert size == vectors.size / weights.size
-    Layer(
-        name=name,
-        type=LayerType.LINEAR_COMBINATION_LAYER,
-        size=size,
-        inputs=[Input(weights.name), Input(vectors.name)],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.LINEAR_COMBINATION_LAYER, [weights, vectors], size=size)
-
-
-convex_comb_layer = linear_comb_layer
-
-
-@wrap_name_default()
-@layer_support()
-def block_expand_layer(input,
-                       block_x=0,
-                       block_y=0,
-                       stride_x=0,
-                       stride_y=0,
-                       padding_x=0,
-                       padding_y=0,
-                       num_channels=None,
-                       name=None,
-                       layer_attr=None):
-    """
-    Expand feature map to minibatch matrix.
-       - matrix width is: block_y * block_x * num_channels
-       - matirx height is: outputH * outputW
-
-    .. math::
-
-       outputH = 1 + (2 * padding_y + imgSizeH - block_y + stride_y - 1) / stride_y
-
-       outputW = 1 + (2 * padding_x + imgSizeW - block_x + stride_x - 1) / stride_x
-
-    The expanding method is the same with ExpandConvLayer, but saved the transposed
-    value. After expanding, output.sequenceStartPositions will store timeline.
-    The number of time steps is outputH * outputW and the dimension of each
-    time step is block_y * block_x * num_channels. This layer can be used after
-    convolutional neural network, and before recurrent neural network.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       block_expand = block_expand_layer(input=layer,
-                                         num_channels=128,
-                                         stride_x=1,
-                                         stride_y=1,
-                                         block_x=1,
-                                         block_x=3)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None, its actual value will be automatically set to
-                         the channels number of the input.
-    :type num_channels: int
-    :param block_x: The width of sub block.
-    :type block_x: int
-    :param block_y: The width of sub block.
-    :type block_y: int
-    :param stride_x: The stride size in horizontal direction.
-    :type stride_x: int
-    :param stride_y: The stride size in vertical direction.
-    :type stride_y: int
-    :param padding_x: The padding size in horizontal direction.
-    :type padding_x: int
-    :param padding_y: The padding size in vertical direction.
-    :type padding_y: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring.
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-    l = Layer(
-        name=name,
-        inputs=Input(
-            input.name,
-            block_expand=BlockExpand(
-                channels=num_channels,
-                block_x=block_x,
-                block_y=block_y,
-                stride_x=stride_x,
-                stride_y=stride_y,
-                padding_x=padding_x,
-                padding_y=padding_y)),
-        type=LayerType.BLOCK_EXPAND,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name, LayerType.BLOCK_EXPAND, parents=[input], size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support()
-def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
-    """
-    A layer to do max out on convolutional layer output.
-      - Input: the output of a convolutional layer.
-      - Output: feature map size same as the input's, and its channel number is
-        (input channel) / groups.
-
-    So groups should be larger than 1, and the num of channels should be able
-    to be devided by groups.
-
-    Reference:
-        `Maxout Networks
-        <http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf>`_
-        `Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
-        <https://arxiv.org/pdf/1312.6082v4.pdf>`_
-
-
-    .. math::
-
-       & out = \max_k (in[n, k, o_c , s])
-
-       & out_{i * s + j} = \max_k in_{  k * o_{c} * s + i * s + j}
-
-       & s = \\frac{input.size}{ num\_channels}
-
-       & o_{c} = \\frac{num\_channels}{groups}
-
-       & 0 \le i < o_{c}
-
-       & 0 \le j < s
-
-       & 0 \le k < groups
-
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       maxout = maxout_layer(input,
-                             num_channels=128,
-                             groups=4)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None, its actual value will be automatically set to
-                         the channels number of the input.
-    :type num_channels: int
-    :param groups: The group number of input layer.
-    :type groups: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input.activation, LinearActivation)
-    assert groups > 1
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-    assert num_channels % groups == 0
-    l = Layer(
-        name=name,
-        inputs=Input(
-            input.name, maxout=MaxOut(
-                channels=num_channels, groups=groups)),
-        type=LayerType.MAXOUT,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.MAXOUT, parents=[input], size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support()
-def ctc_layer(input,
-              label,
-              size=None,
-              name=None,
-              norm_by_times=False,
-              layer_attr=None):
-    """
-    Connectionist Temporal Classification (CTC) is designed for temporal
-    classication task. e.g. sequence labeling problems where the
-    alignment between the inputs and the target labels is unknown.
-
-    Reference:
-        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
-        with Recurrent Neural Networks
-        <http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf>`_
-
-    Note:
-        Considering the 'blank' label needed by CTC, you need to use (num_classes + 1)
-        as the size of the input, where num_classes is the category number.
-        And the 'blank' is the last category index. So the size of 'input' layer (e.g.
-        fc_layer with softmax activation) should be (num_classes + 1). The size of
-        ctc_layer should also be (num_classes + 1).
-
-    The example usage is:
-
-    .. code-block:: python
-
-      ctc = ctc_layer(input=input,
-                      label=label,
-                      size=9055,
-                      norm_by_times=True)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type label: LayerOutput
-    :param size: The dimension of this layer, which must be equal to (category number + 1).
-    :type size: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param norm_by_times: Whether to do normalization by times. False is the default.
-    :type norm_by_times: bool
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput)
-    assert isinstance(label, LayerOutput)
-    if label.size is not None:
-        if size is not None:
-            assert size == label.size + 1
-        else:
-            size = label.size + 1
-    Layer(
-        name=name,
-        type=LayerType.CTC_LAYER,
-        size=size,
-        norm_by_times=norm_by_times,
-        inputs=[input.name, label.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size)
-
-
-@wrap_name_default()
-@layer_support()
-def warp_ctc_layer(input,
-                   label,
-                   size=None,
-                   name=None,
-                   blank=0,
-                   norm_by_times=False,
-                   layer_attr=None):
-    """
-    A layer intergrating the open-source `warp-ctc
-    <https://github.com/baidu-research/warp-ctc>`_ library, which is used in
-    `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin
-    <https://arxiv.org/pdf/1512.02595v1.pdf>`_, to compute Connectionist Temporal
-    Classification (CTC) loss. Besides, another `warp-ctc repository
-    <https://github.com/gangliao/warp-ctc>`_ , which is forked from
-    the official one, is maintained to enable more compiling options. During the
-    building process, PaddlePaddle will clone the source codes, build and
-    install it to :code:`third_party/install/warpctc` directory.
-
-    Reference:
-        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
-        with Recurrent Neural Networks
-        <http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf>`_
-
-    Note:
-        - Let num_classes represents the category number. Considering the 'blank'
-          label needed by CTC, you need to use (num_classes + 1) as the size of
-          warp_ctc layer.
-        - You can set 'blank' to any value ranged in [0, num_classes], which
-          should be consistent with those used in your labels.
-        - As a native 'softmax' activation is interated to the warp-ctc library,
-          'linear' activation is expected to be used instead in the 'input' layer.
-
-    The example usage is:
-
-    .. code-block:: python
-
-      ctc = warp_ctc_layer(input=input,
-                           label=label,
-                           size=1001,
-                           blank=1000,
-                           norm_by_times=False)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type label: LayerOutput
-    :param size: The dimension of this layer, which must be equal to (category number + 1).
-    :type size: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param blank: The 'blank' label used in ctc.
-    :type blank: int
-    :param norm_by_times: Whether to do normalization by times. False is the default.
-    :type norm_by_times: bool
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput)
-    assert isinstance(label, LayerOutput)
-    if label.size is not None:
-        if size is not None:
-            assert size == label.size + 1
-        else:
-            size = label.size + 1
-    Layer(
-        name=name,
-        type=LayerType.WARP_CTC_LAYER,
-        size=size,
-        blank=blank,
-        norm_by_times=norm_by_times,
-        inputs=[input.name, label.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.WARP_CTC_LAYER, parents=[input, label], size=size)
-
-
-@wrap_name_default()
-@wrap_param_attr_default()
-@layer_support()
-def crf_layer(input,
-              label,
-              size=None,
-              weight=None,
-              param_attr=None,
-              name=None,
-              coeff=1.0,
-              layer_attr=None):
-    """
-    A layer for calculating the cost of sequential conditional random
-    field model.
-
-    The example usage is:
-
-    .. code-block:: python
-
-      crf = crf_layer(input=input,
-                      label=label,
-                      size=label_dim)
-
-    :param input: The first input layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type label: LayerOutput
-    :param size: The category number.
-    :type size: int
-    :param weight: The weight layer defines a weight for each sample in the
-                   mini-batch. It is optional.
-    :type weight: LayerOutput
-    :param param_attr: The parameter attribute. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput)
-    assert isinstance(label, LayerOutput)
-    assert weight is None or isinstance(weight, LayerOutput)
-    if input.size is not None and label.size is not None:
-        assert input.size == label.size
-        if size is None:
-            size = input.size
-        else:
-            assert size == input.size
-
-    ipts = [Input(input.name, **param_attr.attr), Input(label.name)]
-    if weight is not None:
-        ipts.append(Input(weight.name))
-
-    Layer(
-        name=name,
-        type=LayerType.CRF_LAYER,
-        size=size,
-        inputs=ipts,
-        coeff=coeff,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    parents = [input, label]
-    if weight is not None:
-        parents.append(weight)
-    # The size for LayerOutput means the dimension of the output.
-    # It's different from the meaning of crf layer, which is the number of
-    # classes.
-    return LayerOutput(name, LayerType.CRF_LAYER, parents, size=1)
-
-
-@wrap_name_default()
-@wrap_param_attr_default()
-@layer_support()
-def crf_decoding_layer(input,
-                       size,
-                       label=None,
-                       param_attr=None,
-                       name=None,
-                       layer_attr=None):
-    """
-    A layer for calculating the decoding sequence of sequential conditional
-    random field model. The decoding sequence is stored in output.ids.
-    If the input 'label' is provided, it is treated as the ground-truth label, and
-    this layer will also calculate error. output.value[i] is 1 for an incorrect
-    decoding and 0 for the correct.
-
-    The example usage is:
-
-    .. code-block:: python
-
-      crf_decoding = crf_decoding_layer(input=input,
-                                        size=label_dim)
-
-    :param input: The first input layer.
-    :type input: LayerOutput
-    :param size: The dimension of this layer.
-    :type size: int
-    :param label: The input label.
-    :type label: LayerOutput | None
-    :param param_attr: The parameter attribute. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(input, LayerOutput)
-    assert label is None or isinstance(label, LayerOutput)
-
-    ipts = [Input(input.name, **param_attr.attr)]
-    if label is not None:
-        ipts.append(Input(label.name))
-
-    Layer(
-        name=name,
-        type=LayerType.CRF_DECODING_LAYER,
-        size=size,
-        inputs=ipts,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    parents = [input]
-    if label is not None:
-        parents.append(label)
-    # The size for LayerOutput means the dimension of the output.
-    # It's different from the meaning of crf layer, which is the number of
-    # classes.
-    return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=1)
-
-
-"""
-Following are cost Layers.
-"""
-
-
-@wrap_bias_attr_default(has_bias=True)
-@wrap_param_attr_default()
-@wrap_name_default()
-@layer_support()
-def nce_layer(input,
-              label,
-              num_classes=None,
-              param_attr=None,
-              weight=None,
-              num_neg_samples=10,
-              neg_distribution=None,
-              name=None,
-              bias_attr=None,
-              layer_attr=None):
-    """
-    Noise-contrastive estimation.
-
-    Reference:
-        `A fast and simple algorithm for training neural probabilistic language
-        models. <https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf>`_
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cost = nce_layer(input=[layer1, layer2], label=layer2,
-                        param_attr=[attr1, attr2], weight=layer3,
-                        num_classes=3, neg_distribution=[0.1,0.3,0.6])
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The first input of this layer.
-    :type input: LayerOutput | list | tuple | collections.Sequence
-    :param label: The input label.
-    :type label: LayerOutput
-    :param weight: The weight layer defines a weight for each sample in the
-                   mini-batch. It is optional.
-    :type weight: LayerOutput
-    :param num_classes: The number of classes.
-    :type num_classes: int
-    :param act: Activation type. SigmoidActivation is the default activation.
-    :type act: BaseActivation
-    :param param_attr: The parameter attribute. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param num_neg_samples: The number of sampled negative labels. 10 is the
-                            default value.
-    :type num_neg_samples: int
-    :param neg_distribution: The discrete noisy distribution over the output
-                             space from which num_neg_samples negative labels
-                             are sampled. If this parameter is not set, a
-                             uniform distribution will be used. A user-defined
-                             distribution is a list whose length must be equal
-                             to the num_classes. Each member of the list defines
-                             the probability of a class given input x.
-    :type neg_distribution: list | tuple | collections.Sequence | None
-    :param bias_attr: The parameter attribute for bias. If this parameter is set to
-                      False or an object whose type is not ParameterAttribute,
-                      no bias is defined. If this parameter is set to True,
-                      the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if isinstance(input, LayerOutput):
-        input = [input]
-        assert not isinstance(param_attr, collections.Sequence)
-        param_attr = [param_attr]
-    else:
-        if isinstance(param_attr, collections.Sequence):
-            assert len(input) == len(param_attr)
-        else:
-            param_attr = [copy.deepcopy(param_attr) for _ in range(len(input))]
-
-    assert isinstance(input, collections.Sequence)
-
-    assert isinstance(label, LayerOutput)
-    assert label.layer_type == LayerType.DATA
-    if num_classes is None:
-        num_classes = label.size
-    if neg_distribution is not None:
-        assert isinstance(neg_distribution, collections.Sequence)
-        assert len(neg_distribution) == num_classes
-        assert abs(sum(neg_distribution) - 1.0) < 1e-5
-
-    ipts_for_layer = []
-    parents = []
-    for each_input, attr in zip(input, param_attr):
-        assert isinstance(each_input, LayerOutput)
-        ipts_for_layer.append(Input(each_input.name, **attr.attr))
-        parents.append(each_input)
-    ipts_for_layer.append(label.name)
-    parents.append(label)
-
-    if weight is not None:
-        assert isinstance(weight, LayerOutput)
-        assert weight.layer_type == LayerType.DATA
-        ipts_for_layer.append(weight.name)
-        parents.append(weight)
-
-    l = Layer(
-        name=name,
-        type=LayerType.NCE_LAYER,
-        num_classes=num_classes,
-        neg_sampling_dist=neg_distribution,
-        active_type=SigmoidActivation().name,
-        num_neg_samples=num_neg_samples,
-        inputs=ipts_for_layer,
-        bias=ParamAttr.to_bias(bias_attr),
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        LayerType.NCE_LAYER,
-        parents=parents,
-        size=l.config.size,
-        activation=SigmoidActivation())
-
-
-@wrap_name_default()
-@layer_support()
-def rank_cost(left,
-              right,
-              label,
-              weight=None,
-              name=None,
-              coeff=1.0,
-              layer_attr=None):
-    """
-    A cost Layer for learning to rank using gradient descent.
-
-    Reference:
-        `Learning to Rank using Gradient Descent
-        <http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf>`_
-
-    .. math::
-
-       C_{i,j} & = -\\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}})
-
-       o_{i,j} & =  o_i - o_j
-
-       \\tilde{P_{i,j}} & = \\{0, 0.5, 1\\} \ or \ \\{0, 1\\}
-
-    In this formula:
-      - :math:`C_{i,j}` is the cross entropy cost.
-      - :math:`\\tilde{P_{i,j}}` is the label. 1 means positive order
-        and 0 means reverse order.
-      - :math:`o_i` and :math:`o_j`: the left output and right output.
-        Their dimension is one.
-
-    The example usage is:
-
-    .. code-block:: python
-
-      cost = rank_cost(left=out_left,
-                       right=out_right,
-                       label=label)
-
-    :param left: The first input, the size of this layer is 1.
-    :type left: LayerOutput
-    :param right: The right input, the size of this layer is 1.
-    :type right: LayerOutput
-    :param label: Label is 1 or 0, means positive order and reverse order.
-    :type label: LayerOutput
-    :param weight: The weight layer defines a weight for each sample in the
-                   mini-batch. It is optional.
-    :type weight: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert left.size == 1
-    assert right.size == 1
-    assert label.size == 1
-
-    ipts = [left.name, right.name, label.name]
-    parents = [left, right, label]
-    if weight is not None:
-        ipts.append(weight.name)
-        parents.append(weight)
-
-    Layer(
-        name=name,
-        type=LayerType.RANK_COST,
-        inputs=ipts,
-        coeff=coeff,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(name, LayerType.RANK_COST, parents=parents, size=1)
-
-
-@wrap_name_default()
-@layer_support()
-def lambda_cost(input,
-                score,
-                name,
-                NDCG_num=5,
-                max_sort_size=-1,
-                layer_attr=None):
-    """
-    lambdaCost for lambdaRank LTR approach.
-
-    The example usage is:
-
-    .. code-block:: python
-
-      cost = lambda_cost(input=input,
-                         score=score,
-                         NDCG_num=8,
-                         max_sort_size=-1)
-
-    :param input: The first input of this layer, which is often a document
-                  samples list of the same query and whose type must be sequence.
-    :type input: LayerOutput
-    :param score: The scores of the samples.
-    :type input: LayerOutput
-    :param NDCG_num: The size of NDCG (Normalized Discounted Cumulative Gain),
-                     e.g., 5 for NDCG@5. It must be less than or equal to the
-                     minimum size of the list.
-    :type NDCG_num: int
-    :param max_sort_size: The size of partial sorting in calculating gradient. If
-                          max_sort_size is equal to -1 or greater than the number
-                          of the samples in the list, then the algorithm will sort
-                          the entire list to compute the gradient. In other cases,
-                          max_sort_size must be greater than or equal to NDCG_num.
-    :type max_sort_size: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput) and isinstance(score, LayerOutput)
-    if score.size is not None:
-        assert score.size == 1
-    Layer(
-        name=name,
-        type=LayerType.LAMBDA_COST,
-        inputs=[input.name, score.name],
-        NDCG_num=NDCG_num,
-        max_sort_size=max_sort_size,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name, LayerType.LAMBDA_COST, parents=[input, score], size=1)
-
-
-@wrap_name_default()
-@layer_support()
-def cross_entropy(input,
-                  label,
-                  name=None,
-                  coeff=1.0,
-                  weight=None,
-                  layer_attr=None):
-    """
-    A loss layer for multi class entropy.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cost = cross_entropy(input=input_layer,
-                            label=label_layer)
-
-    :param input: The first input layer.
-    :type input: LayerOutput.
-    :param label: The input label.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :param weight: The weight layer defines a weight for each sample in the
-                   mini-batch. It is optional.
-    :type weight: LayerOutout
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    ipts, parents = __cost_input__(input, label, weight)
-    Layer(
-        name=name,
-        type=LayerType.CROSS_ENTROPY,
-        inputs=ipts,
-        coeff=coeff,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1)
-
-
-@wrap_name_default()
-@layer_support()
-def cross_entropy_with_selfnorm(input,
-                                label,
-                                name=None,
-                                coeff=1.0,
-                                softmax_selfnorm_alpha=0.1,
-                                layer_attr=None):
-    """
-    A loss layer for multi class entropy with selfnorm.
-    Input should be a vector of positive numbers, without normalization.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cost = cross_entropy_with_selfnorm(input=input_layer,
-                                          label=label_layer)
-
-    :param input: The first input layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :param softmax_selfnorm_alpha: The scale factor affects the cost.
-    :type softmax_selfnorm_alpha: float
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(
-        name=name,
-        type=LayerType.CROSS_ENTROPY_WITH_SELFNORM,
-        inputs=[input.name, label.name],
-        coeff=coeff,
-        softmax_selfnorm_alpha=softmax_selfnorm_alpha,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name,
-        LayerType.CROSS_ENTROPY_WITH_SELFNORM,
-        parents=[input, label],
-        size=1)
-
-
-@wrap_name_default()
-@layer_support()
-def sum_cost(input, name=None, layer_attr=None):
-    """
-    A loss layer which calculates the sum of the input as loss.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cost = sum_cost(input=input_layer)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput.
-    """
-    assert isinstance(input, LayerOutput)
-    Layer(
-        name=name,
-        type=LayerType.SUM_COST,
-        inputs=[input.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(name, LayerType.SUM_COST, parents=[input], size=1)
-
-
-@wrap_name_default()
-@layer_support()
-def huber_regression_cost(input,
-                          label,
-                          name=None,
-                          delta=1.0,
-                          coeff=1.0,
-                          layer_attr=None):
-    """
-    In statistics, the Huber loss is a loss function used in robust regression,
-    that is less sensitive to outliers in data than the squared error loss.
-    Given a prediction f(x), a label y and :math:`\delta`, the loss function
-    is defined as:
-
-    .. math::
-
-       loss = 0.5*(y-f(x))^{2}, | y-f(x) | < \delta
-
-       loss = \delta | y-f(x) | - 0.5 \delta ^2, otherwise
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cost = huber_regression_cost(input=input_layer, label=label_layer)
-
-    :param input: The first input layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param delta: The difference between the observed and predicted values.
-    :type delta: float
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput.
-    """
-    assert isinstance(input, LayerOutput)
-    Layer(
-        name=name,
-        type=LayerType.HUBER_REGRESSION,
-        inputs=[input.name, label.name],
-        delta=delta,
-        coeff=coeff,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.HUBER_REGRESSION, parents=[input, label], size=1)
-
-
-@wrap_name_default()
-@layer_support()
-def huber_classification_cost(input,
-                              label,
-                              name=None,
-                              coeff=1.0,
-                              layer_attr=None):
-    """
-    For classification purposes, a variant of the Huber loss called modified Huber
-    is sometimes used. Given a prediction f(x) (a real-valued classifier score) and
-    a true binary class label :math:`y\in \{-1, 1 \}`, the modified Huber
-    loss is defined as:
-
-    .. math:
-
-       loss = \max ( 0, 1-yf(x) )^2, yf(x) \geq -1
-
-       loss = -4yf(x), otherwise
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cost = huber_classification_cost(input=input_layer, label=label_layer)
-
-    :param input: The first input layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput)
-    if input.size is not None:
-        assert input.size == 1
-    Layer(
-        name=name,
-        type=LayerType.HUBER_CLASSIFICATION,
-        inputs=[input.name, label.name],
-        coeff=coeff,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.HUBER_CLASSIFICATION, parents=[input, label], size=1)
-
-
-@wrap_name_default()
-@layer_support()
-def multi_binary_label_cross_entropy(input,
-                                     label,
-                                     name=None,
-                                     coeff=1.0,
-                                     layer_attr=None):
-    """
-    A loss layer for multi binary label cross entropy.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cost = multi_binary_label_cross_entropy(input=input_layer,
-                                               label=label_layer)
-
-    :param input: The first input layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    if input.activation is None or \
-            not isinstance(input.activation, SigmoidActivation):
-        logger.log(logging.WARN,
-                   ("%s is not a recommended activation for "
-                    "multi_binary_label_cross_entropy, sigmoid is better") %
-                   repr(input.activation))
-
-    Layer(
-        name=name,
-        type=LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
-        inputs=[input.name, label.name],
-        coeff=coeff,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
-        parents=[input, label],
-        size=1)
-
-
-class BeamInput(object):
-    """
-    Define the input for cross_entropy_over_beam layer.
-
-    A beam is made up of a triple: the first one is scores over all
-    candidates; the second one is indices of top k selected candidates; the
-    third one is the index of ground truth, which is also always called
-    gold.
-    """
-
-    def __init__(self, candidate_scores, selected_candidates, gold):
-        assert isinstance(candidate_scores, LayerOutput)
-        self.candidate_scores = candidate_scores
-        assert candidate_scores.size == 1
-
-        assert isinstance(selected_candidates, LayerOutput)
-        self.selected_candidates = selected_candidates
-
-        assert isinstance(gold, LayerOutput)
-        self.gold = gold
-
-
-@wrap_name_default()
-@layer_support()
-def cross_entropy_over_beam(input, name=None):
-    """
-    This layer is used in learning to search models, which is to solve complex
-    joint prediction problems based on learning to search through a
-    problem-defined search space.
-
-    Specifically, the learning to search process for this layer begins with
-    searching a target sequence from a nested sequence. In the first search
-    step, top beam size sequences with highest scores, indices of these top k
-    sequences in the original nested sequence, and the ground truth (also
-    called gold) altogether (a triple) make up of the first beam.
-
-    Then, several special positions, for example, start and end positions
-    that define meaningful segments are searched. In these searches, top k
-    positions with highest scores are selected, and then sequence, starting
-    from the selected starts till ends of the sequences (or a fixed position)
-    are taken to search next.
-
-    We call the possible top k results returned in one search the beam. This
-    search process can be repeated for pre-defined turns and leads to several
-    beam expansions.
-
-    Finally, the layer cross_entropy_over_beam takes all the beam expansions
-    which contain several candidate targets found along the multi-step search.
-    cross_entropy_over_beam calculates cross entropy over the expanded beams
-    which all the candidates in the beam as the normalized factor.
-
-    Note that, if gold falls off the beam at search step t, then the cost is
-    calculated over the beam at step t.
-
-    This cost layer always works together with kmax_seq_score_layer,
-    sub_nested_seq_layer, and sequence_slice_layer to trim the input to form a
-    sub-search space.
-
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cost = cross_entropy_over_beam(input=[
-           BeamInput(
-               candidate_scores=beam1_candidates,
-               selected_candidates=beam1_topk,
-               gold=gold1),
-           BeamInput(
-               candidate_scores=beam2_candidates,
-               selected_candidates=beam2_topk,
-               gold=gold2),
-       ])
-
-
-    :param input: Input beams for this layer.
-    :type input: BeamInput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    if isinstance(input, BeamInput):
-        input = [input]
-    else:
-        assert isinstance(input, list), (
-            'input for cross_entropy_over_beam shold be a python list '
-            'of BeamInput object.')
-        for ipt in input:
-            assert isinstance(ipt, BeamInput), (
-                'input for cross_entropy_over_beam '
-                'should be a BeamInput object.')
-
-    ipts = []
-    parents = []
-    for beam in input:
-        parents += [beam.candidate_scores, beam.selected_candidates, beam.gold]
-        ipts += [
-            beam.candidate_scores.name, beam.selected_candidates.name,
-            beam.gold.name
-        ]
-
-    Layer(name=name, type=LayerType.CROSS_ENTROPY_OVER_BEAM, inputs=ipts)
-    return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1)
-
-
-@wrap_name_default()
-@layer_support()
-def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
-    """
-    This is a L1 loss but more smooth. It requires that the
-    sizes of input and label are equal. The formula is as follows,
-
-    .. math::
-
-        L = \sum_{i} smooth_{L1}(input_i - label_i)
-
-    in which
-
-    .. math::
-
-        smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if}  \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases}
-
-    Reference:
-        `Fast R-CNN
-        <https://arxiv.org/pdf/1504.08083v2.pdf>`_
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cost = smooth_l1_cost(input=input_layer,
-                             label=label_layer)
-
-    :param input: The input layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput)
-    assert isinstance(label, LayerOutput)
-    assert input.size == label.size
-
-    Layer(
-        name=name,
-        type=LayerType.SMOOTH_L1,
-        inputs=[input.name, label.name],
-        coeff=coeff,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.SMOOTH_L1, parents=[input, label], size=1)
-
-
-@wrap_name_default()
-def multiplex_layer(input, name=None, layer_attr=None):
-    """
-    This layer multiplex multiple layers according to the indexes,
-    which are provided by the first input layer.
-    inputs[0]: the indexes of the layers to form the output of size batchSize.
-    inputs[1:N]; the candidate output data.
-    For each index i from 0 to batchSize - 1, the i-th row of the output is the
-    the same to the i-th row of the (index[i] + 1)-th layer.
-
-    For each i-th row of output:
-    .. math::
-        y[i][j] = x_{x_{0}[i] + 1}[i][j], j = 0,1, ... , (x_{1}.width - 1)
-
-    where, y is output. :math:`x_{k}` is the k-th input layer and
-    :math:`k = x_{0}[i] + 1`.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       maxid = multiplex_layer(input=layers)
-
-    :param input: Input layers.
-    :type input: list of LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(input, collections.Sequence)
-    assert len(input) > 2, 'multiplex_layer should have more than 2 inputs'
-    for i in range(1, len(input)):
-        assert isinstance(input[i], LayerOutput)
-        assert input[i].size == input[1].size, \
-            "All the input layers except the first one should have the same size"
-
-    l = Layer(
-        name=name,
-        type='multiplex',
-        inputs=[x.name for x in input],
-        size=input[1].size,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.MULTIPLEX_LAYER,
-        parents=input,
-        size=l.config.size)
-
-
-@wrap_name_default("dropout")
-def dropout_layer(input, dropout_rate, name=None):
-    """
-
-    The example usage is:
-
-    .. code-block:: python
-
-        dropout = dropout_layer(input=input_layer, dropout_rate=0.5)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param dropout_rate: The probability of dropout.
-    :type dropout_rate: float
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    return addto_layer(
-        name=name,
-        input=input,
-        act=LinearActivation(),
-        bias_attr=False,
-        layer_attr=ExtraAttr(drop_rate=dropout_rate))
-
-
-@wrap_name_default()
-@wrap_act_default(act=LinearActivation())
-@wrap_param_attr_default()
-@layer_support(DROPOUT)
-def row_conv_layer(input,
-                   context_len,
-                   act=None,
-                   name=None,
-                   param_attr=None,
-                   layer_attr=None):
-    """
-
-    The row convolution is called lookahead convolution. It is firstly
-    introduced in paper of `Deep Speech 2: End-to-End Speech Recognition
-    in English and Mandarin <https://arxiv.org/pdf/1512.02595v1.pdf>`_ .
-
-    The bidirectional RNN that learns representation for a sequence by
-    performing a forward and a backward pass through the entire sequence.
-    However, unlike unidirectional RNNs, bidirectional RNNs are challenging
-    to deploy in an online and low-latency setting. The lookahead convolution
-    incorporates information from future subsequences in a computationally
-    efficient manner to improve unidirectional RNNs.
-
-    The connection of row convolution is different from the 1D sequence
-    convolution. Assumed that, the future context-length is k, that is to say,
-    it can get the output at timestep t by using the the input feature from t-th
-    timestep to (t+k+1)-th timestep. Assumed that the hidden dim of input
-    activations are d, the activations r_t for the new layer at time-step t are:
-
-    .. math::
-
-        r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}}
-                  \quad \\text{for} \quad  (1 \leq i \leq d)
-
-    Note:
-        The `context_len` is `k + 1`. That is to say, the lookahead step
-        number plus one equals context_len.
-
-
-    .. code-block:: python
-
-       row_conv = row_conv_layer(input=input_layer, context_len=3)
-
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param context_len: The context length equals the lookahead step number
-                        plus one.
-    :type context_len: int
-    :param act: Activation Type. LinearActivation is the default activation.
-    :type act: BaseActivation
-    :param param_attr: The parameter attribute. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput)
-    assert context_len > 0, "the context_len must be greatet than 0."
-
-    Layer(
-        inputs=[Input(input.name, **param_attr.attr)],
-        name=name,
-        context_length=context_len,
-        type=LayerType.ROW_CONV_LAYER,
-        active_type=act.name,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.ROW_CONV_LAYER, input, activation=act, size=input.size)
-
-
-@layer_support()
-@wrap_name_default()
-def prelu_layer(input,
-                name=None,
-                partial_sum=1,
-                channel_shared=None,
-                num_channels=None,
-                param_attr=None,
-                layer_attr=None):
-    """
-    The Parametric Relu activation that actives outputs with a learnable weight.
-
-    Reference:
-        `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
-        ImageNet Classification <http://arxiv.org/pdf/1502.01852v1.pdf>`_
-
-    .. math::
-       z_i &\\quad if \\quad z_i > 0 \\\\
-       a_i * z_i  &\\quad \\mathrm{otherwise}
-
-    The example usage is:
-
-    .. code-block:: python
-
-       prelu = prelu_layer(input=layers, partial_sum=1)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param partial_sum: this parameter makes a group of inputs share the same weight.
-
-        - partial_sum = 1, indicates the element-wise activation: each element has a weight.
-        - partial_sum = number of elements in one channel, indicates the channel-wise activation, elements in a channel share the same weight.
-        - partial_sum = number of outputs, indicates all elements share the same weight.
-
-    :type partial_sum: int
-    :param channel_shared: whether or not the parameter are shared across channels.
-
-        - channel_shared = True, we set the partial_sum to the number of outputs.
-        - channel_shared = False, we set the partial_sum to the number of elements in one channel.
-
-    :type channel_shared: bool
-    :param num_channels: number of input channel.
-    :type num_channels: int
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(input, LayerOutput), 'prelu_layer accepts only one input.'
-
-    if not param_attr:
-        param_attr = ParamAttr(initial_mean=0.25, initial_std=0.0)
-    else:
-        assert isinstance(param_attr, ParameterAttribute)
-
-    if num_channels is None:
-        assert input.num_filters is not None, \
-                'the input channel cannot be detected, please specify the num_channels parameter'
-        num_channels = input.num_filters
-
-    if channel_shared is not None:
-        assert isinstance(channel_shared, bool)
-        assert (input.height != 0 and input.width != 0), \
-            'input height and widht must be setted'
-        if channel_shared:
-            partial_sum = input.height * input.width * num_channels
-        else:
-            partial_sum = input.height * input.width
-
-    l = Layer(
-        name=name,
-        type=LayerType.PRELU,
-        inputs=Input(input.name, **param_attr.attr),
-        partial_sum=partial_sum,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.PRELU,
-        parents=input,
-        num_filters=num_channels,
-        size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support(ERROR_CLIPPING, DROPOUT)
-@wrap_act_default(act=LinearActivation())
-def gated_unit_layer(input,
-                     size,
-                     act=None,
-                     name=None,
-                     gate_attr=None,
-                     gate_param_attr=None,
-                     gate_bias_attr=True,
-                     inproj_attr=None,
-                     inproj_param_attr=None,
-                     inproj_bias_attr=True,
-                     layer_attr=None):
-    """
-    The gated unit layer implements a simple gating mechanism over the input.
-    The input :math:`X` is first projected into a new space :math:`X'`, and
-    it is also used to produce a gate weight :math:`\sigma`. Element-wise
-    product between :math:`X'` and :math:`\sigma` is finally returned.
-
-    Reference:
-        `Language Modeling with Gated Convolutional Networks
-        <https://arxiv.org/abs/1612.08083>`_
-
-    .. math::
-       y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
-
-    The example usage is:
-
-    .. code-block:: python
-        gated_unit = gated_unit_layer(size=128, input=input_layer))
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param size: The dimension of this layer's output.
-    :type size: int
-    :param act: Activation type of the projection. LinearActivation is the default
-                activation.
-    :type act: BaseActivation
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param gate_attr: The extra layer attribute of the gate. See ExtraLayerAttribute for
-                      details.
-    :type gate_attr: ExtraLayerAttribute | None
-    :param gate_param_attr: The parameter attribute of the gate. See ParameterAttribute
-                            for details.
-    :type gate_param_attr: ParameterAttribute
-    :param gate_bias_attr: The bias attribute of the gate. If this parameter is set to False or
-                           an object whose type is not ParameterAttribute, no bias is defined.
-                           If this parameter is set to True, the bias is initialized to zero.
-    :type gate_bias_attr: ParameterAttribute | bool | None | Any
-    :param inproj_attr: Extra layer attributes of the projection. See ExtraLayerAttribute for
-                        details.
-    :type inproj_attr: ExtraLayerAttribute | None
-    :param inproj_param_attr: The parameter attribute of the projection. See ParameterAttribute
-                              for details.
-    :type inproj_param_attr: ParameterAttribute
-    :param inproj_bias_attr: The bias attribute of the projection. If this parameter is set to False
-                             or an object whose type is not ParameterAttribute, no bias is defined.
-                             If this parameter is set to True, the bias is initialized to zero.
-    :type inproj_bias_attr: ParameterAttribute | bool | None | Any
-    :param layer_attr: Extra layer attribute of the product. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(
-        input, LayerOutput), 'The gated linear unit accepts only one input.'
-
-    input_proj = fc_layer(
-        input=input,
-        name="%s_input_proj" % name,
-        size=size,
-        act=act,
-        layer_attr=inproj_attr,
-        param_attr=inproj_param_attr,
-        bias_attr=inproj_bias_attr)
-
-    gate = fc_layer(
-        size=size,
-        name="%s_gate" % name,
-        act=SigmoidActivation(),
-        input=input,
-        layer_attr=gate_attr,
-        param_attr=gate_param_attr,
-        bias_attr=gate_bias_attr)
-    return mixed_layer(
-        name="%s_gated_act" % name,
-        input=dotmul_operator(input_proj, gate),
-        layer_attr=layer_attr)
-
-
-@layer_support()
-@wrap_name_default('switch_order')
-def switch_order_layer(input,
-                       name=None,
-                       reshape_axis=None,
-                       act=None,
-                       layer_attr=None):
-    """
-    This layer switch dimension order of image input.
-    From order "batchSize, channels, height, width"
-    to order "batchSize, height, width, channels".
-
-    The example usage is:
-
-    .. code-block:: python
-       reshape_axis = 3
-       switch = switch_order(input=layer, name='switch', reshape_axis=reshape_axis)
-       reshape = {'height':[ 0, 1, 2], 'width':[3]}
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param reshape_axis: Specify the axises of 'height'. Its value should be positive and less than 4.
-    :type reshape_axis: int
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput)
-    assert reshape_axis != None and (reshape_axis > 0 and reshape_axis < 4)
-    height = [ele for ele in xrange(reshape_axis)]
-    width = [ele for ele in range(reshape_axis, 4)]
-    reshape = {'height': height, 'width': width}
-
-    l = Layer(
-        name=name,
-        inputs=input.name,
-        reshape=reshape,
-        type=LayerType.SWITCH_ORDER_LAYER,
-        active_type=act.name,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.SWITCH_ORDER_LAYER,
-        activation=act,
-        parents=input,
-        size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support()
-def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
-    """
-    This layer crops images according to the offset and shape. Users can set
-    the crop shape through the argument 'shape' explicitly or by specifying a
-    reference input layer.
-
-    The example usage is:
-
-    .. code-block:: python
-    crop = crop_layer(input=[image_input, reference_input], axis=2, offset=[2, 3])
-
-    :param input: The input of this layer. If two inputs are given, the second one
-                  will be regarded as the reference.
-                  And the input must be 4-dims and in NCHW order.
-    :type input: LayerOutput | Sequence
-    :param offset: The crop offset.
-    :type offset: Sequence
-    :param axis: The start axis to be cropped. For image input layer:
-        - 0: batch size
-        - 1: channels
-        - 2: height
-        - 3: width
-    :type axis: int
-    :param shape: The shape to be cropped to. Default is None.
-    :type shape: Sequence | None
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if isinstance(input, LayerOutput):
-        input = [input]
-    else:
-        assert isinstance(input, collections.Sequence)
-    l = Layer(
-        inputs=[x.name for x in input],
-        axis=axis,
-        offset=offset,
-        shape=shape,
-        name=name,
-        type=LayerType.CROP_LAYER,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.CROP_LAYER,
-        parents=input,
-        size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support()
-def sub_nested_seq_layer(input, selected_indices, name=None):
-    """
-    The sub_nested_seq_layer accepts two inputs: the first one is a nested
-    sequence; the second one is a set of selceted indices in the nested sequence.
-
-    Then sub_nest_seq_layer trims the first nested sequence input according
-    to the selected indices to form a new output. This layer is useful in
-    beam training.
-
-    The example usage is:
-
-    .. code-block:: python
-
-        sub_nest_seq = sub_nested_seq_layer(input=data, selected_indices=selected_ids)
-
-
-    :param input: The input of this layer. It is a nested sequence.
-    :type input: LayerOutput
-    :param selected_indices: A set of sequence indices in the nested sequence.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(input, LayerOutput), (
-        'The first input of '
-        'sub_nested_seq_layer must be a Paddle layer.')
-    assert isinstance(selected_indices, LayerOutput), (
-        'The second input of '
-        'sub_nested_seq_layer must be a Paddle layer.')
-
-    l = Layer(
-        inputs=input.name,
-        selected_indices=selected_indices.name,
-        name=name,
-        type=LayerType.SUB_NESTED_SEQ)
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.SUB_NESTED_SEQ,
-        parents=input,
-        size=l.config.size)
-
-
-@wrap_name_default("clip")
-def clip_layer(input, min, max, name=None):
-    """
-    A layer for clipping the input value by the threshold.
-
-    .. math::
-
-        out[i] = \min (\max (in[i],p_{1} ),p_{2} )
-
-    .. code-block:: python
-
-        clip = clip_layer(input=input_layer, min=-10, max=10)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput.
-    :param min: The lower threshold for clipping.
-    :type min: float
-    :param max: The upper threshold for clipping.
-    :type max: float
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(
-        name=name,
-        type=LayerType.CLIP_LAYER,
-        inputs=[input.name],
-        min=min,
-        max=max)
-    return LayerOutput(
-        name, LayerType.CLIP_LAYER, parents=[input], size=input.size)
-
-
-@wrap_name_default()
-def seq_slice_layer(input, starts, ends, name=None):
-    """
-    seq_slice_layer will return one or several sub-sequences from the
-    input sequence layer given start and end indices.
-
-        - If only start indices are given, and end indices are set to None,
-          this layer slices the input sequence from the given start indices
-          to its end.
-        - If only end indices are given, and start indices are set to None,
-          this layer slices the input sequence from its beginning to the
-          given end indices.
-        - If start and end indices are both given, they should have the same
-          number of elements.
-
-    If start or end indices contains more than one elements, the input sequence
-    will be sliced for multiple times.
-
-
-    .. code-block:: python
-
-        seq_silce = seq_slice_layer(input=input_seq,
-                                    starts=start_pos, ends=end_pos)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer, which should be a sequence.
-    :type input: LayerOutput
-    :param starts: The start indices to slice the input sequence.
-    :type starts: LayerOutput | None
-    :param ends: The end indices to slice the input sequence.
-    :type ends: LayerOutput | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(input, LayerOutput), (
-        'The first input of seq_slice layer must be a PaddlePaddle layer.')
-
-    if starts is not None:
-        assert isinstance(starts, LayerOutput), (
-            'The start indices for seq_slice layer '
-            'must be a PaddlePaddle layer.')
-    if ends is not None:
-        assert isinstance(ends, LayerOutput), (
-            'The end indices for seq_slice layer must be a PaddlePaddle layer.')
-    assert starts is not None or ends is not None, (
-        'start and end indices '
-        'cannot be set to None at the same time, at least one of '
-        'them should be given.')
-    if starts is not None and ends is not None:
-        assert starts.size == ends.size, (
-            'If start and end indices are both given to seq_slice_layer, '
-            'they should have the same width.')
-
-    Layer(
-        name=name,
-        type=LayerType.SEQ_SLICE,
-        inputs=input.name,
-        starts=starts.name if starts is not None else None,
-        ends=ends.name if ends is not None else None)
-    return LayerOutput(
-        name, LayerType.SEQ_SLICE, parents=[input], size=input.size)
-
-
-@wrap_name_default()
-@layer_support()
-def kmax_seq_score_layer(input, name=None, beam_size=1):
-    """
-    This layer accepts one input which is scores over a sequence or a nested
-    sequence, and returns indices of beam_size sequences with highest scores.
-
-    .. code-block:: python
-
-        kmax_indices = kmax_seq_score_layer(input=input_layer, beam_size)
-
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer. It stores scores over a sequence or
-                  a nested sequence and its size must be 1.
-    :type input: LayerOutput
-    :param beam_size: The indices of the sequences with top beam_size scores are returned.
-    :type beam_size: int
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput), ("kmax_seq_score_layer "
-                                            "accepts only one input.")
-    assert input.size == 1, (
-        "input of kmax_seq_score_layer is a score "
-        "over a sequence or a nested sequence, so its width must be 1.")
-
-    Layer(
-        name=name,
-        type=LayerType.KMAX_SEQ_SCORE,
-        inputs=[input.name],
-        beam_size=beam_size)
-
-    return LayerOutput(
-        name, LayerType.KMAX_SEQ_SCORE, parents=[input], size=input.size)
-
-
-@wrap_name_default("conv3d")
-@wrap_param_attr_default()
-@wrap_bias_attr_default()
-@wrap_act_default(act=ReluActivation())
-@layer_support(DROPOUT)
-def img_conv3d_layer(input,
-                     filter_size,
-                     num_filters,
-                     name=None,
-                     num_channels=None,
-                     act=None,
-                     groups=1,
-                     stride=1,
-                     padding=0,
-                     bias_attr=None,
-                     param_attr=None,
-                     shared_biases=True,
-                     layer_attr=None,
-                     trans=False,
-                     layer_type=None):
-    """
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        conv = img_conv3d_layer(input=data, filter_size=1,
-                              num_channels=8,
-                              num_filters=16, stride=1,
-                              bias_attr=False,
-                              act=ReluActivation())
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param filter_size: The dimensions of the filter kernel along three axises. If the parameter
-                        is set to one integer, the three dimensions will be same.
-    :type filter_size: int | tuple | list
-    :param num_filters: The number of filters. It is as same as the output image channel.
-    :type num_filters: int
-    :param act: Activation type. ReluActivation is the default activation.
-    :type act: BaseActivation
-    :param groups: The number of the filter groups.
-    :type groups: int
-    :param stride: The strides of the convolution along three axises. If the parameter
-                   is set to one integer, the three strides will be same.
-    :type stride: int | tuple | list
-    :param padding: The numbers of padding along three axises. If the parameter is set to
-                    one integer, they will be same.
-    :type padding: int | tuple | list
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None, its actual value will be automatically set to
-                         the channels number of the input.
-    :type num_channels: int
-    :param param_attr: The parameter attribute of the convolution. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param shared_biases: Whether biases will be shared between filters or not.
-    :type shared_biases: bool
-    :param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :param trans: True if it is a convTransLayer, False if it is a convLayer
-    :type trans: bool
-    :param layer_type: Specify the layer type. If the parameter is set, it must be "deconv3d"
-                       when trans=True. If not set, it will be automatically set to "deconv3d"
-                       when trans=True and "conv3d" when trans=False.
-    :type layer_type: basestring
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-
-    if isinstance(filter_size, collections.Sequence):
-        assert len(filter_size) == 3
-        filter_size, filter_size_y, filter_size_z = filter_size
-    else:
-        filter_size_y = filter_size
-        filter_size_z = filter_size
-
-    if isinstance(stride, collections.Sequence):
-        assert len(stride) == 3
-        stride, stride_y, stride_z = stride
-    else:
-        stride_y = stride
-        stride_z = stride
-
-    if isinstance(padding, collections.Sequence):
-        assert len(padding) == 3
-        padding, padding_y, padding_z = padding
-    else:
-        padding_y = padding
-        padding_z = padding
-
-    if param_attr.attr.get('initial_smart'):
-        # special initial for conv layers.
-        init_w = (2.0 / (filter_size**2 * num_channels))**0.5
-        param_attr.attr["initial_mean"] = 0.0
-        param_attr.attr["initial_std"] = init_w
-        param_attr.attr["initial_strategy"] = 0
-        param_attr.attr["initial_smart"] = False
-
-    if layer_type:
-        if trans:
-            assert layer_type in ["deconv3d"]
-        lt = layer_type
-    else:
-        lt = LayerType.DECONV3D_LAYER if trans else LayerType.CONV3D_LAYER
-
-    l = Layer(
-        name=name,
-        inputs=Input(
-            input.name,
-            conv=Conv3D(
-                filter_size=filter_size,
-                padding=padding,
-                stride=stride,
-                channels=num_channels,
-                groups=groups,
-                filter_size_y=filter_size_y,
-                padding_y=padding_y,
-                stride_y=stride_y,
-                filter_size_z=filter_size_z,
-                padding_z=padding_z,
-                stride_z=stride_z),
-            **param_attr.attr),
-        active_type=act.name,
-        num_filters=num_filters,
-        bias=ParamAttr.to_bias(bias_attr),
-        shared_biases=shared_biases,
-        type=lt,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        lt,
-        parents=[input],
-        activation=act,
-        num_filters=num_filters,
-        size=l.config.size)
-
-
-@wrap_name_default("scale_shift")
-@wrap_param_attr_default()
-@wrap_bias_attr_default()
-def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
-    """
-    A layer applies a linear transformation to each element in each row of
-    the input matrix. For each element, the layer first re-scales it and then
-    adds a bias to it.
-
-    This layer is very like the SlopeInterceptLayer, except the scale and
-    bias are trainable.
-
-    .. math::
-
-        y = w * x + b
-
-    .. code-block:: python
-
-        scale_shift = scale_shift_layer(input=input_layer, bias_attr=False)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param param_attr: The parameter attribute of scaling. See ParameterAttribute for
-                      details.
-    :type param_attr: ParameterAttribute
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(
-        name=name,
-        type=LayerType.SCALE_SHIFT_LAYER,
-        inputs=Input(input.name, **param_attr.attr),
-        bias=ParamAttr.to_bias(bias_attr))
-    return LayerOutput(
-        name, LayerType.SCALE_SHIFT_LAYER, parents=[input], size=input.size)
-
-
-@wrap_name_default("resize")
-def resize_layer(input, size, name=None):
-    """
-    The resize layer resizes the input matrix with a shape of [Height, Width]
-    into the output matrix with a shape of [Height x Width / size, size],
-    where size is the parameter of this layer indicating the output dimension.
-
-    :param input: The input of this layer.
-    :type input: LayerOutput.
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param size: The resized output dimension of this layer.
-    :type size: int
-    :return: A LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(name=name, type=LayerType.RESIZE, inputs=Input(input.name), size=size)
-    return LayerOutput(name, LayerType.RESIZE, parents=[input], size=input.size)
-
-
-@wrap_act_default(act=LinearActivation())
-@wrap_name_default('sub_seq')
-def sub_seq_layer(input, offsets, sizes, act=None, bias_attr=None, name=None):
-    """
-    sub_seq_layer will return sub-sequences from the input sequences. For each
-    sequence in the input sequence layer, sub_seq_layer will slice it by given
-    offset and size. Please notice that, number of offset value and size value
-    both are equal to the number of sequence in the input layer.
-
-    .. code-block:: python
-
-        sub_seq = sub_seq_layer(input=input_seq, offsets=offsets, sizes=sizes)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer, which should be sequence.
-    :type input: LayerOutput
-    :param offsets: The offset indices to slice the input sequence, which should
-                    be sequence type.
-    :type offsets: LayerOutput
-    :param sizes: The sizes of the sub-sequences, which should be sequence type.
-    :type sizes: LayerOutput
-    :param act: Activation type, LinearActivation is the default activation.
-    :type act: BaseActivation.
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(input, LayerOutput), (
-        'The first input of sub_seq_layer layer must be a PaddlePaddle layer.')
-    assert isinstance(offsets, LayerOutput), (
-        'The offset indices for sub_seq_layer, '
-        'must be a PaddlePaddle layer.')
-    assert isinstance(sizes, LayerOutput), (
-        'The sizes of sub-sequences, must be a PaddlePaddle layer.')
-
-    Layer(
-        name=name,
-        type=LayerType.SUB_SEQ_LAYER,
-        inputs=[input.name, offsets.name, sizes.name],
-        active_type=act.name,
-        bias=ParamAttr.to_bias(bias_attr))
-
-    return LayerOutput(
-        name,
-        LayerType.SUB_SEQ_LAYER,
-        parents=[input, offsets, sizes],
-        size=input.size)
-
-
-@wrap_name_default('scale_sub_region')
-def scale_sub_region_layer(input, indices, value, name=None):
-    """
-    Given an image or feature map with CHW information, scale_sub_region_layer
-    can be used to multiply a real value to values of a sub continuous region.
-    You can provide start and end indices of CHW for each instance.
-    Please notice that all start indices are counting from 1.
-    The shape of indices should be [batch_size, 6] and the layout for each row
-    is [C_Start, C_End, H_Start, H_End, W_Start, W_End].
-
-    .. code-block:: python
-
-        scale_sub_region = scale_sub_region_layer(input=input,
-                                                  indices=indices,
-                                                  value=value)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer which should contains CHW information.
-    :type input: LayerOutput
-    :param indices: Start index and end index for C H W, the input value should
-                    be a 2-D matrix with shape [batch_size, 6].
-    :type indices: LayerOutput.
-    :param value: value to multiply.
-    :type value: float
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(input, LayerOutput), (
-        'The first input of scale_sub_region_layer, '
-        'must be a PaddlePaddle layer.')
-    assert isinstance(indices, LayerOutput), (
-        'The start and end indices for CHW, must be a PaddlePaddle layer.')
-    assert isinstance(value, float), (
-        'The value to multiply, must be a real value.')
-
-    Layer(
-        name=name,
-        type=LayerType.SCALE_SUB_REGION_LAYER,
-        inputs=[input.name, indices.name],
-        value=value)
-
-    return LayerOutput(
-        name,
-        LayerType.SCALE_SUB_REGION_LAYER,
-        parents=[input, indices],
-        num_filters=input.num_filters,
-        size=input.size)
-
-
-@wrap_name_default()
-@wrap_act_default(act=LinearActivation())
-@wrap_param_attr_default()
-@layer_support()
-def factorization_machine(input,
-                          factor_size,
-                          act=None,
-                          name=None,
-                          param_attr=None,
-                          layer_attr=None):
-    """
-    The Factorization Machine models pairwise feature interactions as inner
-    product of the learned latent vectors corresponding to each input feature.
-    The Factorization Machine can effectively capture feature interactions
-    especially when the input is sparse.
-
-    This implementation only consider the 2-order feature interactions using
-    Factorization Machine with the formula:
-
-    .. math::
-        y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \\rangle x_i x_j
-
-    Note:
-        X is the input vector with size n. V is the factor matrix. Each row of V
-        is the latent vector corresponding to each input dimesion. The size of
-        each latent vector is k.
-
-    For details of Factorization Machine, please refer to the paper:
-    Factorization machines.
-
-    .. code-block:: python
-        first_order = paddle.layer.fc(input=input,
-                                      size=1,
-                                      act=paddle.activation.Linear())
-        second_order = paddle.layer.factorization_machine(input=input,
-                                                          factor_size=10)
-        fm = paddle.layer.addto(input=[first_order, second_order],
-                                act=paddle.activation.Linear(),
-                                bias_attr=False)
-
-    :param input: The input layer. Supported input types: all input data types
-                  on CPU, and only dense input types on GPU.
-    :type input: LayerOutput
-    :param factor_size: The hyperparameter that defines the dimensionality of
-                        the latent vector size.
-    :type context_len: int
-    :param act: Activation Type. Default is linear activation.
-    :type act: BaseActivation
-    :param param_attr: The parameter attribute. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput)
-    assert factor_size > 0, "the factor_size must be greater than 0."
-
-    Layer(
-        inputs=[Input(input.name, **param_attr.attr)],
-        name=name,
-        factor_size=factor_size,
-        type=LayerType.FACTORIZATION_MACHINE,
-        active_type=act.name,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.FACTORIZATION_MACHINE, input, activation=act, size=1)
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
deleted file mode 100644
index b5cde7bac77..00000000000
--- a/python/paddle/trainer_config_helpers/networks.py
+++ /dev/null
@@ -1,1813 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-
-from activations import LinearActivation, ReluActivation, SoftmaxActivation, \
-    IdentityActivation, TanhActivation, SequenceSoftmaxActivation
-from attrs import ExtraAttr
-from default_decorators import wrap_name_default, wrap_act_default, \
-    wrap_param_default, wrap_bias_attr_default, wrap_param_attr_default
-from layers import *  # There are too many layers used in network, so import *
-from poolings import MaxPooling, SumPooling
-from paddle.trainer.config_parser import *
-
-__all__ = [
-    'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
-    "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
-    'img_conv_group', 'img_separable_conv', 'vgg_16_network', 'gru_unit',
-    'gru_group', 'simple_gru', 'simple_attention', 'dot_product_attention',
-    'multi_head_attention', 'simple_gru2', 'bidirectional_gru',
-    'text_conv_pool', 'bidirectional_lstm', 'inputs', 'outputs'
-]
-
-######################################################
-#                     Text CNN                       #
-######################################################
-
-
-@wrap_name_default("sequence_conv_pooling")
-def sequence_conv_pool(input,
-                       context_len,
-                       hidden_size,
-                       name=None,
-                       context_start=None,
-                       pool_type=None,
-                       context_proj_layer_name=None,
-                       context_proj_param_attr=False,
-                       fc_layer_name=None,
-                       fc_param_attr=None,
-                       fc_bias_attr=None,
-                       fc_act=None,
-                       pool_bias_attr=None,
-                       fc_attr=None,
-                       context_attr=None,
-                       pool_attr=None):
-    """
-    Text convolution pooling group.
-
-    Text input => Context Projection => FC Layer => Pooling => Output.
-
-    :param name: group name.
-    :type name: basestring
-    :param input: input layer.
-    :type input: LayerOutput
-    :param context_len: context projection length. See
-                        context_projection's document.
-    :type context_len: int
-    :param hidden_size: FC Layer size.
-    :type hidden_size: int
-    :param context_start: context start position. See
-                          context_projection's context_start.
-    :type context_start: int|None
-    :param pool_type: pooling layer type. See pooling_layer's document.
-    :type pool_type: BasePoolingType
-    :param context_proj_layer_name: context projection layer name.
-                                    None if user don't care.
-    :type context_proj_layer_name: basestring
-    :param context_proj_param_attr: padding parameter attribute of context projection layer.
-                                    If false, it means padding always be zero.
-    :type context_proj_param_attr: ParameterAttribute|None
-    :param fc_layer_name: fc layer name. None if user don't care.
-    :type fc_layer_name: basestring
-    :param fc_param_attr: fc layer parameter attribute. None if user don't care.
-    :type fc_param_attr: ParameterAttribute|None
-    :param fc_bias_attr: fc bias parameter attribute. False if no bias,
-                         None if user don't care.
-    :type fc_bias_attr: ParameterAttribute|False|None
-    :param fc_act: fc layer activation type. None means tanh.
-    :type fc_act: BaseActivation
-    :param pool_bias_attr: pooling layer bias attr. False if no bias.
-                           None if user don't care.
-    :type pool_bias_attr: ParameterAttribute|False|None
-    :param fc_attr: fc layer extra attribute.
-    :type fc_attr: ExtraLayerAttribute
-    :param context_attr: context projection layer extra attribute.
-    :type context_attr: ExtraLayerAttribute
-    :param pool_attr: pooling layer extra attribute.
-    :type pool_attr: ExtraLayerAttribute
-    :return: layer's output.
-    :rtype: LayerOutput
-    """
-    # Set Default Value to param
-    context_proj_layer_name = "%s_conv_proj" % name \
-        if context_proj_layer_name is None else context_proj_layer_name
-
-    with mixed_layer(
-            name=context_proj_layer_name,
-            size=input.size * context_len,
-            act=LinearActivation(),
-            layer_attr=context_attr) as m:
-        m += context_projection(
-            input,
-            context_len=context_len,
-            context_start=context_start,
-            padding_attr=context_proj_param_attr)
-
-    fc_layer_name = "%s_conv_fc" % name \
-        if fc_layer_name is None else fc_layer_name
-    fl = fc_layer(
-        name=fc_layer_name,
-        input=m,
-        size=hidden_size,
-        act=fc_act,
-        layer_attr=fc_attr,
-        param_attr=fc_param_attr,
-        bias_attr=fc_bias_attr)
-
-    return pooling_layer(
-        name=name,
-        input=fl,
-        pooling_type=pool_type,
-        bias_attr=pool_bias_attr,
-        layer_attr=pool_attr)
-
-
-text_conv_pool = sequence_conv_pool
-
-############################################################################
-#                       Images                                             #
-############################################################################
-
-
-@wrap_name_default("conv_pool")
-def simple_img_conv_pool(input,
-                         filter_size,
-                         num_filters,
-                         pool_size,
-                         name=None,
-                         pool_type=None,
-                         act=None,
-                         groups=1,
-                         conv_stride=1,
-                         conv_padding=0,
-                         bias_attr=None,
-                         num_channel=None,
-                         param_attr=None,
-                         shared_bias=True,
-                         conv_layer_attr=None,
-                         pool_stride=1,
-                         pool_padding=0,
-                         pool_layer_attr=None):
-    """
-    Simple image convolution and pooling group.
-
-    Img input => Conv => Pooling => Output.
-
-    :param name: group name.
-    :type name: basestring
-    :param input: input layer.
-    :type input: LayerOutput
-    :param filter_size: see img_conv_layer for details.
-    :type filter_size: int
-    :param num_filters: see img_conv_layer for details.
-    :type num_filters: int
-    :param pool_size: see img_pool_layer for details.
-    :type pool_size: int
-    :param pool_type: see img_pool_layer for details.
-    :type pool_type: BasePoolingType
-    :param act: see img_conv_layer for details.
-    :type act: BaseActivation
-    :param groups: see img_conv_layer for details.
-    :type groups: int
-    :param conv_stride: see img_conv_layer for details.
-    :type conv_stride: int
-    :param conv_padding: see img_conv_layer for details.
-    :type conv_padding: int
-    :param bias_attr: see img_conv_layer for details.
-    :type bias_attr: ParameterAttribute
-    :param num_channel: see img_conv_layer for details.
-    :type num_channel: int
-    :param param_attr: see img_conv_layer for details.
-    :type param_attr: ParameterAttribute
-    :param shared_bias: see img_conv_layer for details.
-    :type shared_bias: bool
-    :param conv_layer_attr: see img_conv_layer for details.
-    :type conv_layer_attr: ExtraLayerAttribute
-    :param pool_stride: see img_pool_layer for details.
-    :type pool_stride: int
-    :param pool_padding: see img_pool_layer for details.
-    :type pool_padding: int
-    :param pool_layer_attr: see img_pool_layer for details.
-    :type pool_layer_attr: ExtraLayerAttribute
-    :return: layer's output
-    :rtype: LayerOutput
-    """
-    _conv_ = img_conv_layer(
-        name="%s_conv" % name,
-        input=input,
-        filter_size=filter_size,
-        num_filters=num_filters,
-        num_channels=num_channel,
-        act=act,
-        groups=groups,
-        stride=conv_stride,
-        padding=conv_padding,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        shared_biases=shared_bias,
-        layer_attr=conv_layer_attr)
-    return img_pool_layer(
-        name="%s_pool" % name,
-        input=_conv_,
-        pool_size=pool_size,
-        pool_type=pool_type,
-        stride=pool_stride,
-        padding=pool_padding,
-        layer_attr=pool_layer_attr)
-
-
-@wrap_name_default("conv_bn_pool")
-def img_conv_bn_pool(input,
-                     filter_size,
-                     num_filters,
-                     pool_size,
-                     name=None,
-                     pool_type=None,
-                     act=None,
-                     groups=1,
-                     conv_stride=1,
-                     conv_padding=0,
-                     conv_bias_attr=None,
-                     num_channel=None,
-                     conv_param_attr=None,
-                     shared_bias=True,
-                     conv_layer_attr=None,
-                     bn_param_attr=None,
-                     bn_bias_attr=None,
-                     bn_layer_attr=None,
-                     pool_stride=1,
-                     pool_padding=0,
-                     pool_layer_attr=None):
-    """
-    Convolution, batch normalization, pooling group.
-
-    Img input => Conv => BN => Pooling => Output.
-
-    :param name: group name.
-    :type name: basestring
-    :param input: input layer.
-    :type input: LayerOutput
-    :param filter_size: see img_conv_layer for details.
-    :type filter_size: int
-    :param num_filters: see img_conv_layer for details.
-    :type num_filters: int
-    :param pool_size: see img_pool_layer for details.
-    :type pool_size: int
-    :param pool_type: see img_pool_layer for details.
-    :type pool_type: BasePoolingType
-    :param act: see batch_norm_layer for details.
-    :type act: BaseActivation
-    :param groups: see img_conv_layer for details.
-    :type groups: int
-    :param conv_stride: see img_conv_layer for details.
-    :type conv_stride: int
-    :param conv_padding: see img_conv_layer for details.
-    :type conv_padding: int
-    :param conv_bias_attr: see img_conv_layer for details.
-    :type conv_bias_attr: ParameterAttribute
-    :param num_channel: see img_conv_layer for details.
-    :type num_channel: int
-    :param conv_param_attr: see img_conv_layer for details.
-    :type conv_param_attr: ParameterAttribute
-    :param shared_bias: see img_conv_layer for details.
-    :type shared_bias: bool
-    :param conv_layer_attr: see img_conv_layer for details.
-    :type conv_layer_attr: ExtraLayerOutput
-    :param bn_param_attr: see batch_norm_layer for details.
-    :type bn_param_attr: ParameterAttribute
-    :param bn_bias_attr: see batch_norm_layer for details.
-    :type bn_bias_attr: ParameterAttribute
-    :param bn_layer_attr: see batch_norm_layer for details.
-    :type bn_layer_attr: ExtraLayerAttribute
-    :param pool_stride: see img_pool_layer for details.
-    :type pool_stride: int
-    :param pool_padding: see img_pool_layer for details.
-    :type pool_padding: int
-    :param pool_layer_attr: see img_pool_layer for details.
-    :type pool_layer_attr: ExtraLayerAttribute
-    :return: layer's output
-    :rtype: LayerOutput
-    """
-    __conv__ = img_conv_layer(
-        name="%s_conv" % name,
-        input=input,
-        filter_size=filter_size,
-        num_filters=num_filters,
-        num_channels=num_channel,
-        act=LinearActivation(),
-        groups=groups,
-        stride=conv_stride,
-        padding=conv_padding,
-        bias_attr=conv_bias_attr,
-        param_attr=conv_param_attr,
-        shared_biases=shared_bias,
-        layer_attr=conv_layer_attr)
-    __bn__ = batch_norm_layer(
-        name="%s_bn" % name,
-        input=__conv__,
-        act=act,
-        bias_attr=bn_bias_attr,
-        param_attr=bn_param_attr,
-        layer_attr=bn_layer_attr)
-    return img_pool_layer(
-        name="%s_pool" % name,
-        input=__bn__,
-        pool_type=pool_type,
-        pool_size=pool_size,
-        stride=pool_stride,
-        padding=pool_padding,
-        layer_attr=pool_layer_attr)
-
-
-@wrap_act_default(param_names=['conv_act'], act=ReluActivation())
-@wrap_param_default(
-    param_names=['pool_type'], default_factory=lambda _: MaxPooling())
-def img_conv_group(input,
-                   conv_num_filter,
-                   pool_size,
-                   num_channels=None,
-                   conv_padding=1,
-                   conv_filter_size=3,
-                   conv_act=None,
-                   conv_with_batchnorm=False,
-                   conv_batchnorm_drop_rate=0,
-                   pool_stride=1,
-                   pool_type=None,
-                   param_attr=None):
-    """
-    Image Convolution Group, Used for vgg net.
-
-    :param conv_batchnorm_drop_rate: if conv_with_batchnorm[i] is true,
-        conv_batchnorm_drop_rate[i] represents the drop rate of each batch norm.
-    :type conv_batchnorm_drop_rate: list
-    :param input: input layer.
-    :type input: LayerOutput
-    :param conv_num_filter: list of output channels num.
-    :type conv_num_filter: list|tuple
-    :param pool_size: pooling filter size.
-    :type pool_size: int
-    :param num_channels: input channels num.
-    :type num_channels: int
-    :param conv_padding: convolution padding size.
-    :type conv_padding: int
-    :param conv_filter_size: convolution filter size.
-    :type conv_filter_size: int
-    :param conv_act: activation funciton after convolution.
-    :type conv_act: BaseActivation
-    :param conv_with_batchnorm: if conv_with_batchnorm[i] is true,
-        there is a batch normalization operation after each convolution.
-    :type conv_with_batchnorm: list
-    :param pool_stride: pooling stride size.
-    :type pool_stride: int
-    :param pool_type: pooling type.
-    :type pool_type: BasePoolingType
-    :param param_attr: param attribute of convolution layer,
-                       None means default attribute.
-    :type param_attr: ParameterAttribute
-    :return: layer's output
-    :rtype: LayerOutput
-    """
-    tmp = input
-
-    # Type checks
-    assert isinstance(tmp, LayerOutput)
-    assert isinstance(conv_num_filter, list) or isinstance(conv_num_filter,
-                                                           tuple)
-    for each_num_filter in conv_num_filter:
-        assert isinstance(each_num_filter, int)
-
-    assert isinstance(pool_size, int)
-
-    def __extend_list__(obj):
-        if not hasattr(obj, '__len__'):
-            return [obj] * len(conv_num_filter)
-        else:
-            return obj
-
-    conv_padding = __extend_list__(conv_padding)
-    conv_filter_size = __extend_list__(conv_filter_size)
-    conv_act = __extend_list__(conv_act)
-    conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
-    conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)
-
-    for i in xrange(len(conv_num_filter)):
-        extra_kwargs = dict()
-        if num_channels is not None:
-            extra_kwargs['num_channels'] = num_channels
-            num_channels = None
-        if conv_with_batchnorm[i]:
-            extra_kwargs['act'] = LinearActivation()
-        else:
-            extra_kwargs['act'] = conv_act[i]
-
-        tmp = img_conv_layer(
-            input=tmp,
-            padding=conv_padding[i],
-            filter_size=conv_filter_size[i],
-            num_filters=conv_num_filter[i],
-            param_attr=param_attr,
-            **extra_kwargs)
-
-        # logger.debug("tmp.num_filters = %d" % tmp.num_filters)
-
-        if conv_with_batchnorm[i]:
-            dropout = conv_batchnorm_drop_rate[i]
-            if dropout == 0 or abs(dropout) < 1e-5:  # dropout not set
-                tmp = batch_norm_layer(input=tmp, act=conv_act[i])
-            else:
-                tmp = batch_norm_layer(
-                    input=tmp,
-                    act=conv_act[i],
-                    layer_attr=ExtraAttr(drop_rate=dropout))
-
-    return img_pool_layer(
-        input=tmp, stride=pool_stride, pool_size=pool_size, pool_type=pool_type)
-
-
-@wrap_name_default("separable_conv")
-def img_separable_conv(input,
-                       num_channels,
-                       num_out_channels,
-                       filter_size,
-                       stride=1,
-                       padding=0,
-                       depth_multiplier=1,
-                       act=None,
-                       bias_attr=None,
-                       param_attr=None,
-                       shared_bias=True,
-                       layer_type='exconv',
-                       name=None):
-    """
-    Separable Convolution.
-
-    The separable convolution module is consisted of a depthwise convolution
-    that acts separately on input channels, followed by a pointwise convolution
-    with 1*1 kernels that mixes channels. It is used for Xception:
-    https://arxiv.org/pdf/1610.02357.pdf
-
-    :param input: input layer.
-    :type input: LayerOutput
-    :param num_channels: the number of input channels.
-    :type num_channels: int
-    :param num_out_channels: the number of output channels.
-    :type num_out_channels: int
-    :param filter_size: the filter size for the depthwise convolution.
-    :type filter_size: int|tuple
-    :param stride: the stride size for the depthwise convolution.
-    :type stride: int|tuple
-    :param padding: the padding size for the depthwise convolution.
-    :type padding: int|tuple
-    :param depth_multiplier: the number of filter for one channel in the
-                             depthwize convolution.
-    :type depth_multiplier: int
-    :param act: the activation function for the output.
-    :type act: BaseActivation
-    :param bias_attr: see img_conv_layer for details.
-    :type bias_attr: ParameterAttribute
-    :param param_attr: see img_conv_layer for details.
-    :type param_attr: ParameterAttribute
-    :param shared_bias: see img_conv_layer for details.
-    :type shared_bias: bool
-    :param layer_type: see img_conv_layer for details.
-    :type layer_type: bool
-    :return: layer's output
-    :rtype: LayerOutput
-    """
-    __depthwise_conv__ = img_conv_layer(
-        name="%s_depthwise_conv" % name,
-        input=input,
-        num_channels=num_channels,
-        num_filters=num_channels * depth_multiplier,
-        groups=num_channels,
-        filter_size=filter_size,
-        stride=stride,
-        padding=padding,
-        act=LinearActivation(),
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        shared_biases=shared_bias,
-        layer_type=layer_type)
-    __pointwise_conv__ = img_conv_layer(
-        name="%s_pointwise_conv" % name,
-        input=__depthwise_conv__,
-        num_channels=num_channels * depth_multiplier,
-        num_filters=num_out_channels,
-        filter_size=1,
-        stride=1,
-        padding=0,
-        act=act,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        shared_biases=shared_bias)
-    return __pointwise_conv__
-
-
-def small_vgg(input_image, num_channels, num_classes):
-    def __vgg__(ipt, num_filter, times, dropouts, num_channels_=None):
-        return img_conv_group(
-            input=ipt,
-            num_channels=num_channels_,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * times,
-            conv_filter_size=3,
-            conv_act=ReluActivation(),
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type=MaxPooling())
-
-    tmp = __vgg__(input_image, 64, 2, [0.3, 0], num_channels)
-    tmp = __vgg__(tmp, 128, 2, [0.4, 0])
-    tmp = __vgg__(tmp, 256, 3, [0.4, 0.4, 0])
-    tmp = __vgg__(tmp, 512, 3, [0.4, 0.4, 0])
-    tmp = img_pool_layer(
-        input=tmp, stride=2, pool_size=2, pool_type=MaxPooling())
-    tmp = dropout_layer(input=tmp, dropout_rate=0.5)
-    tmp = fc_layer(
-        input=tmp,
-        size=512,
-        layer_attr=ExtraAttr(drop_rate=0.5),
-        act=LinearActivation())
-    tmp = batch_norm_layer(input=tmp, act=ReluActivation())
-    return fc_layer(input=tmp, size=num_classes, act=SoftmaxActivation())
-
-
-def vgg_16_network(input_image, num_channels, num_classes=1000):
-    """
-    Same model from https://gist.github.com/ksimonyan/211839e770f7b538e2d8
-
-    :param num_classes: number of class.
-    :type num_classes: int
-    :param input_image: input layer.
-    :type input_image: LayerOutput
-    :param num_channels: input channels num.
-    :type num_channels: int
-    :return: layer's output
-    :rtype: LayerOutput
-    """
-
-    tmp = img_conv_group(
-        input=input_image,
-        num_channels=num_channels,
-        conv_padding=1,
-        conv_num_filter=[64, 64],
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_size=2,
-        pool_stride=2,
-        pool_type=MaxPooling())
-
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=[128, 128],
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=[256, 256, 256],
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=[512, 512, 512],
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=[512, 512, 512],
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-
-    tmp = fc_layer(
-        input=tmp,
-        size=4096,
-        act=ReluActivation(),
-        layer_attr=ExtraAttr(drop_rate=0.5))
-
-    tmp = fc_layer(
-        input=tmp,
-        size=4096,
-        act=ReluActivation(),
-        layer_attr=ExtraAttr(drop_rate=0.5))
-
-    return fc_layer(input=tmp, size=num_classes, act=SoftmaxActivation())
-
-
-############################################################################
-#                       Recurrent                                          #
-############################################################################
-
-
-@wrap_name_default("lstm")
-def simple_lstm(input,
-                size,
-                name=None,
-                reverse=False,
-                mat_param_attr=None,
-                bias_param_attr=None,
-                inner_param_attr=None,
-                act=None,
-                gate_act=None,
-                state_act=None,
-                mixed_layer_attr=None,
-                lstm_cell_attr=None):
-    """
-    Simple LSTM Cell.
-
-    It just combines a mixed layer with fully_matrix_projection and a lstmemory
-    layer. The simple lstm cell was implemented with follow equations.
-
-    ..  math::
-
-        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
-
-        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
-
-        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
-
-        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
-
-        h_t & = o_t tanh(c_t)
-
-    Please refer to **Generating Sequences With Recurrent Neural Networks** for more
-    details about lstm. Link_ is here.
-
-    .. _Link: http://arxiv.org/abs/1308.0850
-
-    :param name: lstm layer name.
-    :type name: basestring
-    :param input: layer's input.
-    :type input: LayerOutput
-    :param size: lstm layer size.
-    :type size: int
-    :param reverse: process the input in a reverse order or not.
-    :type reverse: bool
-    :param mat_param_attr: parameter attribute of matrix projection in mixed layer.
-    :type mat_param_attr: ParameterAttribute
-    :param bias_param_attr: bias parameter attribute. False means no bias, None
-                            means default bias.
-    :type bias_param_attr: ParameterAttribute|False
-    :param inner_param_attr: parameter attribute of lstm cell.
-    :type inner_param_attr: ParameterAttribute
-    :param act: last activiation type of lstm.
-    :type act: BaseActivation
-    :param gate_act: gate activiation type of lstm.
-    :type gate_act: BaseActivation
-    :param state_act: state activiation type of lstm.
-    :type state_act: BaseActivation
-    :param mixed_layer_attr: extra attribute of mixed layer.
-    :type mixed_layer_attr: ExtraLayerAttribute
-    :param lstm_cell_attr: extra attribute of lstm.
-    :type lstm_cell_attr: ExtraLayerAttribute
-    :return: layer's output.
-    :rtype: LayerOutput
-    """
-    fc_name = 'lstm_transform_%s' % name
-    with mixed_layer(
-            name=fc_name,
-            size=size * 4,
-            act=IdentityActivation(),
-            layer_attr=mixed_layer_attr,
-            bias_attr=False) as m:
-        m += full_matrix_projection(input, param_attr=mat_param_attr)
-
-    return lstmemory(
-        name=name,
-        input=m,
-        reverse=reverse,
-        bias_attr=bias_param_attr,
-        param_attr=inner_param_attr,
-        act=act,
-        gate_act=gate_act,
-        state_act=state_act,
-        layer_attr=lstm_cell_attr)
-
-
-@wrap_name_default('lstm_unit')
-def lstmemory_unit(input,
-                   out_memory=None,
-                   name=None,
-                   size=None,
-                   param_attr=None,
-                   act=None,
-                   gate_act=None,
-                   state_act=None,
-                   input_proj_bias_attr=None,
-                   input_proj_layer_attr=None,
-                   lstm_bias_attr=None,
-                   lstm_layer_attr=None):
-    """
-    lstmemory_unit defines the caculation process of a LSTM unit during a
-    single time step. This function is not a recurrent layer, so it can not be
-    directly used to process sequence input. This function is always used in
-    recurrent_group (see layers.py for more details) to implement attention
-    mechanism.
-
-    Please refer to  **Generating Sequences With Recurrent Neural Networks**
-    for more details about LSTM. The link goes as follows:
-    .. _Link: https://arxiv.org/abs/1308.0850
-
-    ..  math::
-
-        i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
-
-        f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
-
-        c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
-
-        o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
-
-        h_t & = o_t tanh(c_t)
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        lstm_step = lstmemory_unit(input=[layer1],
-                                   size=256,
-                                   act=TanhActivation(),
-                                   gate_act=SigmoidActivation(),
-                                   state_act=TanhActivation())
-
-
-    :param input: Input layer.
-    :type input: LayerOutput
-    :param out_memory: The output of previous time step.
-    :type out_memory: LayerOutput | None
-    :param name: The lstmemory unit name.
-    :type name: basestring
-    :param size: The lstmemory unit size.
-    :type size: int
-    :param param_attr: The parameter attribute for the weights in
-                     input to hidden projection.
-                     None means default attribute.
-    :type param_attr: ParameterAttribute
-    :param act: The last activiation type of lstm.
-    :type act: BaseActivation
-    :param gate_act: The gate activiation type of lstm.
-    :type gate_act: BaseActivation
-    :param state_act: The state activiation type of lstm.
-    :type state_act: BaseActivation
-    :param input_proj_bias_attr: The parameter attribute for the bias in
-                      input to hidden projection.
-                      False or None means no bias.
-                      If this parameter is set to True,
-                      the bias is initialized to zero.
-    :type input_proj_bias_attr: ParameterAttribute|bool|None
-    :param input_proj_layer_attr: The extra layer attribute for
-                     input to hidden projection of the LSTM unit,
-                     such as dropout, error clipping.
-    :type input_proj_layer_attr: ExtraLayerAttribute
-    :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
-                      False or None means no bias.
-                      If this parameter is set to True,
-                      the bias is initialized to zero.
-    :type lstm_bias_attr: ParameterAttribute|True|None
-    :param lstm_layer_attr: The extra attribute of lstm layer.
-    :type lstm_layer_attr: ExtraLayerAttribute
-    :return: The lstmemory unit name.
-    :rtype: LayerOutput
-    """
-    if size is None:
-        assert input.size % 4 == 0
-        size = input.size / 4
-    if out_memory is None:
-        out_mem = memory(name=name, size=size)
-    else:
-        out_mem = out_memory
-
-    state_mem = memory(name="%s_state" % name, size=size)
-
-    with mixed_layer(
-            name="%s_input_recurrent" % name,
-            size=size * 4,
-            bias_attr=input_proj_bias_attr,
-            layer_attr=input_proj_layer_attr,
-            act=IdentityActivation()) as m:
-        m += identity_projection(input=input)
-        m += full_matrix_projection(input=out_mem, param_attr=param_attr)
-
-    lstm_out = lstm_step_layer(
-        name=name,
-        input=m,
-        state=state_mem,
-        size=size,
-        bias_attr=lstm_bias_attr,
-        act=act,
-        gate_act=gate_act,
-        state_act=state_act,
-        layer_attr=lstm_layer_attr)
-    get_output_layer(name='%s_state' % name, input=lstm_out, arg_name='state')
-
-    return lstm_out
-
-
-@wrap_name_default('lstm_group')
-def lstmemory_group(input,
-                    size=None,
-                    name=None,
-                    out_memory=None,
-                    reverse=False,
-                    param_attr=None,
-                    act=None,
-                    gate_act=None,
-                    state_act=None,
-                    input_proj_bias_attr=None,
-                    input_proj_layer_attr=None,
-                    lstm_bias_attr=None,
-                    lstm_layer_attr=None):
-    """
-    lstm_group is a recurrent_group version of Long Short Term Memory. It
-    does exactly the same calculation as the lstmemory layer (see lstmemory in
-    layers.py for the maths) does. A promising benefit is that LSTM memory
-    cell states(or hidden states) in every time step are accessible to the
-    user. This is especially useful in attention model. If you do not need to
-    access the internal states of the lstm and merely use its outputs,
-    it is recommended to use the lstmemory, which is relatively faster than
-    lstmemory_group.
-
-    NOTE: In PaddlePaddle's implementation, the following input-to-hidden
-    multiplications:
-    :math:`W_{x_i}x_{t}` , :math:`W_{x_f}x_{t}`,
-    :math:`W_{x_c}x_t`, :math:`W_{x_o}x_{t}` are not done in lstmemory_unit to
-    speed up the calculations. Consequently, an additional mixed_layer with
-    full_matrix_projection must be included before lstmemory_unit is called.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        lstm_step = lstmemory_group(input=[layer1],
-                                    size=256,
-                                    act=TanhActivation(),
-                                    gate_act=SigmoidActivation(),
-                                    state_act=TanhActivation())
-
-    :param input: Input layer.
-    :type input: LayerOutput
-    :param size: The lstmemory group size.
-    :type size: int
-    :param name: The name of lstmemory group.
-    :type name: basestring
-    :param out_memory: The output of previous time step.
-    :type out_memory: LayerOutput | None
-    :param reverse: Process the input in a reverse order or not.
-    :type reverse: bool
-    :param param_attr: The parameter attribute for the weights in
-                     input to hidden projection.
-                     None means default attribute.
-    :type param_attr: ParameterAttribute
-    :param act: The last activiation type of lstm.
-    :type act: BaseActivation
-    :param gate_act: The gate activiation type of lstm.
-    :type gate_act: BaseActivation
-    :param state_act: The state activiation type of lstm.
-    :type state_act: BaseActivation
-    :param input_proj_bias_attr: The parameter attribute for the bias in
-                      input to hidden projection.
-                      False or None means no bias.
-                      If this parameter is set to True,
-                      the bias is initialized to zero.
-    :type input_proj_bias_attr: ParameterAttribute|bool|None
-    :param input_proj_layer_attr: The extra layer attribute for
-                     input to hidden projection of the LSTM unit,
-                     such as dropout, error clipping.
-    :type input_proj_layer_attr: ExtraLayerAttribute
-    :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
-                      False or None means no bias.
-                      If this parameter is set to True,
-                      the bias is initialized to zero.
-    :type lstm_bias_attr: ParameterAttribute|True|None
-    :param lstm_layer_attr: The extra attribute of lstm layer.
-    :type lstm_layer_attr: ExtraLayerAttribute
-    :return: the lstmemory group.
-    :rtype: LayerOutput
-    """
-
-    def __lstm_step__(ipt):
-        return lstmemory_unit(
-            input=ipt,
-            name=name,
-            size=size,
-            act=act,
-            gate_act=gate_act,
-            state_act=state_act,
-            out_memory=out_memory,
-            input_proj_bias_attr=input_proj_bias_attr,
-            input_proj_layer_attr=input_proj_layer_attr,
-            param_attr=param_attr,
-            lstm_layer_attr=lstm_layer_attr,
-            lstm_bias_attr=lstm_bias_attr)
-
-    return recurrent_group(
-        name='%s_recurrent_group' % name,
-        step=__lstm_step__,
-        reverse=reverse,
-        input=input)
-
-
-@wrap_name_default('gru_unit')
-def gru_unit(input,
-             memory_boot=None,
-             size=None,
-             name=None,
-             gru_bias_attr=None,
-             gru_param_attr=None,
-             act=None,
-             gate_act=None,
-             gru_layer_attr=None,
-             naive=False):
-    """
-    gru_unit defines the calculation process of a gated recurrent unit during a single
-    time step. This function is not a recurrent layer, so it can not be
-    directly used to process sequence input. This function is always used in
-    the recurrent_group (see layers.py for more details) to implement attention
-    mechanism.
-
-    Please see grumemory in layers.py for the details about the maths.
-
-    :param input: input layer.
-    :type input: LayerOutput
-    :param memory_boot: the initialization state of the LSTM cell.
-    :type memory_boot: LayerOutput | None
-    :param name: name of the gru group.
-    :type name: basestring
-    :param size: hidden size of the gru.
-    :type size: int
-    :param act: activation type of gru
-    :type act: BaseActivation
-    :param gate_act: gate activation type or gru
-    :type gate_act: BaseActivation
-    :param gru_layer_attr: Extra attribute of the gru layer.
-    :type gru_layer_attr: ExtraLayerAttribute
-    :return: the gru output layer.
-    :rtype: LayerOutput
-    """
-
-    assert input.size % 3 == 0
-    if size is None:
-        size = input.size / 3
-
-    out_mem = memory(name=name, size=size, boot_layer=memory_boot)
-
-    if naive:
-        __step__ = gru_step_naive_layer
-    else:
-        __step__ = gru_step_layer
-
-    gru_out = __step__(
-        name=name,
-        input=input,
-        output_mem=out_mem,
-        size=size,
-        bias_attr=gru_bias_attr,
-        param_attr=gru_param_attr,
-        act=act,
-        gate_act=gate_act,
-        layer_attr=gru_layer_attr)
-    return gru_out
-
-
-@wrap_name_default('gru_group')
-def gru_group(input,
-              memory_boot=None,
-              size=None,
-              name=None,
-              reverse=False,
-              gru_bias_attr=None,
-              gru_param_attr=None,
-              act=None,
-              gate_act=None,
-              gru_layer_attr=None,
-              naive=False):
-    """
-    gru_group is a recurrent_group version of Gated Recurrent Unit. It
-    does exactly the same calculation as the grumemory layer does. A promising
-    benefit is that gru hidden states are accessible to the user. This is
-    especially useful in attention model. If you do not need to access
-    any internal state and merely use the outputs of a GRU, it is recommended
-    to use the grumemory, which is relatively faster.
-
-    Please see grumemory in layers.py for more detail about the maths.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        gru = gru_group(input=[layer1],
-                        size=256,
-                        act=TanhActivation(),
-                        gate_act=SigmoidActivation())
-
-    :param input: input layer.
-    :type input: LayerOutput
-    :param memory_boot: the initialization state of the LSTM cell.
-    :type memory_boot: LayerOutput | None
-    :param name: name of the gru group.
-    :type name: basestring
-    :param size: hidden size of the gru.
-    :type size: int
-    :param reverse: process the input in a reverse order or not.
-    :type reverse: bool
-    :param act: activiation type of gru
-    :type act: BaseActivation
-    :param gate_act: gate activiation type of gru
-    :type gate_act: BaseActivation
-    :param gru_bias_attr: bias parameter attribute of gru layer,
-                          False means no bias, None means default bias.
-    :type gru_bias_attr: ParameterAttribute|False|None
-    :param gru_layer_attr: Extra attribute of the gru layer.
-    :type gru_layer_attr: ExtraLayerAttribute
-    :return: the gru group.
-    :rtype: LayerOutput
-    """
-
-    def __gru_step__(ipt):
-        return gru_unit(
-            input=ipt,
-            memory_boot=memory_boot,
-            name=name,
-            size=size,
-            gru_bias_attr=gru_bias_attr,
-            gru_param_attr=gru_param_attr,
-            act=act,
-            gate_act=gate_act,
-            gru_layer_attr=gru_layer_attr,
-            naive=naive)
-
-    return recurrent_group(
-        name='%s_recurrent_group' % name,
-        step=__gru_step__,
-        reverse=reverse,
-        input=input)
-
-
-@wrap_name_default('simple_gru')
-def simple_gru(input,
-               size,
-               name=None,
-               reverse=False,
-               mixed_param_attr=None,
-               mixed_bias_param_attr=None,
-               mixed_layer_attr=None,
-               gru_bias_attr=None,
-               gru_param_attr=None,
-               act=None,
-               gate_act=None,
-               gru_layer_attr=None,
-               naive=False):
-    """
-    You may see gru_step_layer, grumemory in layers.py, gru_unit, gru_group,
-    simple_gru in network.py. The reason why there are so many interfaces is
-    that we have two ways to implement recurrent neural network. One way is to
-    use one complete layer to implement rnn (including simple rnn, gru and lstm)
-    with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But
-    the multiplication operation :math:`W x_t` is not computed in these layers.
-    See details in their interfaces in layers.py.
-    The other implementation is to use an recurrent group which can ensemble a
-    series of layers to compute rnn step by step. This way is flexible for
-    attenion mechanism or other complex connections.
-
-    - gru_step_layer: only compute rnn by one step. It needs an memory as input
-      and can be used in recurrent group.
-    - gru_unit: a wrapper of gru_step_layer with memory.
-    - gru_group: a GRU cell implemented by a combination of multiple layers in
-      recurrent group.
-      But :math:`W x_t` is not done in group.
-    - gru_memory: a GRU cell implemented by one layer, which does same calculation
-      with gru_group and is faster than gru_group.
-    - simple_gru: a complete GRU implementation inlcuding :math:`W x_t` and
-      gru_group. :math:`W` contains :math:`W_r`, :math:`W_z` and :math:`W`, see
-      formula in grumemory.
-
-    The computational speed is that, grumemory is relatively better than
-    gru_group, and gru_group is relatively better than simple_gru.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        gru = simple_gru(input=[layer1], size=256)
-
-    :param input: input layer.
-    :type input: LayerOutput
-    :param name: name of the gru group.
-    :type name: basestring
-    :param size: hidden size of the gru.
-    :type size: int
-    :param reverse: process the input in a reverse order or not.
-    :type reverse: bool
-    :param act: activiation type of gru
-    :type act: BaseActivation
-    :param gate_act: gate activiation type of gru
-    :type gate_act: BaseActivation
-    :param gru_bias_attr: bias parameter attribute of gru layer,
-                          False means no bias, None means default bias.
-    :type gru_bias_attr: ParameterAttribute|False|None
-    :param gru_layer_attr: Extra attribute of the gru layer.
-    :type gru_layer_attr: ExtraLayerAttribute
-    :return: the gru group.
-    :rtype: LayerOutput
-    """
-    with mixed_layer(
-            name='%s_transform' % name,
-            size=size * 3,
-            bias_attr=mixed_bias_param_attr,
-            layer_attr=mixed_layer_attr) as m:
-        m += full_matrix_projection(input=input, param_attr=mixed_param_attr)
-
-    return gru_group(
-        name=name,
-        size=size,
-        input=m,
-        reverse=reverse,
-        gru_bias_attr=gru_bias_attr,
-        gru_param_attr=gru_param_attr,
-        act=act,
-        gate_act=gate_act,
-        gru_layer_attr=gru_layer_attr,
-        naive=naive)
-
-
-@wrap_name_default('simple_gru2')
-def simple_gru2(input,
-                size,
-                name=None,
-                reverse=False,
-                mixed_param_attr=None,
-                mixed_bias_attr=None,
-                gru_param_attr=None,
-                gru_bias_attr=None,
-                act=None,
-                gate_act=None,
-                mixed_layer_attr=None,
-                gru_cell_attr=None):
-    """
-    simple_gru2 is the same with simple_gru, but using grumemory instead.
-    Please refer to grumemory in layers.py for more detail about the math.
-    simple_gru2 is faster than simple_gru.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        gru = simple_gru2(input=[layer1], size=256)
-
-    :param input: input layer.
-    :type input: LayerOutput
-    :param name: name of the gru group.
-    :type name: basestring
-    :param size: hidden size of the gru.
-    :type size: int
-    :param reverse: process the input in a reverse order or not.
-    :type reverse: bool
-    :param act: activiation type of gru
-    :type act: BaseActivation
-    :param gate_act: gate activiation type of gru
-    :type gate_act: BaseActivation
-    :param gru_bias_attr: bias parameter attribute of gru layer,
-                          False means no bias, None means default bias.
-    :type gru_bias_attr: ParameterAttribute|False|None
-    :param gru_param_attr: param parameter attribute of gru layer,
-                          None means default param.
-    :type gru_param_attr: ParameterAttribute|None
-    :return: the gru group.
-    :rtype: LayerOutput
-    """
-    with mixed_layer(
-            name='%s_transform' % name,
-            size=size * 3,
-            bias_attr=mixed_bias_attr,
-            layer_attr=mixed_layer_attr) as m:
-        m += full_matrix_projection(input=input, param_attr=mixed_param_attr)
-
-    return grumemory(
-        name=name,
-        input=m,
-        reverse=reverse,
-        bias_attr=gru_bias_attr,
-        param_attr=gru_param_attr,
-        act=act,
-        gate_act=gate_act,
-        layer_attr=gru_cell_attr)
-
-
-@wrap_name_default("bidirectional_gru")
-def bidirectional_gru(input,
-                      size,
-                      name=None,
-                      return_seq=False,
-                      fwd_mixed_param_attr=None,
-                      fwd_mixed_bias_attr=None,
-                      fwd_gru_param_attr=None,
-                      fwd_gru_bias_attr=None,
-                      fwd_act=None,
-                      fwd_gate_act=None,
-                      fwd_mixed_layer_attr=None,
-                      fwd_gru_cell_attr=None,
-                      bwd_mixed_param_attr=None,
-                      bwd_mixed_bias_attr=None,
-                      bwd_gru_param_attr=None,
-                      bwd_gru_bias_attr=None,
-                      bwd_act=None,
-                      bwd_gate_act=None,
-                      bwd_mixed_layer_attr=None,
-                      bwd_gru_cell_attr=None,
-                      last_seq_attr=None,
-                      first_seq_attr=None,
-                      concat_attr=None,
-                      concat_act=None):
-    """
-    A bidirectional_gru is a recurrent unit that iterates over the input
-    sequence both in forward and backward orders, and then concatenate two
-    outputs to form a final output. However, concatenation of two outputs
-    is not the only way to form the final output, you can also, for example,
-    just add them together.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        bi_gru = bidirectional_gru(input=[input1], size=512)
-
-    :param name: bidirectional gru layer name.
-    :type name: basestring
-    :param input: input layer.
-    :type input: LayerOutput
-    :param size: gru layer size.
-    :type size: int
-    :param return_seq: If set False, the last time step of output are
-                       concatenated and returned.
-                       If set True, the entire output sequences in forward
-                       and backward directions are concatenated and returned.
-    :type return_seq: bool
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    args = locals()
-
-    fw = simple_gru2(
-        name='%s_fw' % name,
-        input=input,
-        size=size,
-        **dict((k[len('fwd_'):], v) for k, v in args.iteritems()
-               if k.startswith('fwd_')))
-
-    bw = simple_gru2(
-        name="%s_bw" % name,
-        input=input,
-        size=size,
-        reverse=True,
-        **dict((k[len('bwd_'):], v) for k, v in args.iteritems()
-               if k.startswith('bwd_')))
-
-    if return_seq:
-        return concat_layer(
-            name=name, input=[fw, bw], layer_attr=concat_attr, act=concat_act)
-    else:
-        fw_seq = last_seq(
-            name="%s_fw_last" % name, input=fw, layer_attr=last_seq_attr)
-        bw_seq = first_seq(
-            name="%s_bw_last" % name, input=bw, layer_attr=first_seq_attr)
-        return concat_layer(
-            name=name,
-            input=[fw_seq, bw_seq],
-            layer_attr=concat_attr,
-            act=concat_act)
-
-
-@wrap_name_default("bidirectional_lstm")
-def bidirectional_lstm(input,
-                       size,
-                       name=None,
-                       return_seq=False,
-                       fwd_mat_param_attr=None,
-                       fwd_bias_param_attr=None,
-                       fwd_inner_param_attr=None,
-                       fwd_act=None,
-                       fwd_gate_act=None,
-                       fwd_state_act=None,
-                       fwd_mixed_layer_attr=None,
-                       fwd_lstm_cell_attr=None,
-                       bwd_mat_param_attr=None,
-                       bwd_bias_param_attr=None,
-                       bwd_inner_param_attr=None,
-                       bwd_act=None,
-                       bwd_gate_act=None,
-                       bwd_state_act=None,
-                       bwd_mixed_layer_attr=None,
-                       bwd_lstm_cell_attr=None,
-                       last_seq_attr=None,
-                       first_seq_attr=None,
-                       concat_attr=None,
-                       concat_act=None):
-    """
-    A bidirectional_lstm is a recurrent unit that iterates over the input
-    sequence both in forward and backward orders, and then concatenate two
-    outputs to form a final output. However, concatenation of two outputs
-    is not the only way to form the final output, you can also, for example,
-    just add them together.
-
-    Please refer to  **Neural Machine Translation by Jointly Learning to Align
-    and Translate** for more details about the bidirectional lstm.
-    The link goes as follows:
-    .. _Link: https://arxiv.org/pdf/1409.0473v3.pdf
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        bi_lstm = bidirectional_lstm(input=[input1], size=512)
-
-    :param name: bidirectional lstm layer name.
-    :type name: basestring
-    :param input: input layer.
-    :type input: LayerOutput
-    :param size: lstm layer size.
-    :type size: int
-    :param return_seq: If set False, the last time step of output are
-                       concatenated and returned.
-                       If set True, the entire output sequences in forward
-                       and backward directions are concatenated and returned.
-    :type return_seq: bool
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    args = locals()
-
-    fw = simple_lstm(
-        name='%s_fw' % name,
-        input=input,
-        size=size,
-        **dict((k[len('fwd_'):], v) for k, v in args.iteritems()
-               if k.startswith('fwd_')))
-
-    bw = simple_lstm(
-        name="%s_bw" % name,
-        input=input,
-        size=size,
-        reverse=True,
-        **dict((k[len('bwd_'):], v) for k, v in args.iteritems()
-               if k.startswith('bwd_')))
-
-    if return_seq:
-        return concat_layer(
-            name=name, input=[fw, bw], layer_attr=concat_attr, act=concat_act)
-    else:
-        fw_seq = last_seq(
-            name="%s_fw_last" % name, input=fw, layer_attr=last_seq_attr)
-        bw_seq = first_seq(
-            name="%s_bw_last" % name, input=bw, layer_attr=first_seq_attr)
-        return concat_layer(
-            name=name,
-            input=[fw_seq, bw_seq],
-            layer_attr=concat_attr,
-            act=concat_act)
-
-
-@wrap_name_default()
-@wrap_act_default(param_names=['weight_act'], act=TanhActivation())
-def simple_attention(encoded_sequence,
-                     encoded_proj,
-                     decoder_state,
-                     transform_param_attr=None,
-                     softmax_param_attr=None,
-                     weight_act=None,
-                     name=None):
-    """
-    Calculate and return a context vector with attention mechanism.
-    Size of the context vector equals to size of the encoded_sequence.
-
-    ..  math::
-
-        a(s_{i-1},h_{j}) & = v_{a}f(W_{a}s_{t-1} + U_{a}h_{j})
-
-        e_{i,j} & = a(s_{i-1}, h_{j})
-
-        a_{i,j} & = \\frac{exp(e_{i,j})}{\\sum_{k=1}^{T_x}{exp(e_{i,k})}}
-
-        c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}h_{j}
-
-    where :math:`h_{j}` is the jth element of encoded_sequence,
-    :math:`U_{a}h_{j}` is the jth element of encoded_proj
-    :math:`s_{i-1}` is decoder_state
-    :math:`f` is weight_act, and is set to tanh by default.
-
-    Please refer to **Neural Machine Translation by Jointly Learning to
-    Align and Translate** for more details. The link is as follows:
-    https://arxiv.org/abs/1409.0473.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        context = simple_attention(encoded_sequence=enc_seq,
-                                   encoded_proj=enc_proj,
-                                   decoder_state=decoder_prev,)
-
-    :param name: name of the attention model.
-    :type name: basestring
-    :param softmax_param_attr: parameter attribute of sequence softmax
-                               that is used to produce attention weight.
-    :type softmax_param_attr: ParameterAttribute
-    :param weight_act: activation of the attention model.
-    :type weight_act: BaseActivation
-    :param encoded_sequence: output of the encoder
-    :type encoded_sequence: LayerOutput
-    :param encoded_proj: attention weight is computed by a feed forward neural
-                         network which has two inputs : decoder's hidden state
-                         of previous time step and encoder's output.
-                         encoded_proj is output of the feed-forward network for
-                         encoder's output. Here we pre-compute it outside
-                         simple_attention for speed consideration.
-    :type encoded_proj: LayerOutput
-    :param decoder_state: hidden state of decoder in previous time step
-    :type decoder_state: LayerOutput
-    :param transform_param_attr: parameter attribute of the feed-forward
-                                network that takes decoder_state as inputs to
-                                compute attention weight.
-    :type transform_param_attr: ParameterAttribute
-    :return: a context vector
-    :rtype: LayerOutput
-    """
-    assert encoded_proj.size == decoder_state.size
-    proj_size = encoded_proj.size
-
-    with mixed_layer(size=proj_size, name="%s_transform" % name) as m:
-        m += full_matrix_projection(
-            decoder_state, param_attr=transform_param_attr)
-
-    expanded = expand_layer(
-        input=m, expand_as=encoded_sequence, name='%s_expand' % name)
-
-    with mixed_layer(
-            size=proj_size, act=weight_act, name="%s_combine" % name) as m:
-        m += identity_projection(expanded)
-        m += identity_projection(encoded_proj)
-
-    # sequence softmax is used to normalize similarities between decoder state
-    # and encoder outputs into a distribution
-    attention_weight = fc_layer(
-        input=m,
-        size=1,
-        act=SequenceSoftmaxActivation(),
-        param_attr=softmax_param_attr,
-        name="%s_softmax" % name,
-        bias_attr=False)
-
-    scaled = scaling_layer(
-        weight=attention_weight,
-        input=encoded_sequence,
-        name='%s_scaling' % name)
-
-    return pooling_layer(
-        input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
-
-
-@wrap_name_default()
-def dot_product_attention(encoded_sequence,
-                          attended_sequence,
-                          transformed_state,
-                          softmax_param_attr=None,
-                          name=None):
-    """
-    Calculate and return a context vector with dot-product attention mechanism.
-    The dimension of the context vector equals to that of the attended_sequence.
-
-    ..  math::
-
-        a(s_{i-1},h_{j}) & = s_{i-1}^\mathrm{T} h_{j}
-
-        e_{i,j} & = a(s_{i-1}, h_{j})
-
-        a_{i,j} & = \\frac{exp(e_{i,j})}{\\sum_{k=1}^{T_x}{exp(e_{i,k})}}
-
-        c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}z_{j}
-
-    where :math:`h_{j}` is the jth element of encoded_sequence,
-    :math:`z_{j}` is the jth element of attended_sequence,
-    :math:`s_{i-1}` is transformed_state.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        context = dot_product_attention(encoded_sequence=enc_seq,
-                                        attended_sequence=att_seq,
-                                        transformed_state=state,)
-
-    :param name: A prefix attached to the name of each layer that defined inside
-                 the dot_product_attention.
-    :type name: basestring
-    :param softmax_param_attr: The parameter attribute of sequence softmax
-                               that is used to produce attention weight.
-    :type softmax_param_attr: ParameterAttribute
-    :param encoded_sequence: The output hidden vectors of the encoder.
-    :type encoded_sequence: LayerOutput
-    :param attended_sequence: The attention weight is computed by a feed forward neural
-                              network which has two inputs : decoder's transformed hidden
-                              state of previous time step and encoder's output.
-                              attended_sequence is the sequence to be attended.
-    :type attended_sequence: LayerOutput
-    :param transformed_state: The transformed hidden state of decoder in previous time step.
-                              Since the dot-product operation will be performed on it and the
-                              encoded_sequence, their dimensions must be equal. For flexibility,
-                              we suppose transformations of the decoder's hidden state have been
-                              done outside dot_product_attention and no more will be performed
-                              inside. Then users can use either the original or transformed one.
-    :type transformed_state: LayerOutput
-    :return: The context vector.
-    :rtype: LayerOutput
-    """
-    assert transformed_state.size == encoded_sequence.size
-
-    expanded = expand_layer(
-        input=transformed_state,
-        expand_as=encoded_sequence,
-        name='%s_expand' % name)
-
-    m = dot_prod_layer(
-        input1=expanded, input2=encoded_sequence, name='%s_dot-product' % name)
-
-    attention_weight = fc_layer(
-        input=m,
-        size=1,
-        act=SequenceSoftmaxActivation(),
-        param_attr=softmax_param_attr,
-        name="%s_softmax" % name,
-        bias_attr=False)
-
-    scaled = scaling_layer(
-        weight=attention_weight,
-        input=attended_sequence,
-        name='%s_scaling' % name)
-
-    return pooling_layer(
-        input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
-
-
-@wrap_name_default()
-def multi_head_attention(query,
-                         key,
-                         value,
-                         key_proj_size,
-                         value_proj_size,
-                         head_num,
-                         attention_type,
-                         softmax_param_attr=None,
-                         name=None):
-    """
-    Calculate and return a context vector with dot-product attention mechanism.
-    The dimension of the context vector equals to value_proj_size * head_num.
-
-    Please refer to **Attention Is All You Need** for more details. The link is
-    as follows:
-    https://arxiv.org/abs/1706.03762.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        context = multi_head_attention(query=decoder_state,
-                                       key=enc_seq,
-                                       value=enc_seq,
-                                       key_proj_size=64,
-                                       value_pro_size=64,
-                                       head_num=8,
-                                       attention_type='dot-product attention')
-
-    :param name: A prefix attached to the name of each layer that defined inside
-                 the multi_head_attention.
-    :type name: basestring
-    :param softmax_param_attr: The parameter attribute of sequence softmax
-                               that is used to produce attention weight.
-    :type softmax_param_attr: ParameterAttribute
-    :param query: query is used to calculate attention weights over values at current step.
-    :type query: LayerOutput
-    :param key: key is used to calculate the attention weight of the corresponding value.
-    :type key: LayerOutput
-    :param value: value is the sequence to be attended.
-    :type value: LayerOutput
-    :param key_proj_size: The dimension of the linear projection performed on key and query.
-    :type key_proj_size: int
-    :param value_proj_size: The dimension of the linear projection performed on value.
-    :type value_proj_size: int
-    :param head_num: The number of attention heads.
-    :type head_num: int
-    :param attention_type: The type of the attention mechanism used in each attention
-                           heads. Now, we only support scaled dot-product attention and
-                           additive attention.
-    :type attention_type: basestring
-    :return: The context vector.
-    :rtype: LayerOutput
-    """
-    assert attention_type in ['dot-product attention', 'additive attention']
-
-    with mixed_layer(
-            size=key_proj_size * head_num,
-            name='%s_query_proj' % name) as query_proj:
-        query_proj += full_matrix_projection(query)
-    query_proj = expand_layer(input=query_proj, expand_as=key)
-
-    with mixed_layer(
-            size=key_proj_size * head_num,
-            name='%s_key_proj' % name) as key_proj:
-        key_proj += full_matrix_projection(key)
-
-    with mixed_layer(
-            size=value_proj_size * head_num,
-            name='%s_value_proj' % name) as value_proj:
-        value_proj += full_matrix_projection(value)
-
-    head_list = []
-    for i in range(head_num):
-        with mixed_layer(size=key_proj_size) as sub_query_proj:
-            sub_query_proj += identity_projection(
-                query_proj, offset=key_proj_size * i, size=key_proj_size)
-
-        with mixed_layer(size=key_proj_size) as sub_key_proj:
-            sub_key_proj += identity_projection(
-                key_proj, offset=key_proj_size * i, size=key_proj_size)
-
-        with mixed_layer(size=value_proj_size) as sub_value_proj:
-            sub_value_proj += identity_projection(
-                value_proj, offset=value_proj_size * i, size=value_proj_size)
-
-        if attention_type == 'dot-product attention':
-            m = dot_prod_layer(
-                input1=sub_query_proj,
-                input2=sub_key_proj,
-                name='%s_dot-product_%d' % (name, i))
-            m = slope_intercept_layer(
-                input=m,
-                slope=math.sqrt(1.0 / key_proj_size),
-                name='%s_dot-product_scaling_%d' % (name, i))
-        else:
-            with mixed_layer(
-                    size=key_proj_size,
-                    act=TanhActivation(),
-                    name='%s_combine_%d' % (name, i)) as m:
-                m += identity_projection(sub_query_proj)
-                m += identity_projection(sub_key_proj)
-
-        attention_weight = fc_layer(
-            input=m,
-            size=1,
-            act=SequenceSoftmaxActivation(),
-            param_attr=softmax_param_attr,
-            name="%s_softmax_%d" % (name, i),
-            bias_attr=False)
-
-        scaled = scaling_layer(
-            weight=attention_weight,
-            input=sub_value_proj,
-            name='%s_scaling_%d' % (name, i))
-        head = pooling_layer(
-            input=scaled,
-            pooling_type=SumPooling(),
-            name="%s_pooling_%d" % (name, i))
-
-        head_list.append(head)
-
-    attended = concat_layer(head_list)
-
-    return attended
-
-
-def inputs(layers, *args):
-    """
-    Declare the inputs of network. The order of input should be as same as
-    the data provider's return order.
-
-    :param layers: Input Layers.
-    :type layers: list|tuple|LayerOutput.
-    :return:
-    """
-
-    if isinstance(layers, LayerOutput) or isinstance(layers, basestring):
-        layers = [layers]
-    if len(args) != 0:
-        layers.extend(args)
-
-    Inputs(*[l.name for l in layers])
-
-
-def outputs(layers, *args):
-    """
-    Declare the outputs of network. If user has not defined the inputs of
-    network, this method will calculate the input order by dfs travel.
-
-    :param layers: Output layers.
-    :type layers: list|tuple|LayerOutput
-    :return:
-    """
-
-    traveled = set()
-
-    def __dfs_travel__(layer,
-                       predicate=lambda x: x.layer_type == LayerType.DATA):
-        """
-        DFS LRV Travel for output layer.
-
-        The return order is define order for data_layer in this leaf node.
-
-        :param layer:
-        :type layer: LayerOutput
-        :return:
-        """
-        if layer in traveled:
-            return []
-        else:
-            traveled.add(layer)
-
-        assert isinstance(layer, LayerOutput), "layer is %s" % (layer)
-        retv = []
-        if layer.parents is not None:
-            for p in layer.parents:
-                retv.extend(__dfs_travel__(p, predicate))
-
-        if predicate(layer):
-            retv.append(layer)
-        return retv
-
-    if isinstance(layers, LayerOutput):
-        layers = [layers]
-
-    if len(args) != 0:
-        layers.extend(args)
-
-    assert len(layers) > 0
-
-    if HasInputsSet():  # input already set
-        Outputs(*[l.name for l in layers])
-        return  # just return outputs.
-
-    if len(layers) != 1:
-        logger.warning("`outputs` routine try to calculate network's"
-                       " inputs and outputs order. It might not work well."
-                       "Please see follow log carefully.")
-    inputs = []
-    outputs_ = []
-    for each_layer in layers:
-        assert isinstance(each_layer, LayerOutput)
-        inputs.extend(__dfs_travel__(each_layer))
-        outputs_.extend(
-            __dfs_travel__(each_layer,
-                           lambda x: x.layer_type == LayerType.COST))
-
-    # Currently, we got each leaf node's inputs order, output order.
-    # We merge them together.
-
-    final_inputs = []
-    final_outputs = []
-
-    for each_input in inputs:
-        assert isinstance(each_input, LayerOutput)
-        if each_input.name not in final_inputs:
-            final_inputs.append(each_input.name)
-
-    for each_output in outputs_:
-        assert isinstance(each_output, LayerOutput)
-        if each_output.name not in final_outputs:
-            final_outputs.append(each_output.name)
-
-    logger.info("".join(["The input order is [", ", ".join(final_inputs), "]"]))
-
-    if len(final_outputs) == 0:
-        final_outputs = map(lambda x: x.name, layers)
-
-    logger.info("".join(
-        ["The output order is [", ", ".join(final_outputs), "]"]))
-
-    Inputs(*final_inputs)
-    Outputs(*final_outputs)
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
deleted file mode 100644
index 32698e5b2cb..00000000000
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ /dev/null
@@ -1,447 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.config_parser import Settings, default_decay_rate, \
-    default_gradient_clipping_threshold, default_momentum
-
-from .default_decorators import wrap_param_default
-
-__all__ = [
-    'Optimizer', 'BaseSGDOptimizer', 'MomentumOptimizer', 'AdamaxOptimizer',
-    'AdamOptimizer', 'AdaGradOptimizer', 'RMSPropOptimizer',
-    'DecayedAdaGradOptimizer', 'AdaDeltaOptimizer', 'BaseRegularization',
-    'L2Regularization', 'settings', 'ModelAverage'
-]
-
-
-class Optimizer(object):
-    def to_setting_kwargs(self):
-        raise NotImplementedError()
-
-    def extra_settings(self):
-        pass
-
-    @property
-    def is_support_sparse(self):
-        return True
-
-
-class BaseSGDOptimizer(Optimizer):
-    """
-    SGD Optimizer.
-
-    SGD is an optimization method, trying to find a neural network that
-    minimize the "cost/error" of it by iteration. In paddle's implementation
-    SGD Optimizer is synchronized, which means all gradients will be wait to
-    calculate and reduced into one gradient, then do optimize operation.
-
-    The neural network consider the learning problem of minimizing an objective
-    function, that has the form of a sum
-
-    ..  math::
-
-        Q(w) = \\sum_{i}^{n} Q_i(w)
-
-    The value of function Q sometimes is the cost of neural network (Mean
-    Square Error between prediction and label for example). The function Q is
-    parametrised by w, the weight/bias of neural network. And weights is what to
-    be learned. The i is the i-th observation in (trainning) data.
-
-    So, the SGD method will optimize the weight by
-
-    ..  math::
-
-        w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
-
-    where :math:`\\eta` is learning rate. And :math:`n` is batch size.
-    """
-
-    def to_setting_kwargs(self):
-        raise NotImplementedError()
-
-
-class MomentumOptimizer(BaseSGDOptimizer):
-    """
-    MomentumOptimizer.
-
-    When sparse=True, the update scheme:
-
-    ..  math::
-
-        \\alpha_t &= \\alpha_{t-1} / k \\\\
-        \\beta_t &= \\beta_{t-1} / (1 + \\lambda \\gamma_t) \\\\
-        u_t &= u_{t-1} - \\alpha_t \\gamma_t g_t \\\\
-        v_t &= v_{t-1} + \\tau_{t-1} \\alpha_t \\gamma_t g_t \\\\
-        \\tau_t &= \\tau_{t-1} + \\beta_t / \\alpha_t
-    
-    where :math:`k` is momentum, :math:`\\lambda` is decay rate, 
-    :math:`\\gamma_t` is learning rate at the t'th step.
-
-    :param sparse: with sparse support or not.
-    :type sparse: bool
-    """
-
-    def extra_settings(self):
-        default_momentum(self.momentum)
-
-    def to_setting_kwargs(self):
-        if self.sparse:
-            return {'learning_method': 'sparse_momentum'}
-        else:
-            return {'learning_method': 'momentum'}
-
-    def __init__(self, momentum=None, sparse=False):
-        self.momentum = momentum
-        self.sparse = sparse
-
-
-class AdamOptimizer(BaseSGDOptimizer):
-    """
-    Adam optimizer.
-    The details of please refer `Adam: A Method for Stochastic Optimization
-    <https://arxiv.org/abs/1412.6980>`_
-
-    ..  math::
-
-        m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
-        v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
-        w & = w - \\frac{\\eta m(w, t)}{\\sqrt{v(w,t) + \\epsilon}}
-
-    :param beta1: the :math:`\\beta_1` in equation.
-    :type beta1: float
-    :param beta2: the :math:`\\beta_2` in equation.
-    :type beta2: float
-    :param epsilon: the :math:`\\epsilon` in equation. It is used to prevent
-                        divided by zero.
-    :type epsilon: float
-    """
-
-    @property
-    def is_support_sparse(self):
-        return False
-
-    def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8):
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-
-    def to_setting_kwargs(self):
-        return {
-            'learning_method': 'adam',
-            'adam_beta1': self.beta1,
-            'adam_beta2': self.beta2,
-            'adam_epsilon': self.epsilon
-        }
-
-
-class AdamaxOptimizer(BaseSGDOptimizer):
-    """
-    Adamax optimizer.
-
-    The details of please refer this `Adam: A Method for Stochastic Optimization
-    <https://arxiv.org/abs/1412.6980>`_
-
-    ..  math::
-
-        m_t & = \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
-        u_t & = max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
-        w_t & = w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t
-
-    :param beta1: the :math:`\\beta_1` in the equation.
-    :type beta1: float
-    :param beta2: the :math:`\\beta_2` in the equation.
-    :type beta2: float
-    """
-
-    def __init__(self, beta1, beta2):
-        self.beta1 = beta1
-        self.beta2 = beta2
-
-    def to_setting_kwargs(self):
-        return {
-            'learning_method': 'adamax',
-            'adam_beta1': self.beta1,
-            'adam_beta2': self.beta2
-        }
-
-    @property
-    def is_support_sparse(self):
-        return False
-
-
-class AdaGradOptimizer(BaseSGDOptimizer):
-    """
-    Adagrad(for ADAptive GRAdient algorithm) optimizer.
-
-    For details please refer this `Adaptive Subgradient Methods for
-    Online Learning and Stochastic Optimization
-    <http://www.magicbroom.info/Papers/DuchiHaSi10.pdf>`_.
-
-    ..  math::
-
-        G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\
-        w & = w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
-    """
-
-    def to_setting_kwargs(self):
-        return {'learning_method': 'adagrad'}
-
-    def __init__(self):
-        pass
-
-
-class RMSPropOptimizer(BaseSGDOptimizer):
-    """
-    RMSProp(for Root Mean Square Propagation) optimizer. For details please
-    refer this `slide <http://www.cs.toronto.edu/~tijmen/csc321/slides/
-    lecture_slides_lec6.pdf>`_.
-
-    The equations of this method as follows:
-
-    ..  math::
-
-        v(w, t) & = \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
-        w & = w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)
-
-    :param rho: the :math:`\\rho` in the equation. The forgetting factor.
-    :type rho: float
-    :param epsilon: the :math:`\\epsilon` in the equation.
-    :type epsilon: float
-    """
-
-    def to_setting_kwargs(self):
-        return {
-            'learning_method': 'rmsprop',
-            'ada_rou': self.rho,
-            'ada_epsilon': self.epsilon
-        }
-
-    def __init__(self, rho=0.95, epsilon=1e-6):
-        self.rho = rho
-        self.epsilon = epsilon
-
-
-class DecayedAdaGradOptimizer(BaseSGDOptimizer):
-    """
-    AdaGrad method with decayed sum gradients. The equations of this method
-    show as follow.
-
-    ..  math::
-
-        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
-        learning\\_rate &= 1/sqrt( ( E(g_t^2) + \\epsilon )
-
-    :param rho: The :math:`\\rho` parameter in that equation
-    :type rho: float
-    :param epsilon: The :math:`\\epsilon` parameter in that equation.
-    :type epsilon: float
-    """
-
-    def to_setting_kwargs(self):
-        return {
-            'learning_method': 'decayed_adagrad',
-            'ada_rou': self.rho,
-            'ada_epsilon': self.epsilon
-        }
-
-    def __init__(self, rho=0.95, epsilon=1e-6):
-        self.rho = rho
-        self.epsilon = epsilon
-
-
-class AdaDeltaOptimizer(BaseSGDOptimizer):
-    """
-    AdaDelta method. The details of adadelta please refer to this
-    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD
-    <http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf>`_.
-
-    ..  math::
-
-        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
-        learning\\_rate &= sqrt( ( E(dx_{t-1}^2) + \\epsilon ) / ( \\
-                          E(g_t^2) + \\epsilon ) ) \\\\
-        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2
-
-    :param rho: :math:`\\rho` in equation
-    :type rho: float
-    :param epsilon: :math:`\\rho` in equation
-    :type epsilon: float
-    """
-
-    def to_setting_kwargs(self):
-        return {
-            'learning_method': 'adadelta',
-            'ada_rou': self.rho,
-            'ada_epsilon': self.epsilon
-        }
-
-    def __init__(self, rho=0.95, epsilon=1e-6):
-        self.rho = rho
-        self.epsilon = epsilon
-
-
-class BaseRegularization(Optimizer):
-    def __init__(self):
-        self.algorithm = ""
-        self.learning_method = ""
-
-    def to_setting_kwargs(self):
-        return {}
-
-
-class L2Regularization(BaseRegularization):
-    def __init__(self, rate):
-        super(L2Regularization, self).__init__()
-        self.decay_rate = rate
-
-    def to_setting_kwargs(self):
-        if self.algorithm == 'owlqn':
-            return {'l2weight': self.decay_rate}
-        else:
-            return dict()
-
-    def extra_settings(self):
-        if self.algorithm == 'sgd' or self.algorithm == 'async_sgd':
-            default_decay_rate(self.decay_rate)
-
-
-class ModelAverage(Optimizer):
-    def to_setting_kwargs(self):
-        return {
-            'average_window': self.average_window,
-            'max_average_window': self.max_average_window,
-            'do_average_in_cpu': self.do_average_in_cpu
-        }
-
-    def __init__(self,
-                 average_window,
-                 max_average_window=None,
-                 do_average_in_cpu=False):
-        self.average_window = average_window
-        self.max_average_window = max_average_window
-        self.do_average_in_cpu = do_average_in_cpu
-
-
-class GradientClippingThreshold(Optimizer):
-    def extra_settings(self):
-        default_gradient_clipping_threshold(self.threshold)
-
-    def __init__(self, threshold):
-        self.threshold = threshold
-
-    def to_setting_kwargs(self):
-        return dict()
-
-
-def __extends__(dict1, dict2):
-    for key in dict2:
-        assert key not in dict1
-        dict1[key] = dict2[key]
-    return dict1
-
-
-@wrap_param_default(
-    ['learning_method'], default_factory=lambda _: MomentumOptimizer())
-@wrap_param_default(
-    ['regularization'], default_factory=lambda _: BaseRegularization())
-def settings(batch_size,
-             learning_rate=1e-3,
-             learning_rate_decay_a=0.,
-             learning_rate_decay_b=0.,
-             learning_rate_schedule='poly',
-             learning_rate_args='',
-             async_lagged_grad_discard_ratio=1.5,
-             learning_method=None,
-             regularization=None,
-             is_async=False,
-             model_average=None,
-             gradient_clipping_threshold=None):
-    """
-    Set the optimization method, learning rate, batch size, and other training
-    settings. The currently supported algorithms are SGD and Async-SGD.
-
-    ..  warning::
-
-        Note that the 'batch_size' in PaddlePaddle is not equal to global
-        training batch size. It represents the single training process's batch
-        size. If you use N processes to train one model, for example use three
-        GPU machines, the global batch size is N*'batch_size'.
-
-    :param batch_size: batch size for one training process.
-    :type batch_size: int
-    :param learning_rate: learning rate for SGD
-    :type learning_rate: float
-    :param learning_method: The extension optimization algorithms of gradient
-                            descent, such as momentum, adagrad, rmsprop, etc.
-                            Note that it should be instance with base type
-                            BaseSGDOptimizer.
-    :type learning_method: BaseSGDOptimizer
-    :param regularization: The regularization method.
-    :type regularization: BaseRegularization
-    :param is_async: Is Async-SGD or not. Default value is False.
-    :type is_async: bool
-    :param model_average: Model Average Settings.
-    :type model_average: ModelAverage
-    :param gradient_clipping_threshold: gradient clipping threshold. If gradient
-                                        value larger than some value, will be
-                                        clipped.
-    :type gradient_clipping_threshold: float
-    :param async_lagged_grad_discard_ratio: async SGD gradient commit control,
-          when async_lagged_grad_discard_ratio * num_gradient_servers commit passed, 
-          the current async SGD gradient is discarded.
-    :type async_lagged_grad_discard_ratio: float
-    """
-    if isinstance(regularization, BaseRegularization):
-        regularization = [regularization]
-
-    assert isinstance(learning_method, Optimizer)
-    if isinstance(learning_method, BaseSGDOptimizer):
-        algorithm = 'async_sgd' if is_async else 'sgd'
-    else:
-        algorithm = 'owlqn'
-
-    args = [
-        'batch_size', 'learning_rate', 'learning_rate_decay_a',
-        'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args',
-        'gradient_clipping_threshold', 'async_lagged_grad_discard_ratio'
-    ]
-    kwargs = dict()
-    kwargs['algorithm'] = algorithm
-    for arg in args:
-        kwargs[arg] = locals()[arg]
-
-    kwargs = __extends__(kwargs, learning_method.to_setting_kwargs())
-    learning_method.extra_settings()
-
-    for regular in regularization:
-        assert isinstance(regular, BaseRegularization)
-        regular.algorithm = algorithm
-        regular.learning_method = kwargs['learning_method']
-        kwargs = __extends__(kwargs, regular.to_setting_kwargs())
-        regular.extra_settings()
-
-    if gradient_clipping_threshold is not None:
-        gradient_clipping_threshold = GradientClippingThreshold(
-            threshold=gradient_clipping_threshold)
-
-    for each in [model_average, gradient_clipping_threshold]:
-        if each is not None:
-            assert isinstance(each, Optimizer)
-            each.algorithm = algorithm
-            each.learning_method = kwargs['learning_method']
-            kwargs = __extends__(kwargs, each.to_setting_kwargs())
-            each.extra_settings()
-
-    # Do Check?
-    Settings(**kwargs)
diff --git a/python/paddle/trainer_config_helpers/poolings.py b/python/paddle/trainer_config_helpers/poolings.py
deleted file mode 100644
index e0aeb311b3a..00000000000
--- a/python/paddle/trainer_config_helpers/poolings.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-"""
-
-__all__ = [
-    "BasePoolingType", "MaxPooling", "AvgPooling", "MaxWithMaskPooling",
-    "CudnnMaxPooling", "CudnnAvgPooling", "CudnnAvgInclPadPooling",
-    "SumPooling", "SquareRootNPooling"
-]
-
-
-class BasePoolingType(object):
-    """
-    Base Pooling Type.
-    Note these pooling types are used for sequence input, not for images.
-    Each PoolingType contains one parameter:
-
-    :param name: pooling layer type name used by paddle.
-    :type name: basestring
-
-    """
-
-    def __init__(self, name):
-        self.name = name
-
-
-class MaxPooling(BasePoolingType):
-    """
-    Max pooling.
-
-    Return the very large values for each dimension in sequence or time steps.
-
-    ..  math::
-
-        max(samples\\_of\\_a\\_sequence)
-
-    :param output_max_index: True if output sequence max index instead of max
-                             value. None means use default value in proto.
-    :type output_max_index: bool|None
-    """
-
-    def __init__(self, output_max_index=None):
-        BasePoolingType.__init__(self, "max")
-        self.output_max_index = output_max_index
-
-
-class MaxWithMaskPooling(BasePoolingType):
-    """
-    MaxWithMask pooling.
-
-    Not only return the very large values for each dimension in sequence or time steps,
-    but also the location indices of found maxinum values.
-
-    """
-
-    def __init__(self):
-        BasePoolingType.__init__(self, "max-pool-with-mask")
-
-
-class CudnnMaxPooling(BasePoolingType):
-    """
-    Cudnn max pooling only support GPU. Return the maxinum value in the
-    pooling window.
-    """
-
-    def __init__(self):
-        BasePoolingType.__init__(self, "cudnn-max-pool")
-
-
-class CudnnAvgPooling(BasePoolingType):
-    """
-    Cudnn average pooling only support GPU. Return the average value in the
-    pooling window.
-    """
-
-    def __init__(self):
-        BasePoolingType.__init__(self, "cudnn-avg-pool")
-
-
-class CudnnAvgInclPadPooling(BasePoolingType):
-    """
-    Cudnn average pooling only support GPU. Return the average value in the
-    pooling window taking into account the padding cells.
-    """
-
-    def __init__(self):
-        BasePoolingType.__init__(self, "cudnn-avg-incl-pad-pool")
-
-
-class AvgPooling(BasePoolingType):
-    """
-    Average pooling.
-
-    Return the average values for each dimension in sequence or time steps.
-
-    ..  math::
-
-        sum(samples\\_of\\_a\\_sequence)/sample\\_num
-    """
-    STRATEGY_AVG = "average"
-    STRATEGY_SUM = "sum"
-    STRATEGY_SQROOTN = "squarerootn"
-
-    def __init__(self, strategy=STRATEGY_AVG):
-        BasePoolingType.__init__(self, "average")
-        self.strategy = strategy
-
-
-class SumPooling(AvgPooling):
-    """
-    Sum pooling.
-
-    Return the sum values of each dimension in sequence or time steps.
-
-    ..  math::
-
-        sum(samples\\_of\\_a\\_sequence)
-    """
-
-    def __init__(self):
-        AvgPooling.__init__(self, AvgPooling.STRATEGY_SUM)
-
-
-class SquareRootNPooling(AvgPooling):
-    """
-    Square Root Pooling.
-
-    Return the square root values of each dimension in sequence or time steps.
-
-    ..  math::
-
-        sum(samples\\_of\\_a\\_sequence)/sqrt(sample\\_num)
-    """
-
-    def __init__(self):
-        AvgPooling.__init__(self, AvgPooling.STRATEGY_SQROOTN)
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
deleted file mode 100644
index 30e0b9906c4..00000000000
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-#################### test_config_parser #########################
-add_test(NAME layers_test
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
-        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/layers_test.py
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
-
-add_test(NAME test_reset_hook
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
-        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
-
-add_paddle_exe(protobuf_equal ProtobufEqualMain.cpp)
-add_test(NAME test_layerHelpers
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
-  ${PADDLE_BINARY_DIR}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
-  ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
-)
diff --git a/python/paddle/trainer_config_helpers/tests/ProtobufEqualMain.cpp b/python/paddle/trainer_config_helpers/tests/ProtobufEqualMain.cpp
deleted file mode 100644
index 7b10e0b7a60..00000000000
--- a/python/paddle/trainer_config_helpers/tests/ProtobufEqualMain.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <google/protobuf/text_format.h>
-#include <google/protobuf/util/message_differencer.h>
-#include <fstream>
-#include <iostream>
-#include "TrainerConfig.pb.h"
-
-bool loadPb(google::protobuf::Message* conf, const std::string& filename) {
-  std::ifstream fin;
-  fin.open(filename.c_str());
-  if (fin.is_open()) {
-    std::string str((std::istreambuf_iterator<char>(fin)),
-                    std::istreambuf_iterator<char>());
-    bool ok = google::protobuf::TextFormat::ParseFromString(str, conf);
-    fin.close();
-    return ok;
-  } else {
-    return false;
-  }
-}
-
-int main(int argc, char** argv) {
-  std::unique_ptr<google::protobuf::Message> config1;
-  std::unique_ptr<google::protobuf::Message> config2;
-  if (argc == 3) {
-    config1.reset(new paddle::ModelConfig());
-    config2.reset(new paddle::ModelConfig());
-  } else if (argc == 4) {
-    config1.reset(new paddle::TrainerConfig());
-    config2.reset(new paddle::TrainerConfig());
-  }
-  if (!config1 || !config2) {
-    return 1;
-  } else if (!loadPb(config1.get(), argv[1])) {
-    return 2;
-  } else if (!loadPb(config2.get(), argv[2])) {
-    return 3;
-  } else {
-    if (google::protobuf::util::MessageDifferencer::ApproximatelyEquals(
-            *config1, *config2)) {
-      return 0;
-    } else {
-      return 4;
-    }
-  }
-}
diff --git a/python/paddle/trainer_config_helpers/tests/configs/.gitignore b/python/paddle/trainer_config_helpers/tests/configs/.gitignore
deleted file mode 100644
index c654bd41b0b..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-protostr/*.unittest
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
deleted file mode 100755
index 10c941f7074..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-export configs=(test_repeat_layer test_fc layer_activations projections test_print_layer
-test_sequence_pooling test_lstmemory_layer test_grumemory_layer
-last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
-img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
-test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
-test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
-test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
-test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
-test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
-test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
-test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer
-test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer
-test_scale_sub_region_layer test_dot_prod_layer test_l2_distance_layer
-test_factorization_machine)
-
-export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
deleted file mode 100755
index 44a75a60cc7..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-set -e
-cd `dirname $0`
-
-protostr=$PWD/protostr
-. file_list.sh
-
-for conf in ${configs[*]}
-do
-    echo "Generating " $conf
-    $1 -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unittest
-    if [ ! -f "$protostr/$conf.protostr" ]; then 
-        cp $protostr/$conf.protostr.unittest $protostr/$conf.protostr
-    fi
-    cat ${conf}.py |$1 test_config_parser_for_non_file_config.py > $protostr/$conf.protostr.non_file_config.unittest
-done
-
-for conf in ${whole_configs[*]}
-do
-    echo "Generating " $conf
-    $1 -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unittest
-    if [ ! -f "$protostr/$conf.protostr" ]; then 
-        cp $protostr/$conf.protostr.unittest $protostr/$conf.protostr
-    fi
-    cat ${conf}.py |$1 test_config_parser_for_non_file_config.py --whole > $protostr/$conf.protostr.non_file_config.unittest
-done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
deleted file mode 100644
index 767b6454242..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-3, batch_size=1000)
-
-img = data_layer(name='image', size=256 * 256)
-
-# the parse_conv in config_parse.py is not strictly accurate when filter_size
-# is not square. So here set square filter_size.
-img_conv = img_conv_layer(
-    input=img,
-    num_channels=1,
-    num_filters=64,
-    filter_size=(32, 32),
-    padding=(1, 1),
-    dilation=(1, 1),
-    stride=(1, 1),
-    act=LinearActivation())
-img_bn = batch_norm_layer(input=img_conv, act=ReluActivation())
-
-img_norm = img_cmrnorm_layer(input=img_bn, size=32)
-
-img_pool = img_pool_layer(input=img_conv, pool_size=32, pool_type=MaxPooling())
-
-outputs(img_pool, img_norm)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py b/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py
deleted file mode 100644
index e17c8fa7c0a..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-3, batch_size=1000)
-
-img = data_layer(name='image', size=227 * 227)
-
-# the parse_conv in config_parse.py is not strictly accurate when filter_size
-# is not square. So here set square filter_size.
-img_conv = img_conv_layer(
-    input=img,
-    num_channels=1,
-    num_filters=64,
-    filter_size=(32, 32),
-    padding=(1, 1),
-    stride=(1, 1),
-    act=LinearActivation(),
-    trans=True)
-img_bn = batch_norm_layer(input=img_conv, act=ReluActivation())
-
-img_norm = img_cmrnorm_layer(input=img_bn, size=32)
-
-img_pool = img_pool_layer(input=img_conv, pool_size=32, pool_type=MaxPooling())
-
-outputs(img_pool, img_norm)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
deleted file mode 100644
index 5b6d2627e43..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-din = data_layer(name='data', size=30)
-
-seq_op = [first_seq, last_seq]
-
-agg_level = [AggregateLevel.TO_SEQUENCE, AggregateLevel.TO_NO_SEQUENCE]
-
-opts = []
-
-for op in seq_op:
-    for al in agg_level:
-        opts.append(op(input=din, agg_level=al))
-
-for op in seq_op:
-    opts.append(
-        op(input=din, agg_level=AggregateLevel.TO_NO_SEQUENCE, stride=5))
-
-outputs(opts)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py b/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
deleted file mode 100644
index ac1f7e02c09..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
+++ /dev/null
@@ -1,34 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-'''
-Test all activations.
-'''
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-din = data_layer(name='input', size=100)
-
-acts = [
-    TanhActivation, SigmoidActivation, SoftmaxActivation, IdentityActivation,
-    LinearActivation, ExpActivation, ReluActivation, BReluActivation,
-    SoftReluActivation, STanhActivation, AbsActivation, SquareActivation
-]
-
-outputs([
-    fc_layer(
-        input=din, size=100, act=act(), name="layer_%d" % i)
-    for i, act in enumerate(acts)
-])
diff --git a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
deleted file mode 100644
index 29dc634fb39..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-x = data_layer(name='data', size=100)
-x = layer_math.exp(x)
-x = layer_math.sqrt(x)
-x = layer_math.reciprocal(x)
-x = layer_math.log(x)
-x = layer_math.abs(x)
-x = layer_math.sigmoid(x)
-x = layer_math.tanh(x)
-x = layer_math.square(x)
-x = layer_math.relu(x)
-y = 1 + x
-y = y + 1
-y = x + y
-y = y - x
-y = y - 2
-y = 2 - y
-y = 2 * y
-y = y * 3
-z = data_layer(name='data_2', size=1)
-y = y * z
-y = z * y
-y = y + z
-y = z + y
-outputs(y)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/projections.py b/python/paddle/trainer_config_helpers/tests/configs/projections.py
deleted file mode 100644
index 3b7a196d1c1..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/projections.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-'''
-Test mixed layer, projections and operators.
-'''
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-4)
-
-din = data_layer(name='test', size=100)
-
-din = embedding_layer(input=din, size=256)
-
-with mixed_layer(size=100) as m1:
-    m1 += full_matrix_projection(input=din)
-
-with mixed_layer(size=100) as m2:
-    m2 += table_projection(input=m1)
-
-with mixed_layer(size=100) as m3:
-    m3 += identity_projection(input=m2)
-
-with mixed_layer(size=100) as m4:
-    m4 += dotmul_projection(input=m3)
-
-with mixed_layer() as m5:
-    m5 += context_projection(input=m4, context_len=3)
-
-with mixed_layer() as m6:
-    m6 += dotmul_operator(a=m3, b=m4)
-    m6 += scaling_projection(m3)
-
-img = data_layer(name='img', size=32 * 32)
-flt = data_layer(name='filter', size=3 * 3 * 1 * 64)
-
-with mixed_layer() as m7:
-    m7 += conv_operator(
-        img=img, filter=flt, num_filters=64, num_channels=1, filter_size=3)
-    m7 += conv_projection(img, filter_size=3, num_filters=64, num_channels=1)
-
-with mixed_layer() as m8:
-    m8 += conv_operator(
-        img=img,
-        filter=flt,
-        num_filters=64,
-        num_channels=1,
-        filter_size=3,
-        stride=2,
-        padding=1,
-        trans=True)
-    m8 += conv_projection(
-        img,
-        filter_size=3,
-        num_filters=64,
-        num_channels=1,
-        stride=2,
-        padding=1,
-        trans=True)
-end = mixed_layer(
-    input=[
-        full_matrix_projection(input=m5),
-        trans_full_matrix_projection(input=m6),
-        full_matrix_projection(input=m7), full_matrix_projection(input=m8)
-    ],
-    size=100,
-    layer_attr=ExtraAttr(
-        drop_rate=0.5, error_clipping_threshold=40))
-
-outputs(end)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
deleted file mode 100644
index 3e0f9576488..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
+++ /dev/null
@@ -1,193 +0,0 @@
-type: "nn"
-layers {
-  name: "image"
-  type: "data"
-  size: 65536
-  active_type: ""
-}
-layers {
-  name: "__conv_0__"
-  type: "exconv"
-  size: 3297856
-  active_type: ""
-  inputs {
-    input_layer_name: "image"
-    input_parameter_name: "___conv_0__.w0"
-    conv_conf {
-      filter_size: 32
-      channels: 1
-      stride: 1
-      padding: 1
-      groups: 1
-      filter_channels: 1
-      output_x: 227
-      img_size: 256
-      caffe_mode: true
-      filter_size_y: 32
-      padding_y: 1
-      stride_y: 1
-      output_y: 227
-      img_size_y: 256
-      dilation: 1
-      dilation_y: 1
-    }
-  }
-  bias_parameter_name: "___conv_0__.wbias"
-  num_filters: 64
-  shared_biases: true
-  height: 227
-  width: 227
-}
-layers {
-  name: "__batch_norm_0__"
-  type: "batch_norm"
-  size: 3297856
-  active_type: "relu"
-  inputs {
-    input_layer_name: "__conv_0__"
-    input_parameter_name: "___batch_norm_0__.w0"
-    image_conf {
-      channels: 64
-      img_size: 227
-      img_size_y: 227
-    }
-  }
-  inputs {
-    input_layer_name: "__conv_0__"
-    input_parameter_name: "___batch_norm_0__.w1"
-  }
-  inputs {
-    input_layer_name: "__conv_0__"
-    input_parameter_name: "___batch_norm_0__.w2"
-  }
-  bias_parameter_name: "___batch_norm_0__.wbias"
-  moving_average_fraction: 0.9
-  height: 227
-  width: 227
-  depth: 1
-  epsilon: 1e-05
-}
-layers {
-  name: "__crmnorm_0__"
-  type: "norm"
-  size: 3297856
-  active_type: ""
-  inputs {
-    input_layer_name: "__batch_norm_0__"
-    norm_conf {
-      norm_type: "cmrnorm-projection"
-      channels: 64
-      size: 32
-      scale: 0.0004
-      pow: 0.75
-      output_x: 227
-      img_size: 227
-      blocked: false
-      output_y: 227
-      img_size_y: 227
-    }
-  }
-  height: 227
-  width: 227
-}
-layers {
-  name: "__pool_0__"
-  type: "pool"
-  size: 2458624
-  active_type: ""
-  inputs {
-    input_layer_name: "__conv_0__"
-    pool_conf {
-      pool_type: "max-projection"
-      channels: 64
-      size_x: 32
-      stride: 1
-      output_x: 196
-      img_size: 227
-      padding: 0
-      size_y: 32
-      stride_y: 1
-      output_y: 196
-      img_size_y: 227
-      padding_y: 0
-    }
-  }
-  height: 196
-  width: 196
-}
-parameters {
-  name: "___conv_0__.w0"
-  size: 65536
-  initial_mean: 0.0
-  initial_std: 0.0441941738242
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___conv_0__.wbias"
-  size: 64
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 64
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___batch_norm_0__.w0"
-  size: 64
-  initial_mean: 1.0
-  initial_std: 0.0
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___batch_norm_0__.w1"
-  size: 64
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 64
-  initial_strategy: 0
-  initial_smart: false
-  is_static: true
-  is_shared: true
-}
-parameters {
-  name: "___batch_norm_0__.w2"
-  size: 64
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 64
-  initial_strategy: 0
-  initial_smart: false
-  is_static: true
-  is_shared: true
-}
-parameters {
-  name: "___batch_norm_0__.wbias"
-  size: 64
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 64
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "image"
-output_layer_names: "__pool_0__"
-output_layer_names: "__crmnorm_0__"
-sub_models {
-  name: "root"
-  layer_names: "image"
-  layer_names: "__conv_0__"
-  layer_names: "__batch_norm_0__"
-  layer_names: "__crmnorm_0__"
-  layer_names: "__pool_0__"
-  input_layer_names: "image"
-  output_layer_names: "__pool_0__"
-  output_layer_names: "__crmnorm_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
deleted file mode 100644
index a18a4652e14..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ /dev/null
@@ -1,193 +0,0 @@
-type: "nn"
-layers {
-  name: "image"
-  type: "data"
-  size: 51529
-  active_type: ""
-}
-layers {
-  name: "__conv_0__"
-  type: "exconvt"
-  size: 4194304
-  active_type: ""
-  inputs {
-    input_layer_name: "image"
-    input_parameter_name: "___conv_0__.w0"
-    conv_conf {
-      filter_size: 32
-      channels: 1
-      stride: 1
-      padding: 1
-      groups: 1
-      filter_channels: 64
-      output_x: 227
-      img_size: 256
-      caffe_mode: true
-      filter_size_y: 32
-      padding_y: 1
-      stride_y: 1
-      output_y: 227
-      img_size_y: 256
-      dilation: 1
-      dilation_y: 1
-    }
-  }
-  bias_parameter_name: "___conv_0__.wbias"
-  num_filters: 64
-  shared_biases: true
-  height: 256
-  width: 256
-}
-layers {
-  name: "__batch_norm_0__"
-  type: "batch_norm"
-  size: 4194304
-  active_type: "relu"
-  inputs {
-    input_layer_name: "__conv_0__"
-    input_parameter_name: "___batch_norm_0__.w0"
-    image_conf {
-      channels: 64
-      img_size: 256
-      img_size_y: 256
-    }
-  }
-  inputs {
-    input_layer_name: "__conv_0__"
-    input_parameter_name: "___batch_norm_0__.w1"
-  }
-  inputs {
-    input_layer_name: "__conv_0__"
-    input_parameter_name: "___batch_norm_0__.w2"
-  }
-  bias_parameter_name: "___batch_norm_0__.wbias"
-  moving_average_fraction: 0.9
-  height: 256
-  width: 256
-  depth: 1
-  epsilon: 1e-05
-}
-layers {
-  name: "__crmnorm_0__"
-  type: "norm"
-  size: 4194304
-  active_type: ""
-  inputs {
-    input_layer_name: "__batch_norm_0__"
-    norm_conf {
-      norm_type: "cmrnorm-projection"
-      channels: 64
-      size: 32
-      scale: 0.0004
-      pow: 0.75
-      output_x: 256
-      img_size: 256
-      blocked: false
-      output_y: 256
-      img_size_y: 256
-    }
-  }
-  height: 256
-  width: 256
-}
-layers {
-  name: "__pool_0__"
-  type: "pool"
-  size: 3240000
-  active_type: ""
-  inputs {
-    input_layer_name: "__conv_0__"
-    pool_conf {
-      pool_type: "max-projection"
-      channels: 64
-      size_x: 32
-      stride: 1
-      output_x: 225
-      img_size: 256
-      padding: 0
-      size_y: 32
-      stride_y: 1
-      output_y: 225
-      img_size_y: 256
-      padding_y: 0
-    }
-  }
-  height: 225
-  width: 225
-}
-parameters {
-  name: "___conv_0__.w0"
-  size: 65536
-  initial_mean: 0.0
-  initial_std: 0.0441941738242
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___conv_0__.wbias"
-  size: 64
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 64
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___batch_norm_0__.w0"
-  size: 64
-  initial_mean: 1.0
-  initial_std: 0.0
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___batch_norm_0__.w1"
-  size: 64
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 64
-  initial_strategy: 0
-  initial_smart: false
-  is_static: true
-  is_shared: true
-}
-parameters {
-  name: "___batch_norm_0__.w2"
-  size: 64
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 64
-  initial_strategy: 0
-  initial_smart: false
-  is_static: true
-  is_shared: true
-}
-parameters {
-  name: "___batch_norm_0__.wbias"
-  size: 64
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 64
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "image"
-output_layer_names: "__pool_0__"
-output_layer_names: "__crmnorm_0__"
-sub_models {
-  name: "root"
-  layer_names: "image"
-  layer_names: "__conv_0__"
-  layer_names: "__batch_norm_0__"
-  layer_names: "__crmnorm_0__"
-  layer_names: "__pool_0__"
-  input_layer_names: "image"
-  output_layer_names: "__pool_0__"
-  output_layer_names: "__crmnorm_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
deleted file mode 100644
index fee0f8e462b..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
+++ /dev/null
@@ -1,102 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 30
-  active_type: ""
-}
-layers {
-  name: "__first_seq_0__"
-  type: "seqlastins"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  select_first: true
-  trans_type: "seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__first_seq_1__"
-  type: "seqlastins"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  select_first: true
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__last_seq_0__"
-  type: "seqlastins"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  trans_type: "seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__last_seq_1__"
-  type: "seqlastins"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__first_seq_2__"
-  type: "seqlastins"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  select_first: true
-  trans_type: "non-seq"
-  seq_pool_stride: 5
-}
-layers {
-  name: "__last_seq_2__"
-  type: "seqlastins"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: 5
-}
-input_layer_names: "data"
-output_layer_names: "__first_seq_0__"
-output_layer_names: "__first_seq_1__"
-output_layer_names: "__last_seq_0__"
-output_layer_names: "__last_seq_1__"
-output_layer_names: "__first_seq_2__"
-output_layer_names: "__last_seq_2__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__first_seq_0__"
-  layer_names: "__first_seq_1__"
-  layer_names: "__last_seq_0__"
-  layer_names: "__last_seq_1__"
-  layer_names: "__first_seq_2__"
-  layer_names: "__last_seq_2__"
-  input_layer_names: "data"
-  output_layer_names: "__first_seq_0__"
-  output_layer_names: "__first_seq_1__"
-  output_layer_names: "__last_seq_0__"
-  output_layer_names: "__last_seq_1__"
-  output_layer_names: "__first_seq_2__"
-  output_layer_names: "__last_seq_2__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/layer_activations.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/layer_activations.protostr
deleted file mode 100644
index ecf39e4d321..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/layer_activations.protostr
+++ /dev/null
@@ -1,423 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "layer_0"
-  type: "fc"
-  size: 100
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_0.w0"
-  }
-  bias_parameter_name: "_layer_0.wbias"
-}
-layers {
-  name: "layer_1"
-  type: "fc"
-  size: 100
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_1.w0"
-  }
-  bias_parameter_name: "_layer_1.wbias"
-}
-layers {
-  name: "layer_2"
-  type: "fc"
-  size: 100
-  active_type: "softmax"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_2.w0"
-  }
-  bias_parameter_name: "_layer_2.wbias"
-}
-layers {
-  name: "layer_3"
-  type: "fc"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_3.w0"
-  }
-  bias_parameter_name: "_layer_3.wbias"
-}
-layers {
-  name: "layer_4"
-  type: "fc"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_4.w0"
-  }
-  bias_parameter_name: "_layer_4.wbias"
-}
-layers {
-  name: "layer_5"
-  type: "fc"
-  size: 100
-  active_type: "exponential"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_5.w0"
-  }
-  bias_parameter_name: "_layer_5.wbias"
-}
-layers {
-  name: "layer_6"
-  type: "fc"
-  size: 100
-  active_type: "relu"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_6.w0"
-  }
-  bias_parameter_name: "_layer_6.wbias"
-}
-layers {
-  name: "layer_7"
-  type: "fc"
-  size: 100
-  active_type: "brelu"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_7.w0"
-  }
-  bias_parameter_name: "_layer_7.wbias"
-}
-layers {
-  name: "layer_8"
-  type: "fc"
-  size: 100
-  active_type: "softrelu"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_8.w0"
-  }
-  bias_parameter_name: "_layer_8.wbias"
-}
-layers {
-  name: "layer_9"
-  type: "fc"
-  size: 100
-  active_type: "stanh"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_9.w0"
-  }
-  bias_parameter_name: "_layer_9.wbias"
-}
-layers {
-  name: "layer_10"
-  type: "fc"
-  size: 100
-  active_type: "abs"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_10.w0"
-  }
-  bias_parameter_name: "_layer_10.wbias"
-}
-layers {
-  name: "layer_11"
-  type: "fc"
-  size: 100
-  active_type: "square"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_11.w0"
-  }
-  bias_parameter_name: "_layer_11.wbias"
-}
-parameters {
-  name: "_layer_0.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_0.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_1.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_1.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_2.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_2.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_3.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_3.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_4.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_4.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_5.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_5.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_6.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_6.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_7.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_7.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_8.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_8.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_9.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_9.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_10.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_10.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_11.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_11.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "input"
-output_layer_names: "layer_0"
-output_layer_names: "layer_1"
-output_layer_names: "layer_2"
-output_layer_names: "layer_3"
-output_layer_names: "layer_4"
-output_layer_names: "layer_5"
-output_layer_names: "layer_6"
-output_layer_names: "layer_7"
-output_layer_names: "layer_8"
-output_layer_names: "layer_9"
-output_layer_names: "layer_10"
-output_layer_names: "layer_11"
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "layer_0"
-  layer_names: "layer_1"
-  layer_names: "layer_2"
-  layer_names: "layer_3"
-  layer_names: "layer_4"
-  layer_names: "layer_5"
-  layer_names: "layer_6"
-  layer_names: "layer_7"
-  layer_names: "layer_8"
-  layer_names: "layer_9"
-  layer_names: "layer_10"
-  layer_names: "layer_11"
-  input_layer_names: "input"
-  output_layer_names: "layer_0"
-  output_layer_names: "layer_1"
-  output_layer_names: "layer_2"
-  output_layer_names: "layer_3"
-  output_layer_names: "layer_4"
-  output_layer_names: "layer_5"
-  output_layer_names: "layer_6"
-  output_layer_names: "layer_7"
-  output_layer_names: "layer_8"
-  output_layer_names: "layer_9"
-  output_layer_names: "layer_10"
-  output_layer_names: "layer_11"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
deleted file mode 100644
index 582207741ab..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
+++ /dev/null
@@ -1,413 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__exp_0__"
-  type: "mixed"
-  size: 100
-  active_type: "exponential"
-  inputs {
-    input_layer_name: "data"
-    proj_conf {
-      type: "identity"
-      name: "___exp_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__sqrt_0__"
-  type: "mixed"
-  size: 100
-  active_type: "sqrt"
-  inputs {
-    input_layer_name: "__exp_0__"
-    proj_conf {
-      type: "identity"
-      name: "___sqrt_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__reciprocal_0__"
-  type: "mixed"
-  size: 100
-  active_type: "reciprocal"
-  inputs {
-    input_layer_name: "__sqrt_0__"
-    proj_conf {
-      type: "identity"
-      name: "___reciprocal_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__log_0__"
-  type: "mixed"
-  size: 100
-  active_type: "log"
-  inputs {
-    input_layer_name: "__reciprocal_0__"
-    proj_conf {
-      type: "identity"
-      name: "___log_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__abs_0__"
-  type: "mixed"
-  size: 100
-  active_type: "abs"
-  inputs {
-    input_layer_name: "__log_0__"
-    proj_conf {
-      type: "identity"
-      name: "___abs_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__sigmoid_0__"
-  type: "mixed"
-  size: 100
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "__abs_0__"
-    proj_conf {
-      type: "identity"
-      name: "___sigmoid_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__tanh_0__"
-  type: "mixed"
-  size: 100
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__sigmoid_0__"
-    proj_conf {
-      type: "identity"
-      name: "___tanh_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__square_0__"
-  type: "mixed"
-  size: 100
-  active_type: "square"
-  inputs {
-    input_layer_name: "__tanh_0__"
-    proj_conf {
-      type: "identity"
-      name: "___square_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__relu_0__"
-  type: "mixed"
-  size: 100
-  active_type: "relu"
-  inputs {
-    input_layer_name: "__square_0__"
-    proj_conf {
-      type: "identity"
-      name: "___relu_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__slope_intercept_layer_0__"
-  type: "slope_intercept"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__relu_0__"
-  }
-  slope: 1.0
-  intercept: 1
-}
-layers {
-  name: "__slope_intercept_layer_1__"
-  type: "slope_intercept"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__slope_intercept_layer_0__"
-  }
-  slope: 1.0
-  intercept: 1
-}
-layers {
-  name: "__mixed_0__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__relu_0__"
-    proj_conf {
-      type: "identity"
-      name: "___mixed_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-  inputs {
-    input_layer_name: "__slope_intercept_layer_1__"
-    proj_conf {
-      type: "identity"
-      name: "___mixed_0__.w1"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__slope_intercept_layer_2__"
-  type: "slope_intercept"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__relu_0__"
-  }
-  slope: -1.0
-  intercept: 0.0
-}
-layers {
-  name: "__mixed_1__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_0__"
-    proj_conf {
-      type: "identity"
-      name: "___mixed_1__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-  inputs {
-    input_layer_name: "__slope_intercept_layer_2__"
-    proj_conf {
-      type: "identity"
-      name: "___mixed_1__.w1"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__slope_intercept_layer_3__"
-  type: "slope_intercept"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_1__"
-  }
-  slope: 1.0
-  intercept: -2
-}
-layers {
-  name: "__slope_intercept_layer_4__"
-  type: "slope_intercept"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__slope_intercept_layer_3__"
-  }
-  slope: -1.0
-  intercept: 0.0
-}
-layers {
-  name: "__slope_intercept_layer_5__"
-  type: "slope_intercept"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__slope_intercept_layer_4__"
-  }
-  slope: 1.0
-  intercept: 2
-}
-layers {
-  name: "__slope_intercept_layer_6__"
-  type: "slope_intercept"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__slope_intercept_layer_5__"
-  }
-  slope: 2
-  intercept: 0.0
-}
-layers {
-  name: "__slope_intercept_layer_7__"
-  type: "slope_intercept"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__slope_intercept_layer_6__"
-  }
-  slope: 3
-  intercept: 0.0
-}
-layers {
-  name: "data_2"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "__scaling_layer_0__"
-  type: "scaling"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "data_2"
-  }
-  inputs {
-    input_layer_name: "__slope_intercept_layer_7__"
-  }
-}
-layers {
-  name: "__scaling_layer_1__"
-  type: "scaling"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "data_2"
-  }
-  inputs {
-    input_layer_name: "__scaling_layer_0__"
-  }
-}
-layers {
-  name: "__repeat_layer_0__"
-  type: "featmap_expand"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "data_2"
-  }
-  num_filters: 100
-}
-layers {
-  name: "__mixed_2__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__scaling_layer_1__"
-    proj_conf {
-      type: "identity"
-      name: "___mixed_2__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-  inputs {
-    input_layer_name: "__repeat_layer_0__"
-    proj_conf {
-      type: "identity"
-      name: "___mixed_2__.w1"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__repeat_layer_1__"
-  type: "featmap_expand"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "data_2"
-  }
-  num_filters: 100
-}
-layers {
-  name: "__mixed_3__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_2__"
-    proj_conf {
-      type: "identity"
-      name: "___mixed_3__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-  inputs {
-    input_layer_name: "__repeat_layer_1__"
-    proj_conf {
-      type: "identity"
-      name: "___mixed_3__.w1"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-input_layer_names: "data_2"
-input_layer_names: "data"
-output_layer_names: "__mixed_3__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__exp_0__"
-  layer_names: "__sqrt_0__"
-  layer_names: "__reciprocal_0__"
-  layer_names: "__log_0__"
-  layer_names: "__abs_0__"
-  layer_names: "__sigmoid_0__"
-  layer_names: "__tanh_0__"
-  layer_names: "__square_0__"
-  layer_names: "__relu_0__"
-  layer_names: "__slope_intercept_layer_0__"
-  layer_names: "__slope_intercept_layer_1__"
-  layer_names: "__mixed_0__"
-  layer_names: "__slope_intercept_layer_2__"
-  layer_names: "__mixed_1__"
-  layer_names: "__slope_intercept_layer_3__"
-  layer_names: "__slope_intercept_layer_4__"
-  layer_names: "__slope_intercept_layer_5__"
-  layer_names: "__slope_intercept_layer_6__"
-  layer_names: "__slope_intercept_layer_7__"
-  layer_names: "data_2"
-  layer_names: "__scaling_layer_0__"
-  layer_names: "__scaling_layer_1__"
-  layer_names: "__repeat_layer_0__"
-  layer_names: "__mixed_2__"
-  layer_names: "__repeat_layer_1__"
-  layer_names: "__mixed_3__"
-  input_layer_names: "data_2"
-  input_layer_names: "data"
-  output_layer_names: "__mixed_3__"
-  is_recurrent_layer_group: false
-}
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
deleted file mode 100644
index d8bd7b9dfb7..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
+++ /dev/null
@@ -1,466 +0,0 @@
-type: "nn"
-layers {
-  name: "test"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__embedding_0__"
-  type: "mixed"
-  size: 256
-  active_type: ""
-  inputs {
-    input_layer_name: "test"
-    input_parameter_name: "___embedding_0__.w0"
-    proj_conf {
-      type: "table"
-      name: "___embedding_0__.w0"
-      input_size: 100
-      output_size: 256
-    }
-  }
-}
-layers {
-  name: "__mixed_0__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__embedding_0__"
-    input_parameter_name: "___mixed_0__.w0"
-    proj_conf {
-      type: "fc"
-      name: "___mixed_0__.w0"
-      input_size: 256
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__mixed_1__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_0__"
-    input_parameter_name: "___mixed_1__.w0"
-    proj_conf {
-      type: "table"
-      name: "___mixed_1__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__mixed_2__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_1__"
-    proj_conf {
-      type: "identity"
-      name: "___mixed_2__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__mixed_3__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_2__"
-    input_parameter_name: "___mixed_3__.w0"
-    proj_conf {
-      type: "dot_mul"
-      name: "___mixed_3__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__mixed_4__"
-  type: "mixed"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_3__"
-    input_parameter_name: "___mixed_4__.w0"
-    proj_conf {
-      type: "context"
-      name: "___mixed_4__.w0"
-      input_size: 100
-      output_size: 300
-      context_start: -1
-      context_length: 3
-      trainable_padding: true
-    }
-  }
-}
-layers {
-  name: "__mixed_5__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_2__"
-  }
-  inputs {
-    input_layer_name: "__mixed_2__"
-    input_parameter_name: "___mixed_5__.w1"
-    proj_conf {
-      type: "scaling"
-      name: "___mixed_5__.w1"
-      input_size: 100
-      output_size: 100
-    }
-  }
-  inputs {
-    input_layer_name: "__mixed_3__"
-  }
-  operator_confs {
-    type: "dot_mul"
-    input_indices: 0
-    input_indices: 2
-    input_sizes: 100
-    input_sizes: 100
-    output_size: 100
-    dotmul_scale: 1
-  }
-}
-layers {
-  name: "img"
-  type: "data"
-  size: 1024
-  active_type: ""
-}
-layers {
-  name: "filter"
-  type: "data"
-  size: 576
-  active_type: ""
-}
-layers {
-  name: "__mixed_6__"
-  type: "mixed"
-  size: 57600
-  active_type: ""
-  inputs {
-    input_layer_name: "img"
-  }
-  inputs {
-    input_layer_name: "img"
-    input_parameter_name: "___mixed_6__.w1"
-    proj_conf {
-      type: "conv"
-      name: "___mixed_6__.w1"
-      input_size: 1024
-      output_size: 57600
-      conv_conf {
-        filter_size: 3
-        channels: 1
-        stride: 1
-        padding: 0
-        groups: 1
-        filter_channels: 1
-        output_x: 30
-        img_size: 32
-        caffe_mode: true
-        filter_size_y: 3
-        padding_y: 0
-        stride_y: 1
-        output_y: 30
-        img_size_y: 32
-      }
-      num_filters: 64
-    }
-  }
-  inputs {
-    input_layer_name: "filter"
-  }
-  operator_confs {
-    type: "conv"
-    input_indices: 0
-    input_indices: 2
-    input_sizes: 1024
-    input_sizes: 576
-    output_size: 57600
-    conv_conf {
-      filter_size: 3
-      channels: 1
-      stride: 1
-      padding: 0
-      groups: 1
-      filter_channels: 1
-      output_x: 30
-      img_size: 32
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 0
-      stride_y: 1
-      output_y: 30
-      img_size_y: 32
-    }
-    num_filters: 64
-  }
-}
-layers {
-  name: "__mixed_7__"
-  type: "mixed"
-  size: 254016
-  active_type: ""
-  inputs {
-    input_layer_name: "img"
-  }
-  inputs {
-    input_layer_name: "img"
-    input_parameter_name: "___mixed_7__.w1"
-    proj_conf {
-      type: "convt"
-      name: "___mixed_7__.w1"
-      input_size: 1024
-      output_size: 254016
-      conv_conf {
-        filter_size: 3
-        channels: 1
-        stride: 2
-        padding: 1
-        groups: 1
-        filter_channels: 64
-        output_x: 32
-        img_size: 63
-        caffe_mode: true
-        filter_size_y: 3
-        padding_y: 1
-        stride_y: 2
-        output_y: 32
-        img_size_y: 63
-      }
-      num_filters: 64
-    }
-  }
-  inputs {
-    input_layer_name: "filter"
-  }
-  operator_confs {
-    type: "convt"
-    input_indices: 0
-    input_indices: 2
-    input_sizes: 1024
-    input_sizes: 576
-    output_size: 254016
-    conv_conf {
-      filter_size: 3
-      channels: 1
-      stride: 2
-      padding: 1
-      groups: 1
-      filter_channels: 64
-      output_x: 32
-      img_size: 63
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 2
-      output_y: 32
-      img_size_y: 63
-    }
-    num_filters: 64
-  }
-}
-layers {
-  name: "__mixed_8__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_4__"
-    input_parameter_name: "___mixed_8__.w0"
-    proj_conf {
-      type: "fc"
-      name: "___mixed_8__.w0"
-      input_size: 300
-      output_size: 100
-    }
-  }
-  inputs {
-    input_layer_name: "__mixed_5__"
-    input_parameter_name: "___mixed_8__.w1"
-    proj_conf {
-      type: "trans_fc"
-      name: "___mixed_8__.w1"
-      input_size: 100
-      output_size: 100
-    }
-  }
-  inputs {
-    input_layer_name: "__mixed_6__"
-    input_parameter_name: "___mixed_8__.w2"
-    proj_conf {
-      type: "fc"
-      name: "___mixed_8__.w2"
-      input_size: 57600
-      output_size: 100
-    }
-  }
-  inputs {
-    input_layer_name: "__mixed_7__"
-    input_parameter_name: "___mixed_8__.w3"
-    proj_conf {
-      type: "fc"
-      name: "___mixed_8__.w3"
-      input_size: 254016
-      output_size: 100
-    }
-  }
-  drop_rate: 0.5
-  error_clipping_threshold: 40.0
-}
-parameters {
-  name: "___embedding_0__.w0"
-  size: 25600
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 256
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___mixed_0__.w0"
-  size: 25600
-  initial_mean: 0.0
-  initial_std: 0.0625
-  dims: 256
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___mixed_1__.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___mixed_3__.w0"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 1.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___mixed_4__.w0"
-  size: 200
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 2
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___mixed_5__.w1"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 1.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___mixed_6__.w1"
-  size: 576
-  initial_mean: 0.0
-  initial_std: 0.471404520791
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___mixed_7__.w1"
-  size: 576
-  initial_mean: 0.0
-  initial_std: 0.471404520791
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___mixed_8__.w0"
-  size: 30000
-  initial_mean: 0.0
-  initial_std: 0.057735026919
-  dims: 300
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___mixed_8__.w1"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___mixed_8__.w2"
-  size: 5760000
-  initial_mean: 0.0
-  initial_std: 0.00416666666667
-  dims: 57600
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___mixed_8__.w3"
-  size: 25401600
-  initial_mean: 0.0
-  initial_std: 0.00198412698413
-  dims: 254016
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-input_layer_names: "test"
-input_layer_names: "img"
-input_layer_names: "filter"
-output_layer_names: "__mixed_8__"
-sub_models {
-  name: "root"
-  layer_names: "test"
-  layer_names: "__embedding_0__"
-  layer_names: "__mixed_0__"
-  layer_names: "__mixed_1__"
-  layer_names: "__mixed_2__"
-  layer_names: "__mixed_3__"
-  layer_names: "__mixed_4__"
-  layer_names: "__mixed_5__"
-  layer_names: "img"
-  layer_names: "filter"
-  layer_names: "__mixed_6__"
-  layer_names: "__mixed_7__"
-  layer_names: "__mixed_8__"
-  input_layer_names: "test"
-  input_layer_names: "img"
-  input_layer_names: "filter"
-  output_layer_names: "__mixed_8__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_fc.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_fc.protostr
deleted file mode 100644
index 3e8633b0798..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_fc.protostr
+++ /dev/null
@@ -1,125 +0,0 @@
-type: "nn"
-layers {
-  name: "feature_a"
-  type: "data"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "feature_b"
-  type: "data"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 200
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "feature_a"
-    input_parameter_name: "fc_param"
-  }
-  bias_parameter_name: "bias_param"
-}
-layers {
-  name: "__fc_layer_1__"
-  type: "fc"
-  size: 200
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "feature_b"
-    input_parameter_name: "fc_param"
-  }
-  bias_parameter_name: "bias_param"
-}
-layers {
-  name: "__fc_layer_2__"
-  type: "fc"
-  size: 10
-  active_type: "softmax"
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "softmax_param"
-  }
-  inputs {
-    input_layer_name: "__fc_layer_1__"
-    input_parameter_name: "softmax_param"
-  }
-}
-layers {
-  name: "label"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "__cost_0__"
-  type: "multi-class-cross-entropy"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_2__"
-  }
-  inputs {
-    input_layer_name: "label"
-  }
-  coeff: 1.0
-}
-parameters {
-  name: "fc_param"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 1.0
-  dims: 200
-  dims: 200
-  initial_strategy: 1
-  initial_smart: false
-}
-parameters {
-  name: "bias_param"
-  size: 200
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 200
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "softmax_param"
-  size: 2000
-  initial_mean: 0.0
-  initial_std: 1.0
-  dims: 200
-  dims: 10
-  initial_strategy: 1
-  initial_smart: false
-}
-input_layer_names: "feature_a"
-input_layer_names: "feature_b"
-input_layer_names: "label"
-output_layer_names: "__cost_0__"
-evaluators {
-  name: "classification_error_evaluator"
-  type: "classification_error"
-  input_layers: "__fc_layer_2__"
-  input_layers: "label"
-}
-sub_models {
-  name: "root"
-  layer_names: "feature_a"
-  layer_names: "feature_b"
-  layer_names: "__fc_layer_0__"
-  layer_names: "__fc_layer_1__"
-  layer_names: "__fc_layer_2__"
-  layer_names: "label"
-  layer_names: "__cost_0__"
-  input_layer_names: "feature_a"
-  input_layer_names: "feature_b"
-  input_layer_names: "label"
-  output_layer_names: "__cost_0__"
-  evaluator_names: "classification_error_evaluator"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
deleted file mode 100644
index 7254deb3689..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
+++ /dev/null
@@ -1,289 +0,0 @@
-type: "recurrent_nn"
-layers {
-  name: "data_a"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "data_b"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__simple_gru_0___transform"
-  type: "mixed"
-  size: 600
-  active_type: ""
-  inputs {
-    input_layer_name: "data_a"
-    input_parameter_name: "mixed_param"
-    proj_conf {
-      type: "fc"
-      name: "___simple_gru_0___transform.w0"
-      input_size: 100
-      output_size: 600
-    }
-  }
-}
-layers {
-  name: "__simple_gru_0___recurrent_group"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
-  type: "scatter_agent"
-  size: 600
-  active_type: ""
-}
-layers {
-  name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
-  type: "agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
-  type: "gru_step"
-  size: 200
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
-    input_parameter_name: "gru_param"
-  }
-  inputs {
-    input_layer_name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
-  }
-  bias_parameter_name: "gru_bias"
-  active_gate_type: "sigmoid"
-}
-layers {
-  name: "__simple_gru_0__"
-  type: "gather_agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__simple_gru_1___transform"
-  type: "mixed"
-  size: 600
-  active_type: ""
-  inputs {
-    input_layer_name: "data_b"
-    input_parameter_name: "mixed_param"
-    proj_conf {
-      type: "fc"
-      name: "___simple_gru_1___transform.w0"
-      input_size: 100
-      output_size: 600
-    }
-  }
-}
-layers {
-  name: "__simple_gru_1___recurrent_group"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
-  type: "scatter_agent"
-  size: 600
-  active_type: ""
-}
-layers {
-  name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
-  type: "agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
-  type: "gru_step"
-  size: 200
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
-    input_parameter_name: "gru_param"
-  }
-  inputs {
-    input_layer_name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
-  }
-  bias_parameter_name: "gru_bias"
-  active_gate_type: "sigmoid"
-}
-layers {
-  name: "__simple_gru_1__"
-  type: "gather_agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__last_seq_0__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "__simple_gru_0__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__last_seq_1__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "__simple_gru_1__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 10
-  active_type: "softmax"
-  inputs {
-    input_layer_name: "__last_seq_0__"
-    input_parameter_name: "softmax_param"
-  }
-  inputs {
-    input_layer_name: "__last_seq_1__"
-    input_parameter_name: "softmax_param"
-  }
-}
-layers {
-  name: "label"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "__cost_0__"
-  type: "multi-class-cross-entropy"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-  }
-  inputs {
-    input_layer_name: "label"
-  }
-  coeff: 1.0
-}
-parameters {
-  name: "mixed_param"
-  size: 60000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 600
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "gru_param"
-  size: 120000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 600
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "gru_bias"
-  size: 600
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 600
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "softmax_param"
-  size: 2000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 10
-  initial_strategy: 0
-  initial_smart: true
-}
-input_layer_names: "data_a"
-input_layer_names: "data_b"
-input_layer_names: "label"
-output_layer_names: "__cost_0__"
-evaluators {
-  name: "classification_error_evaluator"
-  type: "classification_error"
-  input_layers: "__fc_layer_0__"
-  input_layers: "label"
-}
-sub_models {
-  name: "root"
-  layer_names: "data_a"
-  layer_names: "data_b"
-  layer_names: "__simple_gru_0___transform"
-  layer_names: "__simple_gru_0___recurrent_group"
-  layer_names: "__simple_gru_0__"
-  layer_names: "__simple_gru_1___transform"
-  layer_names: "__simple_gru_1___recurrent_group"
-  layer_names: "__simple_gru_1__"
-  layer_names: "__last_seq_0__"
-  layer_names: "__last_seq_1__"
-  layer_names: "__fc_layer_0__"
-  layer_names: "label"
-  layer_names: "__cost_0__"
-  input_layer_names: "data_a"
-  input_layer_names: "data_b"
-  input_layer_names: "label"
-  output_layer_names: "__cost_0__"
-  evaluator_names: "classification_error_evaluator"
-  is_recurrent_layer_group: false
-}
-sub_models {
-  name: "__simple_gru_0___recurrent_group"
-  layer_names: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
-  layer_names: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
-  layer_names: "__simple_gru_0__@__simple_gru_0___recurrent_group"
-  is_recurrent_layer_group: true
-  reversed: false
-  memories {
-    layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
-    link_name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
-  }
-  in_links {
-    layer_name: "__simple_gru_0___transform"
-    link_name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
-  }
-  out_links {
-    layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
-    link_name: "__simple_gru_0__"
-  }
-}
-sub_models {
-  name: "__simple_gru_1___recurrent_group"
-  layer_names: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
-  layer_names: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
-  layer_names: "__simple_gru_1__@__simple_gru_1___recurrent_group"
-  is_recurrent_layer_group: true
-  reversed: false
-  memories {
-    layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
-    link_name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
-  }
-  in_links {
-    layer_name: "__simple_gru_1___transform"
-    link_name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
-  }
-  out_links {
-    layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
-    link_name: "__simple_gru_1__"
-  }
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
deleted file mode 100644
index 75cf2312032..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
+++ /dev/null
@@ -1,385 +0,0 @@
-type: "recurrent_nn"
-layers {
-  name: "data_a"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "data_b"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__mixed_0__"
-  type: "mixed"
-  size: 400
-  active_type: ""
-  inputs {
-    input_layer_name: "data_a"
-    input_parameter_name: "mixed_param"
-    proj_conf {
-      type: "fc"
-      name: "___mixed_0__.w0"
-      input_size: 100
-      output_size: 400
-    }
-  }
-}
-layers {
-  name: "__mixed_1__"
-  type: "mixed"
-  size: 400
-  active_type: ""
-  inputs {
-    input_layer_name: "data_b"
-    input_parameter_name: "mixed_param"
-    proj_conf {
-      type: "fc"
-      name: "___mixed_1__.w0"
-      input_size: 100
-      output_size: 400
-    }
-  }
-}
-layers {
-  name: "__lstm_group_0___recurrent_group"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "__mixed_0__@__lstm_group_0___recurrent_group"
-  type: "scatter_agent"
-  size: 400
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-  type: "agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-  type: "agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
-  type: "mixed"
-  size: 400
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_0__@__lstm_group_0___recurrent_group"
-    proj_conf {
-      type: "identity"
-      name: "___lstm_group_0___input_recurrent.w0"
-      input_size: 400
-      output_size: 400
-    }
-  }
-  inputs {
-    input_layer_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-    input_parameter_name: "lstm_param"
-    proj_conf {
-      type: "fc"
-      name: "___lstm_group_0___input_recurrent.w1"
-      input_size: 100
-      output_size: 400
-    }
-  }
-}
-layers {
-  name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-  type: "lstm_step"
-  size: 100
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
-  }
-  inputs {
-    input_layer_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-  }
-  bias_parameter_name: "lstm_bias"
-  active_gate_type: "sigmoid"
-  active_state_type: "tanh"
-}
-layers {
-  name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
-  type: "get_output"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-    input_layer_argument: "state"
-  }
-}
-layers {
-  name: "__lstm_group_0__"
-  type: "gather_agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_1___recurrent_group"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "__mixed_1__@__lstm_group_1___recurrent_group"
-  type: "scatter_agent"
-  size: 400
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
-  type: "agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
-  type: "agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_1___input_recurrent@__lstm_group_1___recurrent_group"
-  type: "mixed"
-  size: 400
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_1__@__lstm_group_1___recurrent_group"
-    proj_conf {
-      type: "identity"
-      name: "___lstm_group_1___input_recurrent.w0"
-      input_size: 400
-      output_size: 400
-    }
-  }
-  inputs {
-    input_layer_name: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
-    input_parameter_name: "lstm_param"
-    proj_conf {
-      type: "fc"
-      name: "___lstm_group_1___input_recurrent.w1"
-      input_size: 100
-      output_size: 400
-    }
-  }
-}
-layers {
-  name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
-  type: "lstm_step"
-  size: 100
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__lstm_group_1___input_recurrent@__lstm_group_1___recurrent_group"
-  }
-  inputs {
-    input_layer_name: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
-  }
-  bias_parameter_name: "lstm_bias"
-  active_gate_type: "sigmoid"
-  active_state_type: "tanh"
-}
-layers {
-  name: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
-  type: "get_output"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
-    input_layer_argument: "state"
-  }
-}
-layers {
-  name: "__lstm_group_1__"
-  type: "gather_agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__last_seq_0__"
-  type: "seqlastins"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__lstm_group_0__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__last_seq_1__"
-  type: "seqlastins"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__lstm_group_1__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 10
-  active_type: "softmax"
-  inputs {
-    input_layer_name: "__last_seq_0__"
-    input_parameter_name: "softmax_param"
-  }
-  inputs {
-    input_layer_name: "__last_seq_1__"
-    input_parameter_name: "softmax_param"
-  }
-}
-layers {
-  name: "label"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "__cost_0__"
-  type: "multi-class-cross-entropy"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-  }
-  inputs {
-    input_layer_name: "label"
-  }
-  coeff: 1.0
-}
-parameters {
-  name: "mixed_param"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 400
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "lstm_param"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 400
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "lstm_bias"
-  size: 300
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 300
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "softmax_param"
-  size: 1000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 10
-  initial_strategy: 0
-  initial_smart: true
-}
-input_layer_names: "data_a"
-input_layer_names: "data_b"
-input_layer_names: "label"
-output_layer_names: "__cost_0__"
-evaluators {
-  name: "classification_error_evaluator"
-  type: "classification_error"
-  input_layers: "__fc_layer_0__"
-  input_layers: "label"
-}
-sub_models {
-  name: "root"
-  layer_names: "data_a"
-  layer_names: "data_b"
-  layer_names: "__mixed_0__"
-  layer_names: "__mixed_1__"
-  layer_names: "__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0__"
-  layer_names: "__lstm_group_1___recurrent_group"
-  layer_names: "__lstm_group_1__"
-  layer_names: "__last_seq_0__"
-  layer_names: "__last_seq_1__"
-  layer_names: "__fc_layer_0__"
-  layer_names: "label"
-  layer_names: "__cost_0__"
-  input_layer_names: "data_a"
-  input_layer_names: "data_b"
-  input_layer_names: "label"
-  output_layer_names: "__cost_0__"
-  evaluator_names: "classification_error_evaluator"
-  is_recurrent_layer_group: false
-}
-sub_models {
-  name: "__lstm_group_0___recurrent_group"
-  layer_names: "__mixed_0__@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
-  is_recurrent_layer_group: true
-  reversed: false
-  memories {
-    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-    link_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-  }
-  memories {
-    layer_name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
-    link_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-  }
-  in_links {
-    layer_name: "__mixed_0__"
-    link_name: "__mixed_0__@__lstm_group_0___recurrent_group"
-  }
-  out_links {
-    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-    link_name: "__lstm_group_0__"
-  }
-}
-sub_models {
-  name: "__lstm_group_1___recurrent_group"
-  layer_names: "__mixed_1__@__lstm_group_1___recurrent_group"
-  layer_names: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
-  layer_names: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
-  layer_names: "__lstm_group_1___input_recurrent@__lstm_group_1___recurrent_group"
-  layer_names: "__lstm_group_1__@__lstm_group_1___recurrent_group"
-  layer_names: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
-  is_recurrent_layer_group: true
-  reversed: false
-  memories {
-    layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
-    link_name: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
-  }
-  memories {
-    layer_name: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
-    link_name: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
-  }
-  in_links {
-    layer_name: "__mixed_1__"
-    link_name: "__mixed_1__@__lstm_group_1___recurrent_group"
-  }
-  out_links {
-    layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
-    link_name: "__lstm_group_1__"
-  }
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
deleted file mode 100644
index 0d51f70ee01..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
+++ /dev/null
@@ -1,424 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 200
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-  bias_parameter_name: "___fc_layer_0__.wbias"
-}
-layers {
-  name: "__recurrent_layer_0__"
-  type: "recurrent"
-  size: 200
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___recurrent_layer_0__.w0"
-  }
-  bias_parameter_name: "___recurrent_layer_0__.wbias"
-  reversed: false
-}
-layers {
-  name: "__recurrent_layer_1__"
-  type: "recurrent"
-  size: 200
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___recurrent_layer_1__.w0"
-  }
-  bias_parameter_name: "___recurrent_layer_1__.wbias"
-  reversed: true
-}
-layers {
-  name: "__fc_layer_1__"
-  type: "fc"
-  size: 800
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___fc_layer_1__.w0"
-  }
-}
-layers {
-  name: "__lstmemory_0__"
-  type: "lstmemory"
-  size: 200
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "__fc_layer_1__"
-    input_parameter_name: "___lstmemory_0__.w0"
-  }
-  bias_parameter_name: "___lstmemory_0__.wbias"
-  reversed: false
-  active_gate_type: "sigmoid"
-  active_state_type: "tanh"
-}
-layers {
-  name: "__fc_layer_2__"
-  type: "fc"
-  size: 800
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___fc_layer_2__.w0"
-  }
-}
-layers {
-  name: "__lstmemory_1__"
-  type: "lstmemory"
-  size: 200
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "__fc_layer_2__"
-    input_parameter_name: "___lstmemory_1__.w0"
-  }
-  bias_parameter_name: "___lstmemory_1__.wbias"
-  reversed: true
-  active_gate_type: "sigmoid"
-  active_state_type: "tanh"
-}
-layers {
-  name: "__fc_layer_3__"
-  type: "fc"
-  size: 600
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___fc_layer_3__.w0"
-  }
-}
-layers {
-  name: "__gru_0__"
-  type: "gated_recurrent"
-  size: 200
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "__fc_layer_3__"
-    input_parameter_name: "___gru_0__.w0"
-  }
-  bias_parameter_name: "___gru_0__.wbias"
-  reversed: false
-  active_gate_type: "sigmoid"
-}
-layers {
-  name: "__fc_layer_4__"
-  type: "fc"
-  size: 600
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___fc_layer_4__.w0"
-  }
-}
-layers {
-  name: "__gru_1__"
-  type: "gated_recurrent"
-  size: 200
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "__fc_layer_4__"
-    input_parameter_name: "___gru_1__.w0"
-  }
-  bias_parameter_name: "___gru_1__.wbias"
-  reversed: true
-  active_gate_type: "sigmoid"
-}
-layers {
-  name: "__last_seq_0__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "__recurrent_layer_0__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__first_seq_0__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "__recurrent_layer_1__"
-  }
-  select_first: true
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__last_seq_1__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "__lstmemory_0__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__first_seq_1__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "__lstmemory_1__"
-  }
-  select_first: true
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__last_seq_2__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "__gru_0__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__first_seq_2__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "__gru_1__"
-  }
-  select_first: true
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-parameters {
-  name: "___fc_layer_0__.w0"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_0__.wbias"
-  size: 200
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 200
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___recurrent_layer_0__.w0"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___recurrent_layer_0__.wbias"
-  size: 200
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 200
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___recurrent_layer_1__.w0"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___recurrent_layer_1__.wbias"
-  size: 200
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 200
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___fc_layer_1__.w0"
-  size: 160000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 800
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___lstmemory_0__.w0"
-  size: 160000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 200
-  dims: 4
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___lstmemory_0__.wbias"
-  size: 1400
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1400
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___fc_layer_2__.w0"
-  size: 160000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 800
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___lstmemory_1__.w0"
-  size: 160000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 200
-  dims: 4
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___lstmemory_1__.wbias"
-  size: 1400
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1400
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___fc_layer_3__.w0"
-  size: 120000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 600
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___gru_0__.w0"
-  size: 120000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 600
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___gru_0__.wbias"
-  size: 600
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 600
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___fc_layer_4__.w0"
-  size: 120000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 600
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___gru_1__.w0"
-  size: 120000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 600
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___gru_1__.wbias"
-  size: 600
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 600
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-output_layer_names: "__last_seq_0__"
-output_layer_names: "__first_seq_0__"
-output_layer_names: "__last_seq_1__"
-output_layer_names: "__first_seq_1__"
-output_layer_names: "__last_seq_2__"
-output_layer_names: "__first_seq_2__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__fc_layer_0__"
-  layer_names: "__recurrent_layer_0__"
-  layer_names: "__recurrent_layer_1__"
-  layer_names: "__fc_layer_1__"
-  layer_names: "__lstmemory_0__"
-  layer_names: "__fc_layer_2__"
-  layer_names: "__lstmemory_1__"
-  layer_names: "__fc_layer_3__"
-  layer_names: "__gru_0__"
-  layer_names: "__fc_layer_4__"
-  layer_names: "__gru_1__"
-  layer_names: "__last_seq_0__"
-  layer_names: "__first_seq_0__"
-  layer_names: "__last_seq_1__"
-  layer_names: "__first_seq_1__"
-  layer_names: "__last_seq_2__"
-  layer_names: "__first_seq_2__"
-  input_layer_names: "data"
-  output_layer_names: "__last_seq_0__"
-  output_layer_names: "__first_seq_0__"
-  output_layer_names: "__last_seq_1__"
-  output_layer_names: "__first_seq_1__"
-  output_layer_names: "__last_seq_2__"
-  output_layer_names: "__first_seq_2__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
deleted file mode 100644
index 9b69ae4a3b3..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
+++ /dev/null
@@ -1,93 +0,0 @@
-type: "nn"
-layers {
-  name: "data3D"
-  type: "data"
-  size: 360
-  active_type: ""
-  height: 6
-  width: 20
-  depth: 3
-}
-layers {
-  name: "__batch_norm_0__"
-  type: "batch_norm"
-  size: 360
-  active_type: "relu"
-  inputs {
-    input_layer_name: "data3D"
-    input_parameter_name: "___batch_norm_0__.w0"
-    image_conf {
-      channels: 1
-      img_size: 20
-      img_size_y: 6
-      img_size_z: 3
-    }
-  }
-  inputs {
-    input_layer_name: "data3D"
-    input_parameter_name: "___batch_norm_0__.w1"
-  }
-  inputs {
-    input_layer_name: "data3D"
-    input_parameter_name: "___batch_norm_0__.w2"
-  }
-  bias_parameter_name: "___batch_norm_0__.wbias"
-  moving_average_fraction: 0.9
-  height: 6
-  width: 20
-  depth: 3
-  epsilon: 1e-05
-}
-parameters {
-  name: "___batch_norm_0__.w0"
-  size: 1
-  initial_mean: 1.0
-  initial_std: 0.0
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___batch_norm_0__.w1"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-  is_static: true
-  is_shared: true
-}
-parameters {
-  name: "___batch_norm_0__.w2"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-  is_static: true
-  is_shared: true
-}
-parameters {
-  name: "___batch_norm_0__.wbias"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data3D"
-output_layer_names: "__batch_norm_0__"
-sub_models {
-  name: "root"
-  layer_names: "data3D"
-  layer_names: "__batch_norm_0__"
-  input_layer_names: "data3D"
-  output_layer_names: "__batch_norm_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr
deleted file mode 100644
index 8a1399efad0..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr
+++ /dev/null
@@ -1,155 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 120
-  active_type: ""
-}
-layers {
-  name: "__bidirectional_gru_0___fw_transform"
-  type: "mixed"
-  size: 120
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___bidirectional_gru_0___fw_transform.w0"
-    proj_conf {
-      type: "fc"
-      name: "___bidirectional_gru_0___fw_transform.w0"
-      input_size: 120
-      output_size: 120
-    }
-  }
-}
-layers {
-  name: "__bidirectional_gru_0___fw"
-  type: "gated_recurrent"
-  size: 40
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__bidirectional_gru_0___fw_transform"
-    input_parameter_name: "___bidirectional_gru_0___fw.w0"
-  }
-  bias_parameter_name: "___bidirectional_gru_0___fw.wbias"
-  reversed: false
-  active_gate_type: "sigmoid"
-}
-layers {
-  name: "__bidirectional_gru_0___bw_transform"
-  type: "mixed"
-  size: 120
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___bidirectional_gru_0___bw_transform.w0"
-    proj_conf {
-      type: "fc"
-      name: "___bidirectional_gru_0___bw_transform.w0"
-      input_size: 120
-      output_size: 120
-    }
-  }
-}
-layers {
-  name: "__bidirectional_gru_0___bw"
-  type: "gated_recurrent"
-  size: 40
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__bidirectional_gru_0___bw_transform"
-    input_parameter_name: "___bidirectional_gru_0___bw.w0"
-  }
-  bias_parameter_name: "___bidirectional_gru_0___bw.wbias"
-  reversed: true
-  active_gate_type: "sigmoid"
-}
-layers {
-  name: "__bidirectional_gru_0__"
-  type: "concat"
-  size: 80
-  active_type: ""
-  inputs {
-    input_layer_name: "__bidirectional_gru_0___fw"
-  }
-  inputs {
-    input_layer_name: "__bidirectional_gru_0___bw"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-parameters {
-  name: "___bidirectional_gru_0___fw_transform.w0"
-  size: 14400
-  initial_mean: 0.0
-  initial_std: 0.0912870929175
-  dims: 120
-  dims: 120
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___bidirectional_gru_0___fw.w0"
-  size: 4800
-  initial_mean: 0.0
-  initial_std: 0.158113883008
-  dims: 40
-  dims: 120
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___bidirectional_gru_0___fw.wbias"
-  size: 120
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 120
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___bidirectional_gru_0___bw_transform.w0"
-  size: 14400
-  initial_mean: 0.0
-  initial_std: 0.0912870929175
-  dims: 120
-  dims: 120
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___bidirectional_gru_0___bw.w0"
-  size: 4800
-  initial_mean: 0.0
-  initial_std: 0.158113883008
-  dims: 40
-  dims: 120
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___bidirectional_gru_0___bw.wbias"
-  size: 120
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 120
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-output_layer_names: "__bidirectional_gru_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__bidirectional_gru_0___fw_transform"
-  layer_names: "__bidirectional_gru_0___fw"
-  layer_names: "__bidirectional_gru_0___bw_transform"
-  layer_names: "__bidirectional_gru_0___bw"
-  layer_names: "__bidirectional_gru_0__"
-  input_layer_names: "data"
-  output_layer_names: "__bidirectional_gru_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
deleted file mode 100644
index 25ec6323751..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
+++ /dev/null
@@ -1,137 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 2304
-  active_type: ""
-}
-layers {
-  name: "__conv_0__"
-  type: "exconv"
-  size: 36864
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___conv_0__.w0"
-    conv_conf {
-      filter_size: 3
-      channels: 1
-      stride: 1
-      padding: 1
-      groups: 1
-      filter_channels: 1
-      output_x: 48
-      img_size: 48
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 1
-      output_y: 48
-      img_size_y: 48
-      dilation: 1
-      dilation_y: 1
-    }
-  }
-  bias_parameter_name: "___conv_0__.wbias"
-  num_filters: 16
-  shared_biases: true
-  height: 48
-  width: 48
-}
-layers {
-  name: "__bilinear_interp_layer_0__"
-  type: "bilinear_interp"
-  size: 65536
-  active_type: ""
-  inputs {
-    input_layer_name: "__conv_0__"
-    bilinear_interp_conf {
-      image_conf {
-        channels: 16
-        img_size: 48
-        img_size_y: 48
-      }
-      out_size_x: 64
-      out_size_y: 64
-    }
-  }
-  height: 64
-  width: 64
-}
-layers {
-  name: "__pool_0__"
-  type: "pool"
-  size: 16384
-  active_type: ""
-  inputs {
-    input_layer_name: "__bilinear_interp_layer_0__"
-    pool_conf {
-      pool_type: "max-projection"
-      channels: 16
-      size_x: 2
-      stride: 2
-      output_x: 32
-      img_size: 64
-      padding: 0
-      size_y: 2
-      stride_y: 2
-      output_y: 32
-      img_size_y: 64
-      padding_y: 0
-    }
-  }
-  height: 32
-  width: 32
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 384
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__pool_0__"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-}
-parameters {
-  name: "___conv_0__.w0"
-  size: 144
-  initial_mean: 0.0
-  initial_std: 0.471404520791
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___conv_0__.wbias"
-  size: 16
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 16
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___fc_layer_0__.w0"
-  size: 6291456
-  initial_mean: 0.0
-  initial_std: 0.0078125
-  dims: 16384
-  dims: 384
-  initial_strategy: 0
-  initial_smart: true
-}
-input_layer_names: "data"
-output_layer_names: "__fc_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__conv_0__"
-  layer_names: "__bilinear_interp_layer_0__"
-  layer_names: "__pool_0__"
-  layer_names: "__fc_layer_0__"
-  input_layer_names: "data"
-  output_layer_names: "__fc_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr
deleted file mode 100644
index 4b9578a0c05..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr
+++ /dev/null
@@ -1,31 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 300
-  active_type: ""
-}
-layers {
-  name: "__clip_0__"
-  type: "clip"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-    clip_conf {
-      min: -10
-      max: 10
-    }
-  }
-}
-input_layer_names: "input"
-output_layer_names: "__clip_0__"
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "__clip_0__"
-  input_layer_names: "input"
-  output_layer_names: "__clip_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_conv3d_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_conv3d_layer.protostr
deleted file mode 100644
index 9fe2bc29d3c..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_conv3d_layer.protostr
+++ /dev/null
@@ -1,132 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 36288
-  active_type: ""
-  height: 48
-  width: 42
-  depth: 6
-}
-layers {
-  name: "conv3d_1"
-  type: "conv3d"
-  size: 24192
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "_conv3d_1.w0"
-    conv_conf {
-      filter_size: 3
-      channels: 3
-      stride: 2
-      padding: 1
-      groups: 1
-      filter_channels: 3
-      output_x: 21
-      img_size: 42
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 2
-      output_y: 24
-      img_size_y: 48
-      filter_size_z: 3
-      padding_z: 1
-      stride_z: 2
-      output_z: 3
-      img_size_z: 6
-    }
-  }
-  bias_parameter_name: "_conv3d_1.wbias"
-  num_filters: 16
-  shared_biases: true
-  height: 24
-  width: 21
-  depth: 3
-}
-layers {
-  name: "conv3d_2"
-  type: "conv3d"
-  size: 24192
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "_conv3d_2.w0"
-    conv_conf {
-      filter_size: 3
-      channels: 3
-      stride: 2
-      padding: 1
-      groups: 1
-      filter_channels: 3
-      output_x: 21
-      img_size: 42
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 2
-      output_y: 24
-      img_size_y: 48
-      filter_size_z: 3
-      padding_z: 1
-      stride_z: 2
-      output_z: 3
-      img_size_z: 6
-    }
-  }
-  bias_parameter_name: "_conv3d_2.wbias"
-  num_filters: 16
-  shared_biases: true
-  height: 24
-  width: 21
-  depth: 3
-}
-parameters {
-  name: "_conv3d_1.w0"
-  size: 1296
-  initial_mean: 0.0
-  initial_std: 0.272165526976
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_conv3d_1.wbias"
-  size: 16
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 16
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_conv3d_2.w0"
-  size: 1296
-  initial_mean: 0.0
-  initial_std: 0.272165526976
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_conv3d_2.wbias"
-  size: 16
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 16
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-output_layer_names: "conv3d_2"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "conv3d_1"
-  layer_names: "conv3d_2"
-  input_layer_names: "data"
-  output_layer_names: "conv3d_2"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
deleted file mode 100644
index 55ab464ddf8..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ /dev/null
@@ -1,375 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "labels"
-  type: "data"
-  size: 5000
-  active_type: ""
-}
-layers {
-  name: "probs"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "xe-label"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 4
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-  bias_parameter_name: "___fc_layer_0__.wbias"
-}
-layers {
-  name: "__ctc_layer_0__"
-  type: "ctc"
-  size: 5001
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-  }
-  inputs {
-    input_layer_name: "labels"
-  }
-  norm_by_times: false
-}
-layers {
-  name: "__warp_ctc_layer_0__"
-  type: "warp_ctc"
-  size: 5001
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-  }
-  inputs {
-    input_layer_name: "labels"
-  }
-  norm_by_times: false
-  blank: 0
-}
-layers {
-  name: "crf_label"
-  type: "data"
-  size: 4
-  active_type: ""
-}
-layers {
-  name: "__crf_layer_0__"
-  type: "crf"
-  size: 4
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___crf_layer_0__.w0"
-  }
-  inputs {
-    input_layer_name: "crf_label"
-  }
-  coeff: 1.0
-}
-layers {
-  name: "left"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "right"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "label"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "__rank_cost_0__"
-  type: "rank-cost"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "left"
-  }
-  inputs {
-    input_layer_name: "right"
-  }
-  inputs {
-    input_layer_name: "label"
-  }
-  coeff: 1.0
-}
-layers {
-  name: "list_feature"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "list_scores"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "__lambda_cost_0__"
-  type: "lambda_cost"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "list_feature"
-  }
-  inputs {
-    input_layer_name: "list_scores"
-  }
-  NDCG_num: 5
-  max_sort_size: -1
-}
-layers {
-  name: "__cross_entropy_0__"
-  type: "multi-class-cross-entropy"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "probs"
-  }
-  inputs {
-    input_layer_name: "xe-label"
-  }
-  coeff: 1.0
-}
-layers {
-  name: "__cross_entropy_with_selfnorm_0__"
-  type: "multi_class_cross_entropy_with_selfnorm"
-  active_type: ""
-  inputs {
-    input_layer_name: "probs"
-  }
-  inputs {
-    input_layer_name: "xe-label"
-  }
-  softmax_selfnorm_alpha: 0.1
-  coeff: 1.0
-}
-layers {
-  name: "__huber_regression_cost_0__"
-  type: "huber_regression"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-  }
-  inputs {
-    input_layer_name: "labels"
-  }
-  coeff: 1.0
-  delta: 1.0
-}
-layers {
-  name: "huber_probs"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "huber_label"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "__huber_classification_cost_0__"
-  type: "huber_classification"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "huber_probs"
-  }
-  inputs {
-    input_layer_name: "huber_label"
-  }
-  coeff: 1.0
-}
-layers {
-  name: "__multi_binary_label_cross_entropy_0__"
-  type: "multi_binary_label_cross_entropy"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "probs"
-  }
-  inputs {
-    input_layer_name: "xe-label"
-  }
-  coeff: 1.0
-}
-layers {
-  name: "__sum_cost_0__"
-  type: "sum_cost"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-  }
-  coeff: 1.0
-}
-layers {
-  name: "__nce_layer_0__"
-  type: "nce"
-  size: 1
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___nce_layer_0__.w0"
-  }
-  inputs {
-    input_layer_name: "labels"
-  }
-  bias_parameter_name: "___nce_layer_0__.wbias"
-  num_classes: 5000
-  num_neg_samples: 10
-}
-parameters {
-  name: "___fc_layer_0__.w0"
-  size: 800
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 4
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_0__.wbias"
-  size: 4
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 4
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___crf_layer_0__.w0"
-  size: 24
-  initial_mean: 0.0
-  initial_std: 0.408248290464
-  dims: 6
-  dims: 4
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___nce_layer_0__.w0"
-  size: 20000
-  initial_mean: 0.0
-  initial_std: 0.0141421356237
-  dims: 5000
-  dims: 4
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___nce_layer_0__.wbias"
-  size: 5000
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 5000
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "input"
-input_layer_names: "labels"
-input_layer_names: "crf_label"
-input_layer_names: "left"
-input_layer_names: "right"
-input_layer_names: "label"
-input_layer_names: "list_feature"
-input_layer_names: "list_scores"
-input_layer_names: "probs"
-input_layer_names: "xe-label"
-input_layer_names: "huber_probs"
-input_layer_names: "huber_label"
-output_layer_names: "__ctc_layer_0__"
-output_layer_names: "__warp_ctc_layer_0__"
-output_layer_names: "__crf_layer_0__"
-output_layer_names: "__rank_cost_0__"
-output_layer_names: "__lambda_cost_0__"
-output_layer_names: "__cross_entropy_0__"
-output_layer_names: "__cross_entropy_with_selfnorm_0__"
-output_layer_names: "__huber_regression_cost_0__"
-output_layer_names: "__huber_classification_cost_0__"
-output_layer_names: "__multi_binary_label_cross_entropy_0__"
-output_layer_names: "__sum_cost_0__"
-output_layer_names: "__nce_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "labels"
-  layer_names: "probs"
-  layer_names: "xe-label"
-  layer_names: "__fc_layer_0__"
-  layer_names: "__ctc_layer_0__"
-  layer_names: "__warp_ctc_layer_0__"
-  layer_names: "crf_label"
-  layer_names: "__crf_layer_0__"
-  layer_names: "left"
-  layer_names: "right"
-  layer_names: "label"
-  layer_names: "__rank_cost_0__"
-  layer_names: "list_feature"
-  layer_names: "list_scores"
-  layer_names: "__lambda_cost_0__"
-  layer_names: "__cross_entropy_0__"
-  layer_names: "__cross_entropy_with_selfnorm_0__"
-  layer_names: "__huber_regression_cost_0__"
-  layer_names: "huber_probs"
-  layer_names: "huber_label"
-  layer_names: "__huber_classification_cost_0__"
-  layer_names: "__multi_binary_label_cross_entropy_0__"
-  layer_names: "__sum_cost_0__"
-  layer_names: "__nce_layer_0__"
-  input_layer_names: "input"
-  input_layer_names: "labels"
-  input_layer_names: "crf_label"
-  input_layer_names: "left"
-  input_layer_names: "right"
-  input_layer_names: "label"
-  input_layer_names: "list_feature"
-  input_layer_names: "list_scores"
-  input_layer_names: "probs"
-  input_layer_names: "xe-label"
-  input_layer_names: "huber_probs"
-  input_layer_names: "huber_label"
-  output_layer_names: "__ctc_layer_0__"
-  output_layer_names: "__warp_ctc_layer_0__"
-  output_layer_names: "__crf_layer_0__"
-  output_layer_names: "__rank_cost_0__"
-  output_layer_names: "__lambda_cost_0__"
-  output_layer_names: "__cross_entropy_0__"
-  output_layer_names: "__cross_entropy_with_selfnorm_0__"
-  output_layer_names: "__huber_regression_cost_0__"
-  output_layer_names: "__huber_classification_cost_0__"
-  output_layer_names: "__multi_binary_label_cross_entropy_0__"
-  output_layer_names: "__sum_cost_0__"
-  output_layer_names: "__nce_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
deleted file mode 100644
index cec8a73db66..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
+++ /dev/null
@@ -1,162 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 300
-  active_type: ""
-}
-layers {
-  name: "label"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "weight"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 10
-  active_type: "softmax"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-  bias_parameter_name: "___fc_layer_0__.wbias"
-}
-layers {
-  name: "__cost_0__"
-  type: "multi-class-cross-entropy"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-  }
-  inputs {
-    input_layer_name: "label"
-  }
-  inputs {
-    input_layer_name: "weight"
-  }
-  coeff: 1.0
-}
-layers {
-  name: "__square_error_cost_0__"
-  type: "square_error"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-  }
-  inputs {
-    input_layer_name: "label"
-  }
-  inputs {
-    input_layer_name: "weight"
-  }
-  coeff: 1.0
-}
-layers {
-  name: "multi_class_label"
-  type: "data"
-  size: 500
-  active_type: ""
-}
-layers {
-  name: "__nce_layer_0__"
-  type: "nce"
-  size: 1
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___nce_layer_0__.w0"
-  }
-  inputs {
-    input_layer_name: "multi_class_label"
-  }
-  inputs {
-    input_layer_name: "weight"
-  }
-  bias_parameter_name: "___nce_layer_0__.wbias"
-  num_classes: 500
-  num_neg_samples: 10
-}
-parameters {
-  name: "___fc_layer_0__.w0"
-  size: 3000
-  initial_mean: 0.0
-  initial_std: 0.057735026919
-  dims: 300
-  dims: 10
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_0__.wbias"
-  size: 10
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 10
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___nce_layer_0__.w0"
-  size: 5000
-  initial_mean: 0.0
-  initial_std: 0.04472135955
-  dims: 500
-  dims: 10
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___nce_layer_0__.wbias"
-  size: 500
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 500
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "input"
-input_layer_names: "label"
-input_layer_names: "weight"
-input_layer_names: "multi_class_label"
-output_layer_names: "__cost_0__"
-output_layer_names: "__square_error_cost_0__"
-output_layer_names: "__nce_layer_0__"
-evaluators {
-  name: "classification_error_evaluator"
-  type: "classification_error"
-  input_layers: "__fc_layer_0__"
-  input_layers: "label"
-  input_layers: "weight"
-}
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "label"
-  layer_names: "weight"
-  layer_names: "__fc_layer_0__"
-  layer_names: "__cost_0__"
-  layer_names: "__square_error_cost_0__"
-  layer_names: "multi_class_label"
-  layer_names: "__nce_layer_0__"
-  input_layer_names: "input"
-  input_layer_names: "label"
-  input_layer_names: "weight"
-  input_layer_names: "multi_class_label"
-  output_layer_names: "__cost_0__"
-  output_layer_names: "__square_error_cost_0__"
-  output_layer_names: "__nce_layer_0__"
-  evaluator_names: "classification_error_evaluator"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr
deleted file mode 100644
index a602569697e..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr
+++ /dev/null
@@ -1,207 +0,0 @@
-type: "nn"
-layers {
-  name: "sentence_states"
-  type: "data"
-  size: 32
-  active_type: ""
-}
-layers {
-  name: "sentence_scores"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "__kmax_seq_score_layer_0__"
-  type: "kmax_seq_score"
-  active_type: ""
-  inputs {
-    input_layer_name: "sentence_scores"
-  }
-  beam_size: 5
-}
-layers {
-  name: "__sub_nested_seq_layer_0__"
-  type: "sub_nested_seq"
-  size: 32
-  active_type: ""
-  inputs {
-    input_layer_name: "sentence_states"
-  }
-  inputs {
-    input_layer_name: "__kmax_seq_score_layer_0__"
-  }
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "__sub_nested_seq_layer_0__"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-  bias_parameter_name: "___fc_layer_0__.wbias"
-}
-layers {
-  name: "__kmax_seq_score_layer_1__"
-  type: "kmax_seq_score"
-  active_type: ""
-  inputs {
-    input_layer_name: "sentence_scores"
-  }
-  beam_size: 5
-}
-layers {
-  name: "__seq_slice_layer_0__"
-  type: "seq_slice"
-  size: 32
-  active_type: ""
-  inputs {
-    input_layer_name: "__sub_nested_seq_layer_0__"
-  }
-  inputs {
-    input_layer_name: "__kmax_seq_score_layer_1__"
-  }
-  select_first: true
-}
-layers {
-  name: "__fc_layer_1__"
-  type: "fc"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "__seq_slice_layer_0__"
-    input_parameter_name: "___fc_layer_1__.w0"
-  }
-  bias_parameter_name: "___fc_layer_1__.wbias"
-}
-layers {
-  name: "__kmax_seq_score_layer_2__"
-  type: "kmax_seq_score"
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_1__"
-  }
-  beam_size: 5
-}
-layers {
-  name: "sentences_ids"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "start_ids"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "end_ids"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "__cross_entropy_over_beam_0__"
-  type: "cross_entropy_over_beam"
-  active_type: ""
-  inputs {
-    input_layer_name: "sentence_scores"
-  }
-  inputs {
-    input_layer_name: "__kmax_seq_score_layer_0__"
-  }
-  inputs {
-    input_layer_name: "sentences_ids"
-  }
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-  }
-  inputs {
-    input_layer_name: "__kmax_seq_score_layer_1__"
-  }
-  inputs {
-    input_layer_name: "start_ids"
-  }
-  inputs {
-    input_layer_name: "__fc_layer_1__"
-  }
-  inputs {
-    input_layer_name: "__kmax_seq_score_layer_2__"
-  }
-  inputs {
-    input_layer_name: "end_ids"
-  }
-}
-parameters {
-  name: "___fc_layer_0__.w0"
-  size: 32
-  initial_mean: 0.0
-  initial_std: 0.176776695297
-  dims: 32
-  dims: 1
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_0__.wbias"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___fc_layer_1__.w0"
-  size: 32
-  initial_mean: 0.0
-  initial_std: 0.176776695297
-  dims: 32
-  dims: 1
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_1__.wbias"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "sentence_scores"
-input_layer_names: "sentences_ids"
-input_layer_names: "sentence_states"
-input_layer_names: "start_ids"
-input_layer_names: "end_ids"
-output_layer_names: "__cross_entropy_over_beam_0__"
-sub_models {
-  name: "root"
-  layer_names: "sentence_states"
-  layer_names: "sentence_scores"
-  layer_names: "__kmax_seq_score_layer_0__"
-  layer_names: "__sub_nested_seq_layer_0__"
-  layer_names: "__fc_layer_0__"
-  layer_names: "__kmax_seq_score_layer_1__"
-  layer_names: "__seq_slice_layer_0__"
-  layer_names: "__fc_layer_1__"
-  layer_names: "__kmax_seq_score_layer_2__"
-  layer_names: "sentences_ids"
-  layer_names: "start_ids"
-  layer_names: "end_ids"
-  layer_names: "__cross_entropy_over_beam_0__"
-  input_layer_names: "sentence_scores"
-  input_layer_names: "sentences_ids"
-  input_layer_names: "sentence_states"
-  input_layer_names: "start_ids"
-  input_layer_names: "end_ids"
-  output_layer_names: "__cross_entropy_over_beam_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_deconv3d_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_deconv3d_layer.protostr
deleted file mode 100644
index 7bf409731cb..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_deconv3d_layer.protostr
+++ /dev/null
@@ -1,132 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 36288
-  active_type: ""
-  height: 48
-  width: 42
-  depth: 6
-}
-layers {
-  name: "deconv3d_1"
-  type: "deconv3d"
-  size: 1387760
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "_deconv3d_1.w0"
-    conv_conf {
-      filter_size: 3
-      channels: 3
-      stride: 2
-      padding: 1
-      groups: 1
-      filter_channels: 16
-      output_x: 42
-      img_size: 83
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 2
-      output_y: 48
-      img_size_y: 95
-      filter_size_z: 3
-      padding_z: 1
-      stride_z: 2
-      output_z: 6
-      img_size_z: 11
-    }
-  }
-  bias_parameter_name: "_deconv3d_1.wbias"
-  num_filters: 16
-  shared_biases: true
-  height: 95
-  width: 83
-  depth: 11
-}
-layers {
-  name: "deconv3d_2"
-  type: "deconv3d"
-  size: 1387760
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "_deconv3d_2.w0"
-    conv_conf {
-      filter_size: 3
-      channels: 3
-      stride: 2
-      padding: 1
-      groups: 1
-      filter_channels: 16
-      output_x: 42
-      img_size: 83
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 2
-      output_y: 48
-      img_size_y: 95
-      filter_size_z: 3
-      padding_z: 1
-      stride_z: 2
-      output_z: 6
-      img_size_z: 11
-    }
-  }
-  bias_parameter_name: "_deconv3d_2.wbias"
-  num_filters: 16
-  shared_biases: true
-  height: 95
-  width: 83
-  depth: 11
-}
-parameters {
-  name: "_deconv3d_1.w0"
-  size: 6912
-  initial_mean: 0.0
-  initial_std: 0.272165526976
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_deconv3d_1.wbias"
-  size: 16
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 16
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_deconv3d_2.w0"
-  size: 6912
-  initial_mean: 0.0
-  initial_std: 0.272165526976
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_deconv3d_2.wbias"
-  size: 16
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 16
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-output_layer_names: "deconv3d_2"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "deconv3d_1"
-  layer_names: "deconv3d_2"
-  input_layer_names: "data"
-  output_layer_names: "deconv3d_2"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr
deleted file mode 100644
index 6690f9852a3..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr
+++ /dev/null
@@ -1,66 +0,0 @@
-type: "nn"
-layers {
-  name: "input_loc"
-  type: "data"
-  size: 16
-  active_type: ""
-  height: 16
-  width: 1
-}
-layers {
-  name: "input_conf"
-  type: "data"
-  size: 8
-  active_type: ""
-  height: 1
-  width: 8
-}
-layers {
-  name: "priorbox"
-  type: "data"
-  size: 32
-  active_type: ""
-  height: 4
-  width: 8
-}
-layers {
-  name: "test_detection_output"
-  type: "detection_output"
-  size: 1400
-  active_type: ""
-  inputs {
-    input_layer_name: "priorbox"
-    detection_output_conf {
-      num_classes: 21
-      nms_threshold: 0.45
-      nms_top_k: 400
-      background_id: 0
-      input_num: 1
-      keep_top_k: 200
-      confidence_threshold: 0.01
-    }
-  }
-  inputs {
-    input_layer_name: "input_loc"
-  }
-  inputs {
-    input_layer_name: "input_conf"
-  }
-}
-input_layer_names: "priorbox"
-input_layer_names: "input_loc"
-input_layer_names: "input_conf"
-output_layer_names: "test_detection_output"
-sub_models {
-  name: "root"
-  layer_names: "input_loc"
-  layer_names: "input_conf"
-  layer_names: "priorbox"
-  layer_names: "test_detection_output"
-  input_layer_names: "priorbox"
-  input_layer_names: "input_loc"
-  input_layer_names: "input_conf"
-  output_layer_names: "test_detection_output"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
deleted file mode 100644
index f1530c382c3..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
+++ /dev/null
@@ -1,38 +0,0 @@
-type: "nn"
-layers {
-  name: "vector1"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "vector2"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "__dot_prod_layer_0__"
-  type: "dot_prod"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "vector1"
-  }
-  inputs {
-    input_layer_name: "vector2"
-  }
-}
-input_layer_names: "vector1"
-input_layer_names: "vector2"
-output_layer_names: "__dot_prod_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "vector1"
-  layer_names: "vector2"
-  layer_names: "__dot_prod_layer_0__"
-  input_layer_names: "vector1"
-  input_layer_names: "vector2"
-  output_layer_names: "__dot_prod_layer_0__"
-  is_recurrent_layer_group: false
-}
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_expand_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_expand_layer.protostr
deleted file mode 100644
index f4b36052264..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_expand_layer.protostr
+++ /dev/null
@@ -1,56 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 30
-  active_type: ""
-}
-layers {
-  name: "data_seq"
-  type: "data"
-  size: 30
-  active_type: ""
-}
-layers {
-  name: "__expand_layer_0__"
-  type: "expand"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  inputs {
-    input_layer_name: "data_seq"
-  }
-  trans_type: "seq"
-}
-layers {
-  name: "__expand_layer_1__"
-  type: "expand"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  inputs {
-    input_layer_name: "data_seq"
-  }
-  trans_type: "non-seq"
-}
-input_layer_names: "data"
-input_layer_names: "data_seq"
-output_layer_names: "__expand_layer_0__"
-output_layer_names: "__expand_layer_1__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "data_seq"
-  layer_names: "__expand_layer_0__"
-  layer_names: "__expand_layer_1__"
-  input_layer_names: "data"
-  input_layer_names: "data_seq"
-  output_layer_names: "__expand_layer_0__"
-  output_layer_names: "__expand_layer_1__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
deleted file mode 100644
index 4f3002b1994..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
+++ /dev/null
@@ -1,39 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 1024
-  active_type: ""
-}
-layers {
-  name: "__factorization_machine_0__"
-  type: "factorization_machine"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___factorization_machine_0__.w0"
-  }
-  factor_size: 10
-}
-parameters {
-  name: "___factorization_machine_0__.w0"
-  size: 10240
-  initial_mean: 0.0
-  initial_std: 0.03125
-  dims: 1024
-  dims: 10
-  initial_strategy: 0
-  initial_smart: true
-}
-input_layer_names: "data"
-output_layer_names: "__factorization_machine_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__factorization_machine_0__"
-  input_layer_names: "data"
-  output_layer_names: "__factorization_machine_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_fc.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_fc.protostr
deleted file mode 100644
index 8151898832d..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_fc.protostr
+++ /dev/null
@@ -1,98 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__trans_layer_0__"
-  type: "trans"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 100
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__trans_layer_0__"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-}
-layers {
-  name: "mask"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__selective_fc_layer_0__"
-  type: "selective_fc"
-  size: 100
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___selective_fc_layer_0__.w0"
-  }
-  inputs {
-    input_layer_name: "mask"
-  }
-  bias_parameter_name: "___selective_fc_layer_0__.wbias"
-  selective_fc_pass_generation: false
-  has_selected_colums: true
-  selective_fc_full_mul_ratio: 0.02
-}
-parameters {
-  name: "___fc_layer_0__.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___selective_fc_layer_0__.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-  is_sparse: false
-}
-parameters {
-  name: "___selective_fc_layer_0__.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-input_layer_names: "mask"
-output_layer_names: "__fc_layer_0__"
-output_layer_names: "__selective_fc_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__trans_layer_0__"
-  layer_names: "__fc_layer_0__"
-  layer_names: "mask"
-  layer_names: "__selective_fc_layer_0__"
-  input_layer_names: "data"
-  input_layer_names: "mask"
-  output_layer_names: "__fc_layer_0__"
-  output_layer_names: "__selective_fc_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
deleted file mode 100644
index f1e4d894a5f..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
+++ /dev/null
@@ -1,106 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 256
-  active_type: ""
-}
-layers {
-  name: "__gated_unit_layer_0___input_proj"
-  type: "fc"
-  size: 512
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___gated_unit_layer_0___input_proj.w0"
-  }
-  bias_parameter_name: "___gated_unit_layer_0___input_proj.wbias"
-  error_clipping_threshold: 100.0
-}
-layers {
-  name: "__gated_unit_layer_0___gate"
-  type: "fc"
-  size: 512
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___gated_unit_layer_0___gate.w0"
-  }
-  bias_parameter_name: "___gated_unit_layer_0___gate.wbias"
-  error_clipping_threshold: 100.0
-}
-layers {
-  name: "__gated_unit_layer_0___gated_act"
-  type: "mixed"
-  size: 512
-  active_type: ""
-  inputs {
-    input_layer_name: "__gated_unit_layer_0___input_proj"
-  }
-  inputs {
-    input_layer_name: "__gated_unit_layer_0___gate"
-  }
-  error_clipping_threshold: 100.0
-  operator_confs {
-    type: "dot_mul"
-    input_indices: 0
-    input_indices: 1
-    input_sizes: 512
-    input_sizes: 512
-    output_size: 512
-    dotmul_scale: 1
-  }
-}
-parameters {
-  name: "___gated_unit_layer_0___input_proj.w0"
-  size: 131072
-  initial_mean: 0.0
-  initial_std: 0.0001
-  dims: 256
-  dims: 512
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___gated_unit_layer_0___input_proj.wbias"
-  size: 512
-  initial_mean: 0.0
-  initial_std: 1
-  dims: 1
-  dims: 512
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___gated_unit_layer_0___gate.w0"
-  size: 131072
-  initial_mean: 0.0
-  initial_std: 0.0001
-  dims: 256
-  dims: 512
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___gated_unit_layer_0___gate.wbias"
-  size: 512
-  initial_mean: 0.0
-  initial_std: 1
-  dims: 1
-  dims: 512
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "input"
-output_layer_names: "__gated_unit_layer_0___gated_act"
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "__gated_unit_layer_0___input_proj"
-  layer_names: "__gated_unit_layer_0___gate"
-  layer_names: "__gated_unit_layer_0___gated_act"
-  input_layer_names: "input"
-  output_layer_names: "__gated_unit_layer_0___gated_act"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_grumemory_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_grumemory_layer.protostr
deleted file mode 100644
index 2c19b2fd120..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_grumemory_layer.protostr
+++ /dev/null
@@ -1,51 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 120
-  active_type: ""
-}
-layers {
-  name: "__gru_0__"
-  type: "gated_recurrent"
-  size: 40
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___gru_0__.w0"
-  }
-  bias_parameter_name: "___gru_0__.wbias"
-  reversed: true
-  active_gate_type: "tanh"
-}
-parameters {
-  name: "___gru_0__.w0"
-  size: 4800
-  initial_mean: 0.0
-  initial_std: 0.158113883008
-  dims: 40
-  dims: 120
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___gru_0__.wbias"
-  size: 120
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 120
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-output_layer_names: "__gru_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__gru_0__"
-  input_layer_names: "data"
-  output_layer_names: "__gru_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_hsigmoid.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_hsigmoid.protostr
deleted file mode 100644
index e81fcb13c4c..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_hsigmoid.protostr
+++ /dev/null
@@ -1,62 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "label"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "__hsigmoid_0__"
-  type: "hsigmoid"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___hsigmoid_0__.w0"
-  }
-  inputs {
-    input_layer_name: "label"
-  }
-  bias_parameter_name: "___hsigmoid_0__.wbias"
-  num_classes: 10
-}
-parameters {
-  name: "___hsigmoid_0__.w0"
-  size: 900
-  initial_mean: 0.0
-  initial_std: 0.333333333333
-  dims: 9
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___hsigmoid_0__.wbias"
-  size: 9
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 9
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-input_layer_names: "label"
-output_layer_names: "__hsigmoid_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "label"
-  layer_names: "__hsigmoid_0__"
-  input_layer_names: "data"
-  input_layer_names: "label"
-  output_layer_names: "__hsigmoid_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
deleted file mode 100644
index f93d368c868..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
+++ /dev/null
@@ -1,59 +0,0 @@
-type: "nn"
-layers {
-  name: "input_seq"
-  type: "data"
-  size: 128
-  active_type: ""
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 1
-  active_type: "exponential"
-  inputs {
-    input_layer_name: "input_seq"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-  bias_parameter_name: "___fc_layer_0__.wbias"
-}
-layers {
-  name: "__kmax_seq_score_layer_0__"
-  type: "kmax_seq_score"
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-  }
-  beam_size: 5
-}
-parameters {
-  name: "___fc_layer_0__.w0"
-  size: 128
-  initial_mean: 0.0
-  initial_std: 0.0883883476483
-  dims: 128
-  dims: 1
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_0__.wbias"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "input_seq"
-output_layer_names: "__kmax_seq_score_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "input_seq"
-  layer_names: "__fc_layer_0__"
-  layer_names: "__kmax_seq_score_layer_0__"
-  input_layer_names: "input_seq"
-  output_layer_names: "__kmax_seq_score_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr
deleted file mode 100644
index 9ba33689edc..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr
+++ /dev/null
@@ -1,39 +0,0 @@
-type: "nn"
-layers {
-  name: "x"
-  type: "data"
-  size: 128
-  active_type: ""
-}
-layers {
-  name: "y"
-  type: "data"
-  size: 128
-  active_type: ""
-}
-layers {
-  name: "__l2_distance_layer_0__"
-  type: "l2_distance"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "x"
-  }
-  inputs {
-    input_layer_name: "y"
-  }
-}
-input_layer_names: "x"
-input_layer_names: "y"
-output_layer_names: "__l2_distance_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "x"
-  layer_names: "y"
-  layer_names: "__l2_distance_layer_0__"
-  input_layer_names: "x"
-  input_layer_names: "y"
-  output_layer_names: "__l2_distance_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_lstmemory_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_lstmemory_layer.protostr
deleted file mode 100644
index 76a4afab82c..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_lstmemory_layer.protostr
+++ /dev/null
@@ -1,53 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 128
-  active_type: ""
-}
-layers {
-  name: "__lstmemory_0__"
-  type: "lstmemory"
-  size: 32
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___lstmemory_0__.w0"
-  }
-  bias_parameter_name: "___lstmemory_0__.wbias"
-  reversed: true
-  active_gate_type: "tanh"
-  active_state_type: "tanh"
-}
-parameters {
-  name: "___lstmemory_0__.w0"
-  size: 4096
-  initial_mean: 0.0
-  initial_std: 0.176776695297
-  dims: 32
-  dims: 32
-  dims: 4
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___lstmemory_0__.wbias"
-  size: 224
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 224
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-output_layer_names: "__lstmemory_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__lstmemory_0__"
-  input_layer_names: "data"
-  output_layer_names: "__lstmemory_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
deleted file mode 100644
index 39dc4871469..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
+++ /dev/null
@@ -1,233 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 2304
-  active_type: ""
-  height: 48
-  width: 48
-}
-layers {
-  name: "__conv_0__"
-  type: "exconv"
-  size: 36864
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___conv_0__.w0"
-    conv_conf {
-      filter_size: 3
-      channels: 1
-      stride: 1
-      padding: 1
-      groups: 1
-      filter_channels: 1
-      output_x: 48
-      img_size: 48
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 1
-      output_y: 48
-      img_size_y: 48
-      dilation: 1
-      dilation_y: 1
-    }
-  }
-  bias_parameter_name: "___conv_0__.wbias"
-  num_filters: 16
-  shared_biases: true
-  height: 48
-  width: 48
-}
-layers {
-  name: "__maxout_layer_0__"
-  type: "maxout"
-  size: 18432
-  active_type: ""
-  inputs {
-    input_layer_name: "__conv_0__"
-    maxout_conf {
-      image_conf {
-        channels: 16
-        img_size: 48
-        img_size_y: 48
-      }
-      groups: 2
-    }
-  }
-  height: 48
-  width: 48
-}
-layers {
-  name: "__pool_0__"
-  type: "pool"
-  size: 4608
-  active_type: ""
-  inputs {
-    input_layer_name: "__maxout_layer_0__"
-    pool_conf {
-      pool_type: "max-projection"
-      channels: 8
-      size_x: 2
-      stride: 2
-      output_x: 24
-      img_size: 48
-      padding: 0
-      size_y: 2
-      stride_y: 2
-      output_y: 24
-      img_size_y: 48
-      padding_y: 0
-    }
-  }
-  height: 24
-  width: 24
-}
-layers {
-  name: "__conv_1__"
-  type: "exconv"
-  size: 73728
-  active_type: ""
-  inputs {
-    input_layer_name: "__pool_0__"
-    input_parameter_name: "___conv_1__.w0"
-    conv_conf {
-      filter_size: 3
-      channels: 8
-      stride: 1
-      padding: 1
-      groups: 1
-      filter_channels: 8
-      output_x: 24
-      img_size: 24
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 1
-      output_y: 24
-      img_size_y: 24
-      dilation: 1
-      dilation_y: 1
-    }
-  }
-  bias_parameter_name: "___conv_1__.wbias"
-  num_filters: 128
-  shared_biases: true
-  height: 24
-  width: 24
-}
-layers {
-  name: "__maxout_layer_1__"
-  type: "maxout"
-  size: 18432
-  active_type: ""
-  inputs {
-    input_layer_name: "__conv_1__"
-    maxout_conf {
-      image_conf {
-        channels: 128
-        img_size: 24
-        img_size_y: 24
-      }
-      groups: 4
-    }
-  }
-  height: 24
-  width: 24
-}
-layers {
-  name: "__block_expand_layer_0__"
-  type: "blockexpand"
-  size: 192
-  active_type: ""
-  inputs {
-    input_layer_name: "__maxout_layer_1__"
-    block_expand_conf {
-      channels: 32
-      stride_x: 1
-      stride_y: 1
-      padding_x: 0
-      padding_y: 0
-      block_x: 1
-      block_y: 6
-      output_x: 0
-      output_y: 0
-      img_size_x: 0
-      img_size_y: 0
-    }
-  }
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 384
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__block_expand_layer_0__"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-}
-parameters {
-  name: "___conv_0__.w0"
-  size: 144
-  initial_mean: 0.0
-  initial_std: 0.471404520791
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___conv_0__.wbias"
-  size: 16
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 16
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___conv_1__.w0"
-  size: 9216
-  initial_mean: 0.0
-  initial_std: 0.166666666667
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___conv_1__.wbias"
-  size: 128
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 128
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___fc_layer_0__.w0"
-  size: 73728
-  initial_mean: 0.0
-  initial_std: 0.0721687836487
-  dims: 192
-  dims: 384
-  initial_strategy: 0
-  initial_smart: true
-}
-input_layer_names: "data"
-output_layer_names: "__fc_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__conv_0__"
-  layer_names: "__maxout_layer_0__"
-  layer_names: "__pool_0__"
-  layer_names: "__conv_1__"
-  layer_names: "__maxout_layer_1__"
-  layer_names: "__block_expand_layer_0__"
-  layer_names: "__fc_layer_0__"
-  input_layer_names: "data"
-  output_layer_names: "__fc_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr
deleted file mode 100644
index 0ba84dcc6db..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr
+++ /dev/null
@@ -1,79 +0,0 @@
-type: "nn"
-layers {
-  name: "input_loc"
-  type: "data"
-  size: 16
-  active_type: ""
-  height: 16
-  width: 1
-}
-layers {
-  name: "input_conf"
-  type: "data"
-  size: 8
-  active_type: ""
-  height: 1
-  width: 8
-}
-layers {
-  name: "priorbox"
-  type: "data"
-  size: 32
-  active_type: ""
-  height: 4
-  width: 8
-}
-layers {
-  name: "label"
-  type: "data"
-  size: 24
-  active_type: ""
-  height: 4
-  width: 6
-}
-layers {
-  name: "test_multibox_loss"
-  type: "multibox_loss"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "priorbox"
-    multibox_loss_conf {
-      num_classes: 21
-      overlap_threshold: 0.5
-      neg_pos_ratio: 3.0
-      neg_overlap: 0.5
-      background_id: 0
-      input_num: 1
-    }
-  }
-  inputs {
-    input_layer_name: "label"
-  }
-  inputs {
-    input_layer_name: "input_loc"
-  }
-  inputs {
-    input_layer_name: "input_conf"
-  }
-}
-input_layer_names: "priorbox"
-input_layer_names: "label"
-input_layer_names: "input_loc"
-input_layer_names: "input_conf"
-output_layer_names: "test_multibox_loss"
-sub_models {
-  name: "root"
-  layer_names: "input_loc"
-  layer_names: "input_conf"
-  layer_names: "priorbox"
-  layer_names: "label"
-  layer_names: "test_multibox_loss"
-  input_layer_names: "priorbox"
-  input_layer_names: "label"
-  input_layer_names: "input_loc"
-  input_layer_names: "input_conf"
-  output_layer_names: "test_multibox_loss"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multiplex_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multiplex_layer.protostr
deleted file mode 100644
index 379842ba8d3..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multiplex_layer.protostr
+++ /dev/null
@@ -1,63 +0,0 @@
-type: "nn"
-layers {
-  name: "index"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "data1"
-  type: "data"
-  size: 30
-  active_type: ""
-}
-layers {
-  name: "data2"
-  type: "data"
-  size: 30
-  active_type: ""
-}
-layers {
-  name: "data3"
-  type: "data"
-  size: 30
-  active_type: ""
-}
-layers {
-  name: "__multiplex_layer_0__"
-  type: "multiplex"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "index"
-  }
-  inputs {
-    input_layer_name: "data1"
-  }
-  inputs {
-    input_layer_name: "data2"
-  }
-  inputs {
-    input_layer_name: "data3"
-  }
-}
-input_layer_names: "index"
-input_layer_names: "data1"
-input_layer_names: "data2"
-input_layer_names: "data3"
-output_layer_names: "__multiplex_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "index"
-  layer_names: "data1"
-  layer_names: "data2"
-  layer_names: "data3"
-  layer_names: "__multiplex_layer_0__"
-  input_layer_names: "index"
-  input_layer_names: "data1"
-  input_layer_names: "data2"
-  input_layer_names: "data3"
-  output_layer_names: "__multiplex_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
deleted file mode 100644
index c1bfdf1b19c..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
+++ /dev/null
@@ -1,225 +0,0 @@
-type: "nn"
-layers {
-  name: "w"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "a"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "b"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "c"
-  type: "data"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "d"
-  type: "data"
-  size: 31
-  active_type: ""
-}
-layers {
-  name: "__interpolation_layer_0__"
-  type: "interpolation"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "w"
-  }
-  inputs {
-    input_layer_name: "a"
-  }
-  inputs {
-    input_layer_name: "b"
-  }
-}
-layers {
-  name: "__power_layer_0__"
-  type: "power"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "w"
-  }
-  inputs {
-    input_layer_name: "a"
-  }
-}
-layers {
-  name: "__scaling_layer_0__"
-  type: "scaling"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "w"
-  }
-  inputs {
-    input_layer_name: "a"
-  }
-}
-layers {
-  name: "__cos_sim_0__"
-  type: "cos"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "a"
-  }
-  inputs {
-    input_layer_name: "b"
-  }
-  cos_scale: 1
-}
-layers {
-  name: "__cos_sim_1__"
-  type: "cos_vm"
-  size: 2
-  active_type: ""
-  inputs {
-    input_layer_name: "a"
-  }
-  inputs {
-    input_layer_name: "c"
-  }
-  cos_scale: 1
-}
-layers {
-  name: "__sum_to_one_norm_layer_0__"
-  type: "sum_to_one_norm"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "a"
-  }
-}
-layers {
-  name: "__conv_shift_layer_0__"
-  type: "conv_shift"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "a"
-  }
-  inputs {
-    input_layer_name: "d"
-  }
-}
-layers {
-  name: "__tensor_layer_0__"
-  type: "tensor"
-  size: 1000
-  active_type: ""
-  inputs {
-    input_layer_name: "a"
-    input_parameter_name: "___tensor_layer_0__.w0"
-  }
-  inputs {
-    input_layer_name: "b"
-  }
-  bias_parameter_name: "___tensor_layer_0__.wbias"
-}
-layers {
-  name: "__slope_intercept_layer_0__"
-  type: "slope_intercept"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "a"
-  }
-  slope: 0.7
-  intercept: 0.9
-}
-layers {
-  name: "__linear_comb_layer_0__"
-  type: "convex_comb"
-  size: 2
-  active_type: ""
-  inputs {
-    input_layer_name: "b"
-  }
-  inputs {
-    input_layer_name: "c"
-  }
-}
-parameters {
-  name: "___tensor_layer_0__.w0"
-  size: 10000000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  dims: 1000
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___tensor_layer_0__.wbias"
-  size: 1000
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1000
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "w"
-input_layer_names: "a"
-input_layer_names: "b"
-input_layer_names: "c"
-input_layer_names: "d"
-output_layer_names: "__interpolation_layer_0__"
-output_layer_names: "__power_layer_0__"
-output_layer_names: "__scaling_layer_0__"
-output_layer_names: "__cos_sim_0__"
-output_layer_names: "__cos_sim_1__"
-output_layer_names: "__sum_to_one_norm_layer_0__"
-output_layer_names: "__conv_shift_layer_0__"
-output_layer_names: "__tensor_layer_0__"
-output_layer_names: "__slope_intercept_layer_0__"
-output_layer_names: "__linear_comb_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "w"
-  layer_names: "a"
-  layer_names: "b"
-  layer_names: "c"
-  layer_names: "d"
-  layer_names: "__interpolation_layer_0__"
-  layer_names: "__power_layer_0__"
-  layer_names: "__scaling_layer_0__"
-  layer_names: "__cos_sim_0__"
-  layer_names: "__cos_sim_1__"
-  layer_names: "__sum_to_one_norm_layer_0__"
-  layer_names: "__conv_shift_layer_0__"
-  layer_names: "__tensor_layer_0__"
-  layer_names: "__slope_intercept_layer_0__"
-  layer_names: "__linear_comb_layer_0__"
-  input_layer_names: "w"
-  input_layer_names: "a"
-  input_layer_names: "b"
-  input_layer_names: "c"
-  input_layer_names: "d"
-  output_layer_names: "__interpolation_layer_0__"
-  output_layer_names: "__power_layer_0__"
-  output_layer_names: "__scaling_layer_0__"
-  output_layer_names: "__cos_sim_0__"
-  output_layer_names: "__cos_sim_1__"
-  output_layer_names: "__sum_to_one_norm_layer_0__"
-  output_layer_names: "__conv_shift_layer_0__"
-  output_layer_names: "__tensor_layer_0__"
-  output_layer_names: "__slope_intercept_layer_0__"
-  output_layer_names: "__linear_comb_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
deleted file mode 100644
index d5d6d31a17b..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
+++ /dev/null
@@ -1,122 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 2016
-  active_type: ""
-  height: 48
-  width: 42
-}
-layers {
-  name: "__conv_0__"
-  type: "exconv"
-  size: 32256
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___conv_0__.w0"
-    conv_conf {
-      filter_size: 3
-      channels: 1
-      stride: 1
-      padding: 1
-      groups: 1
-      filter_channels: 1
-      output_x: 42
-      img_size: 42
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 1
-      output_y: 48
-      img_size_y: 48
-      dilation: 1
-      dilation_y: 1
-    }
-  }
-  bias_parameter_name: "___conv_0__.wbias"
-  num_filters: 16
-  shared_biases: true
-  height: 48
-  width: 42
-}
-layers {
-  name: "__pool_0__"
-  type: "pool"
-  size: 8064
-  active_type: ""
-  inputs {
-    input_layer_name: "__conv_0__"
-    pool_conf {
-      pool_type: "max-projection"
-      channels: 16
-      size_x: 2
-      stride: 2
-      output_x: 21
-      img_size: 42
-      padding: 0
-      size_y: 2
-      stride_y: 2
-      output_y: 24
-      img_size_y: 48
-      padding_y: 0
-    }
-  }
-  height: 24
-  width: 21
-}
-layers {
-  name: "__pad_0__"
-  type: "pad"
-  size: 14175
-  active_type: ""
-  inputs {
-    input_layer_name: "__pool_0__"
-    pad_conf {
-      image_conf {
-        channels: 16
-        img_size: 21
-        img_size_y: 24
-      }
-      pad_c: 2
-      pad_c: 3
-      pad_h: 1
-      pad_h: 2
-      pad_w: 3
-      pad_w: 1
-    }
-  }
-  height: 27
-  width: 25
-}
-parameters {
-  name: "___conv_0__.w0"
-  size: 144
-  initial_mean: 0.0
-  initial_std: 0.471404520791
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___conv_0__.wbias"
-  size: 16
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 16
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-output_layer_names: "__pad_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__conv_0__"
-  layer_names: "__pool_0__"
-  layer_names: "__pad_0__"
-  input_layer_names: "data"
-  output_layer_names: "__pad_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pooling3D_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pooling3D_layer.protostr
deleted file mode 100644
index 8eb98593f6f..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pooling3D_layer.protostr
+++ /dev/null
@@ -1,123 +0,0 @@
-type: "nn"
-layers {
-  name: "data_2d"
-  type: "data"
-  size: 6000
-  active_type: ""
-  height: 20
-  width: 10
-}
-layers {
-  name: "pool___2d"
-  type: "pool"
-  size: 840
-  active_type: ""
-  inputs {
-    input_layer_name: "data_2d"
-    pool_conf {
-      pool_type: "avg-projection"
-      channels: 30
-      size_x: 5
-      stride: 3
-      output_x: 4
-      img_size: 10
-      padding: 1
-      size_y: 5
-      stride_y: 3
-      output_y: 7
-      img_size_y: 20
-      padding_y: 1
-    }
-  }
-  height: 7
-  width: 4
-}
-layers {
-  name: "data_3d_1"
-  type: "data"
-  size: 60000
-  active_type: ""
-  height: 20
-  width: 10
-  depth: 10
-}
-layers {
-  name: "pool_3d_1"
-  type: "pool3d"
-  size: 3360
-  active_type: ""
-  inputs {
-    input_layer_name: "data_3d_1"
-    pool_conf {
-      pool_type: "avg-projection"
-      channels: 30
-      size_x: 5
-      stride: 3
-      output_x: 4
-      img_size: 10
-      padding: 1
-      size_y: 5
-      stride_y: 3
-      output_y: 7
-      img_size_y: 20
-      padding_y: 1
-      size_z: 5
-      stride_z: 3
-      output_z: 4
-      img_size_z: 10
-      padding_z: 1
-    }
-  }
-  height: 7
-  width: 4
-  depth: 4
-}
-layers {
-  name: "pool_3d_2"
-  type: "pool3d"
-  size: 3360
-  active_type: ""
-  inputs {
-    input_layer_name: "data_3d_1"
-    pool_conf {
-      pool_type: "max-projection"
-      channels: 30
-      size_x: 5
-      stride: 3
-      output_x: 4
-      img_size: 10
-      padding: 1
-      size_y: 5
-      stride_y: 3
-      output_y: 7
-      img_size_y: 20
-      padding_y: 1
-      size_z: 5
-      stride_z: 3
-      output_z: 4
-      img_size_z: 10
-      padding_z: 1
-    }
-  }
-  height: 7
-  width: 4
-  depth: 4
-}
-input_layer_names: "data_2d"
-output_layer_names: "pool___2d"
-output_layer_names: "pool_3d_1"
-output_layer_names: "pool_3d_2"
-sub_models {
-  name: "root"
-  layer_names: "data_2d"
-  layer_names: "pool___2d"
-  layer_names: "data_3d_1"
-  layer_names: "pool_3d_1"
-  layer_names: "pool_3d_2"
-  input_layer_names: "data_2d"
-  output_layer_names: "pool___2d"
-  output_layer_names: "pool_3d_1"
-  output_layer_names: "pool_3d_2"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
deleted file mode 100644
index 63fb38c6508..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
+++ /dev/null
@@ -1,144 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 300
-  active_type: ""
-  height: 10
-  width: 10
-}
-layers {
-  name: "__prelu_layer_0__"
-  type: "prelu"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___prelu_layer_0__.w0"
-  }
-  partial_sum: 1
-  height: 10
-  width: 10
-  depth: 1
-}
-layers {
-  name: "__prelu_layer_1__"
-  type: "prelu"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___prelu_layer_1__.w0"
-  }
-  partial_sum: 1
-  height: 10
-  width: 10
-  depth: 1
-}
-layers {
-  name: "__prelu_layer_2__"
-  type: "prelu"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___prelu_layer_2__.w0"
-  }
-  partial_sum: 5
-  height: 10
-  width: 10
-  depth: 1
-}
-layers {
-  name: "__prelu_layer_3__"
-  type: "prelu"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___prelu_layer_3__.w0"
-  }
-  partial_sum: 300
-  height: 10
-  width: 10
-  depth: 1
-}
-layers {
-  name: "__prelu_layer_4__"
-  type: "prelu"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___prelu_layer_4__.w0"
-  }
-  partial_sum: 100
-  height: 10
-  width: 10
-  depth: 1
-}
-parameters {
-  name: "___prelu_layer_0__.w0"
-  size: 300
-  initial_mean: 0.25
-  initial_std: 0.0
-  dims: 1
-  dims: 300
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___prelu_layer_1__.w0"
-  size: 300
-  initial_mean: 0.25
-  initial_std: 0.0
-  dims: 1
-  dims: 300
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___prelu_layer_2__.w0"
-  size: 60
-  initial_mean: 0.25
-  initial_std: 0.0
-  dims: 1
-  dims: 60
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___prelu_layer_3__.w0"
-  size: 1
-  initial_mean: 0.25
-  initial_std: 0.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___prelu_layer_4__.w0"
-  size: 3
-  initial_mean: 0.25
-  initial_std: 0.0
-  dims: 1
-  dims: 3
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "input"
-output_layer_names: "__prelu_layer_4__"
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "__prelu_layer_0__"
-  layer_names: "__prelu_layer_1__"
-  layer_names: "__prelu_layer_2__"
-  layer_names: "__prelu_layer_3__"
-  layer_names: "__prelu_layer_4__"
-  input_layer_names: "input"
-  output_layer_names: "__prelu_layer_4__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
deleted file mode 100644
index f4cc492dfb9..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
+++ /dev/null
@@ -1,27 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__print_0__"
-  type: "print"
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-  }
-  user_arg: "layer=input %s"
-}
-input_layer_names: "input"
-output_layer_names: "input"
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "__print_0__"
-  input_layer_names: "input"
-  output_layer_names: "input"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
deleted file mode 100644
index 046037936a6..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
+++ /dev/null
@@ -1,593 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__addto_0__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  inputs {
-    input_layer_name: "data"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_1__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_0__"
-  }
-  inputs {
-    input_layer_name: "__addto_0__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_2__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_1__"
-  }
-  inputs {
-    input_layer_name: "__addto_1__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_3__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_2__"
-  }
-  inputs {
-    input_layer_name: "__addto_2__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_4__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_3__"
-  }
-  inputs {
-    input_layer_name: "__addto_3__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_5__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_4__"
-  }
-  inputs {
-    input_layer_name: "__addto_4__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_6__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_5__"
-  }
-  inputs {
-    input_layer_name: "__addto_5__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_7__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_6__"
-  }
-  inputs {
-    input_layer_name: "__addto_6__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_8__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_7__"
-  }
-  inputs {
-    input_layer_name: "__addto_7__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_9__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_8__"
-  }
-  inputs {
-    input_layer_name: "__addto_8__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_10__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_9__"
-  }
-  inputs {
-    input_layer_name: "__addto_9__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_11__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_10__"
-  }
-  inputs {
-    input_layer_name: "__addto_10__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_12__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_11__"
-  }
-  inputs {
-    input_layer_name: "__addto_11__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_13__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_12__"
-  }
-  inputs {
-    input_layer_name: "__addto_12__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_14__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_13__"
-  }
-  inputs {
-    input_layer_name: "__addto_13__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_15__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_14__"
-  }
-  inputs {
-    input_layer_name: "__addto_14__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_16__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_15__"
-  }
-  inputs {
-    input_layer_name: "__addto_15__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_17__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_16__"
-  }
-  inputs {
-    input_layer_name: "__addto_16__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_18__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_17__"
-  }
-  inputs {
-    input_layer_name: "__addto_17__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_19__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_18__"
-  }
-  inputs {
-    input_layer_name: "__addto_18__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_20__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_19__"
-  }
-  inputs {
-    input_layer_name: "__addto_19__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_21__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_20__"
-  }
-  inputs {
-    input_layer_name: "__addto_20__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_22__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_21__"
-  }
-  inputs {
-    input_layer_name: "__addto_21__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_23__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_22__"
-  }
-  inputs {
-    input_layer_name: "__addto_22__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_24__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_23__"
-  }
-  inputs {
-    input_layer_name: "__addto_23__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_25__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_24__"
-  }
-  inputs {
-    input_layer_name: "__addto_24__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_26__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_25__"
-  }
-  inputs {
-    input_layer_name: "__addto_25__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_27__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_26__"
-  }
-  inputs {
-    input_layer_name: "__addto_26__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_28__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_27__"
-  }
-  inputs {
-    input_layer_name: "__addto_27__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_29__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_28__"
-  }
-  inputs {
-    input_layer_name: "__addto_28__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_30__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_29__"
-  }
-  inputs {
-    input_layer_name: "__addto_29__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_31__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_30__"
-  }
-  inputs {
-    input_layer_name: "__addto_30__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 32
-  active_type: "relu"
-  inputs {
-    input_layer_name: "__addto_31__"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-  bias_parameter_name: "___fc_layer_0__.wbias"
-}
-layers {
-  name: "__fc_layer_1__"
-  type: "fc"
-  size: 10
-  active_type: "softmax"
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___fc_layer_1__.w0"
-  }
-  bias_parameter_name: "___fc_layer_1__.wbias"
-}
-parameters {
-  name: "___fc_layer_0__.w0"
-  size: 3200
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 32
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_0__.wbias"
-  size: 32
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 32
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___fc_layer_1__.w0"
-  size: 320
-  initial_mean: 0.0
-  initial_std: 0.176776695297
-  dims: 32
-  dims: 10
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_1__.wbias"
-  size: 10
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 10
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-output_layer_names: "__fc_layer_1__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__addto_0__"
-  layer_names: "__addto_1__"
-  layer_names: "__addto_2__"
-  layer_names: "__addto_3__"
-  layer_names: "__addto_4__"
-  layer_names: "__addto_5__"
-  layer_names: "__addto_6__"
-  layer_names: "__addto_7__"
-  layer_names: "__addto_8__"
-  layer_names: "__addto_9__"
-  layer_names: "__addto_10__"
-  layer_names: "__addto_11__"
-  layer_names: "__addto_12__"
-  layer_names: "__addto_13__"
-  layer_names: "__addto_14__"
-  layer_names: "__addto_15__"
-  layer_names: "__addto_16__"
-  layer_names: "__addto_17__"
-  layer_names: "__addto_18__"
-  layer_names: "__addto_19__"
-  layer_names: "__addto_20__"
-  layer_names: "__addto_21__"
-  layer_names: "__addto_22__"
-  layer_names: "__addto_23__"
-  layer_names: "__addto_24__"
-  layer_names: "__addto_25__"
-  layer_names: "__addto_26__"
-  layer_names: "__addto_27__"
-  layer_names: "__addto_28__"
-  layer_names: "__addto_29__"
-  layer_names: "__addto_30__"
-  layer_names: "__addto_31__"
-  layer_names: "__fc_layer_0__"
-  layer_names: "__fc_layer_1__"
-  input_layer_names: "data"
-  output_layer_names: "__fc_layer_1__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_repeat_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_repeat_layer.protostr
deleted file mode 100644
index e012386ff95..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_repeat_layer.protostr
+++ /dev/null
@@ -1,42 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 30
-  active_type: ""
-}
-layers {
-  name: "__repeat_layer_0__"
-  type: "featmap_expand"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  num_filters: 10
-}
-layers {
-  name: "__repeat_layer_1__"
-  type: "featmap_expand"
-  size: 300
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "data"
-  }
-  num_filters: 10
-  user_arg: "as_col_vec"
-}
-input_layer_names: "data"
-output_layer_names: "__repeat_layer_0__"
-output_layer_names: "__repeat_layer_1__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__repeat_layer_0__"
-  layer_names: "__repeat_layer_1__"
-  input_layer_names: "data"
-  output_layer_names: "__repeat_layer_0__"
-  output_layer_names: "__repeat_layer_1__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
deleted file mode 100644
index 9399252b23d..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
+++ /dev/null
@@ -1,27 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 300
-  active_type: ""
-}
-layers {
-  name: "__resize_0__"
-  type: "resize"
-  size: 150
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-  }
-}
-input_layer_names: "input"
-output_layer_names: "__resize_0__"
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "__resize_0__"
-  input_layer_names: "input"
-  output_layer_names: "__resize_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
deleted file mode 100644
index 711785be37d..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ /dev/null
@@ -1,738 +0,0 @@
-type: "recurrent_nn"
-layers {
-  name: "seq_input"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "sub_seq_input"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "label"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "__mixed_0__"
-  type: "mixed"
-  size: 400
-  active_type: ""
-  inputs {
-    input_layer_name: "seq_input"
-    input_parameter_name: "___mixed_0__.w0"
-    proj_conf {
-      type: "fc"
-      name: "___mixed_0__.w0"
-      input_size: 100
-      output_size: 400
-    }
-  }
-}
-layers {
-  name: "__mixed_1__"
-  type: "mixed"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "seq_input"
-    input_parameter_name: "___mixed_1__.w0"
-    proj_conf {
-      type: "fc"
-      name: "___mixed_1__.w0"
-      input_size: 100
-      output_size: 300
-    }
-  }
-}
-layers {
-  name: "__recurrent_group_0__"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "seq_input@__recurrent_group_0__"
-  type: "scatter_agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "rnn_forward+delay1@__recurrent_group_0__"
-  type: "agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "rnn_forward@__recurrent_group_0__"
-  type: "fc"
-  size: 200
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "seq_input@__recurrent_group_0__"
-    input_parameter_name: "_rnn_forward@__recurrent_group_0__.w0"
-  }
-  inputs {
-    input_layer_name: "rnn_forward+delay1@__recurrent_group_0__"
-    input_parameter_name: "_rnn_forward@__recurrent_group_0__.w1"
-  }
-  bias_parameter_name: "_rnn_forward@__recurrent_group_0__.wbias"
-}
-layers {
-  name: "rnn_forward"
-  type: "gather_agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__last_seq_0__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "rnn_forward"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__recurrent_group_1__"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "seq_input@__recurrent_group_1__"
-  type: "scatter_agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "rnn_back+delay1@__recurrent_group_1__"
-  type: "agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "rnn_back@__recurrent_group_1__"
-  type: "fc"
-  size: 200
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "seq_input@__recurrent_group_1__"
-    input_parameter_name: "_rnn_back@__recurrent_group_1__.w0"
-  }
-  inputs {
-    input_layer_name: "rnn_back+delay1@__recurrent_group_1__"
-    input_parameter_name: "_rnn_back@__recurrent_group_1__.w1"
-  }
-  bias_parameter_name: "_rnn_back@__recurrent_group_1__.wbias"
-}
-layers {
-  name: "rnn_back"
-  type: "gather_agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__first_seq_0__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "rnn_back"
-  }
-  select_first: true
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__recurrent_group_2__"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "sub_seq_input@__recurrent_group_2__"
-  type: "scatter_agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "rnn_subseq_forward+delay1@__recurrent_group_2__"
-  type: "agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "rnn_subseq_forward@__recurrent_group_2__"
-  type: "fc"
-  size: 200
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "sub_seq_input@__recurrent_group_2__"
-    input_parameter_name: "_rnn_subseq_forward@__recurrent_group_2__.w0"
-  }
-  inputs {
-    input_layer_name: "rnn_subseq_forward+delay1@__recurrent_group_2__"
-    input_parameter_name: "_rnn_subseq_forward@__recurrent_group_2__.w1"
-  }
-  bias_parameter_name: "_rnn_subseq_forward@__recurrent_group_2__.wbias"
-}
-layers {
-  name: "rnn_subseq_forward"
-  type: "gather_agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__last_seq_1__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "rnn_subseq_forward"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__lstm_group_0___recurrent_group"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "__mixed_0__@__lstm_group_0___recurrent_group"
-  type: "scatter_agent"
-  size: 400
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-  type: "agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-  type: "agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
-  type: "mixed"
-  size: 400
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_0__@__lstm_group_0___recurrent_group"
-    proj_conf {
-      type: "identity"
-      name: "___lstm_group_0___input_recurrent.w0"
-      input_size: 400
-      output_size: 400
-    }
-  }
-  inputs {
-    input_layer_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-    input_parameter_name: "___lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group.w1"
-    proj_conf {
-      type: "fc"
-      name: "___lstm_group_0___input_recurrent.w1"
-      input_size: 100
-      output_size: 400
-    }
-  }
-}
-layers {
-  name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-  type: "lstm_step"
-  size: 100
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
-  }
-  inputs {
-    input_layer_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-  }
-  bias_parameter_name: "___lstm_group_0__@__lstm_group_0___recurrent_group.wbias"
-  active_gate_type: "sigmoid"
-  active_state_type: "tanh"
-}
-layers {
-  name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
-  type: "get_output"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-    input_layer_argument: "state"
-  }
-}
-layers {
-  name: "__lstm_group_0__"
-  type: "gather_agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__last_seq_2__"
-  type: "seqlastins"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__lstm_group_0__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__gru_group_0___recurrent_group"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "__mixed_1__@__gru_group_0___recurrent_group"
-  type: "scatter_agent"
-  size: 300
-  active_type: ""
-}
-layers {
-  name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
-  type: "agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__gru_group_0__@__gru_group_0___recurrent_group"
-  type: "gru_step"
-  size: 100
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__mixed_1__@__gru_group_0___recurrent_group"
-    input_parameter_name: "___gru_group_0__@__gru_group_0___recurrent_group.w0"
-  }
-  inputs {
-    input_layer_name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
-  }
-  bias_parameter_name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias"
-  active_gate_type: "sigmoid"
-}
-layers {
-  name: "__gru_group_0__"
-  type: "gather_agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__last_seq_3__"
-  type: "seqlastins"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__gru_group_0__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__recurrent_group_3__"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "seq_input@__recurrent_group_3__"
-  type: "scatter_agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__memory_6__@__recurrent_group_3__"
-  type: "agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__fc_layer_0__@__recurrent_group_3__"
-  type: "fc"
-  size: 200
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "seq_input@__recurrent_group_3__"
-    input_parameter_name: "___fc_layer_0__@__recurrent_group_3__.w0"
-  }
-  inputs {
-    input_layer_name: "__memory_6__@__recurrent_group_3__"
-    input_parameter_name: "___fc_layer_0__@__recurrent_group_3__.w1"
-  }
-  bias_parameter_name: "___fc_layer_0__@__recurrent_group_3__.wbias"
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "gather_agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__last_seq_4__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-parameters {
-  name: "___mixed_0__.w0"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 400
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___mixed_1__.w0"
-  size: 30000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 300
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_rnn_forward@__recurrent_group_0__.w0"
-  size: 20000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_rnn_forward@__recurrent_group_0__.w1"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_rnn_forward@__recurrent_group_0__.wbias"
-  size: 200
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 200
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_rnn_back@__recurrent_group_1__.w0"
-  size: 20000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_rnn_back@__recurrent_group_1__.w1"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_rnn_back@__recurrent_group_1__.wbias"
-  size: 200
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 200
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_rnn_subseq_forward@__recurrent_group_2__.w0"
-  size: 20000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_rnn_subseq_forward@__recurrent_group_2__.w1"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_rnn_subseq_forward@__recurrent_group_2__.wbias"
-  size: 200
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 200
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group.w1"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 400
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___lstm_group_0__@__lstm_group_0___recurrent_group.wbias"
-  size: 300
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 300
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___gru_group_0__@__gru_group_0___recurrent_group.w0"
-  size: 30000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 300
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias"
-  size: 300
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 300
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___fc_layer_0__@__recurrent_group_3__.w0"
-  size: 20000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_0__@__recurrent_group_3__.w1"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_0__@__recurrent_group_3__.wbias"
-  size: 200
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 200
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "seq_input"
-input_layer_names: "sub_seq_input"
-output_layer_names: "__last_seq_0__"
-output_layer_names: "__first_seq_0__"
-output_layer_names: "__last_seq_1__"
-output_layer_names: "__last_seq_2__"
-output_layer_names: "__last_seq_3__"
-output_layer_names: "__last_seq_4__"
-sub_models {
-  name: "root"
-  layer_names: "seq_input"
-  layer_names: "sub_seq_input"
-  layer_names: "label"
-  layer_names: "__mixed_0__"
-  layer_names: "__mixed_1__"
-  layer_names: "__recurrent_group_0__"
-  layer_names: "rnn_forward"
-  layer_names: "__last_seq_0__"
-  layer_names: "__recurrent_group_1__"
-  layer_names: "rnn_back"
-  layer_names: "__first_seq_0__"
-  layer_names: "__recurrent_group_2__"
-  layer_names: "rnn_subseq_forward"
-  layer_names: "__last_seq_1__"
-  layer_names: "__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0__"
-  layer_names: "__last_seq_2__"
-  layer_names: "__gru_group_0___recurrent_group"
-  layer_names: "__gru_group_0__"
-  layer_names: "__last_seq_3__"
-  layer_names: "__recurrent_group_3__"
-  layer_names: "__fc_layer_0__"
-  layer_names: "__last_seq_4__"
-  input_layer_names: "seq_input"
-  input_layer_names: "sub_seq_input"
-  output_layer_names: "__last_seq_0__"
-  output_layer_names: "__first_seq_0__"
-  output_layer_names: "__last_seq_1__"
-  output_layer_names: "__last_seq_2__"
-  output_layer_names: "__last_seq_3__"
-  output_layer_names: "__last_seq_4__"
-  is_recurrent_layer_group: false
-}
-sub_models {
-  name: "__recurrent_group_0__"
-  layer_names: "seq_input@__recurrent_group_0__"
-  layer_names: "rnn_forward+delay1@__recurrent_group_0__"
-  layer_names: "rnn_forward@__recurrent_group_0__"
-  is_recurrent_layer_group: true
-  reversed: false
-  memories {
-    layer_name: "rnn_forward@__recurrent_group_0__"
-    link_name: "rnn_forward+delay1@__recurrent_group_0__"
-  }
-  in_links {
-    layer_name: "seq_input"
-    link_name: "seq_input@__recurrent_group_0__"
-  }
-  out_links {
-    layer_name: "rnn_forward@__recurrent_group_0__"
-    link_name: "rnn_forward"
-  }
-}
-sub_models {
-  name: "__recurrent_group_1__"
-  layer_names: "seq_input@__recurrent_group_1__"
-  layer_names: "rnn_back+delay1@__recurrent_group_1__"
-  layer_names: "rnn_back@__recurrent_group_1__"
-  is_recurrent_layer_group: true
-  reversed: true
-  memories {
-    layer_name: "rnn_back@__recurrent_group_1__"
-    link_name: "rnn_back+delay1@__recurrent_group_1__"
-  }
-  in_links {
-    layer_name: "seq_input"
-    link_name: "seq_input@__recurrent_group_1__"
-  }
-  out_links {
-    layer_name: "rnn_back@__recurrent_group_1__"
-    link_name: "rnn_back"
-  }
-}
-sub_models {
-  name: "__recurrent_group_2__"
-  layer_names: "sub_seq_input@__recurrent_group_2__"
-  layer_names: "rnn_subseq_forward+delay1@__recurrent_group_2__"
-  layer_names: "rnn_subseq_forward@__recurrent_group_2__"
-  is_recurrent_layer_group: true
-  reversed: false
-  memories {
-    layer_name: "rnn_subseq_forward@__recurrent_group_2__"
-    link_name: "rnn_subseq_forward+delay1@__recurrent_group_2__"
-  }
-  in_links {
-    layer_name: "sub_seq_input"
-    link_name: "sub_seq_input@__recurrent_group_2__"
-  }
-  out_links {
-    layer_name: "rnn_subseq_forward@__recurrent_group_2__"
-    link_name: "rnn_subseq_forward"
-  }
-}
-sub_models {
-  name: "__lstm_group_0___recurrent_group"
-  layer_names: "__mixed_0__@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
-  is_recurrent_layer_group: true
-  reversed: false
-  memories {
-    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-    link_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-  }
-  memories {
-    layer_name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
-    link_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-  }
-  in_links {
-    layer_name: "__mixed_0__"
-    link_name: "__mixed_0__@__lstm_group_0___recurrent_group"
-  }
-  out_links {
-    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-    link_name: "__lstm_group_0__"
-  }
-}
-sub_models {
-  name: "__gru_group_0___recurrent_group"
-  layer_names: "__mixed_1__@__gru_group_0___recurrent_group"
-  layer_names: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
-  layer_names: "__gru_group_0__@__gru_group_0___recurrent_group"
-  is_recurrent_layer_group: true
-  reversed: false
-  memories {
-    layer_name: "__gru_group_0__@__gru_group_0___recurrent_group"
-    link_name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
-  }
-  in_links {
-    layer_name: "__mixed_1__"
-    link_name: "__mixed_1__@__gru_group_0___recurrent_group"
-  }
-  out_links {
-    layer_name: "__gru_group_0__@__gru_group_0___recurrent_group"
-    link_name: "__gru_group_0__"
-  }
-}
-sub_models {
-  name: "__recurrent_group_3__"
-  layer_names: "seq_input@__recurrent_group_3__"
-  layer_names: "__memory_6__@__recurrent_group_3__"
-  layer_names: "__fc_layer_0__@__recurrent_group_3__"
-  is_recurrent_layer_group: true
-  reversed: false
-  memories {
-    layer_name: "__fc_layer_0__@__recurrent_group_3__"
-    link_name: "__memory_6__@__recurrent_group_3__"
-  }
-  in_links {
-    layer_name: "seq_input"
-    link_name: "seq_input@__recurrent_group_3__"
-  }
-  out_links {
-    layer_name: "__fc_layer_0__@__recurrent_group_3__"
-    link_name: "__fc_layer_0__"
-  }
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
deleted file mode 100644
index 0ec88aa998c..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
+++ /dev/null
@@ -1,100 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 588
-  active_type: ""
-  height: 14
-  width: 14
-}
-layers {
-  name: "rois"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "__conv_0__"
-  type: "exconv"
-  size: 3136
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___conv_0__.w0"
-    conv_conf {
-      filter_size: 3
-      channels: 3
-      stride: 1
-      padding: 1
-      groups: 1
-      filter_channels: 3
-      output_x: 14
-      img_size: 14
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 1
-      output_y: 14
-      img_size_y: 14
-      dilation: 1
-      dilation_y: 1
-    }
-  }
-  bias_parameter_name: "___conv_0__.wbias"
-  num_filters: 16
-  shared_biases: true
-  height: 14
-  width: 14
-}
-layers {
-  name: "__roi_pool_0__"
-  type: "roi_pool"
-  size: 784
-  active_type: ""
-  inputs {
-    input_layer_name: "__conv_0__"
-    roi_pool_conf {
-      pooled_width: 7
-      pooled_height: 7
-      spatial_scale: 0.0625
-    }
-  }
-  inputs {
-    input_layer_name: "rois"
-  }
-  height: 7
-  width: 7
-}
-parameters {
-  name: "___conv_0__.w0"
-  size: 432
-  initial_mean: 0.0
-  initial_std: 0.272165526976
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___conv_0__.wbias"
-  size: 16
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 16
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-input_layer_names: "rois"
-output_layer_names: "__roi_pool_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "rois"
-  layer_names: "__conv_0__"
-  layer_names: "__roi_pool_0__"
-  input_layer_names: "data"
-  input_layer_names: "rois"
-  output_layer_names: "__roi_pool_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
deleted file mode 100644
index 19c9f16574c..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
+++ /dev/null
@@ -1,41 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 2560
-  active_type: ""
-}
-layers {
-  name: "__row_conv_layer_0__"
-  type: "row_conv"
-  size: 2560
-  active_type: "relu"
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___row_conv_layer_0__.w0"
-    row_conv_conf {
-      context_length: 19
-    }
-  }
-}
-parameters {
-  name: "___row_conv_layer_0__.w0"
-  size: 48640
-  initial_mean: 0.0
-  initial_std: 0.229415733871
-  dims: 19
-  dims: 2560
-  initial_strategy: 0
-  initial_smart: true
-}
-input_layer_names: "data"
-output_layer_names: "__row_conv_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__row_conv_layer_0__"
-  input_layer_names: "data"
-  output_layer_names: "__row_conv_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr
deleted file mode 100644
index c2786ff55c7..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr
+++ /dev/null
@@ -1,27 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 300
-  active_type: ""
-}
-layers {
-  name: "__row_l2_norm_layer_0__"
-  type: "row_l2_norm"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-  }
-}
-input_layer_names: "input"
-output_layer_names: "__row_l2_norm_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "__row_l2_norm_layer_0__"
-  input_layer_names: "input"
-  output_layer_names: "__row_l2_norm_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr
deleted file mode 100644
index 35ade126a25..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr
+++ /dev/null
@@ -1,72 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__scale_shift_0__"
-  type: "scale_shift"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___scale_shift_0__.w0"
-  }
-}
-layers {
-  name: "__scale_shift_1__"
-  type: "scale_shift"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___scale_shift_1__.w0"
-  }
-  bias_parameter_name: "___scale_shift_1__.wbias"
-}
-parameters {
-  name: "___scale_shift_0__.w0"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 1.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___scale_shift_1__.w0"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 1.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___scale_shift_1__.wbias"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-output_layer_names: "__scale_shift_0__"
-output_layer_names: "__scale_shift_1__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__scale_shift_0__"
-  layer_names: "__scale_shift_1__"
-  input_layer_names: "data"
-  output_layer_names: "__scale_shift_0__"
-  output_layer_names: "__scale_shift_1__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr
deleted file mode 100644
index d20133a10ec..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr
+++ /dev/null
@@ -1,51 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 2016
-  active_type: ""
-  height: 48
-  width: 42
-}
-layers {
-  name: "indices"
-  type: "data"
-  size: 6
-  active_type: ""
-}
-layers {
-  name: "__scale_sub_region_0__"
-  type: "scale_sub_region"
-  size: 2016
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    scale_sub_region_conf {
-      image_conf {
-        channels: 1
-        img_size: 42
-        img_size_y: 48
-      }
-      value: 0.0
-    }
-  }
-  inputs {
-    input_layer_name: "indices"
-  }
-  height: 48
-  width: 42
-}
-input_layer_names: "data"
-input_layer_names: "indices"
-output_layer_names: "__scale_sub_region_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "indices"
-  layer_names: "__scale_sub_region_0__"
-  input_layer_names: "data"
-  input_layer_names: "indices"
-  output_layer_names: "__scale_sub_region_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
deleted file mode 100644
index 9d1b41c9d55..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
+++ /dev/null
@@ -1,51 +0,0 @@
-type: "nn"
-layers {
-  name: "data1"
-  type: "data"
-  size: 30
-  active_type: ""
-}
-layers {
-  name: "data2"
-  type: "data"
-  size: 30
-  active_type: ""
-}
-layers {
-  name: "__seqconcat_0__"
-  type: "seqconcat"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "data1"
-  }
-  inputs {
-    input_layer_name: "data2"
-  }
-}
-layers {
-  name: "__seqreshape_0__"
-  type: "seqreshape"
-  size: 5
-  active_type: ""
-  inputs {
-    input_layer_name: "data1"
-  }
-}
-input_layer_names: "data1"
-input_layer_names: "data2"
-output_layer_names: "__seqconcat_0__"
-output_layer_names: "__seqreshape_0__"
-sub_models {
-  name: "root"
-  layer_names: "data1"
-  layer_names: "data2"
-  layer_names: "__seqconcat_0__"
-  layer_names: "__seqreshape_0__"
-  input_layer_names: "data1"
-  input_layer_names: "data2"
-  output_layer_names: "__seqconcat_0__"
-  output_layer_names: "__seqreshape_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_slice_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_slice_layer.protostr
deleted file mode 100644
index 5b73d614fe8..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_slice_layer.protostr
+++ /dev/null
@@ -1,79 +0,0 @@
-type: "nn"
-layers {
-  name: "word"
-  type: "data"
-  size: 128
-  active_type: ""
-}
-layers {
-  name: "starts"
-  type: "data"
-  size: 5
-  active_type: ""
-}
-layers {
-  name: "ends"
-  type: "data"
-  size: 5
-  active_type: ""
-}
-layers {
-  name: "__seq_slice_layer_0__"
-  type: "seq_slice"
-  size: 128
-  active_type: ""
-  inputs {
-    input_layer_name: "word"
-  }
-  inputs {
-    input_layer_name: "starts"
-  }
-  inputs {
-    input_layer_name: "ends"
-  }
-}
-layers {
-  name: "__seq_slice_layer_1__"
-  type: "seq_slice"
-  size: 128
-  active_type: ""
-  inputs {
-    input_layer_name: "word"
-  }
-  inputs {
-    input_layer_name: "starts"
-  }
-  select_first: true
-}
-layers {
-  name: "__seq_slice_layer_2__"
-  type: "seq_slice"
-  size: 128
-  active_type: ""
-  inputs {
-    input_layer_name: "word"
-  }
-  inputs {
-    input_layer_name: "ends"
-  }
-  select_first: false
-}
-input_layer_names: "word"
-output_layer_names: "__seq_slice_layer_0__"
-output_layer_names: "__seq_slice_layer_1__"
-output_layer_names: "__seq_slice_layer_2__"
-sub_models {
-  name: "root"
-  layer_names: "word"
-  layer_names: "starts"
-  layer_names: "ends"
-  layer_names: "__seq_slice_layer_0__"
-  layer_names: "__seq_slice_layer_1__"
-  layer_names: "__seq_slice_layer_2__"
-  input_layer_names: "word"
-  output_layer_names: "__seq_slice_layer_0__"
-  output_layer_names: "__seq_slice_layer_1__"
-  output_layer_names: "__seq_slice_layer_2__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
deleted file mode 100644
index 8989561df04..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
+++ /dev/null
@@ -1,162 +0,0 @@
-type: "nn"
-layers {
-  name: "dat_in"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__seq_pooling_0__"
-  type: "max"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  trans_type: "seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__seq_pooling_1__"
-  type: "max"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__seq_pooling_2__"
-  type: "average"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  average_strategy: "average"
-  trans_type: "seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__seq_pooling_3__"
-  type: "average"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  average_strategy: "average"
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__seq_pooling_4__"
-  type: "average"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  average_strategy: "sum"
-  trans_type: "seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__seq_pooling_5__"
-  type: "average"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  average_strategy: "sum"
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__seq_pooling_6__"
-  type: "max"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: 5
-}
-layers {
-  name: "__seq_pooling_7__"
-  type: "average"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  average_strategy: "average"
-  trans_type: "non-seq"
-  seq_pool_stride: 5
-}
-layers {
-  name: "__seq_pooling_8__"
-  type: "average"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  average_strategy: "sum"
-  trans_type: "non-seq"
-  seq_pool_stride: 5
-}
-layers {
-  name: "__seq_pooling_9__"
-  type: "max"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  output_max_index: true
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-input_layer_names: "dat_in"
-output_layer_names: "__seq_pooling_0__"
-output_layer_names: "__seq_pooling_1__"
-output_layer_names: "__seq_pooling_2__"
-output_layer_names: "__seq_pooling_3__"
-output_layer_names: "__seq_pooling_4__"
-output_layer_names: "__seq_pooling_5__"
-output_layer_names: "__seq_pooling_6__"
-output_layer_names: "__seq_pooling_7__"
-output_layer_names: "__seq_pooling_8__"
-output_layer_names: "__seq_pooling_9__"
-sub_models {
-  name: "root"
-  layer_names: "dat_in"
-  layer_names: "__seq_pooling_0__"
-  layer_names: "__seq_pooling_1__"
-  layer_names: "__seq_pooling_2__"
-  layer_names: "__seq_pooling_3__"
-  layer_names: "__seq_pooling_4__"
-  layer_names: "__seq_pooling_5__"
-  layer_names: "__seq_pooling_6__"
-  layer_names: "__seq_pooling_7__"
-  layer_names: "__seq_pooling_8__"
-  layer_names: "__seq_pooling_9__"
-  input_layer_names: "dat_in"
-  output_layer_names: "__seq_pooling_0__"
-  output_layer_names: "__seq_pooling_1__"
-  output_layer_names: "__seq_pooling_2__"
-  output_layer_names: "__seq_pooling_3__"
-  output_layer_names: "__seq_pooling_4__"
-  output_layer_names: "__seq_pooling_5__"
-  output_layer_names: "__seq_pooling_6__"
-  output_layer_names: "__seq_pooling_7__"
-  output_layer_names: "__seq_pooling_8__"
-  output_layer_names: "__seq_pooling_9__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr
deleted file mode 100644
index 4aa041ea2e1..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr
+++ /dev/null
@@ -1,40 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 300
-  active_type: ""
-}
-layers {
-  name: "label"
-  type: "data"
-  size: 300
-  active_type: ""
-}
-layers {
-  name: "__smooth_l1_cost_0__"
-  type: "smooth_l1"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-  }
-  inputs {
-    input_layer_name: "label"
-  }
-  coeff: 1.0
-}
-input_layer_names: "input"
-input_layer_names: "label"
-output_layer_names: "__smooth_l1_cost_0__"
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "label"
-  layer_names: "__smooth_l1_cost_0__"
-  input_layer_names: "input"
-  input_layer_names: "label"
-  output_layer_names: "__smooth_l1_cost_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
deleted file mode 100644
index 569b0b945a7..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
+++ /dev/null
@@ -1,72 +0,0 @@
-model_config {
-  type: "nn"
-  layers {
-    name: "a"
-    type: "data"
-    size: 10
-    active_type: ""
-  }
-  input_layer_names: "a"
-  output_layer_names: "a"
-  sub_models {
-    name: "root"
-    layer_names: "a"
-    input_layer_names: "a"
-    output_layer_names: "a"
-    is_recurrent_layer_group: false
-  }
-}
-data_config {
-  type: "py2"
-  files: "train.list"
-  async_load_data: false
-  for_test: false
-  load_data_module: "a"
-  load_data_object: "c"
-  load_data_args: ""
-  data_ratio: 1
-  is_main_data: true
-  usage_ratio: 1.0
-}
-opt_config {
-  batch_size: 1000
-  algorithm: "sgd"
-  learning_rate: 0.001
-  learning_rate_decay_a: 0.0
-  learning_rate_decay_b: 0.0
-  l1weight: 0.1
-  l2weight: 0.0
-  c1: 0.0001
-  backoff: 0.5
-  owlqn_steps: 10
-  max_backoff: 5
-  l2weight_zero_iter: 0
-  average_window: 0
-  learning_method: "momentum"
-  ada_epsilon: 1e-06
-  do_average_in_cpu: false
-  ada_rou: 0.95
-  learning_rate_schedule: "poly"
-  delta_add_rate: 1.0
-  shrink_parameter_value: 0
-  adam_beta1: 0.9
-  adam_beta2: 0.999
-  adam_epsilon: 1e-08
-  learning_rate_args: ""
-  async_lagged_grad_discard_ratio: 1.5
-}
-test_data_config {
-  type: "py2"
-  files: "test.list"
-  async_load_data: false
-  for_test: true
-  load_data_module: "b"
-  load_data_object: "d"
-  load_data_args: ""
-  data_ratio: 1
-  is_main_data: true
-  usage_ratio: 1.0
-}
-save_dir: "./output/model"
-start_pass: 0
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
deleted file mode 100644
index ca1b2d8cffd..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
+++ /dev/null
@@ -1,40 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 3200
-  active_type: ""
-  height: 20
-  width: 10
-}
-layers {
-  name: "__spp_0__"
-  type: "spp"
-  size: 80
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    spp_conf {
-      image_conf {
-        channels: 16
-        img_size: 10
-        img_size_y: 20
-      }
-      pool_type: "max-projection"
-      pyramid_height: 2
-    }
-  }
-  height: 1
-  width: 5
-}
-input_layer_names: "data"
-output_layer_names: "__spp_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__spp_0__"
-  input_layer_names: "data"
-  output_layer_names: "__spp_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sub_nested_seq_select_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sub_nested_seq_select_layer.protostr
deleted file mode 100644
index 4b906b113e3..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sub_nested_seq_select_layer.protostr
+++ /dev/null
@@ -1,37 +0,0 @@
-type: "nn"
-layers {
-  name: "input_seq"
-  type: "data"
-  size: 300
-  active_type: ""
-}
-layers {
-  name: "input"
-  type: "data"
-  size: 5
-  active_type: ""
-}
-layers {
-  name: "__sub_nested_seq_layer_0__"
-  type: "sub_nested_seq"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "input_seq"
-  }
-  inputs {
-    input_layer_name: "input"
-  }
-}
-input_layer_names: "input_seq"
-output_layer_names: "__sub_nested_seq_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "input_seq"
-  layer_names: "input"
-  layer_names: "__sub_nested_seq_layer_0__"
-  input_layer_names: "input_seq"
-  output_layer_names: "__sub_nested_seq_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/unused_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/unused_layers.protostr
deleted file mode 100644
index 89ed28406e5..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/unused_layers.protostr
+++ /dev/null
@@ -1,27 +0,0 @@
-type: "nn"
-layers {
-  name: "probs"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__sampling_id_layer_0__"
-  type: "sampling_id"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "probs"
-  }
-}
-input_layer_names: "probs"
-output_layer_names: "__sampling_id_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "probs"
-  layer_names: "__sampling_id_layer_0__"
-  input_layer_names: "probs"
-  output_layer_names: "__sampling_id_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr
deleted file mode 100644
index 7a2f3eab388..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr
+++ /dev/null
@@ -1,87 +0,0 @@
-type: "nn"
-layers {
-  name: "a"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "b"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "__addto_0__"
-  type: "addto"
-  size: 10
-  active_type: ""
-  inputs {
-    input_layer_name: "a"
-  }
-  inputs {
-    input_layer_name: "b"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__concat_0__"
-  type: "concat"
-  size: 20
-  active_type: ""
-  inputs {
-    input_layer_name: "a"
-  }
-  inputs {
-    input_layer_name: "b"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__concat_1__"
-  type: "concat2"
-  size: 20
-  active_type: ""
-  inputs {
-    input_layer_name: "a"
-    proj_conf {
-      type: "identity"
-      name: "___concat_1__.w0"
-      input_size: 10
-      output_size: 10
-    }
-  }
-  inputs {
-    input_layer_name: "b"
-    proj_conf {
-      type: "identity"
-      name: "___concat_1__.w1"
-      input_size: 10
-      output_size: 10
-    }
-  }
-}
-input_layer_names: "a"
-input_layer_names: "b"
-output_layer_names: "__addto_0__"
-output_layer_names: "__concat_0__"
-output_layer_names: "__concat_1__"
-sub_models {
-  name: "root"
-  layer_names: "a"
-  layer_names: "b"
-  layer_names: "__addto_0__"
-  layer_names: "__concat_0__"
-  layer_names: "__concat_1__"
-  input_layer_names: "a"
-  input_layer_names: "b"
-  output_layer_names: "__addto_0__"
-  output_layer_names: "__concat_0__"
-  output_layer_names: "__concat_1__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
deleted file mode 100755
index c8a3b190b19..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-cd `dirname $0`
-
-set -e
-PYTHON_EXEC=$1
-COMPARE_PROTO_UTIL=$2
-
-protostr=`dirname $0`/protostr
-
-files=`ls $protostr | grep -v "unittest"`
-
-./generate_protostr.sh ${PYTHON_EXEC}
-
-. ./file_list.sh
-
-if [ -z ${COMPARE_PROTO_UTIL} ]; then
-  for file in $files
-  do
-      base_protostr=$protostr/$file
-      new_protostr=$protostr/$file.unittest
-      diff $base_protostr $new_protostr -u
-      diff $protostr/$file $protostr/$file.non_file_config.unittest -u
-  done
-else
-  for file in ${configs[*]}
-  do
-    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.unittest; then
-      diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
-    fi
-    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest; then
-      diff $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest -u
-    fi
-  done
-
-  for file in ${whole_configs[*]}
-  do
-    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.unittest --whole; then
-      diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
-    fi
-    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest --whole; then
-      diff $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest -u
-    fi
-  done
-fi
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py b/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
deleted file mode 100644
index 3229252a2f4..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-a = data_layer(name='feature_a', size=200)
-b = data_layer(name='feature_b', size=200)
-
-fc_param = ParamAttr(name='fc_param', initial_max=1.0, initial_min=-1.0)
-bias_param = ParamAttr(name='bias_param', initial_mean=0.0, initial_std=0.0)
-
-softmax_param = ParamAttr(
-    name='softmax_param', initial_max=1.0, initial_min=-1.0)
-
-hidden_a = fc_layer(
-    input=a, size=200, param_attr=fc_param, bias_attr=bias_param)
-hidden_b = fc_layer(
-    input=b, size=200, param_attr=fc_param, bias_attr=bias_param)
-
-predict = fc_layer(
-    input=[hidden_a, hidden_b],
-    param_attr=[softmax_param, softmax_param],
-    bias_attr=False,
-    size=10,
-    act=SoftmaxActivation())
-
-outputs(
-    classification_cost(
-        input=predict, label=data_layer(
-            name='label', size=10)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py b/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
deleted file mode 100644
index dff561fdf78..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-data_1 = data_layer(name='data_a', size=100)
-data_2 = data_layer(name='data_b', size=100)
-
-mixed_param = ParamAttr(name='mixed_param')
-
-gru_param = ParamAttr(name='gru_param')
-gru_bias = ParamAttr(name='gru_bias', initial_mean=0., initial_std=0.)
-
-gru1 = simple_gru(
-    input=data_1,
-    size=200,
-    mixed_param_attr=mixed_param,
-    mixed_bias_param_attr=False,
-    gru_bias_attr=gru_bias,
-    gru_param_attr=gru_param)
-
-gru2 = simple_gru(
-    input=data_2,
-    size=200,
-    mixed_param_attr=mixed_param,
-    mixed_bias_param_attr=False,
-    gru_bias_attr=gru_bias,
-    gru_param_attr=gru_param)
-
-softmax_param = ParamAttr(name='softmax_param')
-
-predict = fc_layer(
-    input=[last_seq(input=gru1), last_seq(input=gru2)],
-    size=10,
-    param_attr=[softmax_param, softmax_param],
-    bias_attr=False,
-    act=SoftmaxActivation())
-outputs(
-    classification_cost(
-        input=predict, label=data_layer(
-            name='label', size=10)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
deleted file mode 100644
index 97ef2d07ae8..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-data_1 = data_layer(name='data_a', size=100)
-data_2 = data_layer(name='data_b', size=100)
-
-mixed_param = ParamAttr(name='mixed_param')
-
-with mixed_layer(size=400, bias_attr=False) as m1:
-    m1 += full_matrix_projection(input=data_1, param_attr=mixed_param)
-
-with mixed_layer(size=400, bias_attr=False) as m2:
-    m2 += full_matrix_projection(input=data_2, param_attr=mixed_param)
-
-lstm_param = ParamAttr(name='lstm_param')
-lstm_bias = ParamAttr(name='lstm_bias', initial_mean=0., initial_std=0.)
-
-lstm1 = lstmemory_group(
-    input=m1,
-    param_attr=lstm_param,
-    lstm_bias_attr=lstm_bias,
-    input_proj_bias_attr=False)
-
-lstm2 = lstmemory_group(
-    input=m2,
-    param_attr=lstm_param,
-    lstm_bias_attr=lstm_bias,
-    input_proj_bias_attr=False)
-
-softmax_param = ParamAttr(name='softmax_param')
-
-predict = fc_layer(
-    input=[last_seq(input=lstm1), last_seq(input=lstm2)],
-    size=10,
-    param_attr=[softmax_param, softmax_param],
-    bias_attr=False,
-    act=SoftmaxActivation())
-outputs(
-    classification_cost(
-        input=predict, label=data_layer(
-            name='label', size=10)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py b/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
deleted file mode 100644
index f882efcba21..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-4)
-
-din = data_layer(name='data', size=200)
-
-hidden = fc_layer(input=din, size=200, act=SigmoidActivation())
-
-rnn = recurrent_layer(input=hidden, act=SigmoidActivation())
-
-rnn2 = recurrent_layer(input=hidden, act=SigmoidActivation(), reverse=True)
-
-lstm1_param = fc_layer(
-    input=hidden, size=200 * 4, act=LinearActivation(), bias_attr=False)
-
-lstm1 = lstmemory(input=lstm1_param, act=SigmoidActivation())
-
-lstm2_param = fc_layer(
-    input=hidden, size=200 * 4, act=LinearActivation(), bias_attr=False)
-
-lstm2 = lstmemory(input=lstm2_param, act=SigmoidActivation(), reverse=True)
-
-gru1_param = fc_layer(
-    input=hidden, size=200 * 3, act=LinearActivation(), bias_attr=False)
-gru1 = grumemory(input=gru1_param, act=SigmoidActivation())
-
-gru2_param = fc_layer(
-    input=hidden, size=200 * 3, act=LinearActivation(), bias_attr=False)
-gru2 = grumemory(input=gru2_param, act=SigmoidActivation(), reverse=True)
-
-outputs(
-    last_seq(input=rnn),
-    first_seq(input=rnn2),
-    last_seq(input=lstm1),
-    first_seq(input=lstm2),
-    last_seq(input=gru1),
-    first_seq(gru2))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py b/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py
deleted file mode 100644
index 169038deb19..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-4)
-
-#data = data_layer(name='data', size=180, width=30, height=6)
-#batchNorm = batch_norm_layer(data, num_channels=1)
-#outputs(batchNorm)
-
-data3D = data_layer(name='data3D', size=120 * 3, width=20, height=6, depth=3)
-batchNorm3D = batch_norm_layer(data3D, num_channels=1, img3D=True)
-outputs(batchNorm3D)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py b/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
deleted file mode 100644
index d29e4e5c4d6..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-4)
-
-din = data_layer(name='data', size=120)
-
-outputs(bidirectional_gru(input=din, size=40, return_seq=True))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
deleted file mode 100644
index 5e724ba7d17..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-data = data_layer(name='data', size=2304)
-
-conv = img_conv_layer(
-    input=data,
-    filter_size=3,
-    num_channels=1,
-    num_filters=16,
-    padding=1,
-    act=LinearActivation(),
-    bias_attr=True)
-
-bilinear = bilinear_interp_layer(input=conv, out_size_x=64, out_size_y=64)
-
-pool = img_pool_layer(
-    input=bilinear,
-    num_channels=16,
-    pool_size=2,
-    stride=2,
-    pool_type=MaxPooling())
-
-fc = fc_layer(input=pool, size=384, bias_attr=False)
-
-outputs(fc)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
deleted file mode 100644
index 95a1192bfae..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name='input', size=300)
-clip = clip_layer(input=data, min=-10, max=10)
-
-outputs(clip)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py b/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py
deleted file mode 100644
index 9b791a0222d..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import re
-import getopt
-
-
-def main(print_whole_config, globals, locals):
-    '''
-     this test will all test_config.py
-  '''
-    cmdstr = """from paddle.trainer.config_parser import parse_config\n"""
-    importstr = ""
-    functionstr = ""
-
-    for line in sys.stdin:
-        if re.match("^import", line) or re.match("^from.*import", line):
-            importstr = importstr + line
-        else:
-            functionstr = functionstr + "  " + line
-
-    cmdstr = cmdstr + importstr + """def configs():\n""" + functionstr
-    #cmdstr = cmdstr + """def configs():\n""" + importstr + functionstr
-    if print_whole_config:
-        cmdstr = cmdstr + """print parse_config(configs, "")"""
-    else:
-        cmdstr = cmdstr + """print parse_config(configs, "").model_config"""
-
-    exec (cmdstr, globals, locals)
-
-
-if __name__ == '__main__':
-    whole = False
-    opts, args = getopt.getopt(sys.argv[1:], "", ["whole"])
-    for op, value in opts:
-        if op == "--whole":
-            whole = True
-    main(whole, globals(), locals())
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
deleted file mode 100644
index f9966e399e7..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
+++ /dev/null
@@ -1,63 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-num_channels = 3
-filter_size = 3
-filter_size_y = 3
-filter_size_z = 3
-stride = 2
-stride_y = 2
-stride_z = 2
-padding = 1
-padding_y = 1
-padding_z = 1
-groups = 1
-
-data = data_layer(
-    name='data', size=12096 * num_channels, height=48, width=42, depth=6)
-# first
-conv3d_1 = img_conv3d_layer(
-    input=data,
-    name='conv3d_1',
-    num_filters=16,
-    num_channels=num_channels,
-    filter_size=filter_size,
-    stride=stride,
-    padding=padding,
-    groups=groups,
-    bias_attr=True,
-    shared_biases=True,
-    trans=False,
-    layer_type="conv3d",
-    act=LinearActivation())
-# second
-conv3d_2 = img_conv3d_layer(
-    input=data,
-    name='conv3d_2',
-    num_filters=16,
-    num_channels=num_channels,
-    filter_size=[filter_size, filter_size_y, filter_size_z],
-    stride=[stride, stride_y, stride_z],
-    padding=[padding, padding_y, padding_z],
-    groups=groups,
-    bias_attr=True,
-    shared_biases=True,
-    trans=False,
-    layer_type="conv3d",
-    act=LinearActivation())
-outputs(conv3d_2)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
deleted file mode 100644
index 351694fd55c..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ /dev/null
@@ -1,61 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-seq_in = data_layer(name='input', size=200)
-labels = data_layer(name='labels', size=5000)
-
-probs = data_layer(name='probs', size=10)
-xe_label = data_layer(name='xe-label', size=10)
-
-hidden = fc_layer(input=seq_in, size=4)
-outputs(
-    ctc_layer(
-        input=seq_in, label=labels),
-    warp_ctc_layer(
-        input=seq_in, label=labels, blank=0),
-    crf_layer(
-        input=hidden, label=data_layer(
-            name='crf_label', size=4)),
-    rank_cost(
-        left=data_layer(
-            name='left', size=1),
-        right=data_layer(
-            name='right', size=1),
-        label=data_layer(
-            name='label', size=1)),
-    lambda_cost(
-        input=data_layer(
-            name='list_feature', size=100),
-        score=data_layer(
-            name='list_scores', size=1)),
-    cross_entropy(
-        input=probs, label=xe_label),
-    cross_entropy_with_selfnorm(
-        input=probs, label=xe_label),
-    huber_regression_cost(
-        input=seq_in, label=labels),
-    huber_classification_cost(
-        input=data_layer(
-            name='huber_probs', size=1),
-        label=data_layer(
-            name='huber_label', size=1)),
-    multi_binary_label_cross_entropy(
-        input=probs, label=xe_label),
-    sum_cost(input=hidden),
-    nce_layer(
-        input=hidden, label=labels))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
deleted file mode 100644
index 8cbcf5de0a3..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
+++ /dev/null
@@ -1,33 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-data = data_layer(name='input', size=300)
-lbl = data_layer(name='label', size=1)
-wt = data_layer(name='weight', size=1)
-fc = fc_layer(input=data, size=10, act=SoftmaxActivation())
-
-outputs(
-    classification_cost(
-        input=fc, label=lbl, weight=wt),
-    square_error_cost(
-        input=fc, label=lbl, weight=wt),
-    nce_layer(
-        input=fc,
-        label=data_layer(
-            name='multi_class_label', size=500),
-        weight=wt))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_crop.py b/python/paddle/trainer_config_helpers/tests/configs/test_crop.py
deleted file mode 100644
index b4ffff252bb..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_crop.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-data = data_layer(name='data', size=2016, height=48, width=42)
-refernce_data = data_layer(name='data', size=768, height=16, width=16)
-
-conv = img_conv_layer(
-    input=data,
-    filter_size=3,
-    num_channels=1,
-    num_filters=16,
-    padding=1,
-    act=LinearActivation(),
-    bias_attr=True)
-
-pool = img_pool_layer(input=conv, pool_size=2, stride=2, pool_type=MaxPooling())
-
-crop = crop_layer(input=[pool, refernce_data], axis=2)
-
-outputs(pad)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
deleted file mode 100644
index 4a5bdf1181d..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python
-#coding=utf-8
-
-from paddle.trainer_config_helpers import *
-beam_size = 5
-
-# the first beam expansion.
-sentence_states = data_layer(name="sentence_states", size=32)
-sentence_scores = data_layer(name="sentence_scores", size=1)
-topk_sentence_ids = kmax_seq_score_layer(
-    input=sentence_scores, beam_size=beam_size)
-
-# the second beam expansion.
-topk_sen = sub_nested_seq_layer(
-    input=sentence_states, selected_indices=topk_sentence_ids)
-start_pos_scores = fc_layer(input=topk_sen, size=1, act=LinearActivation())
-topk_start_pos_ids = kmax_seq_score_layer(
-    input=sentence_scores, beam_size=beam_size)
-
-# the final beam expansion.
-topk_start_spans = seq_slice_layer(
-    input=topk_sen, starts=topk_start_pos_ids, ends=None)
-end_pos_scores = fc_layer(
-    input=topk_start_spans, size=1, act=LinearActivation())
-topk_end_pos_ids = kmax_seq_score_layer(
-    input=end_pos_scores, beam_size=beam_size)
-
-# define the cost
-sentence_idx = data_layer(name="sentences_ids", size=1)
-start_idx = data_layer(name="start_ids", size=1)
-end_idx = data_layer(name="end_ids", size=1)
-cost = cross_entropy_over_beam(input=[
-    BeamInput(
-        candidate_scores=sentence_scores,
-        selected_candidates=topk_sentence_ids,
-        gold=sentence_idx), BeamInput(
-            candidate_scores=start_pos_scores,
-            selected_candidates=topk_start_pos_ids,
-            gold=start_idx), BeamInput(
-                candidate_scores=end_pos_scores,
-                selected_candidates=topk_end_pos_ids,
-                gold=end_idx)
-])
-
-outputs(cost)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
deleted file mode 100644
index 08e701c7a8d..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-num_channels = 3
-filter_size = 3
-filter_size_y = 3
-filter_size_z = 3
-stride = 2
-stride_y = 2
-stride_z = 2
-padding = 1
-padding_y = 1
-padding_z = 1
-groups = 1
-
-data = data_layer(
-    name='data', size=12096 * num_channels, height=48, width=42, depth=6)
-
-# first
-deconv3d_1 = img_conv3d_layer(
-    input=data,
-    name='deconv3d_1',
-    num_filters=16,
-    num_channels=num_channels,
-    filter_size=filter_size,
-    stride=stride,
-    padding=padding,
-    groups=groups,
-    bias_attr=True,
-    shared_biases=True,
-    trans=True,
-    layer_type="deconv3d",
-    act=LinearActivation())
-# second
-deconv3d_2 = img_conv3d_layer(
-    input=data,
-    name='deconv3d_2',
-    num_filters=16,
-    num_channels=num_channels,
-    filter_size=[filter_size, filter_size_y, filter_size_z],
-    stride=[stride, stride_y, stride_z],
-    padding=[padding, padding_y, padding_z],
-    groups=groups,
-    bias_attr=True,
-    shared_biases=True,
-    trans=True,
-    layer_type="deconv3d",
-    act=LinearActivation())
-outputs(deconv3d_2)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py
deleted file mode 100644
index 4ecd1c2b7e0..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-input_loc = data_layer(name='input_loc', size=16, height=16, width=1)
-
-input_conf = data_layer(name='input_conf', size=8, height=1, width=8)
-
-priorbox = data_layer(name='priorbox', size=32, height=4, width=8)
-
-detout = detection_output_layer(
-    input_loc=input_loc,
-    input_conf=input_conf,
-    priorbox=priorbox,
-    num_classes=21,
-    nms_threshold=0.45,
-    nms_top_k=400,
-    keep_top_k=200,
-    confidence_threshold=0.01,
-    background_id=0,
-    name='test_detection_output')
-
-outputs(detout)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
deleted file mode 100644
index 9b444bc2c02..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-vec1 = data_layer(name='vector1', size=10)
-vec2 = data_layer(name='vector2', size=10)
-dot_product = dot_prod_layer(input1=vec1, input2=vec2)
-
-outputs(dot_product)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
deleted file mode 100644
index 85101d2b927..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
+++ /dev/null
@@ -1,28 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-din = data_layer(name='data', size=30)
-data_seq = data_layer(name='data_seq', size=30)
-
-outputs(
-    expand_layer(
-        input=din, expand_as=data_seq, expand_level=ExpandLevel.FROM_SEQUENCE),
-    expand_layer(
-        input=din,
-        expand_as=data_seq,
-        expand_level=ExpandLevel.FROM_NO_SEQUENCE))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
deleted file mode 100644
index 48ac46c5bb6..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name='data', size=1024)
-
-fm = factorization_machine(input=data, factor_size=10)
-
-outputs(fm)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_fc.py b/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
deleted file mode 100644
index f1e454d2112..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-din = data_layer(name='data', size=100)
-
-trans = trans_layer(input=din)
-
-hidden = fc_layer(input=trans, size=100, bias_attr=False)
-
-mask = data_layer(name='mask', size=100)
-
-hidden_sel = selective_fc_layer(
-    input=din, select=mask, size=100, act=SigmoidActivation())
-
-outputs(hidden, hidden_sel)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
deleted file mode 100644
index afc3e9207c5..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name='input', size=256)
-glu = gated_unit_layer(
-    size=512,
-    input=data,
-    act=TanhActivation(),
-    gate_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
-    gate_param_attr=ParamAttr(initial_std=1e-4),
-    gate_bias_attr=ParamAttr(initial_std=1),
-    inproj_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
-    inproj_param_attr=ParamAttr(initial_std=1e-4),
-    inproj_bias_attr=ParamAttr(initial_std=1),
-    layer_attr=ExtraLayerAttribute(error_clipping_threshold=100.0))
-
-outputs(glu)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
deleted file mode 100644
index ac9902d08c6..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-4)
-
-din = data_layer(name='data', size=120)
-
-outputs(
-    grumemory(
-        input=din,
-        size=40,
-        reverse=True,
-        gate_act=TanhActivation(),
-        act=SigmoidActivation()))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py b/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
deleted file mode 100644
index da781c149b8..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-din = data_layer(name='data', size=100)
-label = data_layer(name='label', size=10)
-
-outputs(hsigmoid(input=din, label=label, num_classes=10))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
deleted file mode 100644
index 171da10f75d..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env python
-#coding=utf-8
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name="input_seq", size=128)
-scores = fc_layer(input=data, size=1, act=ExpActivation())
-kmax_seq_id = kmax_seq_score_layer(input=scores, beam_size=5)
-
-outputs(kmax_seq_id)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
deleted file mode 100644
index 42c9b5deea7..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-outputs(
-    l2_distance_layer(
-        x=data_layer(
-            name='x', size=128), y=data_layer(
-                name='y', size=128)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
deleted file mode 100644
index 26eeea5461f..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-din = data_layer(name='data', size=128)
-
-outputs(
-    lstmemory(
-        input=din,
-        reverse=True,
-        gate_act=TanhActivation(),
-        act=TanhActivation(),
-        size=32))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
deleted file mode 100644
index 2cd41a306a7..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-data = data_layer(name='data', size=2304, height=48, width=48)
-
-conv = img_conv_layer(
-    input=data,
-    filter_size=3,
-    num_channels=1,
-    num_filters=16,
-    padding=1,
-    act=LinearActivation(),
-    bias_attr=True)
-
-maxout = maxout_layer(input=conv, num_channels=16, groups=2)
-
-pool = img_pool_layer(
-    input=maxout, num_channels=8, pool_size=2, stride=2, pool_type=MaxPooling())
-
-conv2 = img_conv_layer(
-    input=pool,
-    filter_size=3,
-    num_channels=8,
-    num_filters=128,
-    padding=1,
-    act=LinearActivation(),
-    bias_attr=True)
-
-maxout2 = maxout_layer(input=conv2, num_channels=128, groups=4)
-
-block = block_expand_layer(
-    input=maxout2,
-    num_channels=32,
-    stride_x=1,
-    stride_y=1,
-    block_x=1,
-    block_y=6)
-
-fc = fc_layer(input=block, size=384, bias_attr=False)
-
-outputs(fc)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py
deleted file mode 100644
index b4fd9052c41..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-input_loc = data_layer(name='input_loc', size=16, height=16, width=1)
-
-input_conf = data_layer(name='input_conf', size=8, height=1, width=8)
-
-priorbox = data_layer(name='priorbox', size=32, height=4, width=8)
-
-label = data_layer(name='label', size=24, height=4, width=6)
-
-multibox_loss = multibox_loss_layer(
-    input_loc=input_loc,
-    input_conf=input_conf,
-    priorbox=priorbox,
-    label=label,
-    num_classes=21,
-    overlap_threshold=0.5,
-    neg_pos_ratio=3.0,
-    neg_overlap=0.5,
-    background_id=0,
-    name='test_multibox_loss')
-
-outputs(multibox_loss)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py
deleted file mode 100644
index bfba07be869..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py
+++ /dev/null
@@ -1,26 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-index = data_layer(name='index', size=1)
-din1 = data_layer(name='data1', size=30)
-din2 = data_layer(name='data2', size=30)
-din3 = data_layer(name='data3', size=30)
-
-dout = multiplex_layer([index, din1, din2, din3])
-
-outputs(dout)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
deleted file mode 100644
index 891894172c5..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-weight = data_layer(name='w', size=1)
-a = data_layer(name='a', size=100)
-b = data_layer(name='b', size=100)
-c = data_layer(name='c', size=200)
-d = data_layer(name='d', size=31)
-
-outputs(
-    interpolation_layer(
-        input=[a, b], weight=weight),
-    power_layer(
-        input=a, weight=weight),
-    scaling_layer(
-        input=a, weight=weight),
-    cos_sim(
-        a=a, b=b),
-    cos_sim(
-        a=a, b=c, size=2),
-    sum_to_one_norm_layer(input=a),
-    conv_shift_layer(
-        a=a, b=d),
-    tensor_layer(
-        a=a, b=b, size=1000),
-    slope_intercept_layer(
-        input=a, slope=0.7, intercept=0.9),
-    linear_comb_layer(
-        weights=b, vectors=c))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_pad.py b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
deleted file mode 100644
index c5825c82e5b..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
+++ /dev/null
@@ -1,34 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-data = data_layer(name='data', size=2016, height=48, width=42)
-
-conv = img_conv_layer(
-    input=data,
-    filter_size=3,
-    num_channels=1,
-    num_filters=16,
-    padding=1,
-    act=LinearActivation(),
-    bias_attr=True)
-
-pool = img_pool_layer(input=conv, pool_size=2, stride=2, pool_type=MaxPooling())
-
-pad = pad_layer(input=pool, pad_c=[2, 3], pad_h=[1, 2], pad_w=[3, 1])
-
-outputs(pad)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
deleted file mode 100644
index 5ff52c195a4..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=100, learning_rate=1e-5)
-
-data_2d = data_layer(name='data_2d', size=6000, height=20, width=10)
-
-pool_2d = img_pool_layer(
-    name="pool___2d",
-    input=data_2d,
-    num_channels=30,
-    pool_size=5,
-    stride=3,
-    padding=1,
-    pool_type=AvgPooling())
-outputs(pool_2d)
-
-data_3d = data_layer(
-    name='data_3d_1', size=60000, depth=10, height=20, width=10)
-
-pool_3d_1 = img_pool3d_layer(
-    name="pool_3d_1",
-    input=data_3d,
-    num_channels=30,
-    pool_size=5,
-    stride=3,
-    padding=1,
-    pool_type=AvgPooling())
-outputs(pool_3d_1)
-
-pool_3d_2 = img_pool3d_layer(
-    name="pool_3d_2",
-    input=data_3d,
-    num_channels=30,
-    pool_size=[5, 5, 5],
-    stride=[3, 3, 3],
-    padding=[1, 1, 1],
-    pool_type=MaxPooling())
-outputs(pool_3d_2)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
deleted file mode 100644
index d803a0d13d5..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name='input', size=300, height=10, width=10)
-prelu = prelu_layer(input=data, num_channels=3)
-prelu = prelu_layer(input=data, partial_sum=1, num_channels=3)
-prelu = prelu_layer(input=data, partial_sum=5, num_channels=3)
-prelu = prelu_layer(input=data, channel_shared=True, num_channels=3)
-prelu = prelu_layer(input=data, channel_shared=False, num_channels=3)
-
-outputs(prelu)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
deleted file mode 100644
index ca1f5a45724..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
+++ /dev/null
@@ -1,23 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-din = data_layer(name='input', size=100)
-
-print_layer(input=din)
-
-outputs(din)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py b/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
deleted file mode 100644
index d44870d804f..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-din = data_layer(name='data', size=100)
-
-enc = din
-for i in range(32):
-    enc = addto_layer([enc, enc])
-
-pred = fc_layer(
-    input=fc_layer(
-        input=enc, size=32, act=ReluActivation()),
-    size=10,
-    act=SoftmaxActivation())
-outputs(pred)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
deleted file mode 100644
index ee90e830df1..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-din = data_layer(name='data', size=30)
-
-outputs(
-    repeat_layer(
-        input=din, num_repeats=10, as_row_vector=True),
-    repeat_layer(
-        input=din, num_repeats=10, act=TanhActivation(), as_row_vector=False))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
deleted file mode 100644
index 4aa81919dfd..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name='input', size=300)
-resized = resize_layer(input=data, size=150)
-
-outputs(resized)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
deleted file mode 100644
index 3824ef59953..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
+++ /dev/null
@@ -1,62 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-seq = data_layer(name='seq_input', size=100)
-sub_seq = data_layer(name='sub_seq_input', size=100)
-lbl = data_layer(name='label', size=1)
-
-
-def generate_rnn_simple(name):
-    def rnn_simple(s):
-        m = memory(name=name, size=200)
-        fc = fc_layer(input=[s, m], size=200, name=name)
-        return fc
-
-    return rnn_simple
-
-
-def generate_rnn_simple_no_name():
-    def rnn_simple(s):
-        m = memory(name=None, size=200)
-        fc = fc_layer(input=[s, m], size=200)
-        m.set_input(fc)
-        return fc
-
-    return rnn_simple
-
-
-with mixed_layer() as lstm_param:  # test lstm unit, rnn group
-    lstm_param += full_matrix_projection(input=seq, size=100 * 4)
-
-with mixed_layer() as gru_param:
-    gru_param += full_matrix_projection(input=seq, size=100 * 3)
-
-outputs(
-    last_seq(input=recurrent_group(
-        step=generate_rnn_simple('rnn_forward'), input=seq)),
-    first_seq(input=recurrent_group(
-        step=generate_rnn_simple('rnn_back'), input=seq, reverse=True)),
-    last_seq(input=recurrent_group(
-        step=generate_rnn_simple('rnn_subseq_forward'),
-        input=SubsequenceInput(input=sub_seq))),
-    last_seq(input=lstmemory_group(
-        input=lstm_param, size=100)),
-    last_seq(input=gru_group(
-        input=gru_param, size=100)),
-    last_seq(input=recurrent_group(
-        step=generate_rnn_simple_no_name(), input=seq)), )
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
deleted file mode 100644
index 6929d106c64..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name='data', size=3 * 14 * 14, height=14, width=14)
-
-rois = data_layer(name='rois', size=10)
-
-conv = img_conv_layer(
-    input=data,
-    filter_size=3,
-    num_channels=3,
-    num_filters=16,
-    padding=1,
-    act=LinearActivation(),
-    bias_attr=True)
-
-roi_pool = roi_pool_layer(
-    input=conv,
-    rois=rois,
-    pooled_width=7,
-    pooled_height=7,
-    spatial_scale=1. / 16)
-
-outputs(roi_pool)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py b/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
deleted file mode 100644
index 6381a26fe84..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
+++ /dev/null
@@ -1,23 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-data = data_layer(name='data', size=2560)
-
-row_conv = row_conv_layer(input=data, context_len=19, act=ReluActivation())
-
-outputs(row_conv)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
deleted file mode 100644
index 3c17d2ccfd6..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name='input', size=300)
-row_l2_norm = row_l2_norm_layer(input=data)
-
-outputs(row_l2_norm)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
deleted file mode 100644
index ae8a25ba94d..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
+++ /dev/null
@@ -1,23 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name='data', size=100)
-
-scale = scale_shift_layer(input=data, bias_attr=False)
-
-scale_shift = scale_shift_layer(input=data)
-
-outputs(scale, scale_shift)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
deleted file mode 100644
index e4f7120bcce..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-data = data_layer(name='data', size=2016, height=48, width=42)
-indices = data_layer(name='indices', size=6)
-
-scale_sub_region = scale_sub_region_layer(
-    input=data, indices=indices, value=0.0)
-
-outputs(scale_sub_region)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
deleted file mode 100644
index a6be069e7e2..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
+++ /dev/null
@@ -1,26 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-din1 = data_layer(name='data1', size=30)
-din2 = data_layer(name='data2', size=30)
-
-opts = []
-opts.append(seq_concat_layer(a=din1, b=din2))
-opts.append(seq_reshape_layer(input=din1, reshape_size=5))
-
-outputs(opts)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
deleted file mode 100644
index 510ad322089..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env python
-#coding=utf-8
-from paddle.trainer_config_helpers import *
-
-input_seq = data_layer("word", size=128)
-starts = data_layer("starts", size=5)
-ends = data_layer("ends", size=5)
-
-seq_slice1 = seq_slice_layer(input=input_seq, starts=starts, ends=ends)
-seq_slice2 = seq_slice_layer(input=input_seq, starts=starts, ends=None)
-seq_slice3 = seq_slice_layer(input=input_seq, starts=None, ends=ends)
-
-outputs(seq_slice1, seq_slice2, seq_slice3)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
deleted file mode 100644
index 7b951a4cd79..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-din = data_layer(name='dat_in', size=100)
-
-POOL_TYPE = [MaxPooling, AvgPooling, SumPooling]
-
-AGG_LEVEL = [AggregateLevel.TO_SEQUENCE, AggregateLevel.TO_NO_SEQUENCE]
-
-opts = []
-
-for pt in POOL_TYPE:
-    for al in AGG_LEVEL:
-        opts.append(pooling_layer(input=din, agg_level=al, pooling_type=pt()))
-
-for pt in POOL_TYPE:
-    opts.append(
-        pooling_layer(
-            input=din,
-            agg_level=AggregateLevel.TO_NO_SEQUENCE,
-            pooling_type=pt(),
-            stride=5))
-
-opts.append(
-    pooling_layer(
-        input=din, pooling_type=MaxPooling(output_max_index=True)))
-
-outputs(opts)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py b/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
deleted file mode 100644
index 32a4e6f6d08..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name='input', size=300)
-lbl = data_layer(name='label', size=300)
-smooth_l1 = smooth_l1_cost(input=data, label=lbl)
-
-outputs(smooth_l1)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py b/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py
deleted file mode 100644
index ea68b5493ee..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-define_py_data_sources2(
-    train_list="train.list",
-    test_list="test.list",
-    module=["a", "b"],
-    obj=("c", "d"))
-settings(learning_rate=1e-3, batch_size=1000)
-
-outputs(data_layer(name="a", size=10))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
deleted file mode 100644
index 0e692d4b62c..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=100, learning_rate=1e-5)
-
-data = data_layer(name='data', size=3200, height=20, width=10)
-
-spp = spp_layer(
-    input=data, pyramid_height=2, num_channels=16, pool_type=MaxPooling())
-
-outputs(spp)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py
deleted file mode 100644
index 6d1c3175ba9..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/env python
-#coding=utf-8
-from paddle.trainer_config_helpers import *
-
-beam_size = 5
-
-data = data_layer(name='input_seq', size=300)
-selected_ids = data_layer(name='input', size=beam_size)
-sub_nest_seq = sub_nested_seq_layer(input=data, selected_indices=selected_ids)
-
-outputs(sub_nest_seq)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py b/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
deleted file mode 100644
index 8878e73fff6..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-settings(batch_size=1000, learning_rate=1e-4)
-
-probs = data_layer(name='probs', size=100)
-
-outputs(
-    sampling_id_layer(input=probs),  # It seems not support training
-
-    # It seems this layer is not correct, and should be rewrite.
-    # block_expand_layer(input=probs, channel=1, block_x=1, block_y=3),
-)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/util_layers.py b/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
deleted file mode 100644
index da134f100b9..00000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-a = data_layer(name='a', size=10)
-b = data_layer(name='b', size=10)
-
-result = addto_layer(input=[a, b])
-concat1 = concat_layer(input=[a, b])
-concat2 = concat_layer(
-    input=[identity_projection(input=a), identity_projection(input=b)])
-
-outputs(result, concat1, concat2)
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test.py b/python/paddle/trainer_config_helpers/tests/layers_test.py
deleted file mode 100644
index b3dd8f8fc78..00000000000
--- a/python/paddle/trainer_config_helpers/tests/layers_test.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.config_parser import parse_config_and_serialize
-
-if __name__ == '__main__':
-    parse_config_and_serialize(
-        'trainer_config_helpers/tests/layers_test_config.py', '')
-# layers_test_config.py
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test_config.py b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
deleted file mode 100644
index e6cd35ee761..00000000000
--- a/python/paddle/trainer_config_helpers/tests/layers_test_config.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-num_classes = 5
-
-x = data_layer(name="input1", size=3)
-y = data_layer(name="input2", size=5)
-
-z = out_prod_layer(input1=x, input2=y)
-
-x1 = fc_layer(input=x, size=5)
-y1 = fc_layer(input=y, size=5)
-
-z1 = mixed_layer(
-    act=LinearActivation(),
-    input=[
-        conv_operator(
-            img=x1,
-            filter=y1,
-            filter_size=1,
-            num_filters=5,
-            num_channels=5,
-            stride=1)
-    ])
-
-assert z1.size > 0
-
-y2 = fc_layer(input=y, size=15)
-z2 = rotate_layer(input=y2, height=5, width=3)
-
-cos1 = cos_sim(a=x1, b=y1)
-cos3 = cos_sim(a=x1, b=y2, size=3)
-
-linear_comb = linear_comb_layer(weights=x1, vectors=y2, size=3)
-
-out = fc_layer(
-    input=[cos1, cos3, linear_comb, z, z1, z2],
-    size=num_classes,
-    act=SoftmaxActivation())
-
-print_layer(input=[out])
-
-outputs(classification_cost(out, data_layer(name="label", size=num_classes)))
-
-dotmul = mixed_layer(
-    input=[dotmul_operator(
-        a=x1, b=x1), dotmul_projection(input=y1)])
-
-proj_with_attr_init = mixed_layer(
-    input=full_matrix_projection(
-        input=y1,
-        param_attr=ParamAttr(
-            learning_rate=0, initial_mean=0, initial_std=0)),
-    bias_attr=ParamAttr(
-        initial_mean=0, initial_std=0, learning_rate=0),
-    act=LinearActivation(),
-    size=5,
-    name='proj_with_attr_init')
-
-# for ctc
-tmp = fc_layer(
-    input=[x1, dotmul, proj_with_attr_init],
-    size=num_classes + 1,
-    act=SoftmaxActivation())
-ctc = ctc_layer(input=tmp, label=y, size=num_classes + 1)
-ctc_eval = ctc_error_evaluator(input=tmp, label=y)
-
-settings(
-    batch_size=10,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
diff --git a/python/paddle/trainer_config_helpers/tests/test_reset_hook.py b/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
deleted file mode 100644
index 4d7542c35b2..00000000000
--- a/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from paddle.trainer.config_parser import parse_config
-
-
-class TestParse(unittest.TestCase):
-    def test_parse(self):
-        a = parse_config('trainer_config_helpers/tests/layers_test_config.py',
-                         '')
-        b = parse_config('trainer_config_helpers/tests/layers_test_config.py',
-                         '')
-        self.assertEqual(a, b)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/trainer_config_helpers/utils.py b/python/paddle/trainer_config_helpers/utils.py
deleted file mode 100644
index fe6e9cd53cc..00000000000
--- a/python/paddle/trainer_config_helpers/utils.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.config_parser import logger
-import functools
-
-__all__ = ['deprecated']
-
-
-def deprecated(instead):
-    def __impl__(func):
-        @functools.wraps(func)
-        def __wrapper__(*args, **kwargs):
-            logger.warning("The interface %s is deprecated, "
-                           "will be removed soon. Please use %s instead." %
-                           (func.__name__, instead))
-
-            return func(*args, **kwargs)
-
-        return __wrapper__
-
-    return __impl__
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
deleted file mode 100644
index df710c33d0c..00000000000
--- a/python/paddle/v2/__init__.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import optimizer
-import layer
-import activation
-import parameters
-import trainer
-import event
-import data_type
-import topology
-import networks
-import evaluator
-from . import dataset
-from . import reader
-from . import plot
-import attr
-import op
-import pooling
-import inference
-import networks
-import minibatch
-import plot
-import image
-import paddle.trainer.config_parser as cp
-
-__all__ = [
-    'default_startup_program',
-    'default_main_program',
-    'optimizer',
-    'layer',
-    'activation',
-    'parameters',
-    'init',
-    'trainer',
-    'event',
-    'data_type',
-    'attr',
-    'pooling',
-    'dataset',
-    'reader',
-    'topology',
-    'networks',
-    'infer',
-    'plot',
-    'evaluator',
-    'image',
-    'master',
-]
-
-cp.begin_parse()
-
-
-def set_env_vars(trainer_count):
-    '''Auto set CPU environment if have not set before.
-       For MKL:
-         export KMP_AFFINITY, OMP_DYNAMIC according to the Hyper Threading status.
-         export OMP_NUM_THREADS, MKL_NUM_THREADS according to trainer_count.
-       For OpenBLAS:
-         export OPENBLAS_NUM_THREADS, OPENBLAS_MAIN_FREE according to trainer_count. 
-    '''
-    import platform, paddle
-    if not platform.system() in ['Linux', 'Darwin']:
-        return
-
-    def set_env(key, value):
-        '''If the key has not been set in the environment, set it with value.'''
-        assert isinstance(key, str)
-        assert isinstance(value, str)
-        envset = os.environ.get(key)
-        if envset is None:
-            os.environ[key] = value
-
-    def num_physical_cores():
-        '''Get the number of physical cores'''
-        if platform.system() == "Linux":
-            num_sockets = int(
-                os.popen("grep 'physical id' /proc/cpuinfo | sort -u | wc -l")
-                .read())
-            num_cores_per_socket = int(
-                os.popen("grep 'core id' /proc/cpuinfo | sort -u | wc -l")
-                .read())
-            return num_sockets * num_cores_per_socket
-        else:
-            cmds = {"Darwin": "sysctl -n hw.physicalcpu"}
-            return int(os.popen(cmds.get(platform.system(), "expr 1")).read())
-
-    def num_logical_processors():
-        '''Get the number of logical processors'''
-        cmds = {
-            "Linux": "grep \"processor\" /proc/cpuinfo|sort -u|wc -l",
-            "Darwin": "sysctl -n hw.logicalcpu"
-        }
-        return int(os.popen(cmds.get(platform.system(), "expr 1")).read())
-
-    num_cores = num_physical_cores()
-    num_processors = num_logical_processors()
-    if paddle.version.mkl() == 'ON':
-        if num_processors > num_cores:  # Hyper Threading is enabled
-            set_env("OMP_DYNAMIC", "true")
-            set_env("KMP_AFFINITY", "granularity=fine,compact,1,0")
-        else:
-            set_env("OMP_DYNAMIC", "false")
-            set_env("KMP_AFFINITY", "granularity=fine,compact,0,0")
-    threads = num_processors / trainer_count
-    threads = '1' if threads < 1 else str(threads)
-    if paddle.version.mkl() == 'ON':
-        set_env("OMP_NUM_THREADS", threads)
-        set_env("MKL_NUM_THREADS", threads)
-    else:
-        set_env("OPENBLAS_NUM_THREADS", threads)
-        if threads > 1:
-            set_env("OPENBLAS_MAIN_FREE", '1')
-
-
-def init(**kwargs):
-    import py_paddle.swig_paddle as api
-    args = []
-    args_dict = {}
-    # NOTE: append arguments if they are in ENV
-    for ek, ev in os.environ.iteritems():
-        if ek.startswith("PADDLE_INIT_"):
-            args_dict[ek.replace("PADDLE_INIT_", "").lower()] = str(ev)
-
-    args_dict.update(kwargs)
-    # NOTE: overwrite arguments from ENV if it is in kwargs
-    for key in args_dict.keys():
-        args.append('--%s=%s' % (key, str(args_dict[key])))
-
-    set_env_vars(kwargs.get('trainer_count', 1))
-
-    if 'use_gpu' in kwargs:
-        cp.g_command_config_args['use_gpu'] = kwargs['use_gpu']
-    if 'use_mkldnn' in kwargs:
-        cp.g_command_config_args['use_mkldnn'] = kwargs['use_mkldnn']
-    if 'use_mkl_packed' in kwargs:
-        cp.g_command_config_args['use_mkl_packed'] = kwargs['use_mkl_packed']
-    assert 'parallel_nn' not in kwargs, ("currently 'parallel_nn' is not "
-                                         "supported in v2 APIs.")
-
-    api.initPaddle(*args)
-
-
-infer = inference.infer
-batch = minibatch.batch
diff --git a/python/paddle/v2/activation.py b/python/paddle/v2/activation.py
deleted file mode 100644
index 21261a17820..00000000000
--- a/python/paddle/v2/activation.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.trainer_config_helpers.activations
-import copy
-
-__all__ = []
-
-suffix = 'Activation'
-for act in paddle.trainer_config_helpers.activations.__all__:
-    new_name = act[:-len(suffix)]
-    globals()[new_name] = copy.copy(
-        getattr(paddle.trainer_config_helpers.activations, act))
-    globals()[new_name].__name__ = new_name
-    __all__.append(new_name)
diff --git a/python/paddle/v2/attr.py b/python/paddle/v2/attr.py
deleted file mode 100644
index 5d23894d735..00000000000
--- a/python/paddle/v2/attr.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.trainer_config_helpers.attrs
-
-__all__ = [
-    "Param",
-    "Extra",
-    "Hook",
-]
-
-Param = paddle.trainer_config_helpers.attrs.ParameterAttribute
-Extra = paddle.trainer_config_helpers.attrs.ExtraLayerAttribute
-Hook = paddle.trainer_config_helpers.attrs.HookAttribute
-
-for each in paddle.trainer_config_helpers.attrs.__all__:
-    globals()[each] = getattr(paddle.trainer_config_helpers.attrs, each)
-    __all__.append(each)
diff --git a/python/paddle/v2/config_base.py b/python/paddle/v2/config_base.py
deleted file mode 100644
index d9613e001ac..00000000000
--- a/python/paddle/v2/config_base.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import re
-import paddle.trainer_config_helpers as conf_helps
-
-__layer_map__ = {}
-
-
-def __map_docstr__(doc, name):
-    if doc is None:
-        return doc
-
-    assert isinstance(doc, basestring)
-
-    # replace LayerOutput to paddle.v2.config_base.Layer
-    doc = doc.replace("LayerOutput", "paddle.v2.config_base.Layer")
-
-    doc = doc.replace('ParameterAttribute', 'paddle.v2.attr.ParameterAttribute')
-
-    doc = re.sub(r'ExtraLayerAttribute[^\s]?', 'paddle.v2.attr.ExtraAttribute',
-                 doc)
-
-    # xxx_layer to xxx
-    doc = re.sub(r"(?P<name>[a-z]+)_layer", r"\g<name>", doc)
-
-    # XxxxActivation to paddle.v2.activation.Xxxx
-    doc = re.sub(r"(?P<name>[A-Z][a-zA-Z]+)Activation",
-                 r"paddle.v2.activation.\g<name>", doc)
-
-    # xxx_evaluator to paddle.v2.evaluator.xxx
-    doc = re.sub(r"(?P<name>[a-z]+)_evaluator", r"evaluator.\g<name>", doc)
-
-    # TODO(yuyang18): Add more rules if needed.
-    return doc
-
-
-def __convert_to_v2__(f, name, module):
-    def wrapped(*args, **xargs):
-        out = f(*args, **xargs)
-        outs = out
-        if not isinstance(out, collections.Sequence):
-            outs = [out]
-        for l in outs:
-            if isinstance(l, conf_helps.LayerOutput):
-                __layer_map__[l.full_name] = l
-        return out
-
-    wrapped.__doc__ = __map_docstr__(f.__doc__, name)
-    wrapped.__name__ = name
-    wrapped.__module__ = module
-
-    return wrapped
-
-
-Layer = conf_helps.LayerOutput
diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py
deleted file mode 100644
index 98dfb85a0ea..00000000000
--- a/python/paddle/v2/data_feeder.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from py_paddle import DataProviderConverter
-import collections
-import paddle.trainer.PyDataProvider2 as pydp2
-
-__all__ = ['DataFeeder']
-
-
-def default_feeding_map(data_types):
-    reader_dict = dict()
-    for i, tp in enumerate(data_types):
-        reader_dict[tp[0]] = i
-    return reader_dict
-
-
-class DataFeeder(DataProviderConverter):
-    """
-    DataFeeder converts the data returned by paddle.reader into a data structure
-    of Arguments which is defined in the API. The paddle.reader usually returns
-    a list of mini-batch data entries. Each data entry in the list is one sample.
-    Each sample is a list or a tuple with one feature or multiple features.
-    DataFeeder converts this mini-batch data entries into Arguments in order
-    to feed it to C++ interface.
-    
-    The simple usage shows below
-
-    ..  code-block:: python
-
-        feeding = ['image', 'label']
-        data_types = enumerate_data_types_of_data_layers(topology)
-        feeder = DataFeeder(data_types=data_types, feeding=feeding)
-
-        minibatch_data = [([1.0, 2.0, 3.0, ...], 5)]
-
-        arg = feeder(minibatch_data)
-
-
-    If mini-batch data and data layers are not one to one mapping, we
-    could pass a dictionary to feeding parameter to represent the mapping
-    relationship.
-
-
-    ..  code-block:: python
-
-        data_types = [('image', paddle.data_type.dense_vector(784)),
-                      ('label', paddle.data_type.integer_value(10))]
-        feeding = {'image':0, 'label':1}
-        feeder = DataFeeder(data_types=data_types, feeding=feeding)
-        minibatch_data = [
-                           ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ),  # first sample
-                           ( [1.0,2.0,3.0,4.0], 5, [6,7,8] )   # second sample
-                         ]
-        # or minibatch_data = [
-        #                       [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ],  # first sample
-        #                       [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ]   # second sample
-        #                     ]
-        arg = feeder.convert(minibatch_data)
-
-    ..  note::
-
-        This module is for internal use only. Users should use the `reader`
-        interface.
-
-
-
-    :param data_types: A list to specify data name and type. Each item is
-                       a tuple of (data_name, data_type).
-
-    :type data_types: list
-    :param feeding: A dictionary or a sequence to specify the position of each
-                    data in the input data.
-    :type feeding: dict|collections.Sequence|None
-    """
-
-    def __init__(self, data_types, feeding=None):
-        self.input_names = []
-        input_types = []
-        if feeding is None:
-            feeding = default_feeding_map(data_types)
-        elif isinstance(feeding, collections.Sequence):
-            feed_list = feeding
-            feeding = dict()
-            for i, name in enumerate(feed_list):
-                feeding[name] = i
-        elif not isinstance(feeding, dict):
-            raise TypeError("Feeding should be dict or sequence or None.")
-
-        self.feeding = feeding
-        for each in data_types:
-            self.input_names.append(each[0])
-            if not isinstance(each[1], pydp2.InputType):
-                raise TypeError("second item in each data_type should be an "
-                                "InputType")
-            input_types.append(each[1])
-        DataProviderConverter.__init__(self, input_types)
-
-    def __len__(self):
-        return len(self.input_names)
-
-    def convert(self, dat, argument=None):
-        """
-        :param dat: A list of mini-batch data. Each sample is a list or tuple
-                    one feature or multiple features.
-
-        :type dat: list
-        :param argument: An Arguments object contains this mini-batch data with
-                         one or multiple features. The Arguments definition is
-                         in the API.
-        :type argument: py_paddle.swig_paddle.Arguments
-        """
-
-        def reorder_data(data):
-            retv = []
-            for each in data:
-                reorder = []
-                for name in self.input_names:
-                    reorder.append(each[self.feeding[name]])
-                retv.append(reorder)
-            return retv
-
-        return DataProviderConverter.convert(self, reorder_data(dat), argument)
diff --git a/python/paddle/v2/data_type.py b/python/paddle/v2/data_type.py
deleted file mode 100644
index 226997465f2..00000000000
--- a/python/paddle/v2/data_type.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.trainer.PyDataProvider2 as pydp2
-
-import_list = [
-    nm for nm in dir(pydp2)
-    if '_' in nm and nm[0] != '_' and ('value' in nm or 'vector' in nm or
-                                       'array' in nm)
-]
-import_list.extend(['InputType'])
-
-for nm in import_list:
-    globals()[nm] = getattr(pydp2, nm)
-
-__all__ = import_list
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
deleted file mode 100644
index 38056fe0a94..00000000000
--- a/python/paddle/v2/dataset/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Dataset package.
-"""
-
-import mnist
-import imikolov
-import imdb
-import cifar
-import movielens
-import conll05
-import uci_housing
-import sentiment
-import wmt14
-import wmt16
-import mq2007
-import flowers
-import voc2012
-
-__all__ = [
-    'mnist',
-    'imikolov',
-    'imdb',
-    'cifar',
-    'movielens',
-    'conll05',
-    'sentiment',
-    'uci_housing',
-    'wmt14',
-    'wmt16',
-    'mq2007',
-    'flowers',
-    'voc2012',
-]
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
deleted file mode 100644
index 662655c836d..00000000000
--- a/python/paddle/v2/dataset/cifar.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-CIFAR dataset.
-
-This module will download dataset from
-https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
-paddle reader creators.
-
-The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
-with 6000 images per class. There are 50000 training images and 10000 test
-images.
-
-The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
-containing 600 images each. There are 500 training images and 100 testing
-images per class.
-
-"""
-
-import cPickle
-import itertools
-import numpy
-import paddle.v2.dataset.common
-import tarfile
-
-__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
-
-URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
-CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
-CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
-CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
-CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
-
-
-def reader_creator(filename, sub_name, cycle=False):
-    def read_batch(batch):
-        data = batch['data']
-        labels = batch.get('labels', batch.get('fine_labels', None))
-        assert labels is not None
-        for sample, label in itertools.izip(data, labels):
-            yield (sample / 255.0).astype(numpy.float32), int(label)
-
-    def reader():
-        with tarfile.open(filename, mode='r') as f:
-            names = (each_item.name for each_item in f
-                     if sub_name in each_item.name)
-
-            while True:
-                for name in names:
-                    batch = cPickle.load(f.extractfile(name))
-                    for item in read_batch(batch):
-                        yield item
-                if not cycle:
-                    break
-
-    return reader
-
-
-def train100():
-    """
-    CIFAR-100 training set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [0, 1] and label in [0, 99].
-
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
-        'train')
-
-
-def test100():
-    """
-    CIFAR-100 test set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [0, 1] and label in [0, 9].
-
-    :return: Test reader creator.
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
-        'test')
-
-
-def train10(cycle=False):
-    """
-    CIFAR-10 training set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [0, 1] and label in [0, 9].
-
-    :param cycle: whether to cycle through the dataset
-    :type cycle: bool
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'data_batch',
-        cycle=cycle)
-
-
-def test10(cycle=False):
-    """
-    CIFAR-10 test set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [0, 1] and label in [0, 9].
-
-    :param cycle: whether to cycle through the dataset
-    :type cycle: bool
-    :return: Test reader creator.
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'test_batch',
-        cycle=cycle)
-
-
-def fetch():
-    paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
-    paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
-
-
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100")
-    paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100")
-    paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10")
-    paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10")
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
deleted file mode 100644
index c6ff09a1d1e..00000000000
--- a/python/paddle/v2/dataset/common.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import requests
-import hashlib
-import os
-import errno
-import shutil
-import sys
-import importlib
-import paddle.v2.dataset
-import cPickle
-import glob
-import cPickle as pickle
-
-__all__ = [
-    'DATA_HOME',
-    'download',
-    'md5file',
-    'split',
-    'cluster_files_reader',
-    'convert',
-]
-
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
-
-
-# When running unit tests, there could be multiple processes that
-# trying to create DATA_HOME directory simultaneously, so we cannot
-# use a if condition to check for the existence of the directory;
-# instead, we use the filesystem as the synchronization mechanism by
-# catching returned errors.
-def must_mkdirs(path):
-    try:
-        os.makedirs(DATA_HOME)
-    except OSError as exc:
-        if exc.errno != errno.EEXIST:
-            raise
-        pass
-
-
-must_mkdirs(DATA_HOME)
-
-
-def md5file(fname):
-    hash_md5 = hashlib.md5()
-    f = open(fname, "rb")
-    for chunk in iter(lambda: f.read(4096), b""):
-        hash_md5.update(chunk)
-    f.close()
-    return hash_md5.hexdigest()
-
-
-def download(url, module_name, md5sum, save_name=None):
-    dirname = os.path.join(DATA_HOME, module_name)
-    if not os.path.exists(dirname):
-        os.makedirs(dirname)
-
-    filename = os.path.join(dirname,
-                            url.split('/')[-1]
-                            if save_name is None else save_name)
-
-    retry = 0
-    retry_limit = 3
-    while not (os.path.exists(filename) and md5file(filename) == md5sum):
-        if os.path.exists(filename):
-            print "file md5", md5file(filename), md5sum
-        if retry < retry_limit:
-            retry += 1
-        else:
-            raise RuntimeError("Cannot download {0} within retry limit {1}".
-                               format(url, retry_limit))
-        print "Cache file %s not found, downloading %s" % (filename, url)
-        r = requests.get(url, stream=True)
-        total_length = r.headers.get('content-length')
-
-        if total_length is None:
-            with open(filename, 'w') as f:
-                shutil.copyfileobj(r.raw, f)
-        else:
-            with open(filename, 'w') as f:
-                dl = 0
-                total_length = int(total_length)
-                for data in r.iter_content(chunk_size=4096):
-                    dl += len(data)
-                    f.write(data)
-                    done = int(50 * dl / total_length)
-                    sys.stdout.write("\r[%s%s]" % ('=' * done,
-                                                   ' ' * (50 - done)))
-                    sys.stdout.flush()
-
-    return filename
-
-
-def fetch_all():
-    for module_name in filter(lambda x: not x.startswith("__"),
-                              dir(paddle.v2.dataset)):
-        if "fetch" in dir(
-                importlib.import_module("paddle.v2.dataset.%s" % module_name)):
-            getattr(
-                importlib.import_module("paddle.v2.dataset.%s" % module_name),
-                "fetch")()
-
-
-def fetch_all_recordio(path):
-    for module_name in filter(lambda x: not x.startswith("__"),
-                              dir(paddle.v2.dataset)):
-        if "convert" in dir(
-                importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \
-                not module_name == "common":
-            ds_path = os.path.join(path, module_name)
-            must_mkdirs(ds_path)
-            getattr(
-                importlib.import_module("paddle.v2.dataset.%s" % module_name),
-                "convert")(ds_path)
-
-
-def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
-    """
-    you can call the function as:
-
-    split(paddle.v2.dataset.cifar.train10(), line_count=1000,
-        suffix="imikolov-train-%05d.pickle")
-
-    the output files as:
-
-    |-imikolov-train-00000.pickle
-    |-imikolov-train-00001.pickle
-    |- ...
-    |-imikolov-train-00480.pickle
-
-    :param reader: is a reader creator
-    :param line_count: line count for each file
-    :param suffix: the suffix for the output files, should contain "%d"
-                means the id for each file. Default is "%05d.pickle"
-    :param dumper: is a callable function that dump object to file, this
-                function will be called as dumper(obj, f) and obj is the object
-                will be dumped, f is a file object. Default is cPickle.dump.
-    """
-    if not callable(dumper):
-        raise TypeError("dumper should be callable.")
-    lines = []
-    indx_f = 0
-    for i, d in enumerate(reader()):
-        lines.append(d)
-        if i >= line_count and i % line_count == 0:
-            with open(suffix % indx_f, "w") as f:
-                dumper(lines, f)
-                lines = []
-                indx_f += 1
-    if lines:
-        with open(suffix % indx_f, "w") as f:
-            dumper(lines, f)
-
-
-def cluster_files_reader(files_pattern,
-                         trainer_count,
-                         trainer_id,
-                         loader=cPickle.load):
-    """
-    Create a reader that yield element from the given files, select
-    a file set according trainer count and trainer_id
-
-    :param files_pattern: the files which generating by split(...)
-    :param trainer_count: total trainer count
-    :param trainer_id: the trainer rank id
-    :param loader: is a callable function that load object from file, this
-                function will be called as loader(f) and f is a file object.
-                Default is cPickle.load
-    """
-
-    def reader():
-        if not callable(loader):
-            raise TypeError("loader should be callable.")
-        file_list = glob.glob(files_pattern)
-        file_list.sort()
-        my_file_list = []
-        for idx, fn in enumerate(file_list):
-            if idx % trainer_count == trainer_id:
-                print "append file: %s" % fn
-                my_file_list.append(fn)
-        for fn in my_file_list:
-            with open(fn, "r") as f:
-                lines = loader(f)
-                for line in lines:
-                    yield line
-
-    return reader
-
-
-def convert(output_path, reader, line_count, name_prefix):
-    import recordio
-    """
-    Convert data from reader to recordio format files.
-
-    :param output_path: directory in which output files will be saved.
-    :param reader: a data reader, from which the convert program will read
-                   data instances.
-    :param name_prefix: the name prefix of generated files.
-    :param max_lines_to_shuffle: the max lines numbers to shuffle before
-                                 writing.
-    """
-
-    assert line_count >= 1
-    indx_f = 0
-
-    def write_data(indx_f, lines):
-        filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
-        writer = recordio.writer(filename)
-        for l in lines:
-            # FIXME(Yancey1989):
-            # dumps with protocol: pickle.HIGHEST_PROTOCOL
-            writer.write(cPickle.dumps(l))
-        writer.close()
-
-    lines = []
-    for i, d in enumerate(reader()):
-        lines.append(d)
-        if i % line_count == 0 and i >= line_count:
-            write_data(indx_f, lines)
-            lines = []
-            indx_f += 1
-            continue
-
-    write_data(indx_f, lines)
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
deleted file mode 100644
index 8312900dc43..00000000000
--- a/python/paddle/v2/dataset/conll05.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Conll05 dataset.
-Paddle semantic role labeling Book and demo use this dataset as an example.
-Because Conll05 is not free in public, the default downloaded URL is test set
-of Conll05 (which is public). Users can change URL and MD5 to their Conll
-dataset. And a pre-trained word vector model based on Wikipedia corpus is used
-to initialize SRL model.
-"""
-
-import tarfile
-import gzip
-import itertools
-import paddle.v2.dataset.common
-
-__all__ = ['test, get_dict', 'get_embedding', 'convert']
-
-DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
-DATA_MD5 = '387719152ae52d60422c016e92a742fc'
-WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
-WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
-VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
-VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
-TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
-TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
-EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
-EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
-
-UNK_IDX = 0
-
-
-def load_label_dict(filename):
-    d = dict()
-    tag_dict = set()
-    with open(filename, 'r') as f:
-        for i, line in enumerate(f):
-            line = line.strip()
-            if line.startswith("B-"):
-                tag_dict.add(line[2:])
-            elif line.startswith("I-"):
-                tag_dict.add(line[2:])
-        index = 0
-        for tag in tag_dict:
-            d["B-" + tag] = index
-            index += 1
-            d["I-" + tag] = index
-            index += 1
-        d["O"] = index
-    return d
-
-
-def load_dict(filename):
-    d = dict()
-    with open(filename, 'r') as f:
-        for i, line in enumerate(f):
-            d[line.strip()] = i
-    return d
-
-
-def corpus_reader(data_path, words_name, props_name):
-    """
-    Read one corpus. It returns an iterator. Each element of
-    this iterator is a tuple including sentence and labels. The sentence is
-    consist of a list of word IDs. The labels include a list of label IDs.
-    :return: a iterator of data.
-    :rtype: iterator
-    """
-
-    def reader():
-        tf = tarfile.open(data_path)
-        wf = tf.extractfile(words_name)
-        pf = tf.extractfile(props_name)
-        with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
-                fileobj=pf) as props_file:
-            sentences = []
-            labels = []
-            one_seg = []
-            for word, label in itertools.izip(words_file, props_file):
-                word = word.strip()
-                label = label.strip().split()
-
-                if len(label) == 0:  # end of sentence
-                    for i in xrange(len(one_seg[0])):
-                        a_kind_lable = [x[i] for x in one_seg]
-                        labels.append(a_kind_lable)
-
-                    if len(labels) >= 1:
-                        verb_list = []
-                        for x in labels[0]:
-                            if x != '-':
-                                verb_list.append(x)
-
-                        for i, lbl in enumerate(labels[1:]):
-                            cur_tag = 'O'
-                            is_in_bracket = False
-                            lbl_seq = []
-                            verb_word = ''
-                            for l in lbl:
-                                if l == '*' and is_in_bracket == False:
-                                    lbl_seq.append('O')
-                                elif l == '*' and is_in_bracket == True:
-                                    lbl_seq.append('I-' + cur_tag)
-                                elif l == '*)':
-                                    lbl_seq.append('I-' + cur_tag)
-                                    is_in_bracket = False
-                                elif l.find('(') != -1 and l.find(')') != -1:
-                                    cur_tag = l[1:l.find('*')]
-                                    lbl_seq.append('B-' + cur_tag)
-                                    is_in_bracket = False
-                                elif l.find('(') != -1 and l.find(')') == -1:
-                                    cur_tag = l[1:l.find('*')]
-                                    lbl_seq.append('B-' + cur_tag)
-                                    is_in_bracket = True
-                                else:
-                                    raise RuntimeError('Unexpected label: %s' %
-                                                       l)
-
-                            yield sentences, verb_list[i], lbl_seq
-
-                    sentences = []
-                    labels = []
-                    one_seg = []
-                else:
-                    sentences.append(word)
-                    one_seg.append(label)
-
-        pf.close()
-        wf.close()
-        tf.close()
-
-    return reader
-
-
-def reader_creator(corpus_reader,
-                   word_dict=None,
-                   predicate_dict=None,
-                   label_dict=None):
-    def reader():
-        for sentence, predicate, labels in corpus_reader():
-
-            sen_len = len(sentence)
-
-            verb_index = labels.index('B-V')
-            mark = [0] * len(labels)
-            if verb_index > 0:
-                mark[verb_index - 1] = 1
-                ctx_n1 = sentence[verb_index - 1]
-            else:
-                ctx_n1 = 'bos'
-
-            if verb_index > 1:
-                mark[verb_index - 2] = 1
-                ctx_n2 = sentence[verb_index - 2]
-            else:
-                ctx_n2 = 'bos'
-
-            mark[verb_index] = 1
-            ctx_0 = sentence[verb_index]
-
-            if verb_index < len(labels) - 1:
-                mark[verb_index + 1] = 1
-                ctx_p1 = sentence[verb_index + 1]
-            else:
-                ctx_p1 = 'eos'
-
-            if verb_index < len(labels) - 2:
-                mark[verb_index + 2] = 1
-                ctx_p2 = sentence[verb_index + 2]
-            else:
-                ctx_p2 = 'eos'
-
-            word_idx = [word_dict.get(w, UNK_IDX) for w in sentence]
-
-            ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len
-            ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-            ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len
-            ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len
-            ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len
-
-            pred_idx = [predicate_dict.get(predicate)] * sen_len
-            label_idx = [label_dict.get(w) for w in labels]
-
-            yield word_idx, ctx_n2_idx, ctx_n1_idx, \
-              ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx
-
-    return reader
-
-
-def get_dict():
-    """
-    Get the word, verb and label dictionary of Wikipedia corpus.
-    """
-    word_dict = load_dict(
-        paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st',
-                                          WORDDICT_MD5))
-    verb_dict = load_dict(
-        paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st',
-                                          VERBDICT_MD5))
-    label_dict = load_label_dict(
-        paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st',
-                                          TRGDICT_MD5))
-    return word_dict, verb_dict, label_dict
-
-
-def get_embedding():
-    """
-    Get the trained word vector based on Wikipedia corpus.
-    """
-    return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
-
-
-def test():
-    """
-    Conll05 test set creator.
-
-    Because the training dataset is not free, the test dataset is used for
-    training. It returns a reader creator, each sample in the reader is nine
-    features, including sentence sequence, predicate, predicate context,
-    predicate context flag and tagged sequence.
-
-    :return: Training reader creator
-    :rtype: callable
-    """
-    word_dict, verb_dict, label_dict = get_dict()
-    reader = corpus_reader(
-        paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
-        words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
-        props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
-    return reader_creator(reader, word_dict, verb_dict, label_dict)
-
-
-def fetch():
-    paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
-    paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
-    paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
-    paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
-    paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
-
-
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train")
-    paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test")
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
deleted file mode 100644
index db12076d540..00000000000
--- a/python/paddle/v2/dataset/flowers.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This module will download dataset from
-http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
-and parse train/test set intopaddle reader creators.
-
-This set contains images of flowers belonging to 102 different categories.
-The images were acquired by searching the web and taking pictures. There are a
-minimum of 40 images for each category.
-
-The database was used in:
-
-Nilsback, M-E. and Zisserman, A. Automated flower classification over a large
- number of classes.Proceedings of the Indian Conference on Computer Vision,
-Graphics and Image Processing (2008)
-http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
-
-"""
-import cPickle
-import itertools
-import functools
-from common import download
-import tarfile
-import scipy.io as scio
-from paddle.v2.image import *
-from paddle.v2.reader import *
-import os
-import numpy as np
-from multiprocessing import cpu_count
-__all__ = ['train', 'test', 'valid']
-
-DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
-LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
-SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
-DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
-LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
-SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
-# In official 'readme', tstid is the flag of test data
-# and trnid is the flag of train data. But test data is more than train data.
-# So we exchange the train data and test data.
-TRAIN_FLAG = 'tstid'
-TEST_FLAG = 'trnid'
-VALID_FLAG = 'valid'
-
-
-def default_mapper(is_train, sample):
-    '''
-    map image bytes data to type needed by model input layer
-    '''
-    img, label = sample
-    img = load_image_bytes(img)
-    img = simple_transform(
-        img, 256, 224, is_train, mean=[103.94, 116.78, 123.68])
-    return img.flatten().astype('float32'), label
-
-
-train_mapper = functools.partial(default_mapper, True)
-test_mapper = functools.partial(default_mapper, False)
-
-
-def reader_creator(data_file,
-                   label_file,
-                   setid_file,
-                   dataset_name,
-                   mapper,
-                   buffered_size=1024,
-                   use_xmap=True,
-                   cycle=False):
-    '''
-    1. read images from tar file and
-        merge images into batch files in 102flowers.tgz_batch/
-    2. get a reader to read sample from batch file
-
-    :param data_file: downloaded data file
-    :type data_file: string
-    :param label_file: downloaded label file
-    :type label_file: string
-    :param setid_file: downloaded setid file containing information
-                        about how to split dataset
-    :type setid_file: string
-    :param dataset_name: data set name (tstid|trnid|valid)
-    :type dataset_name: string
-    :param mapper: a function to map image bytes data to type
-                    needed by model input layer
-    :type mapper: callable
-    :param buffered_size: the size of buffer used to process images
-    :type buffered_size: int
-    :param cycle: whether to cycle through the dataset
-    :type cycle: bool
-    :return: data reader
-    :rtype: callable
-    '''
-    labels = scio.loadmat(label_file)['labels'][0]
-    indexes = scio.loadmat(setid_file)[dataset_name][0]
-    img2label = {}
-    for i in indexes:
-        img = "jpg/image_%05d.jpg" % i
-        img2label[img] = labels[i - 1]
-    file_list = batch_images_from_tar(data_file, dataset_name, img2label)
-
-    def reader():
-        while True:
-            for file in open(file_list):
-                file = file.strip()
-                batch = None
-                with open(file, 'r') as f:
-                    batch = cPickle.load(f)
-                data = batch['data']
-                labels = batch['label']
-                for sample, label in itertools.izip(data, batch['label']):
-                    yield sample, int(label) - 1
-            if not cycle:
-                break
-
-    if use_xmap:
-        cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
-        return xmap_readers(mapper, reader, cpu_num, buffered_size)
-    else:
-        return map_readers(mapper, reader)
-
-
-def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False):
-    '''
-    Create flowers training set reader.
-    It returns a reader, each sample in the reader is
-    image pixels in [0, 1] and label in [1, 102]
-    translated from original color image by steps:
-    1. resize to 256*256
-    2. random crop to 224*224
-    3. flatten
-    :param mapper:  a function to map sample.
-    :type mapper: callable
-    :param buffered_size: the size of buffer used to process images
-    :type buffered_size: int
-    :param cycle: whether to cycle through the dataset
-    :type cycle: bool
-    :return: train data reader
-    :rtype: callable
-    '''
-    return reader_creator(
-        download(DATA_URL, 'flowers', DATA_MD5),
-        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5),
-        TRAIN_FLAG,
-        mapper,
-        buffered_size,
-        use_xmap,
-        cycle=cycle)
-
-
-def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False):
-    '''
-    Create flowers test set reader.
-    It returns a reader, each sample in the reader is
-    image pixels in [0, 1] and label in [1, 102]
-    translated from original color image by steps:
-    1. resize to 256*256
-    2. random crop to 224*224
-    3. flatten
-    :param mapper:  a function to map sample.
-    :type mapper: callable
-    :param buffered_size: the size of buffer used to process images
-    :type buffered_size: int
-    :param cycle: whether to cycle through the dataset
-    :type cycle: bool
-    :return: test data reader
-    :rtype: callable
-    '''
-    return reader_creator(
-        download(DATA_URL, 'flowers', DATA_MD5),
-        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5),
-        TEST_FLAG,
-        mapper,
-        buffered_size,
-        use_xmap,
-        cycle=cycle)
-
-
-def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
-    '''
-    Create flowers validation set reader.
-    It returns a reader, each sample in the reader is
-    image pixels in [0, 1] and label in [1, 102]
-    translated from original color image by steps:
-    1. resize to 256*256
-    2. random crop to 224*224
-    3. flatten
-    :param mapper:  a function to map sample.
-    :type mapper: callable
-    :param buffered_size: the size of buffer used to process images
-    :type buffered_size: int
-    :return: test data reader
-    :rtype: callable
-    '''
-    return reader_creator(
-        download(DATA_URL, 'flowers', DATA_MD5),
-        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper,
-        buffered_size, use_xmap)
-
-
-def fetch():
-    download(DATA_URL, 'flowers', DATA_MD5)
-    download(LABEL_URL, 'flowers', LABEL_MD5)
-    download(SETID_URL, 'flowers', SETID_MD5)
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
deleted file mode 100644
index 00c2a3b9928..00000000000
--- a/python/paddle/v2/dataset/imdb.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-IMDB dataset.
-
-This module downloads IMDB dataset from
-http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
-of 25,000 highly polar movie reviews for training, and 25,000 for testing.
-Besides, this module also provides API for building dictionary.
-"""
-
-import paddle.v2.dataset.common
-import collections
-import tarfile
-import re
-import string
-
-__all__ = ['build_dict', 'train', 'test', 'convert']
-
-URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
-MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
-
-
-def tokenize(pattern):
-    """
-    Read files that match the given pattern.  Tokenize and yield each file.
-    """
-
-    with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
-                                                        MD5)) as tarf:
-        # Note that we should use tarfile.next(), which does
-        # sequential access of member files, other than
-        # tarfile.extractfile, which does random access and might
-        # destroy hard disks.
-        tf = tarf.next()
-        while tf != None:
-            if bool(pattern.match(tf.name)):
-                # newline and punctuations removal and ad-hoc tokenization.
-                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
-                    None, string.punctuation).lower().split()
-            tf = tarf.next()
-
-
-def build_dict(pattern, cutoff):
-    """
-    Build a word dictionary from the corpus. Keys of the dictionary are words,
-    and values are zero-based IDs of these words.
-    """
-    word_freq = collections.defaultdict(int)
-    for doc in tokenize(pattern):
-        for word in doc:
-            word_freq[word] += 1
-
-    # Not sure if we should prune less-frequent words here.
-    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
-
-    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
-    words, _ = list(zip(*dictionary))
-    word_idx = dict(zip(words, xrange(len(words))))
-    word_idx['<unk>'] = len(words)
-    return word_idx
-
-
-def reader_creator(pos_pattern, neg_pattern, word_idx):
-    UNK = word_idx['<unk>']
-    INS = []
-
-    def load(pattern, out, label):
-        for doc in tokenize(pattern):
-            out.append(([word_idx.get(w, UNK) for w in doc], label))
-
-    load(pos_pattern, INS, 0)
-    load(neg_pattern, INS, 1)
-
-    def reader():
-        for doc, label in INS:
-            yield doc, label
-
-    return reader
-
-
-def train(word_idx):
-    """
-    IMDB training set creator.
-
-    It returns a reader creator, each sample in the reader is an zero-based ID
-    sequence and label in [0, 1].
-
-    :param word_idx: word dictionary
-    :type word_idx: dict
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        re.compile("aclImdb/train/pos/.*\.txt$"),
-        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)
-
-
-def test(word_idx):
-    """
-    IMDB test set creator.
-
-    It returns a reader creator, each sample in the reader is an zero-based ID
-    sequence and label in [0, 1].
-
-    :param word_idx: word dictionary
-    :type word_idx: dict
-    :return: Test reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        re.compile("aclImdb/test/pos/.*\.txt$"),
-        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
-
-
-def word_dict(cutoff=150):
-    """
-    Build a word dictionary from the corpus.
-
-    :return: Word dictionary
-    :rtype: dict
-    """
-    return build_dict(
-        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), cutoff)
-
-
-def fetch():
-    paddle.v2.dataset.common.download(URL, 'imdb', MD5)
-
-
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    w = word_dict()
-    paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
-    paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
deleted file mode 100644
index 617c722c416..00000000000
--- a/python/paddle/v2/dataset/imikolov.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-imikolov's simple dataset.
-
-This module will download dataset from 
-http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
-into paddle reader creators.
-"""
-import paddle.v2.dataset.common
-import collections
-import tarfile
-
-__all__ = ['train', 'test', 'build_dict', 'convert']
-
-URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
-MD5 = '30177ea32e27c525793142b6bf2c8e2d'
-
-
-class DataType(object):
-    NGRAM = 1
-    SEQ = 2
-
-
-def word_count(f, word_freq=None):
-    if word_freq is None:
-        word_freq = collections.defaultdict(int)
-
-    for l in f:
-        for w in l.strip().split():
-            word_freq[w] += 1
-        word_freq['<s>'] += 1
-        word_freq['<e>'] += 1
-
-    return word_freq
-
-
-def build_dict(min_word_freq=50):
-    """
-    Build a word dictionary from the corpus,  Keys of the dictionary are words,
-    and values are zero-based IDs of these words.
-    """
-    train_filename = './simple-examples/data/ptb.train.txt'
-    test_filename = './simple-examples/data/ptb.valid.txt'
-    with tarfile.open(
-            paddle.v2.dataset.common.download(
-                paddle.v2.dataset.imikolov.URL, 'imikolov',
-                paddle.v2.dataset.imikolov.MD5)) as tf:
-        trainf = tf.extractfile(train_filename)
-        testf = tf.extractfile(test_filename)
-        word_freq = word_count(testf, word_count(trainf))
-        if '<unk>' in word_freq:
-            # remove <unk> for now, since we will set it as last index
-            del word_freq['<unk>']
-
-        word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
-
-        word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
-        words, _ = list(zip(*word_freq_sorted))
-        word_idx = dict(zip(words, xrange(len(words))))
-        word_idx['<unk>'] = len(words)
-
-    return word_idx
-
-
-def reader_creator(filename, word_idx, n, data_type):
-    def reader():
-        with tarfile.open(
-                paddle.v2.dataset.common.download(
-                    paddle.v2.dataset.imikolov.URL, 'imikolov',
-                    paddle.v2.dataset.imikolov.MD5)) as tf:
-            f = tf.extractfile(filename)
-
-            UNK = word_idx['<unk>']
-            for l in f:
-                if DataType.NGRAM == data_type:
-                    assert n > -1, 'Invalid gram length'
-                    l = ['<s>'] + l.strip().split() + ['<e>']
-                    if len(l) >= n:
-                        l = [word_idx.get(w, UNK) for w in l]
-                        for i in range(n, len(l) + 1):
-                            yield tuple(l[i - n:i])
-                elif DataType.SEQ == data_type:
-                    l = l.strip().split()
-                    l = [word_idx.get(w, UNK) for w in l]
-                    src_seq = [word_idx['<s>']] + l
-                    trg_seq = l + [word_idx['<e>']]
-                    if n > 0 and len(src_seq) > n: continue
-                    yield src_seq, trg_seq
-                else:
-                    assert False, 'Unknow data type'
-
-    return reader
-
-
-def train(word_idx, n, data_type=DataType.NGRAM):
-    """
-    imikolov training set creator.
-
-    It returns a reader creator, each sample in the reader is a word ID
-    tuple.
-
-    :param word_idx: word dictionary
-    :type word_idx: dict
-    :param n: sliding window size if type is ngram, otherwise max length of sequence
-    :type n: int
-    :param data_type: data type (ngram or sequence)
-    :type data_type: member variable of DataType (NGRAM or SEQ)
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n,
-                          data_type)
-
-
-def test(word_idx, n, data_type=DataType.NGRAM):
-    """
-    imikolov test set creator.
-
-    It returns a reader creator, each sample in the reader is a word ID
-    tuple.
-
-    :param word_idx: word dictionary
-    :type word_idx: dict
-    :param n: sliding window size if type is ngram, otherwise max length of sequence
-    :type n: int
-    :param data_type: data type (ngram or sequence)
-    :type data_type: member variable of DataType (NGRAM or SEQ)
-    :return: Test reader creator
-    :rtype: callable
-    """
-    return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n,
-                          data_type)
-
-
-def fetch():
-    paddle.v2.dataset.common.download(URL, "imikolov", MD5)
-
-
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    N = 5
-    word_dict = build_dict()
-    paddle.v2.dataset.common.convert(path,
-                                     train(word_dict, N), 1000,
-                                     "imikolov_train")
-    paddle.v2.dataset.common.convert(path,
-                                     test(word_dict, N), 1000, "imikolov_test")
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
deleted file mode 100644
index 026cf501cfb..00000000000
--- a/python/paddle/v2/dataset/mnist.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-MNIST dataset.
-
-This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
-parse training set and test set into paddle reader creators.
-"""
-import paddle.v2.dataset.common
-import subprocess
-import numpy
-import platform
-__all__ = ['train', 'test', 'convert']
-
-URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
-TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
-TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
-TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
-TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
-TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
-TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
-TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
-TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
-
-
-def reader_creator(image_filename, label_filename, buffer_size):
-    def reader():
-        if platform.system() == 'Darwin':
-            zcat_cmd = 'gzcat'
-        elif platform.system() == 'Linux':
-            zcat_cmd = 'zcat'
-        else:
-            raise NotImplementedError()
-
-        # According to http://stackoverflow.com/a/38061619/724872, we
-        # cannot use standard package gzip here.
-        m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
-        m.stdout.read(16)  # skip some magic bytes
-
-        l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
-        l.stdout.read(8)  # skip some magic bytes
-
-        try:  # reader could be break.
-            while True:
-                labels = numpy.fromfile(
-                    l.stdout, 'ubyte', count=buffer_size).astype("int")
-
-                if labels.size != buffer_size:
-                    break  # numpy.fromfile returns empty slice after EOF.
-
-                images = numpy.fromfile(
-                    m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
-                        (buffer_size, 28 * 28)).astype('float32')
-
-                images = images / 255.0 * 2.0 - 1.0
-
-                for i in xrange(buffer_size):
-                    yield images[i, :], int(labels[i])
-        finally:
-            try:
-                m.terminate()
-            except:
-                pass
-            try:
-                l.terminate()
-            except:
-                pass
-
-    return reader
-
-
-def train():
-    """
-    MNIST training set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [0, 1] and label in [0, 9].
-
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist',
-                                          TRAIN_IMAGE_MD5),
-        paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist',
-                                          TRAIN_LABEL_MD5), 100)
-
-
-def test():
-    """
-    MNIST test set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [0, 1] and label in [0, 9].
-
-    :return: Test reader creator.
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist',
-                                          TEST_IMAGE_MD5),
-        paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist',
-                                          TEST_LABEL_MD5), 100)
-
-
-def fetch():
-    paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
-    paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
-    paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
-    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
-
-
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train")
-    paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test")
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
deleted file mode 100644
index 5b61a9420af..00000000000
--- a/python/paddle/v2/dataset/movielens.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Movielens 1-M dataset.
-
-Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
-movies, which was collected by GroupLens Research. This module will download
-Movielens 1-M dataset from 
-http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training
-set and test set into paddle reader creators.
-
-"""
-
-import zipfile
-import paddle.v2.dataset.common
-import re
-import random
-import functools
-
-__all__ = [
-    'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
-    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info',
-    'convert'
-]
-
-age_table = [1, 18, 25, 35, 45, 50, 56]
-
-URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
-MD5 = 'c4d9eecfca2ab87c1945afe126590906'
-
-
-class MovieInfo(object):
-    """
-    Movie id, title and categories information are stored in MovieInfo.
-    """
-
-    def __init__(self, index, categories, title):
-        self.index = int(index)
-        self.categories = categories
-        self.title = title
-
-    def value(self):
-        """
-        Get information from a movie.
-        """
-        return [
-            self.index, [CATEGORIES_DICT[c] for c in self.categories],
-            [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()]
-        ]
-
-    def __str__(self):
-        return "<MovieInfo id(%d), title(%s), categories(%s)>" % (
-            self.index, self.title, self.categories)
-
-    def __repr__(self):
-        return self.__str__()
-
-
-class UserInfo(object):
-    """
-    User id, gender, age, and job information are stored in UserInfo.
-    """
-
-    def __init__(self, index, gender, age, job_id):
-        self.index = int(index)
-        self.is_male = gender == 'M'
-        self.age = age_table.index(int(age))
-        self.job_id = int(job_id)
-
-    def value(self):
-        """
-        Get information from a user.
-        """
-        return [self.index, 0 if self.is_male else 1, self.age, self.job_id]
-
-    def __str__(self):
-        return "<UserInfo id(%d), gender(%s), age(%d), job(%d)>" % (
-            self.index, "M"
-            if self.is_male else "F", age_table[self.age], self.job_id)
-
-    def __repr__(self):
-        return str(self)
-
-
-MOVIE_INFO = None
-MOVIE_TITLE_DICT = None
-CATEGORIES_DICT = None
-USER_INFO = None
-
-
-def __initialize_meta_info__():
-    fn = paddle.v2.dataset.common.download(URL, "movielens", MD5)
-    global MOVIE_INFO
-    if MOVIE_INFO is None:
-        pattern = re.compile(r'^(.*)\((\d+)\)$')
-        with zipfile.ZipFile(file=fn) as package:
-            for info in package.infolist():
-                assert isinstance(info, zipfile.ZipInfo)
-                MOVIE_INFO = dict()
-                title_word_set = set()
-                categories_set = set()
-                with package.open('ml-1m/movies.dat') as movie_file:
-                    for i, line in enumerate(movie_file):
-                        movie_id, title, categories = line.strip().split('::')
-                        categories = categories.split('|')
-                        for c in categories:
-                            categories_set.add(c)
-                        title = pattern.match(title).group(1)
-                        MOVIE_INFO[int(movie_id)] = MovieInfo(
-                            index=movie_id, categories=categories, title=title)
-                        for w in title.split():
-                            title_word_set.add(w.lower())
-
-                global MOVIE_TITLE_DICT
-                MOVIE_TITLE_DICT = dict()
-                for i, w in enumerate(title_word_set):
-                    MOVIE_TITLE_DICT[w] = i
-
-                global CATEGORIES_DICT
-                CATEGORIES_DICT = dict()
-                for i, c in enumerate(categories_set):
-                    CATEGORIES_DICT[c] = i
-
-                global USER_INFO
-                USER_INFO = dict()
-                with package.open('ml-1m/users.dat') as user_file:
-                    for line in user_file:
-                        uid, gender, age, job, _ = line.strip().split("::")
-                        USER_INFO[int(uid)] = UserInfo(
-                            index=uid, gender=gender, age=age, job_id=job)
-    return fn
-
-
-def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
-    fn = __initialize_meta_info__()
-    rand = random.Random(x=rand_seed)
-    with zipfile.ZipFile(file=fn) as package:
-        with package.open('ml-1m/ratings.dat') as rating:
-            for line in rating:
-                if (rand.random() < test_ratio) == is_test:
-                    uid, mov_id, rating, _ = line.strip().split("::")
-                    uid = int(uid)
-                    mov_id = int(mov_id)
-                    rating = float(rating) * 2 - 5.0
-
-                    mov = MOVIE_INFO[mov_id]
-                    usr = USER_INFO[uid]
-                    yield usr.value() + mov.value() + [[rating]]
-
-
-def __reader_creator__(**kwargs):
-    return lambda: __reader__(**kwargs)
-
-
-train = functools.partial(__reader_creator__, is_test=False)
-test = functools.partial(__reader_creator__, is_test=True)
-
-
-def get_movie_title_dict():
-    """
-    Get movie title dictionary.
-    """
-    __initialize_meta_info__()
-    return MOVIE_TITLE_DICT
-
-
-def __max_index_info__(a, b):
-    if a.index > b.index:
-        return a
-    else:
-        return b
-
-
-def max_movie_id():
-    """
-    Get the maximum value of movie id.
-    """
-    __initialize_meta_info__()
-    return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
-
-
-def max_user_id():
-    """
-    Get the maximum value of user id.
-    """
-    __initialize_meta_info__()
-    return reduce(__max_index_info__, USER_INFO.viewvalues()).index
-
-
-def __max_job_id_impl__(a, b):
-    if a.job_id > b.job_id:
-        return a
-    else:
-        return b
-
-
-def max_job_id():
-    """
-    Get the maximum value of job id.
-    """
-    __initialize_meta_info__()
-    return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
-
-
-def movie_categories():
-    """
-    Get movie categoriges dictionary.
-    """
-    __initialize_meta_info__()
-    return CATEGORIES_DICT
-
-
-def user_info():
-    """
-    Get user info dictionary.
-    """
-    __initialize_meta_info__()
-    return USER_INFO
-
-
-def movie_info():
-    """
-    Get movie info dictionary.
-    """
-    __initialize_meta_info__()
-    return MOVIE_INFO
-
-
-def unittest():
-    for train_count, _ in enumerate(train()()):
-        pass
-    for test_count, _ in enumerate(test()()):
-        pass
-
-    print train_count, test_count
-
-
-def fetch():
-    paddle.v2.dataset.common.download(URL, "movielens", MD5)
-
-
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train")
-    paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test")
-
-
-if __name__ == '__main__':
-    unittest()
diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py
deleted file mode 100644
index d3b3dd524c3..00000000000
--- a/python/paddle/v2/dataset/mq2007.py
+++ /dev/null
@@ -1,333 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-MQ2007 dataset
-
-MQ2007 is a query set from Million Query track of TREC 2007. There are about 1700 queries in it with labeled documents. In MQ2007, the 5-fold cross
-validation strategy is adopted and the 5-fold partitions are included in the package. In each fold, there are three subsets for learning: training set,
-validation set and testing set.
-
-MQ2007 dataset from website
-http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar and parse training set and test set into paddle reader creators
-
-"""
-
-import os
-import functools
-import rarfile
-from common import download
-import numpy as np
-
-# URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
-URL = "http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar"
-MD5 = "7be1640ae95c6408dab0ae7207bdc706"
-
-
-def __initialize_meta_info__():
-    """
-  download and extract the MQ2007 dataset
-  """
-    fn = fetch()
-    rar = rarfile.RarFile(fn)
-    dirpath = os.path.dirname(fn)
-    rar.extractall(path=dirpath)
-    return dirpath
-
-
-class Query(object):
-    """
-  queries used for learning to rank algorithms. It is created from relevance scores,  query-document feature vectors
-
-  Parameters:
-  ----------
-  query_id : int
-    query_id in dataset, mapping from query to relevance documents
-  relevance_score : int 
-    relevance score of query and document pair
-  feature_vector : array, dense feature
-    feature in vector format
-  description : string
-    comment section in query doc pair data
-  """
-
-    def __init__(self,
-                 query_id=-1,
-                 relevance_score=-1,
-                 feature_vector=None,
-                 description=""):
-        self.query_id = query_id
-        self.relevance_score = relevance_score
-        if feature_vector is None:
-            self.feature_vector = []
-        else:
-            self.feature_vector = feature_vector
-        self.description = description
-
-    def __str__(self):
-        string = "%s %s %s" % (str(self.relevance_score), str(self.query_id),
-                               " ".join(str(f) for f in self.feature_vector))
-        return string
-
-    # @classmethod
-    def _parse_(self, text):
-        """
-    parse line into Query
-    """
-        comment_position = text.find('#')
-        line = text[:comment_position].strip()
-        self.description = text[comment_position + 1:].strip()
-        parts = line.split()
-        if len(parts) != 48:
-            sys.stdout.write("expect 48 space split parts, get %d" %
-                             (len(parts)))
-            return None
-        # format : 0 qid:10 1:0.000272 2:0.000000 .... 
-        self.relevance_score = int(parts[0])
-        self.query_id = int(parts[1].split(':')[1])
-        for p in parts[2:]:
-            pair = p.split(':')
-            self.feature_vector.append(float(pair[1]))
-        return self
-
-
-class QueryList(object):
-    """
-  group query into list, every item in list is a Query
-  """
-
-    def __init__(self, querylist=None):
-        self.query_id = -1
-        if querylist is None:
-            self.querylist = []
-        else:
-            self.querylist = querylist
-            for query in self.querylist:
-                if self.query_id == -1:
-                    self.query_id = query.query_id
-                else:
-                    if self.query_id != query.query_id:
-                        raise ValueError("query in list must be same query_id")
-
-    def __iter__(self):
-        for query in self.querylist:
-            yield query
-
-    def __len__(self):
-        return len(self.querylist)
-
-    def __getitem__(self, i):
-        return self.querylist[i]
-
-    def _correct_ranking_(self):
-        if self.querylist is None:
-            return
-        self.querylist.sort(key=lambda x: x.relevance_score, reverse=True)
-
-    def _add_query(self, query):
-        if self.query_id == -1:
-            self.query_id = query.query_id
-        else:
-            if self.query_id != query.query_id:
-                raise ValueError("query in list must be same query_id")
-        self.querylist.append(query)
-
-
-def gen_plain_txt(querylist):
-    """
-  gen plain text in list for other usage
-  Paramters:
-  --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
-
-  return :
-  ------
-  query_id : np.array, shape=(samples_num, )
-  label : np.array, shape=(samples_num, )
-  querylist : np.array, shape=(samples_num, feature_dimension)
-    """
-    if not isinstance(querylist, QueryList):
-        querylist = QueryList(querylist)
-    querylist._correct_ranking_()
-    for query in querylist:
-        yield querylist.query_id, query.relevance_score, np.array(
-            query.feature_vector)
-
-
-def gen_point(querylist):
-    """
-  gen item in list for point-wise learning to rank algorithm
-  Paramters:
-  --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
-
-  return :
-  ------
-  label : np.array, shape=(samples_num, )
-  querylist : np.array, shape=(samples_num, feature_dimension)
-  """
-    if not isinstance(querylist, QueryList):
-        querylist = QueryList(querylist)
-    querylist._correct_ranking_()
-    for query in querylist:
-        yield query.relevance_score, np.array(query.feature_vector)
-
-
-def gen_pair(querylist, partial_order="full"):
-    """
-  gen pair for pair-wise learning to rank algorithm
-  Paramters:
-  --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
-  pairtial_order : "full" or "neighbour"
-    there is redudant in all possiable pair combinations, which can be simplifed
-  gen pairs for neighbour items or the full partial order pairs
-
-  return :
-  ------
-  label : np.array, shape=(1)
-  query_left : np.array, shape=(1, feature_dimension)
-  query_right : same as left
-  """
-    if not isinstance(querylist, QueryList):
-        querylist = QueryList(querylist)
-    querylist._correct_ranking_()
-    labels = []
-    docpairs = []
-
-    # C(n,2)
-    for i in range(len(querylist)):
-        query_left = querylist[i]
-        for j in range(i + 1, len(querylist)):
-            query_right = querylist[j]
-            if query_left.relevance_score > query_right.relevance_score:
-                labels.append([1])
-                docpairs.append([
-                    np.array(query_left.feature_vector),
-                    np.array(query_right.feature_vector)
-                ])
-            elif query_left.relevance_score < query_right.relevance_score:
-                labels.append([1])
-                docpairs.append([
-                    np.array(query_right.feature_vector),
-                    np.array(query_left.feature_vector)
-                ])
-    for label, pair in zip(labels, docpairs):
-        yield np.array(label), pair[0], pair[1]
-
-
-def gen_list(querylist):
-    """
-  gen item in list for list-wise learning to rank algorithm
-  Paramters:
-  --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
-
-  return :
-  ------
-  label : np.array, shape=(samples_num, )
-  querylist : np.array, shape=(samples_num, feature_dimension)
-  """
-    if not isinstance(querylist, QueryList):
-        querylist = QueryList(querylist)
-    querylist._correct_ranking_()
-    relevance_score_list = [[query.relevance_score] for query in querylist]
-    feature_vector_list = [query.feature_vector for query in querylist]
-    yield np.array(relevance_score_list), np.array(feature_vector_list)
-
-
-def query_filter(querylists):
-    """
-    filter query get only document with label 0.
-    label 0, 1, 2 means the relevance score document with query
-    parameters :
-      querylist : QueyList list
-
-    return :
-      querylist : QueyList list
-    """
-    filter_query = []
-    for querylist in querylists:
-        relevance_score_list = [query.relevance_score for query in querylist]
-        if sum(relevance_score_list) != .0:
-            filter_query.append(querylist)
-    return filter_query
-
-
-def load_from_text(filepath, shuffle=False, fill_missing=-1):
-    """
-  parse data file into querys
-  """
-    prev_query_id = -1
-    querylists = []
-    querylist = None
-    fn = __initialize_meta_info__()
-    with open(os.path.join(fn, filepath)) as f:
-        for line in f:
-            query = Query()
-            query = query._parse_(line)
-            if query == None:
-                continue
-            if query.query_id != prev_query_id:
-                if querylist is not None:
-                    querylists.append(querylist)
-                querylist = QueryList()
-                prev_query_id = query.query_id
-            querylist._add_query(query)
-    if querylist is not None:
-        querylists.append(querylist)
-    return querylists
-
-
-def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1):
-    """
-  Parameters
-  --------
-  filename : string
-  fill_missing : fill the missing value. default in MQ2007 is -1
-  
-  Returns
-  ------
-  yield
-    label query_left, query_right  # format = "pairwise"
-    label querylist # format = "listwise"
-  """
-    querylists = query_filter(
-        load_from_text(
-            filepath, shuffle=shuffle, fill_missing=fill_missing))
-    for querylist in querylists:
-        if format == "plain_txt":
-            yield next(gen_plain_txt(querylist))
-        elif format == "pointwise":
-            yield next(gen_point(querylist))
-        elif format == "pairwise":
-            for pair in gen_pair(querylist):
-                yield pair
-        elif format == "listwise":
-            yield next(gen_list(querylist))
-
-
-train = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/train.txt")
-test = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/test.txt")
-
-
-def fetch():
-    return download(URL, "MQ2007", MD5)
-
-
-if __name__ == "__main__":
-    fetch()
-    mytest = functools.partial(
-        __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise")
-    for label, query in mytest():
-        print label, query
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py
deleted file mode 100644
index b0b9757c1a7..00000000000
--- a/python/paddle/v2/dataset/sentiment.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# /usr/bin/env python
-# -*- coding:utf-8 -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-The script fetch and preprocess movie_reviews data set that provided by NLTK
-
-TODO(yuyang18): Complete dataset.
-"""
-
-import collections
-from itertools import chain
-
-import nltk
-from nltk.corpus import movie_reviews
-
-import paddle.v2.dataset.common
-
-__all__ = ['train', 'test', 'get_word_dict', 'convert']
-NUM_TRAINING_INSTANCES = 1600
-NUM_TOTAL_INSTANCES = 2000
-
-
-def download_data_if_not_yet():
-    """
-    Download the data set, if the data set is not download.
-    """
-    try:
-        # make sure that nltk can find the data
-        if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path:
-            nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME)
-        movie_reviews.categories()
-    except LookupError:
-        print "Downloading movie_reviews data set, please wait....."
-        nltk.download(
-            'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
-        print "Download data set success....."
-        print "Path is " + nltk.data.find('corpora/movie_reviews').path
-
-
-def get_word_dict():
-    """
-    Sorted the words by the frequency of words which occur in sample
-    :return:
-        words_freq_sorted
-    """
-    words_freq_sorted = list()
-    word_freq_dict = collections.defaultdict(int)
-    download_data_if_not_yet()
-
-    for category in movie_reviews.categories():
-        for field in movie_reviews.fileids(category):
-            for words in movie_reviews.words(field):
-                word_freq_dict[words] += 1
-    words_sort_list = word_freq_dict.items()
-    words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
-    for index, word in enumerate(words_sort_list):
-        words_freq_sorted.append((word[0], index))
-    return words_freq_sorted
-
-
-def sort_files():
-    """
-    Sorted the sample for cross reading the sample
-    :return:
-        files_list
-    """
-    files_list = list()
-    neg_file_list = movie_reviews.fileids('neg')
-    pos_file_list = movie_reviews.fileids('pos')
-    files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
-    return files_list
-
-
-def load_sentiment_data():
-    """
-    Load the data set
-    :return:
-        data_set
-    """
-    data_set = list()
-    download_data_if_not_yet()
-    words_ids = dict(get_word_dict())
-    for sample_file in sort_files():
-        words_list = list()
-        category = 0 if 'neg' in sample_file else 1
-        for word in movie_reviews.words(sample_file):
-            words_list.append(words_ids[word.lower()])
-        data_set.append((words_list, category))
-    return data_set
-
-
-def reader_creator(data):
-    """
-    Reader creator, generate an iterator for data set
-    :param data:
-        train data set or test data set
-    """
-    for each in data:
-        yield each[0], each[1]
-
-
-def train():
-    """
-    Default training set reader creator
-    """
-    data_set = load_sentiment_data()
-    return reader_creator(data_set[0:NUM_TRAINING_INSTANCES])
-
-
-def test():
-    """
-    Default test set reader creator
-    """
-    data_set = load_sentiment_data()
-    return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
-
-
-def fetch():
-    nltk.download(
-        'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
-
-
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train")
-    paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test")
diff --git a/python/paddle/v2/dataset/tests/cifar_test.py b/python/paddle/v2/dataset/tests/cifar_test.py
deleted file mode 100644
index e0e18229da7..00000000000
--- a/python/paddle/v2/dataset/tests/cifar_test.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2.dataset.cifar
-import unittest
-
-
-class TestCIFAR(unittest.TestCase):
-    def check_reader(self, reader):
-        sum = 0
-        label = 0
-        for l in reader():
-            self.assertEqual(l[0].size, 3072)
-            if l[1] > label:
-                label = l[1]
-            sum += 1
-        return sum, label
-
-    def test_test10(self):
-        instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.cifar.test10())
-        self.assertEqual(instances, 10000)
-        self.assertEqual(max_label_value, 9)
-
-    def test_train10(self):
-        instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.cifar.train10())
-        self.assertEqual(instances, 50000)
-        self.assertEqual(max_label_value, 9)
-
-    def test_test100(self):
-        instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.cifar.test100())
-        self.assertEqual(instances, 10000)
-        self.assertEqual(max_label_value, 99)
-
-    def test_train100(self):
-        instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.cifar.train100())
-        self.assertEqual(instances, 50000)
-        self.assertEqual(max_label_value, 99)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/v2/dataset/tests/common_test.py
deleted file mode 100644
index cfa194eba38..00000000000
--- a/python/paddle/v2/dataset/tests/common_test.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2.dataset.common
-import unittest
-import tempfile
-import glob
-
-
-class TestCommon(unittest.TestCase):
-    def test_md5file(self):
-        _, temp_path = tempfile.mkstemp()
-        with open(temp_path, 'w') as f:
-            f.write("Hello\n")
-        self.assertEqual('09f7e02f1290be211da707a266f153b3',
-                         paddle.v2.dataset.common.md5file(temp_path))
-
-    def test_download(self):
-        yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460'
-        self.assertEqual(
-            paddle.v2.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460',
-            paddle.v2.dataset.common.download(
-                yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d'))
-
-    def test_split(self):
-        def test_reader():
-            def reader():
-                for x in xrange(10):
-                    yield x
-
-            return reader
-
-        _, temp_path = tempfile.mkstemp()
-        paddle.v2.dataset.common.split(
-            test_reader(), 4, suffix=temp_path + '/test-%05d.pickle')
-        files = glob.glob(temp_path + '/test-%05d.pickle')
-        self.assertEqual(len(files), 3)
-
-    def test_cluster_file_reader(self):
-        _, temp_path = tempfile.mkstemp()
-        for x in xrange(5):
-            with open(temp_path + '/%05d.test' % x) as f:
-                f.write('%d\n' % x)
-        reader = paddle.v2.dataset.common.cluster_files_reader(
-            temp_path + '/*.test', 5, 0)
-        for idx, e in enumerate(reader()):
-            self.assertEqual(e, str("0"))
-
-    def test_convert(self):
-        record_num = 10
-        num_shards = 4
-
-        def test_reader():
-            def reader():
-                for x in xrange(record_num):
-                    yield x
-
-            return reader
-
-        path = tempfile.mkdtemp()
-        paddle.v2.dataset.common.convert(path,
-                                         test_reader(), num_shards,
-                                         'random_images')
-
-        files = glob.glob(path + '/random_images-*')
-        self.assertEqual(len(files), num_shards)
-
-        recs = []
-        for i in range(0, num_shards):
-            n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1)
-            r = recordio.reader(n)
-            while True:
-                d = r.read()
-                if d is None:
-                    break
-                recs.append(d)
-
-        recs.sort()
-        self.assertEqual(total, record_num)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/flowers_test.py b/python/paddle/v2/dataset/tests/flowers_test.py
deleted file mode 100644
index a8ae9a07acc..00000000000
--- a/python/paddle/v2/dataset/tests/flowers_test.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2.dataset.flowers
-import unittest
-
-
-class TestFlowers(unittest.TestCase):
-    def check_reader(self, reader):
-        sum = 0
-        label = 0
-        size = 224 * 224 * 3
-        for l in reader():
-            self.assertEqual(l[0].size, size)
-            if l[1] > label:
-                label = l[1]
-            sum += 1
-        return sum, label
-
-    def test_train(self):
-        instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.flowers.train())
-        self.assertEqual(instances, 6149)
-        self.assertEqual(max_label_value, 102)
-
-    def test_test(self):
-        instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.flowers.test())
-        self.assertEqual(instances, 1020)
-        self.assertEqual(max_label_value, 102)
-
-    def test_valid(self):
-        instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.flowers.valid())
-        self.assertEqual(instances, 1020)
-        self.assertEqual(max_label_value, 102)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/imdb_test.py b/python/paddle/v2/dataset/tests/imdb_test.py
deleted file mode 100644
index c4d82f26895..00000000000
--- a/python/paddle/v2/dataset/tests/imdb_test.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2.dataset.imdb
-import unittest
-import re
-
-TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$")
-TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$")
-TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$")
-
-TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$")
-TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$")
-TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$")
-
-
-class TestIMDB(unittest.TestCase):
-    word_idx = None
-
-    def test_build_dict(self):
-        if self.word_idx == None:
-            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
-                                                              150)
-
-        self.assertEqual(len(self.word_idx), 7036)
-
-    def check_dataset(self, dataset, expected_size):
-        if self.word_idx == None:
-            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
-                                                              150)
-
-        sum = 0
-        for l in dataset(self.word_idx):
-            self.assertEqual(l[1], sum % 2)
-            sum += 1
-        self.assertEqual(sum, expected_size)
-
-    def test_train(self):
-        self.check_dataset(paddle.v2.dataset.imdb.train, 25000)
-
-    def test_test(self):
-        self.check_dataset(paddle.v2.dataset.imdb.test, 25000)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/imikolov_test.py b/python/paddle/v2/dataset/tests/imikolov_test.py
deleted file mode 100644
index 714a75d6f1f..00000000000
--- a/python/paddle/v2/dataset/tests/imikolov_test.py
+++ /dev/null
@@ -1,67 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2.dataset.imikolov
-import unittest
-
-WORD_DICT = paddle.v2.dataset.imikolov.build_dict()
-
-
-class TestMikolov(unittest.TestCase):
-    def check_reader(self, reader, n):
-        for l in reader():
-            self.assertEqual(len(l), n)
-
-    def test_train(self):
-        n = 5
-        self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n)
-
-        first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\
-            'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\
-            'rake regatta rubens sim snack-food ssangyong swapo wachter'
-        first_line = [
-            WORD_DICT.get(ch, WORD_DICT['<unk>'])
-            for ch in first_line.split(' ')
-        ]
-        for l in paddle.v2.dataset.imikolov.train(
-                WORD_DICT, n=-1,
-                data_type=paddle.v2.dataset.imikolov.DataType.SEQ)():
-            read_line = l[0][1:]
-            break
-        self.assertEqual(first_line, read_line)
-
-    def test_test(self):
-        n = 5
-        self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n)
-
-        first_line = 'consumers may want to move their telephones a little '\
-                'closer to the tv set'
-        first_line = [
-            WORD_DICT.get(ch, WORD_DICT['<unk>'])
-            for ch in first_line.split(' ')
-        ]
-        for l in paddle.v2.dataset.imikolov.test(
-                WORD_DICT, n=-1,
-                data_type=paddle.v2.dataset.imikolov.DataType.SEQ)():
-            read_line = l[0][1:]
-            break
-        self.assertEqual(first_line, read_line)
-
-    def test_total(self):
-        _, idx = zip(*WORD_DICT.items())
-        self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/mnist_test.py b/python/paddle/v2/dataset/tests/mnist_test.py
deleted file mode 100644
index 1d344cac3e7..00000000000
--- a/python/paddle/v2/dataset/tests/mnist_test.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2.dataset.mnist
-import unittest
-
-
-class TestMNIST(unittest.TestCase):
-    def check_reader(self, reader):
-        sum = 0
-        label = 0
-        for l in reader():
-            self.assertEqual(l[0].size, 784)
-            if l[1] > label:
-                label = l[1]
-            sum += 1
-        return sum, label
-
-    def test_train(self):
-        instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.mnist.train())
-        self.assertEqual(instances, 60000)
-        self.assertEqual(max_label_value, 9)
-
-    def test_test(self):
-        instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.mnist.test())
-        self.assertEqual(instances, 10000)
-        self.assertEqual(max_label_value, 9)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/mq2007_test.py b/python/paddle/v2/dataset/tests/mq2007_test.py
deleted file mode 100644
index 59847b6c18e..00000000000
--- a/python/paddle/v2/dataset/tests/mq2007_test.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2.dataset.mq2007
-import unittest
-
-
-class TestMQ2007(unittest.TestCase):
-    def test_pairwise(self):
-        for label, query_left, query_right in paddle.v2.dataset.mq2007.test(
-                format="pairwise"):
-            self.assertEqual(query_left.shape(), (46, ))
-            self.assertEqual(query_right.shape(), (46, ))
-
-    def test_listwise(self):
-        for label_array, query_array in paddle.v2.dataset.mq2007.test(
-                format="listwise"):
-            self.assertEqual(len(label_array), len(query_array))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/test_sentiment.py b/python/paddle/v2/dataset/tests/test_sentiment.py
deleted file mode 100644
index 40740529073..00000000000
--- a/python/paddle/v2/dataset/tests/test_sentiment.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# /usr/bin/env python
-# -*- coding:utf-8 -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import nltk
-import paddle.v2.dataset.sentiment as st
-from nltk.corpus import movie_reviews
-
-
-class TestSentimentMethods(unittest.TestCase):
-    def test_get_word_dict(self):
-        word_dict = st.get_word_dict()[0:10]
-        test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
-                          (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
-                          (u'is', 8), (u'in', 9)]
-        for idx, each in enumerate(word_dict):
-            self.assertEqual(each, test_word_list[idx])
-        self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
-
-    def test_sort_files(self):
-        last_label = ''
-        for sample_file in st.sort_files():
-            current_label = sample_file.split("/")[0]
-            self.assertNotEqual(current_label, last_label)
-            last_label = current_label
-
-    def test_data_set(self):
-        data_set = st.load_sentiment_data()
-        last_label = -1
-        for each in st.test():
-            self.assertNotEqual(each[1], last_label)
-            last_label = each[1]
-        self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
-        self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
-        self.assertEqual(
-            len(list(st.test())),
-            (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/voc2012_test.py b/python/paddle/v2/dataset/tests/voc2012_test.py
deleted file mode 100644
index 31e72ebf5ea..00000000000
--- a/python/paddle/v2/dataset/tests/voc2012_test.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2.dataset.voc2012
-import unittest
-
-
-class TestVOC(unittest.TestCase):
-    def check_reader(self, reader):
-        sum = 0
-        label = 0
-        for l in reader():
-            self.assertEqual(l[0].size, 3 * l[1].size)
-            sum += 1
-        return sum
-
-    def test_train(self):
-        count = self.check_reader(paddle.v2.dataset.voc_seg.train())
-        self.assertEqual(count, 2913)
-
-    def test_test(self):
-        count = self.check_reader(paddle.v2.dataset.voc_seg.test())
-        self.assertEqual(count, 1464)
-
-    def test_val(self):
-        count = self.check_reader(paddle.v2.dataset.voc_seg.val())
-        self.assertEqual(count, 1449)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/wmt16_test.py b/python/paddle/v2/dataset/tests/wmt16_test.py
deleted file mode 100644
index cef6c3216e7..00000000000
--- a/python/paddle/v2/dataset/tests/wmt16_test.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2.dataset.wmt16
-import unittest
-
-
-class TestWMT16(unittest.TestCase):
-    def checkout_one_sample(self, sample):
-        # train data has 3 field: source language word indices,
-        # target language word indices, and target next word indices.
-        self.assertEqual(len(sample), 3)
-
-        # test start mark and end mark in source word indices.
-        self.assertEqual(sample[0][0], 0)
-        self.assertEqual(sample[0][-1], 1)
-
-        # test start mask in target word indices
-        self.assertEqual(sample[1][0], 0)
-
-        # test en mask in target next word indices
-        self.assertEqual(sample[2][-1], 1)
-
-    def test_train(self):
-        for idx, sample in enumerate(
-                paddle.v2.dataset.wmt16.train(
-                    src_dict_size=100000, trg_dict_size=100000)()):
-            if idx >= 10: break
-            self.checkout_one_sample(sample)
-
-    def test_test(self):
-        for idx, sample in enumerate(
-                paddle.v2.dataset.wmt16.test(
-                    src_dict_size=1000, trg_dict_size=1000)()):
-            if idx >= 10: break
-            self.checkout_one_sample(sample)
-
-    def test_val(self):
-        for idx, sample in enumerate(
-                paddle.v2.dataset.wmt16.validation(
-                    src_dict_size=1000, trg_dict_size=1000)()):
-            if idx >= 10: break
-            self.checkout_one_sample(sample)
-
-    def test_get_dict(self):
-        dict_size = 1000
-        word_dict = paddle.v2.dataset.wmt16.get_dict("en", dict_size, True)
-        self.assertEqual(len(word_dict), dict_size)
-        self.assertEqual(word_dict[0], "<s>")
-        self.assertEqual(word_dict[1], "<e>")
-        self.assertEqual(word_dict[2], "<unk>")
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
deleted file mode 100644
index f10bf7e42a1..00000000000
--- a/python/paddle/v2/dataset/uci_housing.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-UCI Housing dataset.
-
-This module will download dataset from
-https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
-parse training set and test set into paddle reader creators.
-"""
-
-import numpy as np
-import os
-import paddle.v2.dataset.common
-from paddle.v2.parameters import Parameters
-
-__all__ = ['train', 'test']
-
-URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
-MD5 = 'd4accdce7a25600298819f8e28e8d593'
-feature_names = [
-    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
-    'PTRATIO', 'B', 'LSTAT', 'convert'
-]
-
-UCI_TRAIN_DATA = None
-UCI_TEST_DATA = None
-URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar'
-MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b'
-
-
-def feature_range(maximums, minimums):
-    import matplotlib
-    matplotlib.use('Agg')
-    import matplotlib.pyplot as plt
-    fig, ax = plt.subplots()
-    feature_num = len(maximums)
-    ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
-    ax.set_title('feature scale')
-    plt.xticks(range(feature_num), feature_names)
-    plt.xlim([-1, feature_num])
-    fig.set_figheight(6)
-    fig.set_figwidth(10)
-    if not os.path.exists('./image'):
-        os.makedirs('./image')
-    fig.savefig('image/ranges.png', dpi=48)
-    plt.close(fig)
-
-
-def load_data(filename, feature_num=14, ratio=0.8):
-    global UCI_TRAIN_DATA, UCI_TEST_DATA
-    if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None:
-        return
-
-    data = np.fromfile(filename, sep=' ')
-    data = data.reshape(data.shape[0] / feature_num, feature_num)
-    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
-        axis=0) / data.shape[0]
-    feature_range(maximums[:-1], minimums[:-1])
-    for i in xrange(feature_num - 1):
-        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
-    offset = int(data.shape[0] * ratio)
-    UCI_TRAIN_DATA = data[:offset]
-    UCI_TEST_DATA = data[offset:]
-
-
-def train():
-    """
-    UCI_HOUSING training set creator.
-
-    It returns a reader creator, each sample in the reader is features after
-    normalization and price number.
-
-    :return: Training reader creator
-    :rtype: callable
-    """
-    global UCI_TRAIN_DATA
-    load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
-
-    def reader():
-        for d in UCI_TRAIN_DATA:
-            yield d[:-1], d[-1:]
-
-    return reader
-
-
-def test():
-    """
-    UCI_HOUSING test set creator.
-
-    It returns a reader creator, each sample in the reader is features after
-    normalization and price number.
-
-    :return: Test reader creator
-    :rtype: callable
-    """
-    global UCI_TEST_DATA
-    load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
-
-    def reader():
-        for d in UCI_TEST_DATA:
-            yield d[:-1], d[-1:]
-
-    return reader
-
-
-def model():
-    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar',
-                                                 MD5_MODEL)
-    with open(tar_file, 'r') as f:
-        parameters = Parameters.from_tar(f)
-    return parameters
-
-
-def fetch():
-    paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)
-
-
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train")
-    paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
diff --git a/python/paddle/v2/dataset/voc2012.py b/python/paddle/v2/dataset/voc2012.py
deleted file mode 100644
index 617e212d67f..00000000000
--- a/python/paddle/v2/dataset/voc2012.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Image dataset for segmentation.
-The 2012 dataset contains images from 2008-2011 for which additional
-segmentations have been prepared. As in previous years the assignment
-to training/test sets has been maintained. The total number of images
-with segmentation has been increased from 7,062 to 9,993.
-"""
-
-import tarfile
-import io
-import numpy as np
-from paddle.v2.dataset.common import download
-from paddle.v2.image import *
-from PIL import Image
-
-__all__ = ['train', 'test', 'val']
-
-VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
-VOCtrainval_11-May-2012.tar'
-
-VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
-SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
-DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
-LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png'
-
-CACHE_DIR = 'voc2012'
-
-
-def reader_creator(filename, sub_name):
-
-    tarobject = tarfile.open(filename)
-    name2mem = {}
-    for ele in tarobject.getmembers():
-        name2mem[ele.name] = ele
-
-    def reader():
-        set_file = SET_FILE.format(sub_name)
-        sets = tarobject.extractfile(name2mem[set_file])
-        for line in sets:
-            line = line.strip()
-            data_file = DATA_FILE.format(line)
-            label_file = LABEL_FILE.format(line)
-            data = tarobject.extractfile(name2mem[data_file]).read()
-            label = tarobject.extractfile(name2mem[label_file]).read()
-            data = Image.open(io.BytesIO(data))
-            label = Image.open(io.BytesIO(label))
-            data = np.array(data)
-            label = np.array(label)
-            yield data, label
-
-    return reader
-
-
-def train():
-    """
-    Create a train dataset reader containing 2913 images in HWC order.
-    """
-    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval')
-
-
-def test():
-    """
-    Create a test dataset reader containing 1464 images in HWC order.
-    """
-    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train')
-
-
-def val():
-    """
-    Create a val dataset reader containing 1449 images in HWC order.
-    """
-    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val')
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
deleted file mode 100644
index b9e602f324a..00000000000
--- a/python/paddle/v2/dataset/wmt14.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-WMT14 dataset.
-The original WMT14 dataset is too large and a small set of data for set is
-provided. This module will download dataset from
-http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz and
-parse training set and test set into paddle reader creators.
-
-"""
-import tarfile
-import gzip
-
-import paddle.v2.dataset.common
-from paddle.v2.parameters import Parameters
-
-__all__ = [
-    'train',
-    'test',
-    'get_dict',
-    'convert',
-]
-
-URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
-                'cslm_joint_paper/data/dev+test.tgz')
-MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
-# this is a small set of data for test. The original data is too large and
-# will be add later.
-URL_TRAIN = ('http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz')
-MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
-# BLEU of this trained model is 26.92
-URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz'
-MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
-
-START = "<s>"
-END = "<e>"
-UNK = "<unk>"
-UNK_IDX = 2
-
-
-def __read_to_dict(tar_file, dict_size):
-    def __to_dict(fd, size):
-        out_dict = dict()
-        for line_count, line in enumerate(fd):
-            if line_count < size:
-                out_dict[line.strip()] = line_count
-            else:
-                break
-        return out_dict
-
-    with tarfile.open(tar_file, mode='r') as f:
-        names = [
-            each_item.name for each_item in f
-            if each_item.name.endswith("src.dict")
-        ]
-        assert len(names) == 1
-        src_dict = __to_dict(f.extractfile(names[0]), dict_size)
-        names = [
-            each_item.name for each_item in f
-            if each_item.name.endswith("trg.dict")
-        ]
-        assert len(names) == 1
-        trg_dict = __to_dict(f.extractfile(names[0]), dict_size)
-        return src_dict, trg_dict
-
-
-def reader_creator(tar_file, file_name, dict_size):
-    def reader():
-        src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
-        with tarfile.open(tar_file, mode='r') as f:
-            names = [
-                each_item.name for each_item in f
-                if each_item.name.endswith(file_name)
-            ]
-            for name in names:
-                for line in f.extractfile(name):
-                    line_split = line.strip().split('\t')
-                    if len(line_split) != 2:
-                        continue
-                    src_seq = line_split[0]  # one source sequence
-                    src_words = src_seq.split()
-                    src_ids = [
-                        src_dict.get(w, UNK_IDX)
-                        for w in [START] + src_words + [END]
-                    ]
-
-                    trg_seq = line_split[1]  # one target sequence
-                    trg_words = trg_seq.split()
-                    trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words]
-
-                    # remove sequence whose length > 80 in training mode
-                    if len(src_ids) > 80 or len(trg_ids) > 80:
-                        continue
-                    trg_ids_next = trg_ids + [trg_dict[END]]
-                    trg_ids = [trg_dict[START]] + trg_ids
-
-                    yield src_ids, trg_ids, trg_ids_next
-
-    return reader
-
-
-def train(dict_size):
-    """
-    WMT14 training set creator.
-
-    It returns a reader creator, each sample in the reader is source language
-    word ID sequence, target language word ID sequence and next word ID
-    sequence.
-
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
-        'train/train', dict_size)
-
-
-def test(dict_size):
-    """
-    WMT14 test set creator.
-
-    It returns a reader creator, each sample in the reader is source language
-    word ID sequence, target language word ID sequence and next word ID
-    sequence.
-
-    :return: Test reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
-        'test/test', dict_size)
-
-
-def gen(dict_size):
-    return reader_creator(
-        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
-        'gen/gen', dict_size)
-
-
-def model():
-    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
-    with gzip.open(tar_file, 'r') as f:
-        parameters = Parameters.from_tar(f)
-    return parameters
-
-
-def get_dict(dict_size, reverse=True):
-    # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
-    # else reverse = true, return dict = {'001':'a', '002':'b', ...}
-    tar_file = paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
-    src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
-    if reverse:
-        src_dict = {v: k for k, v in src_dict.items()}
-        trg_dict = {v: k for k, v in trg_dict.items()}
-    return src_dict, trg_dict
-
-
-def fetch():
-    paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
-    paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
-
-
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    dict_size = 30000
-    paddle.v2.dataset.common.convert(path,
-                                     train(dict_size), 1000, "wmt14_train")
-    paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
diff --git a/python/paddle/v2/dataset/wmt16.py b/python/paddle/v2/dataset/wmt16.py
deleted file mode 100644
index 5793002091b..00000000000
--- a/python/paddle/v2/dataset/wmt16.py
+++ /dev/null
@@ -1,352 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-ACL2016 Multimodal Machine Translation. Please see this website for more
-details: http://www.statmt.org/wmt16/multimodal-task.html#task1
-
-If you use the dataset created for your task, please cite the following paper:
-Multi30K: Multilingual English-German Image Descriptions.
-
-@article{elliott-EtAl:2016:VL16,
- author    = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.},
- title     = {Multi30K: Multilingual English-German Image Descriptions},
- booktitle = {Proceedings of the 6th Workshop on Vision and Language},
- year      = {2016},
- pages     = {70--74},
- year      = 2016
-}
-"""
-
-import os
-import tarfile
-import gzip
-from collections import defaultdict
-
-import paddle.v2.dataset.common
-
-__all__ = [
-    "train",
-    "test",
-    "validation",
-    "convert",
-    "fetch",
-    "get_dict",
-]
-
-DATA_URL = ("http://cloud.dlnel.org/filepub/"
-            "?uuid=46a0808e-ddd8-427c-bacd-0dbc6d045fed")
-DATA_MD5 = "0c38be43600334966403524a40dcd81e"
-
-TOTAL_EN_WORDS = 11250
-TOTAL_DE_WORDS = 19220
-
-START_MARK = "<s>"
-END_MARK = "<e>"
-UNK_MARK = "<unk>"
-
-
-def __build_dict(tar_file, dict_size, save_path, lang):
-    word_dict = defaultdict(int)
-    with tarfile.open(tar_file, mode="r") as f:
-        for line in f.extractfile("wmt16/train"):
-            line_split = line.strip().split("\t")
-            if len(line_split) != 2: continue
-            sen = line_split[0] if lang == "en" else line_split[1]
-            for w in sen.split():
-                word_dict[w] += 1
-
-    with open(save_path, "w") as fout:
-        fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
-        for idx, word in enumerate(
-                sorted(
-                    word_dict.iteritems(), key=lambda x: x[1], reverse=True)):
-            if idx + 3 == dict_size: break
-            fout.write(word[0].encode('utf-8'))
-            fout.write('\n')
-
-
-def __load_dict(tar_file, dict_size, lang, reverse=False):
-    dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME,
-                             "wmt16/%s_%d.dict" % (lang, dict_size))
-    if not os.path.exists(dict_path) or (
-            len(open(dict_path, "r").readlines()) != dict_size):
-        __build_dict(tar_file, dict_size, dict_path, lang)
-
-    word_dict = {}
-    with open(dict_path, "r") as fdict:
-        for idx, line in enumerate(fdict):
-            if reverse:
-                word_dict[idx] = line.strip()
-            else:
-                word_dict[line.strip()] = idx
-    return word_dict
-
-
-def __get_dict_size(src_dict_size, trg_dict_size, src_lang):
-    src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if src_lang == "en" else
-                                        TOTAL_DE_WORDS))
-    trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if src_lang == "en" else
-                                        TOTAL_ENG_WORDS))
-    return src_dict_size, trg_dict_size
-
-
-def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
-    def reader():
-        src_dict = __load_dict(tar_file, src_dict_size, src_lang)
-        trg_dict = __load_dict(tar_file, trg_dict_size,
-                               ("de" if src_lang == "en" else "en"))
-
-        # the indice for start mark, end mark, and unk are the same in source
-        # language and target language. Here uses the source language
-        # dictionary to determine their indices.
-        start_id = src_dict[START_MARK]
-        end_id = src_dict[END_MARK]
-        unk_id = src_dict[UNK_MARK]
-
-        src_col = 0 if src_lang == "en" else 1
-        trg_col = 1 - src_col
-
-        with tarfile.open(tar_file, mode="r") as f:
-            for line in f.extractfile(file_name):
-                line_split = line.strip().split("\t")
-                if len(line_split) != 2:
-                    continue
-                src_words = line_split[src_col].split()
-                src_ids = [start_id] + [
-                    src_dict.get(w, unk_id) for w in src_words
-                ] + [end_id]
-
-                trg_words = line_split[trg_col].split()
-                trg_ids = [trg_dict.get(w, unk_id) for w in trg_words]
-
-                trg_ids_next = trg_ids + [end_id]
-                trg_ids = [start_id] + trg_ids
-
-                yield src_ids, trg_ids, trg_ids_next
-
-    return reader
-
-
-def train(src_dict_size, trg_dict_size, src_lang="en"):
-    """
-    WMT16 train set reader.
-
-    This function returns the reader for train data. Each sample the reader
-    returns is made up of three fields: the source language word index sequence,
-    target language word index sequence and next word index sequence.
-
-
-    NOTE:
-    The original like for training data is:
-    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz
-
-    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
-    using moses's tokenization script:
-    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
-
-    Args:
-        src_dict_size(int): Size of the source language dictionary. Three
-                            special tokens will be added into the dictionary:
-                            <s> for start mark, <e> for end mark, and <unk> for
-                            unknown word.
-        trg_dict_size(int): Size of the target language dictionary. Three
-                            special tokens will be added into the dictionary:
-                            <s> for start mark, <e> for end mark, and <unk> for
-                            unknown word.
-        src_lang(string): A string indicating which language is the source
-                          language. Available options are: "en" for English
-                          and "de" for Germany.
-
-    Returns:
-        callable: The train reader.
-    """
-
-    if src_lang not in ["en", "de"]:
-        raise ValueError("An error language type.  Only support: "
-                         "en (for English); de(for Germany).")
-    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
-                                                   src_lang)
-
-    return reader_creator(
-        tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
-                                                   "wmt16.tar.gz"),
-        file_name="wmt16/train",
-        src_dict_size=src_dict_size,
-        trg_dict_size=trg_dict_size,
-        src_lang=src_lang)
-
-
-def test(src_dict_size, trg_dict_size, src_lang="en"):
-    """
-    WMT16 test set reader.
-
-    This function returns the reader for test data. Each sample the reader
-    returns is made up of three fields: the source language word index sequence,
-    target language word index sequence and next word index sequence.
-
-    NOTE:
-    The original like for test data is:
-    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz
-
-    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
-    using moses's tokenization script:
-    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
-
-    Args:
-        src_dict_size(int): Size of the source language dictionary. Three
-                            special tokens will be added into the dictionary:
-                            <s> for start mark, <e> for end mark, and <unk> for
-                            unknown word.
-        trg_dict_size(int): Size of the target language dictionary. Three
-                            special tokens will be added into the dictionary:
-                            <s> for start mark, <e> for end mark, and <unk> for
-                            unknown word.
-        src_lang(string): A string indicating which language is the source
-                          language. Available options are: "en" for English
-                          and "de" for Germany.
-
-    Returns:
-        callable: The test reader.
-    """
-
-    if src_lang not in ["en", "de"]:
-        raise ValueError("An error language type. "
-                         "Only support: en (for English); de(for Germany).")
-
-    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
-                                                   src_lang)
-
-    return reader_creator(
-        tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
-                                                   "wmt16.tar.gz"),
-        file_name="wmt16/test",
-        src_dict_size=src_dict_size,
-        trg_dict_size=trg_dict_size,
-        src_lang=src_lang)
-
-
-def validation(src_dict_size, trg_dict_size, src_lang="en"):
-    """
-    WMT16 validation set reader.
-
-    This function returns the reader for validation data. Each sample the reader
-    returns is made up of three fields: the source language word index sequence,
-    target language word index sequence and next word index sequence.
-
-    NOTE:
-    The original like for validation data is:
-    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz
-
-    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
-    using moses's tokenization script:
-    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
-
-    Args:
-        src_dict_size(int): Size of the source language dictionary. Three
-                            special tokens will be added into the dictionary:
-                            <s> for start mark, <e> for end mark, and <unk> for
-                            unknown word.
-        trg_dict_size(int): Size of the target language dictionary. Three
-                            special tokens will be added into the dictionary:
-                            <s> for start mark, <e> for end mark, and <unk> for
-                            unknown word.
-        src_lang(string): A string indicating which language is the source
-                          language. Available options are: "en" for English
-                          and "de" for Germany.
-
-    Returns:
-        callable: The validation reader.
-    """
-    if src_lang not in ["en", "de"]:
-        raise ValueError("An error language type. "
-                         "Only support: en (for English); de(for Germany).")
-    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
-                                                   src_lang)
-
-    return reader_creator(
-        tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
-                                                   "wmt16.tar.gz"),
-        file_name="wmt16/val",
-        src_dict_size=src_dict_size,
-        trg_dict_size=trg_dict_size,
-        src_lang=src_lang)
-
-
-def get_dict(lang, dict_size, reverse=False):
-    """
-    return the word dictionary for the specified language.
-
-    Args:
-        lang(string): A string indicating which language is the source
-                      language. Available options are: "en" for English
-                      and "de" for Germany.
-        dict_size(int): Size of the specified language dictionary.
-        reverse(bool): If reverse is set to False, the returned python
-                       dictionary will use word as key and use index as value.
-                       If reverse is set to True, the returned python
-                       dictionary will use index as key and word as value.
-
-    Returns:
-        dict: The word dictionary for the specific language.
-    """
-
-    if lang == "en":
-        dict_size = min(dict_size, TOTAL_EN_WORDS)
-    else:
-        dict_size = min(dict_size, TOTAL_DE_WORDS)
-
-    dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME,
-                             "wmt16/%s_%d.dict" % (lang, dict_size))
-    assert os.path.exists(dict_path), "Word dictionary does not exist. "
-    "Please invoke paddle.dataset.wmt16.train/test/validation first "
-    "to build the dictionary."
-    tar_file = os.path.join(paddle.v2.dataset.common.DATA_HOME, "wmt16.tar.gz")
-    return __load_dict(tar_file, dict_size, lang, reverse)
-
-
-def fetch():
-    """download the entire dataset.
-    """
-    paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
-                                      "wmt16.tar.gz")
-
-
-def convert(path, src_dict_size, trg_dict_size, src_lang):
-    """Converts dataset to recordio format.
-    """
-
-    paddle.v2.dataset.common.convert(
-        path,
-        train(
-            src_dict_size=src_dict_size,
-            trg_dict_size=trg_dict_size,
-            src_lang=src_lang),
-        1000,
-        "wmt16_train")
-    paddle.v2.dataset.common.convert(
-        path,
-        test(
-            src_dict_size=src_dict_size,
-            trg_dict_size=trg_dict_size,
-            src_lang=src_lang),
-        1000,
-        "wmt16_test")
-    paddle.v2.dataset.common.convert(
-        path,
-        validation(
-            src_dict_size=src_dict_size,
-            trg_dict_size=trg_dict_size,
-            src_lang=src_lang),
-        1000,
-        "wmt16_validation")
diff --git a/python/paddle/v2/evaluator.py b/python/paddle/v2/evaluator.py
deleted file mode 100644
index eaaadbe53bc..00000000000
--- a/python/paddle/v2/evaluator.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.trainer_config_helpers.evaluators as evs
-from config_base import __convert_to_v2__
-import inspect
-
-__all__ = []
-
-
-def initialize():
-    def convert_to_new_name(nm):
-        return nm[:-len("_evaluator")]
-
-    for __ev_name__ in filter(lambda x: x.endswith('_evaluator'), evs.__all__):
-        __ev__ = getattr(evs, __ev_name__)
-        __new_name__ = convert_to_new_name(__ev_name__)
-
-        globals()[__new_name__] = __convert_to_v2__(__ev__, __new_name__,
-                                                    __name__)
-        globals()[__new_name__].__name__ = __new_name__
-        __all__.append(__new_name__)
-
-
-initialize()
diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py
deleted file mode 100644
index c11aa121c19..00000000000
--- a/python/paddle/v2/event.py
+++ /dev/null
@@ -1,113 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Testing and training events.
-
-There are:
-
-* TestResult
-* BeginIteration
-* EndIteration
-* BeginPass
-* EndPass
-"""
-__all__ = [
-    'EndIteration', 'BeginIteration', 'BeginPass', 'EndPass', 'TestResult',
-    'EndForwardBackward'
-]
-
-
-class WithMetric(object):
-    def __init__(self, evaluator):
-        import py_paddle.swig_paddle as api
-        if not isinstance(evaluator, api.Evaluator):
-            raise TypeError("Evaluator should be api.Evaluator type")
-        self.__evaluator__ = evaluator
-
-    @property
-    def metrics(self):
-        names = self.__evaluator__.getNames()
-        retv = dict()
-        for each_name in names:
-            val = self.__evaluator__.getValue(each_name)
-            retv[each_name] = val
-        return retv
-
-
-class TestResult(WithMetric):
-    """
-    Result that trainer.test return.
-    """
-
-    def __init__(self, evaluator, cost):
-        super(TestResult, self).__init__(evaluator)
-        self.cost = cost
-
-
-class BeginPass(object):
-    """
-    Event On One Pass Training Start.
-    """
-
-    def __init__(self, pass_id):
-        self.pass_id = pass_id
-
-
-class EndPass(WithMetric):
-    """
-    Event On One Pass Training Complete.
-    To get the output of a specific layer, add "event.gm.getLayerOutputs('predict_layer')"
-    in your event_handler call back
-    """
-
-    def __init__(self, pass_id, evaluator, gm):
-        self.pass_id = pass_id
-        self.gm = gm
-        WithMetric.__init__(self, evaluator)
-
-
-class BeginIteration(object):
-    """
-    Event On One Batch Training Start.
-    """
-
-    def __init__(self, pass_id, batch_id):
-        self.pass_id = pass_id
-        self.batch_id = batch_id
-
-
-class EndForwardBackward(object):
-    """
-    Event On One Batch ForwardBackward Complete.
-    """
-
-    def __init__(self, pass_id, batch_id, gm):
-        self.pass_id = pass_id
-        self.batch_id = batch_id
-        self.gm = gm
-
-
-class EndIteration(WithMetric):
-    """
-    Event On One Batch Training Complete.
-    To get the output of a specific layer, add "event.gm.getLayerOutputs('predict_layer')"
-    in your event_handler call back
-    """
-
-    def __init__(self, pass_id, batch_id, cost, evaluator, gm):
-        self.pass_id = pass_id
-        self.batch_id = batch_id
-        self.cost = cost
-        self.gm = gm
-        WithMetric.__init__(self, evaluator)
diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py
deleted file mode 100644
index 08d8bd68f9b..00000000000
--- a/python/paddle/v2/image.py
+++ /dev/null
@@ -1,380 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This file contains some common interfaces for image preprocess.
-Many users are confused about the image layout. We introduce
-the image layout as follows.
-
-- CHW Layout
-
-  - The abbreviations: C=channel, H=Height, W=Width
-  - The default layout of image opened by cv2 or PIL is HWC.
-    PaddlePaddle only supports the CHW layout. And CHW is simply
-    a transpose of HWC. It must transpose the input image.
-
-- Color format: RGB or BGR
-
-  OpenCV use BGR color format. PIL use RGB color format. Both
-  formats can be used for training. Noted that, the format should
-  be keep consistent between the training and inference peroid.
-"""
-import numpy as np
-try:
-    import cv2
-except ImportError:
-    cv2 = None
-import os
-import tarfile
-import cPickle
-
-__all__ = [
-    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
-    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
-    "batch_images_from_tar"
-]
-
-
-def batch_images_from_tar(data_file,
-                          dataset_name,
-                          img2label,
-                          num_per_batch=1024):
-    """
-    Read images from tar file and batch them into batch file.
-
-    :param data_file: path of image tar file
-    :type data_file: string
-    :param dataset_name: 'train','test' or 'valid'
-    :type dataset_name: string
-    :param img2label: a dic with image file name as key 
-                    and image's label as value
-    :type img2label: dic
-    :param num_per_batch: image number per batch file
-    :type num_per_batch: int
-    :return: path of list file containing paths of batch file
-    :rtype: string
-    """
-    batch_dir = data_file + "_batch"
-    out_path = "%s/%s" % (batch_dir, dataset_name)
-    meta_file = "%s/%s.txt" % (batch_dir, dataset_name)
-
-    if os.path.exists(out_path):
-        return meta_file
-    else:
-        os.makedirs(out_path)
-
-    tf = tarfile.open(data_file)
-    mems = tf.getmembers()
-    data = []
-    labels = []
-    file_id = 0
-    for mem in mems:
-        if mem.name in img2label:
-            data.append(tf.extractfile(mem).read())
-            labels.append(img2label[mem.name])
-            if len(data) == num_per_batch:
-                output = {}
-                output['label'] = labels
-                output['data'] = data
-                cPickle.dump(
-                    output,
-                    open('%s/batch_%d' % (out_path, file_id), 'w'),
-                    protocol=cPickle.HIGHEST_PROTOCOL)
-                file_id += 1
-                data = []
-                labels = []
-    if len(data) > 0:
-        output = {}
-        output['label'] = labels
-        output['data'] = data
-        cPickle.dump(
-            output,
-            open('%s/batch_%d' % (out_path, file_id), 'w'),
-            protocol=cPickle.HIGHEST_PROTOCOL)
-
-    with open(meta_file, 'a') as meta:
-        for file in os.listdir(out_path):
-            meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n")
-    return meta_file
-
-
-def load_image_bytes(bytes, is_color=True):
-    """
-    Load an color or gray image from bytes array.
-
-    Example usage:
-    
-    .. code-block:: python
-
-        with open('cat.jpg') as f:
-            im = load_image_bytes(f.read())
-
-    :param bytes: the input image bytes array.
-    :type bytes: str
-    :param is_color: If set is_color True, it will load and
-                     return a color image. Otherwise, it will
-                     load and return a gray image.
-    :type is_color: bool
-    """
-    flag = 1 if is_color else 0
-    file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
-    img = cv2.imdecode(file_bytes, flag)
-    return img
-
-
-def load_image(file, is_color=True):
-    """
-    Load an color or gray image from the file path.
-
-    Example usage:
-    
-    .. code-block:: python
-
-        im = load_image('cat.jpg')
-
-    :param file: the input image path.
-    :type file: string
-    :param is_color: If set is_color True, it will load and
-                     return a color image. Otherwise, it will
-                     load and return a gray image.
-    :type is_color: bool
-    """
-    # cv2.IMAGE_COLOR for OpenCV3
-    # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
-    # cv2.IMAGE_GRAYSCALE for OpenCV3
-    # cv2.CV_LOAD_IMAGE_GRAYSCALE for older OpenCV Version
-    # Here, use constant 1 and 0
-    # 1: COLOR, 0: GRAYSCALE
-    flag = 1 if is_color else 0
-    im = cv2.imread(file, flag)
-    return im
-
-
-def resize_short(im, size):
-    """ 
-    Resize an image so that the length of shorter edge is size.
-
-    Example usage:
-    
-    .. code-block:: python
-
-        im = load_image('cat.jpg')
-        im = resize_short(im, 256)
-    
-    :param im: the input image with HWC layout.
-    :type im: ndarray
-    :param size: the shorter edge size of image after resizing.
-    :type size: int
-    """
-    h, w = im.shape[:2]
-    h_new, w_new = size, size
-    if h > w:
-        h_new = size * h / w
-    else:
-        w_new = size * w / h
-    im = cv2.resize(im, (w_new, h_new), interpolation=cv2.INTER_CUBIC)
-    return im
-
-
-def to_chw(im, order=(2, 0, 1)):
-    """
-    Transpose the input image order. The image layout is HWC format
-    opened by cv2 or PIL. Transpose the input image to CHW layout
-    according the order (2,0,1).
-
-    Example usage:
-    
-    .. code-block:: python
-
-        im = load_image('cat.jpg')
-        im = resize_short(im, 256)
-        im = to_chw(im)
-    
-    :param im: the input image with HWC layout.
-    :type im: ndarray
-    :param order: the transposed order.
-    :type order: tuple|list 
-    """
-    assert len(im.shape) == len(order)
-    im = im.transpose(order)
-    return im
-
-
-def center_crop(im, size, is_color=True):
-    """
-    Crop the center of image with size.
-
-    Example usage:
-    
-    .. code-block:: python
-
-        im = center_crop(im, 224)
-    
-    :param im: the input image with HWC layout.
-    :type im: ndarray
-    :param size: the cropping size.
-    :type size: int
-    :param is_color: whether the image is color or not.
-    :type is_color: bool
-    """
-    h, w = im.shape[:2]
-    h_start = (h - size) / 2
-    w_start = (w - size) / 2
-    h_end, w_end = h_start + size, w_start + size
-    if is_color:
-        im = im[h_start:h_end, w_start:w_end, :]
-    else:
-        im = im[h_start:h_end, w_start:w_end]
-    return im
-
-
-def random_crop(im, size, is_color=True):
-    """
-    Randomly crop input image with size.
-
-    Example usage:
-    
-    .. code-block:: python
-
-        im = random_crop(im, 224)
-    
-    :param im: the input image with HWC layout.
-    :type im: ndarray
-    :param size: the cropping size.
-    :type size: int
-    :param is_color: whether the image is color or not.
-    :type is_color: bool
-    """
-    h, w = im.shape[:2]
-    h_start = np.random.randint(0, h - size + 1)
-    w_start = np.random.randint(0, w - size + 1)
-    h_end, w_end = h_start + size, w_start + size
-    if is_color:
-        im = im[h_start:h_end, w_start:w_end, :]
-    else:
-        im = im[h_start:h_end, w_start:w_end]
-    return im
-
-
-def left_right_flip(im, is_color=True):
-    """
-    Flip an image along the horizontal direction.
-    Return the flipped image.
-
-    Example usage:
-    
-    .. code-block:: python
-
-        im = left_right_flip(im)
-    
-    :param im: input image with HWC layout or HW layout for gray image
-    :type im: ndarray
-    :param is_color: whether input image is color or not
-    :type is_color: bool
-    """
-    if len(im.shape) == 3 and is_color:
-        return im[:, ::-1, :]
-    else:
-        return im[:, ::-1]
-
-
-def simple_transform(im,
-                     resize_size,
-                     crop_size,
-                     is_train,
-                     is_color=True,
-                     mean=None):
-    """
-    Simply data argumentation for training. These operations include
-    resizing, croping and flipping.
-
-    Example usage:
-    
-    .. code-block:: python
-
-        im = simple_transform(im, 256, 224, True)
-
-    :param im: The input image with HWC layout.
-    :type im: ndarray
-    :param resize_size: The shorter edge length of the resized image.
-    :type resize_size: int
-    :param crop_size: The cropping size.
-    :type crop_size: int
-    :param is_train: Whether it is training or not.
-    :type is_train: bool
-    :param is_color: whether the image is color or not.
-    :type is_color: bool
-    :param mean: the mean values, which can be element-wise mean values or 
-                 mean values per channel.
-    :type mean: numpy array | list
-    """
-    im = resize_short(im, resize_size)
-    if is_train:
-        im = random_crop(im, crop_size, is_color=is_color)
-        if np.random.randint(2) == 0:
-            im = left_right_flip(im, is_color)
-    else:
-        im = center_crop(im, crop_size, is_color=is_color)
-    if len(im.shape) == 3:
-        im = to_chw(im)
-
-    im = im.astype('float32')
-    if mean is not None:
-        mean = np.array(mean, dtype=np.float32)
-        # mean value, may be one value per channel 
-        if mean.ndim == 1 and is_color:
-            mean = mean[:, np.newaxis, np.newaxis]
-        elif mean.ndim == 1:
-            mean = mean
-        else:
-            # elementwise mean
-            assert len(mean.shape) == len(im)
-        im -= mean
-
-    return im
-
-
-def load_and_transform(filename,
-                       resize_size,
-                       crop_size,
-                       is_train,
-                       is_color=True,
-                       mean=None):
-    """
-    Load image from the input file `filename` and transform image for
-    data argumentation. Please refer to the `simple_transform` interface
-    for the transform operations.
-
-    Example usage:
-    
-    .. code-block:: python
-
-        im = load_and_transform('cat.jpg', 256, 224, True)
-
-    :param filename: The file name of input image.
-    :type filename: string
-    :param resize_size: The shorter edge length of the resized image.
-    :type resize_size: int
-    :param crop_size: The cropping size.
-    :type crop_size: int
-    :param is_train: Whether it is training or not.
-    :type is_train: bool
-    :param is_color: whether the image is color or not.
-    :type is_color: bool
-    :param mean: the mean values, which can be element-wise mean values or 
-                 mean values per channel.
-    :type mean: numpy array | list
-    """
-    im = load_image(filename, is_color)
-    im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean)
-    return im
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
deleted file mode 100644
index 28ee042282a..00000000000
--- a/python/paddle/v2/inference.py
+++ /dev/null
@@ -1,172 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy
-import collections
-import topology
-import paddle
-import cPickle
-
-__all__ = ['infer', 'Inference']
-
-
-class Inference(object):
-    """
-    Inference combines neural network output and parameters together
-    to do inference.
-
-    ..  code-block:: python
-
-        inferer = Inference(output_layer=prediction, parameters=parameters)
-        for data_batch in batches:
-            print inferer.infer(data_batch)
-
-
-    :param output_layer: The neural network that should be inferenced.
-    :type output_layer: paddle.v2.config_base.Layer or the sequence
-                        of paddle.v2.config_base.Layer
-    :param parameters: The parameters dictionary.
-    :type parameters: paddle.v2.parameters.Parameters
-    """
-
-    def __init__(self, parameters, output_layer=None, fileobj=None):
-        import py_paddle.swig_paddle as api
-
-        if output_layer is not None:
-            topo = topology.Topology(output_layer)
-            gm = api.GradientMachine.createFromConfigProto(
-                topo.proto(), api.CREATE_MODE_TESTING, [api.PARAMETER_VALUE])
-            self.__data_types__ = topo.data_type()
-        elif fileobj is not None:
-            tmp = cPickle.load(fileobj)
-            gm = api.GradientMachine.createByConfigProtoStr(
-                tmp['protobin'], api.CREATE_MODE_TESTING,
-                [api.PARAMETER_VALUE])
-            self.__data_types__ = tmp['data_type']
-        else:
-            raise ValueError("Either output_layer or fileobj must be set")
-
-        for param in gm.getParameters():
-            val = param.getBuf(api.PARAMETER_VALUE)
-            name = param.getName()
-            assert isinstance(val, api.Vector)
-            val.copyFromNumpyArray(parameters.get(name).flatten())
-            # the setValueUpdated function is called in randomize, zeroMem,
-            # load function in paddle/legacy/parameter/Parameter.cpp. But in the
-            # inference mode, the setValueUpdated is never called, it will
-            # cause the parameter will not be dispatched
-            # in MultiGradientMachine for multi-GPU. So setValueUpdated is
-            # called here, but it's better to call this function in one place.
-            param.setValueUpdated()
-        self.__gradient_machine__ = gm
-
-    def iter_infer(self, input, feeding=None):
-        from data_feeder import DataFeeder
-        feeder = DataFeeder(self.__data_types__, feeding)
-        batch_size = len(input)
-
-        def __reader_impl__():
-            for each_sample in input:
-                yield each_sample
-
-        reader = paddle.batch(__reader_impl__, batch_size=batch_size)
-
-        self.__gradient_machine__.start()
-        for data_batch in reader():
-            yield self.__gradient_machine__.forwardTest(feeder(data_batch))
-        self.__gradient_machine__.finish()
-
-    def iter_infer_field(self, field, **kwargs):
-        if not isinstance(field, list) and not isinstance(field, tuple):
-            field = [field]
-
-        for result in self.iter_infer(**kwargs):
-            for each_result in result:
-                item = [each_result[each_field] for each_field in field]
-                yield item
-
-    def infer(self, input, field='value', flatten_result=True, **kwargs):
-        """
-        Infer a data by model.
-        :param input: input data batch. Should be python iterable object.
-        :param field: output field.
-        """
-        retv = None
-        kwargs['input'] = input
-        for result in self.iter_infer_field(field=field, **kwargs):
-            if retv is None:
-                retv = [[] for i in xrange(len(result))]
-            for i, item in enumerate(result):
-                retv[i].append(item)
-
-        if retv == None:
-            return []
-
-        if flatten_result:
-            retv = [numpy.concatenate(out) for out in retv]
-
-        if len(retv) == 1:
-            return retv[0]
-        else:
-            return retv
-
-
-def infer(output_layer, parameters, input, feeding=None, field='value'):
-    """
-    Infer a neural network by given neural network output and parameters.  The
-    user should pass either a batch of input data or reader method.
-
-    Example usage for sinlge output_layer:
-
-    ..  code-block:: python
-
-        result = paddle.infer(output_layer=prediction,
-                              parameters=parameters,
-                              input=SomeData)
-        print result
-
-    Example usage for multiple outout_layers and fields:
-
-    ..  code-block:: python
-
-        result = paddle.infer(output_layer=[prediction1, prediction2],
-                              parameters=parameters,
-                              input=SomeData,
-                              field=[id, value]])
-        print result
-
-    :param output_layer: output of the neural network that would be inferred
-    :type output_layer: paddle.v2.config_base.Layer or a list of
-                        paddle.v2.config_base.Layer
-    :param parameters: parameters of the neural network.
-    :type parameters: paddle.v2.parameters.Parameters
-    :param input: input data batch. Should be a python iterable object, and each
-                  element is the data batch.
-    :type input: collections.Iterable
-    :param feeding: Reader dictionary. Default could generate from input
-                        value.
-    :param field: The prediction field. It should in [`value`, `id`, `prob`].
-                  `value` and `prob` mean return the prediction probabilities,
-                  `id` means return the prediction labels. Default is `value`.
-                  Note that `prob` only used when output_layer is beam_search
-                  or max_id.
-    :type field: str
-    :return: The prediction result. If there are multiple outout_layers and fields,
-             the return order is outout_layer1.field1, outout_layer2.field1, ...,
-             outout_layer1.field2, outout_layer2.field2 ...
-    :rtype: numpy.ndarray
-    """
-
-    inferer = Inference(output_layer=output_layer, parameters=parameters)
-    return inferer.infer(field=field, input=input, feeding=feeding)
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
deleted file mode 100644
index a188a03eb36..00000000000
--- a/python/paddle/v2/layer.py
+++ /dev/null
@@ -1,326 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-`paddle.v2.layer` is a part of model config packages in paddle.v2. In API v2,
-we want to make Paddle a plain Python package. The model config package defines
-the way how to configure a neural network topology in Paddle Python code.
-
-The primary usage shows below.
-
-..  code-block:: python
-
-    import paddle
-
-    img = paddle.layer.data(name='img', type=paddle.data_type.dense_vector(784))
-    hidden = paddle.layer.fc(input=img, size=200)
-    prediction = paddle.layer.fc(input=hidden, size=10,
-                                 act=paddle.activation.Softmax())
-
-    # use prediction instance where needed.
-    parameters = paddle.parameters.create(cost)
-"""
-import collections
-import copy
-import re
-import paddle.trainer_config_helpers.layers as v1_layers
-import paddle.trainer.config_parser as cp
-from paddle.proto.ModelConfig_pb2 import ModelConfig, SubModelConfig
-from config_base import __convert_to_v2__
-import config_base
-
-__all__ = ['data', 'parse_network']
-
-
-def __need_to_keep__(name):
-    return name in [
-        'StaticInput', 'SubsequenceInput', 'GeneratedInput', 'LayerType',
-        'layer_support', 'BaseGeneratedInput'
-    ]
-
-
-def __need_to_wrap__(name):
-    return name not in ['AggregateLevel', 'ExpandLevel', 'BaseGeneratedInput']
-
-
-def __convert_name__(inname):
-    if __need_to_keep__(inname):
-        return inname
-    if inname == 'maxid_layer':
-        return 'max_id'
-    elif inname.endswith('memory') or inname.endswith(
-            '_seq') or inname.endswith('_sim') or inname == 'hsigmoid':
-        return inname
-    elif inname in [
-            'cross_entropy', 'multi_binary_label_cross_entropy',
-            'cross_entropy_with_selfnorm'
-    ]:
-        return inname + "_cost"
-    elif inname.endswith('_cost'):
-        return inname
-    elif inname.endswith("_layer"):
-        return inname[:-len("_layer")]
-    else:
-        return inname
-
-
-for name in v1_layers.__all__:
-    obj = getattr(v1_layers, name)
-    new_name = __convert_name__(name)
-    if callable(obj) and __need_to_wrap__(name):
-        globals()[new_name] = __convert_to_v2__(obj, new_name, __name__)
-    else:
-        globals()[new_name] = obj
-    __all__.append(new_name)
-
-
-def __data_layer__(name, type, **kwargs):
-    l = v1_layers.data_layer(name, type.dim, **kwargs)
-    l.data_type = type
-    return l
-
-
-def __map_data_docstr__(doc):
-    doc = re.sub(r'(data = [^\)]+)\).*',
-                 "data = paddle.layer.data(name=\"input\", "
-                 "type=paddle.data_type.dense_vector(1000))", doc)
-
-    doc = re.sub(r':param size:.*', ':param type: Data type of this data layer',
-                 doc)
-    doc = re.sub(r':type size:.*', ":type size: paddle.v2.data_type.InputType",
-                 doc)
-    return doc
-
-
-__data_layer__.__doc__ = __map_data_docstr__(v1_layers.data_layer.__doc__)
-
-data = __convert_to_v2__(__data_layer__, 'name', __name__)
-
-
-def __get_used_layers__(output_layers):
-    layer_names = set()
-    parents = {}
-
-    def add_parent(child, parent):
-        if child in parents:
-            parents[child].append(parent)
-        else:
-            parents[child] = [parent]
-
-    def add_additional_parents():
-        for sub_model in cp.g_config.model_config.sub_models:
-            if sub_model.name == 'root':
-                continue
-            for link in sub_model.in_links:
-                add_parent(link.link_name, link.layer_name)
-                add_parent(sub_model.name, link.layer_name)
-            for link in sub_model.out_links:
-                add_parent(link.link_name, link.layer_name)
-                add_parent(link.link_name, sub_model.name)
-            for mem in sub_model.memories:
-                if mem.boot_layer_name:
-                    add_parent(mem.layer_name, mem.boot_layer_name)
-                add_parent(mem.link_name, mem.layer_name)
-
-            if sub_model.HasField('generator'):
-                # according to the implementation of text generation
-                # in recurrent layer group, the generated word must be
-                # the first out link
-                add_parent(sub_model.out_links[0].layer_name,
-                           sub_model.generator.eos_layer_name)
-
-    def dfs_travel(layer_name):
-        if layer_name in layer_names:
-            return
-        layer_names.add(layer_name)
-        layer = cp.g_layer_map[layer_name]
-
-        for inp in layer.inputs:
-            dfs_travel(inp.input_layer_name)
-        if layer.name in parents:
-            for p in parents[layer.name]:
-                dfs_travel(p)
-
-    add_additional_parents()
-
-    for layer in output_layers:
-        dfs_travel(layer.full_name)
-
-    # print layer needs to be specially handled because no other
-    # layer depends on it. It is used to print the result of some
-    # layers when running the model for debug purpose. So we explicitly
-    # add a print layer to the topolty if its input is in the toplogy.
-    for layer in cp.g_config.model_config.layers:
-        if layer.type == 'print':
-            used = True
-            for inp in layer.inputs:
-                if inp.input_layer_name not in layer_names:
-                    used = False
-                    break
-            if used:
-                layer_names.add(layer.name)
-
-    return layer_names
-
-
-def __get_used_parameters__(layer_names, sub_models):
-    parameter_names = set()
-    for name in layer_names:
-        l = cp.g_layer_map[name]
-        for inp in l.inputs:
-            if inp.input_parameter_name:
-                parameter_names.add(inp.input_parameter_name)
-        if l.bias_parameter_name:
-            parameter_names.add(l.bias_parameter_name)
-
-    for sub_model in sub_models:
-        for mem in sub_model.memories:
-            if mem.HasField("boot_bias_parameter_name"):
-                parameter_names.add(mem.boot_bias_parameter_name)
-
-    return parameter_names
-
-
-def __get_used_submodels__(layer_names):
-    submodel_names = set()
-    for submodel in cp.g_config.model_config.sub_models:
-        if submodel.name in layer_names:
-            submodel_names.add(submodel.name)
-    return submodel_names
-
-
-def __get_submodel_data_out_links__():
-    data_links = set()
-    for submodel in cp.g_config.model_config.sub_models:
-        for link in submodel.out_links:
-            if cp.g_layer_map[link.link_name].type == 'data':
-                data_links.add(link.link_name)
-    return data_links
-
-
-def __get_used_evaluators__(layer_names):
-    evaluator_names = set()
-    for e in cp.g_config.model_config.evaluators:
-        used = True
-        for name in e.input_layers:
-            if name not in layer_names:
-                used = False
-                break
-        if used:
-            evaluator_names.add(e.name)
-    return evaluator_names
-
-
-def __trim_submodel__(old_submodel, layer_names, input_layer_names,
-                      output_layer_names, evaluator_names):
-
-    submodel = SubModelConfig()
-    submodel.name = old_submodel.name
-    submodel.layer_names.extend(
-        filter(lambda x: x in layer_names, old_submodel.layer_names))
-    submodel.input_layer_names.extend(
-        filter(lambda x: x in input_layer_names, submodel.layer_names))
-    submodel.output_layer_names.extend(
-        filter(lambda x: x in output_layer_names, submodel.layer_names))
-    submodel.evaluator_names.extend(
-        filter(lambda x: x in evaluator_names, old_submodel.evaluator_names))
-
-    submodel.is_recurrent_layer_group = old_submodel.is_recurrent_layer_group
-    submodel.reversed = old_submodel.reversed
-
-    submodel.memories.extend(
-        filter(lambda x: x.link_name in layer_names, old_submodel.memories))
-    target_inlinkid = (old_submodel.target_inlinkid
-                       if old_submodel.HasField('target_inlinkid') else -1)
-    in_links = []
-    for i, link in enumerate(old_submodel.in_links):
-        if link.link_name in layer_names or i == target_inlinkid:
-            in_links.append(link)
-            if i == target_inlinkid:
-                target_inlinkid = len(in_links) - 1
-    submodel.in_links.extend(in_links)
-
-    submodel.out_links.extend(
-        filter(lambda x: x.link_name in layer_names, old_submodel.out_links))
-    if old_submodel.HasField('generator'):
-        submodel.generator.CopyFrom(old_submodel.generator)
-
-    if old_submodel.HasField('target_inlinkid'):
-        submodel.target_inlinkid = target_inlinkid
-    return submodel
-
-
-def parse_network(output_layers, extra_layers=None):
-    if not isinstance(output_layers, collections.Sequence):
-        output_layers = [output_layers]
-    if extra_layers is not None:
-        if not isinstance(extra_layers, collections.Sequence):
-            extra_layers = [extra_layers]
-    else:
-        extra_layers = []
-
-    layer_names = __get_used_layers__(list(output_layers) + list(extra_layers))
-    submodel_names = __get_used_submodels__(layer_names)
-    submodel_names.add('root')
-    evaluator_names = __get_used_evaluators__(layer_names)
-    data_out_links = __get_submodel_data_out_links__()
-    input_layer_names = set()
-    output_layer_names = set()
-
-    model_config = ModelConfig()
-    model_config.type = cp.g_config.model_config.type
-
-    for layer in output_layers:
-        model_config.output_layer_names.append(layer.full_name)
-        output_layer_names.add(layer.full_name)
-
-    for l in cp.g_config.model_config.layers:
-        if l.name not in layer_names:
-            continue
-        model_config.layers.extend([l])
-        if l.type == 'data':
-            if l.name in data_out_links:
-                """
-                In text generation, the outlink to save the generated word
-                indices is a data_layer defined in recurrent_group. This
-                data_layer is sure to be the output of the network in text
-                generation task, so this statement excludes such a special
-                data_layer from being inputs of the network, otherwise an error
-                will occur during data feeding.
-                """
-                continue
-            model_config.input_layer_names.append(l.name)
-            input_layer_names.add(l.name)
-
-    for e in cp.g_config.model_config.evaluators:
-        if e.name in evaluator_names:
-            model_config.evaluators.extend([e])
-
-    for s in cp.g_config.model_config.sub_models:
-        if s.name in submodel_names:
-            s = __trim_submodel__(s, layer_names, input_layer_names,
-                                  output_layer_names, evaluator_names)
-            model_config.sub_models.extend([s])
-
-    parameter_names = __get_used_parameters__(layer_names,
-                                              model_config.sub_models)
-
-    for p in cp.g_config.model_config.parameters:
-        if p.name in parameter_names:
-            model_config.parameters.extend([p])
-
-    return model_config
-
-
-def get_layer(name):
-    return config_base.__layer_map__.get(name)
diff --git a/python/paddle/v2/master/.gitignore b/python/paddle/v2/master/.gitignore
deleted file mode 100644
index a3ac6e1a33e..00000000000
--- a/python/paddle/v2/master/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-*.whl
-*.so
-*.pyc
diff --git a/python/paddle/v2/master/__init__.py b/python/paddle/v2/master/__init__.py
deleted file mode 100644
index efaeeabfa26..00000000000
--- a/python/paddle/v2/master/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from client import *
-
-__all__ = ['client']
diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py
deleted file mode 100644
index d62e7cc28ef..00000000000
--- a/python/paddle/v2/master/client.py
+++ /dev/null
@@ -1,95 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import ctypes
-import os
-
-__lib__ = None
-
-
-def get_c_lib():
-    global __lib__
-    if __lib__ is None:
-        path = os.path.join(os.path.dirname(__file__), "libpaddle_master.so")
-        __lib__ = ctypes.cdll.LoadLibrary(path)
-    return __lib__
-
-
-class client(object):
-    """
-    client is a client to the master server.
-    """
-
-    def __init__(self, etcd_endpoints, timeout_sec, buf_size=0):
-        self.c = get_c_lib().paddle_new_etcd_master_client(
-            etcd_endpoints, timeout_sec, buf_size)
-
-    def request_save_model(self, trainer_id, block_ms):
-        """request to save model
-
-        Conventionally the 0-th trainer will save model. But in
-        distributed training, any trainer could be killed. This
-        function asks the master server if the trainer should proceed
-        with saving model.
-
-        :param trainer_id: trainer id.
-        :param block_ms: number of millisecond that other save model
-        will be blocked if this save model request succeeded.
-
-        Returns:
-            int: 1 if the save the model request is approved, 0 if
-            does the request is rejected because other trainer is
-            saving the model, -1 if error happened.
-
-        """
-        return get_c_lib().paddle_request_save_model(self.c, trainer_id,
-                                                     block_ms)
-
-    def release(self):
-        get_c_lib().paddle_release_master_client(self.c)
-        self.c = None
-
-    def set_dataset(self, paths):
-        holder_type = ctypes.c_char_p * len(paths)
-        holder = holder_type()
-        for idx, path in enumerate(paths):
-            c_ptr = ctypes.c_char_p(path)
-            holder[idx] = c_ptr
-        get_c_lib().paddle_set_dataset(self.c, holder, len(paths))
-
-    def next_record(self):
-        """gets next record for training
-
-        Returns:
-            string: the record.
-            int: error code, 0 if successful, < 0 otherwise.
-        """
-        p = ctypes.c_char_p()
-        ret = ctypes.pointer(p)
-        size = get_c_lib().paddle_next_record(self.c, ret)
-        if size < 0:
-            # Error
-            return None, size
-
-        if size == 0:
-            # Empty record
-            return "", 0
-
-        record = ret.contents.value[:size]
-        # Memory created from C should be freed.
-        get_c_lib().mem_free(ret.contents)
-        return record, 0
-
-    def paddle_start_get_records(self, pass_id):
-        get_c_lib().paddle_start_get_records(self.c, pass_id)
diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py
deleted file mode 100644
index 3c6a53db3c2..00000000000
--- a/python/paddle/v2/minibatch.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = ['batch']
-
-
-def batch(reader, batch_size, drop_last=True):
-    """
-    Create a batched reader.
-
-    :param reader: the data reader to read from.
-    :type reader: callable
-    :param batch_size: size of each mini-batch
-    :type batch_size: int
-    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
-    :type drop_last: bool
-    :return: the batched reader.
-    :rtype: callable
-    """
-
-    def batch_reader():
-        r = reader()
-        b = []
-        for instance in r:
-            b.append(instance)
-            if len(b) == batch_size:
-                yield b
-                b = []
-        if drop_last == False and len(b) != 0:
-            yield b
-
-    return batch_reader
diff --git a/python/paddle/v2/networks.py b/python/paddle/v2/networks.py
deleted file mode 100644
index 8ae9f3b202d..00000000000
--- a/python/paddle/v2/networks.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.trainer_config_helpers.networks as conf_nw
-import inspect
-from config_base import __convert_to_v2__
-
-__all__ = []
-
-
-def __initialize__():
-    for each_subnetwork in conf_nw.__all__:
-        if each_subnetwork in ['inputs', 'outputs']:
-            continue
-        func = getattr(conf_nw, each_subnetwork)
-        globals()[each_subnetwork] = func
-        globals()[each_subnetwork].__name__ = each_subnetwork
-        global __all__
-        __all__.append(each_subnetwork)
-
-
-__initialize__()
diff --git a/python/paddle/v2/op.py b/python/paddle/v2/op.py
deleted file mode 100644
index 03f3b9b9ef2..00000000000
--- a/python/paddle/v2/op.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import layer
-import activation as act
-from config_base import Layer
-from paddle.trainer_config_helpers.attrs import is_compatible_with
-from paddle.trainer_config_helpers.default_decorators import wrap_name_default
-
-__all__ = []
-
-
-def __register_unary_math_op__(op_name, act):
-    def op(input, name=None):
-        return layer.mixed(
-            input=[layer.identity_projection(input=input)], name=name, act=act)
-
-    op = wrap_name_default(op_name)(op)
-    op.__doc__ = type(act).__doc__
-    globals()[op_name] = op
-    __all__.append(op_name)
-
-
-__register_unary_math_op__('exp', act.Exp())
-__register_unary_math_op__('log', act.Log())
-__register_unary_math_op__('abs', act.Abs())
-__register_unary_math_op__('sigmoid', act.Sigmoid())
-__register_unary_math_op__('tanh', act.Tanh())
-__register_unary_math_op__('square', act.Square())
-__register_unary_math_op__('relu', act.Relu())
-__register_unary_math_op__('sqrt', act.Sqrt())
-__register_unary_math_op__('reciprocal', act.Reciprocal())
-__register_unary_math_op__('softmax', act.Softmax())
-
-
-def __add__(layeroutput, other):
-    if is_compatible_with(other, float):
-        return layer.slope_intercept(input=layeroutput, intercept=other)
-    if not isinstance(other, Layer):
-        raise TypeError("Layer can only be added with"
-                        " another Layer or a number")
-    if layeroutput.size == other.size:
-        return layer.mixed(input=[
-            layer.identity_projection(input=layeroutput),
-            layer.identity_projection(input=other)
-        ])
-    if other.size != 1 and layeroutput.size != 1:
-        raise TypeError("Two Layer can be added only if they have equal size"
-                        " or one of their sizes is 1. sizes are %s and %s" %
-                        (layeroutput.size, other.size))
-    elif layeroutput.size == 1:
-        tmp = layeroutput
-        layeroutput = other
-        other = tmp
-    other = layer.repeat(other, layeroutput.size)
-    return layer.mixed(input=[
-        layer.identity_projection(input=layeroutput),
-        layer.identity_projection(input=other)
-    ])
-
-
-Layer.__radd__ = __add__
-Layer.__add__ = __add__
-
-
-def __neg__(layeroutput):
-    return layer.slope_intercept(input=layeroutput, slope=-1.0)
-
-
-Layer.__neg__ = __neg__
-
-
-def __sub__(layeroutput, other):
-    if is_compatible_with(other, float):
-        return layer.slope_intercept(input=layeroutput, intercept=other)
-    if not isinstance(other, Layer):
-        raise TypeError("Layer can only be subtracted with"
-                        " another Layeroutput or a number")
-    return __add__(layeroutput, -other)
-
-
-Layer.__sub__ = __sub__
-
-
-def __rsub__(layeroutput, other):
-    neg = layer.slope_intercept(input=layeroutput, slope=-1.0)
-    return __add__(neg, other)
-
-
-Layer.__rsub__ = __rsub__
-
-
-def __mul__(layeroutput, other):
-    if is_compatible_with(other, float):
-        return layer.slope_intercept(input=layeroutput, slope=other)
-    if not isinstance(other, Layer):
-        raise TypeError("Layer can only be multiplied with"
-                        " another Layer or a number")
-    elif layeroutput.size == 1:
-        return layer.scaling(input=other, weight=layeroutput)
-    elif other.size == 1:
-        return layer.scaling(input=layeroutput, weight=other)
-    else:
-        raise TypeError("At least one of the operand of '*' must be a number"
-                        " or a Layer with size=1")
-
-
-Layer.__mul__ = __mul__
-Layer.__rmul__ = __mul__
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
deleted file mode 100644
index caef5f484e2..00000000000
--- a/python/paddle/v2/optimizer.py
+++ /dev/null
@@ -1,297 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
-import paddle.trainer_config_helpers.optimizers as v1_optimizers
-from paddle.proto.OptimizerConfig_pb2 import OptimizerConfig
-
-__all__ = [
-    'Momentum', 'Adam', 'Adamax', 'AdaGrad', 'DecayedAdaGrad', 'AdaDelta',
-    'RMSProp', 'ModelAverage', 'L2Regularization'
-]
-
-
-class Optimizer(object):
-    def __init__(self, **kwargs):
-        import py_paddle.swig_paddle as swig_api
-        if 'batch_size' in kwargs:
-            del kwargs['batch_size']  # not important for python library.
-
-        def __impl__():
-            v1_optimizers.settings(batch_size=1, **kwargs)
-
-        self.__opt_conf_proto__ = config_parser_utils.parse_optimizer_config(
-            __impl__)
-        self.__opt_conf__ = swig_api.OptimizationConfig.createFromProto(
-            self.__opt_conf_proto__)
-
-    def enable_types(self):
-        """
-        get enable_types for each optimizer.
-        enable_types = [value, gradient, momentum, etc]
-        For each optimizer(SGD, Adam), GradientMachine should enable different
-        buffers.
-        """
-        import py_paddle.swig_paddle as swig_api
-        tmp = swig_api.ParameterOptimizer.create(self.__opt_conf__)
-        assert isinstance(tmp, swig_api.ParameterOptimizer)
-        return tmp.getParameterTypes()
-
-    def __create_local_updater__(self):
-        import py_paddle.swig_paddle as swig_api
-        return swig_api.ParameterUpdater.createLocalUpdater(self.__opt_conf__)
-
-    def __create_remote_updater__(self, pass_num, use_sparse_updater):
-        import py_paddle.swig_paddle as swig_api
-        return swig_api.ParameterUpdater.createRemoteUpdater(
-            self.__opt_conf__, pass_num, use_sparse_updater)
-
-    def __create_new_remote_updater__(self, pserver_spec, use_etcd):
-        import py_paddle.swig_paddle as swig_api
-        return swig_api.ParameterUpdater.createNewRemoteUpdater(
-            self.__opt_conf__, pserver_spec, use_etcd)
-
-    def create_updater(self, is_local, num_passes, use_sparse_updater,
-                       pserver_spec, use_etcd):
-        """
-        create proper parameter_updater by configuration.
-        :param is_local: create local or remote parameter updater
-        :param num_passes: remote parameter updater will use this to config
-        parameter server.
-        :param use_sparse_updater: when use remote updater, if some parameter is
-        sparse, updater should do some extra thing:
-
-        ..  code-block:: python
-
-            if use_sparse_remote_updater:
-                        gradient_machine.prefetch(in_args)
-                        parameter_updater.getParametersRemote()
-
-        :param pserver_spec: pserver location, eg: localhost:3000, if use etcd,
-        pserver_spec should be the etcd endpoints, eg: http://localhost:2379
-        :return: parameter_updater
-        """
-        if is_local:
-            parameter_updater = self.__create_local_updater__()
-        else:
-            if pserver_spec is None:
-                parameter_updater = self.__create_remote_updater__(
-                    num_passes, use_sparse_updater)
-            else:
-                parameter_updater = self.__create_new_remote_updater__(
-                    pserver_spec, use_etcd)
-        return parameter_updater
-
-
-class Momentum(Optimizer):
-    """
-    Momentum Optimizer.
-
-    When sparse=False, the momentum update formula is as follows:
-
-    ..  math::
-
-        v_{t} &= k * v_{t-1} - \\gamma_t (g_{t} + \\lambda w_{t-1}) \\\\
-        w_{t} &= w_{t-1} + v_{t} \\\\
-
-    where, :math:`k` is momentum, :math:`\\lambda` is decay rate,
-    :math:`\\gamma_t` is learning rate at the t'th iteration.
-    :math:`w_{t}` is the weight as the t'th iteration.
-    And the :math:`v_{t}` is the history momentum variable.
-
-    When sparse=True, the update scheme:
-
-    ..  math::
-
-        \\alpha_t &= \\alpha_{t-1} / k \\\\
-        \\beta_t &= \\beta_{t-1} / (1 + \\lambda \\gamma_t) \\\\
-        u_t &= u_{t-1} - \\alpha_t \\gamma_t g_t \\\\
-        v_t &= v_{t-1} + \\tau_{t-1} \\alpha_t \\gamma_t g_t \\\\
-        \\tau_t &= \\tau_{t-1} + \\beta_t / \\alpha_t
-    
-    where :math:`k` is momentum, :math:`\\lambda` is decay rate, 
-    :math:`\\gamma_t` is learning rate at the t'th iteration.
-
-    :param momentum: the momentum factor.
-    :type momentum: float
-    :param sparse: with sparse support or not, False by default.
-    :type sparse: bool
-    """
-
-    def __init__(self, momentum=None, sparse=False, **kwargs):
-        learning_method = v1_optimizers.MomentumOptimizer(
-            momentum=momentum, sparse=sparse)
-        super(Momentum, self).__init__(
-            learning_method=learning_method, **kwargs)
-
-
-class Adam(Optimizer):
-    """
-    Adam optimizer.
-    The details of please refer `Adam: A Method for Stochastic Optimization
-    <https://arxiv.org/abs/1412.6980>`_
-
-    ..  math::
-
-        m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
-        v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
-        w & = w - \\frac{\\eta m(w, t)}{\\sqrt{v(w,t) + \\epsilon}}
-
-    :param beta1: the :math:`\\beta_1` in equation.
-    :type beta1: float
-    :param beta2: the :math:`\\beta_2` in equation.
-    :type beta2: float
-    :param epsilon: the :math:`\\epsilon` in equation. It is used to prevent
-                        divided by zero.
-    :type epsilon: float
-    """
-
-    def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8, **kwargs):
-        learning_method = v1_optimizers.AdamOptimizer(
-            beta1=beta1, beta2=beta2, epsilon=epsilon)
-        super(Adam, self).__init__(learning_method=learning_method, **kwargs)
-
-
-class Adamax(Optimizer):
-    """
-    Adamax optimizer.
-
-    The details of please refer this `Adam: A Method for Stochastic Optimization
-    <https://arxiv.org/abs/1412.6980>`_
-
-    ..  math::
-
-        m_t & = \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
-        u_t & = max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
-        w_t & = w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t
-
-    :param beta1: the :math:`\\beta_1` in the equation.
-    :type beta1: float
-    :param beta2: the :math:`\\beta_2` in the equation.
-    :type beta2: float
-    """
-
-    def __init__(self, beta1=0.9, beta2=0.999, **kwargs):
-        learning_method = v1_optimizers.AdamaxOptimizer(
-            beta1=beta1, beta2=beta2)
-        super(Adamax, self).__init__(learning_method=learning_method, **kwargs)
-
-
-class AdaGrad(Optimizer):
-    """
-    Adagrad(for ADAptive GRAdient algorithm) optimizer.
-
-    For details please refer this `Adaptive Subgradient Methods for
-    Online Learning and Stochastic Optimization
-    <http://www.magicbroom.info/Papers/DuchiHaSi10.pdf>`_.
-
-    ..  math::
-
-        G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\
-        w & = w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
-    """
-
-    def __init__(self, **kwargs):
-        learning_method = v1_optimizers.AdaGradOptimizer()
-        super(AdaGrad, self).__init__(learning_method=learning_method, **kwargs)
-
-
-class DecayedAdaGrad(Optimizer):
-    """
-    AdaGrad method with decayed sum gradients. The equations of this method
-    show as follow.
-
-    ..  math::
-
-        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
-        learning\\_rate &= 1/sqrt( ( E(g_t^2) + \\epsilon )
-
-    :param rho: The :math:`\\rho` parameter in that equation
-    :type rho: float
-    :param epsilon: The :math:`\\epsilon` parameter in that equation.
-    :type epsilon: float
-    """
-
-    def __init__(self, rho=0.95, epsilon=1e-06, **kwargs):
-        learning_method = v1_optimizers.DecayedAdaGradOptimizer(
-            rho=rho, epsilon=epsilon)
-        super(DecayedAdaGrad, self).__init__(
-            learning_method=learning_method, **kwargs)
-
-
-class AdaDelta(Optimizer):
-    """
-    AdaDelta method. The details of adadelta please refer to this
-    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD
-    <http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf>`_.
-
-    ..  math::
-
-        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
-        learning\\_rate &= sqrt( ( E(dx_{t-1}^2) + \\epsilon ) / ( \\
-                          E(g_t^2) + \\epsilon ) ) \\\\
-        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2
-
-    :param rho: :math:`\\rho` in equation
-    :type rho: float
-    :param epsilon: :math:`\\rho` in equation
-    :type epsilon: float
-    """
-
-    def __init__(self, rho=0.95, epsilon=1e-06, **kwargs):
-        learning_method = v1_optimizers.AdaDeltaOptimizer(
-            rho=rho, epsilon=epsilon)
-        super(AdaDelta, self).__init__(
-            learning_method=learning_method, **kwargs)
-
-
-class RMSProp(Optimizer):
-    """
-    RMSProp(for Root Mean Square Propagation) optimizer. For details please
-    refer this `slide <http://www.cs.toronto.edu/~tijmen/csc321/slides/
-    lecture_slides_lec6.pdf>`_.
-
-    The equations of this method as follows:
-
-    ..  math::
-
-        v(w, t) & = \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
-        w & = w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)
-
-    :param rho: the :math:`\\rho` in the equation. The forgetting factor.
-    :type rho: float
-    :param epsilon: the :math:`\\epsilon` in the equation.
-    :type epsilon: float
-    """
-
-    def __init__(self, rho=0.95, epsilon=1e-6, **kwargs):
-        learning_method = v1_optimizers.RMSPropOptimizer(
-            rho=rho, epsilon=epsilon)
-        super(RMSProp, self).__init__(learning_method=learning_method, **kwargs)
-
-
-ModelAverage = v1_optimizers.ModelAverage
-L2Regularization = v1_optimizers.L2Regularization
-
-if __name__ == '__main__':
-    import py_paddle.swig_paddle as swig_api
-    swig_api.initPaddle('--use_gpu=false')
-    for opt in [
-            Momentum(), Adam(), Adamax(), AdaGrad(), DecayedAdaGrad(),
-            AdaDelta(), RMSProp(), Adam(
-                model_average=ModelAverage(average_window=0.5),
-                regularization=L2Regularization(rate=0.5),
-                gradient_clipping_threshold=25)
-    ]:
-        print opt, opt.enable_types()
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
deleted file mode 100644
index 7b7d1a1d167..00000000000
--- a/python/paddle/v2/parameters.py
+++ /dev/null
@@ -1,441 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-from paddle.proto.ParameterConfig_pb2 import ParameterConfig
-from collections import OrderedDict
-import paddle.trainer.config_parser as cp
-import struct
-import tarfile
-import cStringIO
-from topology import Topology
-
-__all__ = ['Parameters', 'create']
-
-
-def create(layers):
-    """
-    Create parameter pool by topology.
-
-    :param layers:
-    :return:
-    """
-    topology = Topology(layers)
-    pool = Parameters()
-    initializers = cp.g_parameter_initializer_map
-    for param in topology.proto().parameters:
-        pool.__append_config__(param)
-        if param.name in initializers:
-            pool[param.name] = initializers[param.name](param.name)
-    return pool
-
-
-class Parameters(object):
-    """
-    `Parameters` manages all the learnable parameters in a neural network.
-    It stores parameters' information in an OrderedDict. The key is
-    the name of a parameter, and value is a parameter's configuration(in
-    protobuf format), such as initialization mean and std, its size, whether it
-    is a static parameter, and so on.
-
-    :param __param_conf__: store the configurations of learnable parameters in
-        the network in an OrderedDict. Parameter is added one by one into the
-        dict by following their created order in the network: parameters of
-        the previous layers in a network are careted first. You can visit the
-        parameters from bottom to top by iterating over this dict.
-    :type __param_conf__: OrderedDict
-    :param __gradient_machines__: all of the parameters in a neural network are
-        appended to a PaddlePaddle gradient machine, which is used internally to
-        copy parameter values between C++ and Python end.
-    :type __gradient_machines__: list
-    :param __tmp_params__: a dict to store dummy parameters if no
-        __gradient_machines__ is appended to `Parameters`.
-    :type __tmp_params__: dict
-
-    Basically usage is
-
-    ..  code-block:: python
-
-        data = paddle.layers.data(...)
-        ...
-        out = paddle.layers.fc(...)
-
-        parameters = paddle.parameters.create(out)
-
-        parameter_names = parameters.names()
-        fc_mat = parameters.get('fc')
-        print fc_mat
-    """
-
-    def __init__(self):
-        self.__param_conf__ = OrderedDict()
-        self.__gradient_machines__ = []
-        self.__tmp_params__ = dict()
-
-    def __append_config__(self, param_conf):
-        """
-        Append a parameter configuration. It used to initialize Parameters and
-        should be invoked only in paddle.parameters.create
-
-        :param param_conf: The parameter configuration in protobuf
-        :type param_conf: ParameterConfig
-        :return: Nothing
-        """
-
-        if not isinstance(param_conf, ParameterConfig):
-            raise ValueError("param_conf must be paddle.proto.ParameterConfig")
-
-        if param_conf.name in self.__param_conf__:
-            raise ValueError("duplicated parameter %s" % param_conf.name)
-
-        self.__param_conf__[param_conf.name] = param_conf
-
-    def update_param_conf(self, model_config):
-        for p in model_config.parameters:
-            self.__param_conf__[p.name] = p
-
-    def keys(self):
-        """
-        keys are the names of each parameter.
-
-        :return: list of parameter name
-        :rtype: list
-        """
-        return self.__param_conf__.keys()
-
-    def names(self):
-        """
-        names of each parameter.
-
-        :return: list of parameter name
-        :rtype: list
-        """
-        return self.keys()
-
-    def has_key(self, key):
-        """
-        has_key return true if there are such parameter name == key
-
-        :param key: Parameter name
-        :type key: basestring
-        :return: True if contains such key
-        """
-        return key in self.__param_conf__.keys()
-
-    def __iter__(self):
-        """
-        Return an iterator of parameter name. It is used by `for loop`
-        or `in` operator.
-
-        ..  code-block:: python
-
-            parameters = paddle.parameters.create(...)
-            if "fc_param" in parameters:
-                print 'OK'
-        :return: an iterator of parameter name
-        :rtype: iterator
-        """
-        return iter(self.__param_conf__)
-
-    def __getter_inner(self, key, param_type):
-        import py_paddle.swig_paddle as api
-        shape = self.get_shape(key)
-
-        if len(self.__gradient_machines__) == 0:
-            # create new parameter in python numpy.
-            if key in self.__tmp_params__:
-                return self.__tmp_params__[key]
-            else:
-                return np.ndarray(shape=shape, dtype=np.float32)
-        else:
-            for each_gradient_machine in self.__gradient_machines__:
-                param = __get_parameter_in_gradient_machine__(
-                    each_gradient_machine, key)
-                # for simplify implementation now, we always copy from C++
-                assert isinstance(param, api.Parameter)
-                val = param.getBuf(param_type)
-                assert isinstance(val, api.Vector)
-                val = val.copyToNumpyArray()
-                return val
-                # else continue
-
-            raise RuntimeError("Unexpected branch")
-
-    def __getitem__(self, key):
-        """
-        Get parameter by parameter name. It uses Python dict syntax.
-
-        :note: It will always copy the parameter from C++ side.
-        :param key: Parameter name
-        :type key: basestring
-        :return: parameter value
-        :rtype: np.ndarray
-        """
-        import py_paddle.swig_paddle as api
-        return self.__getter_inner(key, api.PARAMETER_VALUE)
-
-    def get_shape(self, key):
-        """
-        get shape of the parameter.
-
-        :param key: parameter name
-        :type key: basestring
-        :return: parameter's shape
-        :rtype: tuple
-        """
-        if not isinstance(key, basestring):
-            raise ValueError("parameter name should be string")
-        if not self.has_key(key):
-            raise ValueError("No such parameter %s" % key)
-        conf = self.__param_conf__[key]
-        dims = conf.dims if conf.dims else (1, conf.size)
-        return tuple(map(int, dims))
-
-    def __setitem__(self, key, value):
-        """
-        Set parameter by parameter name & value. It use Python dict syntax.
-
-        :note: It will always copy the parameter to C++ side.
-        :param key: Parameter name
-        :type key: basestring
-        :param value: Parameter matrix.
-        :type value: np.ndarray
-        :return: Nothing
-        """
-
-        if not isinstance(value, np.ndarray):
-            raise ValueError("Must return ndarray")
-        value = value.astype(dtype=np.float32)
-        shape = self.get_shape(key)
-        if value.shape != shape:
-            raise ValueError("Value shape mismatch, expect %s, should %s" %
-                             (shape, value.shape))
-
-        if len(self.__gradient_machines__) == 0:
-            self.__tmp_params__[key] = value
-        else:
-            for each_gradient_machine in self.__gradient_machines__:
-                __copy_parameter_to_gradient_machine__(each_gradient_machine,
-                                                       key, value)
-
-    def get(self, parameter_name):
-        """
-        Get parameter by parameter name.
-
-        :note: It will always copy the parameter from C++ side.
-        :param parameter_name: parameter name
-        :type parameter_name: basestring
-        :return: The parameter matrix.
-        :rtype: np.ndarray
-        """
-        return self.__getitem__(key=parameter_name)
-
-    def get_grad(self, key):
-        """
-        Get grandient by parameter name.
-
-        :note: It will always copy the parameter from C++ side.
-        :param key: parameter name
-        :type key: basestring
-        :return: The grandient matrix.
-        :rtype: np.ndarray
-        """
-        import py_paddle.swig_paddle as api
-        if self.__param_conf__[key].is_static:
-            return np.zeros(self.__param_conf__[key].size, dtype=np.float32)
-
-        return self.__getter_inner(key, api.PARAMETER_GRADIENT)
-
-    def set(self, parameter_name, value):
-        """
-        Set parameter by parameter name & matrix.
-
-        :param parameter_name: parameter name
-        :type parameter_name: basestring
-        :param value: parameter matrix
-        :type value: np.ndarray
-        :return: Nothing.
-        """
-        self.__setitem__(key=parameter_name, value=value)
-
-    def append_gradient_machine(self, gradient_machine):
-        """
-        append gradient machine to parameters. This method is used internally in
-        Trainer.train.
-
-        :param gradient_machine: PaddlePaddle C++ GradientMachine object.
-        :type gradient_machine: api.GradientMachine
-        :return:
-        """
-        import py_paddle.swig_paddle as api
-        if not isinstance(gradient_machine, api.GradientMachine):
-            raise ValueError("gradient_machine should be api.GradientMachine")
-
-        if len(self.__tmp_params__) != 0:
-            for name, val in self.__tmp_params__.iteritems():
-                try:
-                    __copy_parameter_to_gradient_machine__(gradient_machine,
-                                                           name, val)
-                except ValueError:
-                    # If no such parameter in gradient machine, then don't copy
-                    pass
-
-        self.__gradient_machines__.append(gradient_machine)
-
-    def serialize(self, name, f):
-        """
-
-        :param name:
-        :param f:
-        :type f: file
-        :return:
-        """
-        param = self.get(name)
-        size = reduce(lambda a, b: a * b, param.shape)
-        f.write(struct.pack("IIQ", 0, 4, size))
-        param = param.astype(np.float32)
-        s = param.tostring()
-        wrote_size = 0
-        buf = buffer(s, wrote_size, 65535)
-        while buf:  # f.write crashes with big data blog.
-            f.write(buf)
-            wrote_size += 65535
-            buf = buffer(s, wrote_size, 65535)
-
-    def deserialize(self, name, f):
-        """
-
-        :param name:
-        :param f:
-        :type f: file
-        :return:
-        """
-        f.read(16)  # header
-        arr = np.frombuffer(f.read(), dtype=np.float32)
-        self.set(name, arr.reshape(self.get_shape(name)))
-
-    def to_tar(self, f):
-        """
-        Save parameters to a tar file.
-
-        WARNING: You should use `paddle.v2.trainer.SGD.save_parameter_to_tar(f)`
-            to save parameters most of the time. Otherwise, some settings such
-            as model average will not take effect.
-
-        :param f:
-        :type f: file
-        :return:
-        """
-        tar = tarfile.TarFile(fileobj=f, mode='w')
-        for nm in self.names():
-            buf = cStringIO.StringIO()
-            self.serialize(nm, buf)
-            tarinfo = tarfile.TarInfo(name=nm)
-            buf.seek(0)
-            tarinfo.size = len(buf.getvalue())
-            tar.addfile(tarinfo, buf)
-
-            conf = self.__param_conf__[nm]
-            confStr = conf.SerializeToString()
-            tarinfo = tarfile.TarInfo(name="%s.protobuf" % nm)
-            tarinfo.size = len(confStr)
-            buf = cStringIO.StringIO(confStr)
-            buf.seek(0)
-            tar.addfile(tarinfo, fileobj=buf)
-
-    @staticmethod
-    def from_tar(f):
-        """
-        Create a `Parameters` object from the given file. And
-        the `Parameters` only contains the parameters in this
-        file. It is adapted the parameters are same in the
-        defined network and the given file. For example, it
-        can be used in the inference.
-
-        :param f: the initialized model file.
-        :type f: tar file
-        :return: A Parameters object.
-        :rtype: Parameters.
-        """
-        params = Parameters()
-        tar = tarfile.TarFile(fileobj=f, mode='r')
-        for finfo in tar:
-            assert isinstance(finfo, tarfile.TarInfo)
-            if finfo.name.endswith('.protobuf'):
-                f = tar.extractfile(finfo)
-                conf = ParameterConfig()
-                conf.ParseFromString(f.read())
-                params.__append_config__(conf)
-
-        for param_name in params.names():
-            f = tar.extractfile(param_name)
-            params.deserialize(param_name, f)
-        return params
-
-    def init_from_tar(self, f, exclude_params=[]):
-        """
-        Different from `from_tar`, this interface can be used to
-        init partial network parameters from another saved model.
-
-        :param f: the initialized model file.
-        :type f: tar file
-        :param exclude_params: the names of parameters that should  
-            not be initialized from the model file.
-        :type exclude_params: list of strings
-        :return: Nothing.
-        """
-
-        tar_param = Parameters.from_tar(f)
-        for pname in tar_param.names():
-            if pname in self.names() and pname not in exclude_params:
-                self.set(pname, tar_param.get(pname))
-
-
-def __get_parameter_in_gradient_machine__(gradient_machine, name):
-    """
-
-    :param gradient_machine:
-    :type gradient_machine: api.GradientMachine
-    :param name:
-    :return:
-    :rtype: api.Parameter
-    """
-    params = filter(lambda p: p.getName() == name,
-                    gradient_machine.getParameters())
-
-    if len(params) == 0:
-        raise ValueError("No such parameter")
-    elif len(params) > 1:
-        raise ValueError("Unexpected branch")
-    else:
-        return params[0]
-
-
-def __copy_parameter_to_gradient_machine__(gradient_machine, name, arr):
-    """
-    Copy a python ndarray into the gradient machine.
-
-    :param gradient_machine:
-    :type gradient_machine: api.GradientMachine
-    :param name:
-    :param arr:
-    :type arr: np.ndarray
-    :return:
-    :rtype: api.Parameter
-    """
-    import py_paddle.swig_paddle as api
-    param = __get_parameter_in_gradient_machine__(gradient_machine, name)
-    vec = param.getBuf(api.PARAMETER_VALUE)
-    assert isinstance(vec, api.Vector)
-    vec.copyFromNumpyArray(arr.flatten())
diff --git a/python/paddle/v2/plot/__init__.py b/python/paddle/v2/plot/__init__.py
deleted file mode 100644
index acd3013db4e..00000000000
--- a/python/paddle/v2/plot/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from plot import Ploter
-
-__all__ = ['Ploter']
diff --git a/python/paddle/v2/plot/plot.py b/python/paddle/v2/plot/plot.py
deleted file mode 100644
index c18e63dd5f6..00000000000
--- a/python/paddle/v2/plot/plot.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-
-class PlotData(object):
-    def __init__(self):
-        self.step = []
-        self.value = []
-
-    def append(self, step, value):
-        self.step.append(step)
-        self.value.append(value)
-
-    def reset(self):
-        self.step = []
-        self.value = []
-
-
-class Ploter(object):
-    def __init__(self, *args):
-        self.__args__ = args
-        self.__plot_data__ = {}
-        for title in args:
-            self.__plot_data__[title] = PlotData()
-        # demo in notebooks will use Ploter to plot figure, but when we convert
-        # the ipydb to py file for testing, the import of matplotlib will make the
-        # script crash. So we can use `export DISABLE_PLOT=True` to disable import
-        # these libs
-        self.__disable_plot__ = os.environ.get("DISABLE_PLOT")
-        if not self.__plot_is_disabled__():
-            import matplotlib.pyplot as plt
-            from IPython import display
-            self.plt = plt
-            self.display = display
-
-    def __plot_is_disabled__(self):
-        return self.__disable_plot__ == "True"
-
-    def append(self, title, step, value):
-        assert isinstance(title, basestring)
-        assert self.__plot_data__.has_key(title)
-        data = self.__plot_data__[title]
-        assert isinstance(data, PlotData)
-        data.append(step, value)
-
-    def plot(self, path=None):
-        if self.__plot_is_disabled__():
-            return
-
-        titles = []
-        for title in self.__args__:
-            data = self.__plot_data__[title]
-            assert isinstance(data, PlotData)
-            if len(data.step) > 0:
-                titles.append(title)
-                self.plt.plot(data.step, data.value)
-        self.plt.legend(titles, loc='upper left')
-        if path is None:
-            self.display.clear_output(wait=True)
-            self.display.display(self.plt.gcf())
-        else:
-            self.plt.savefig(path)
-        self.plt.gcf().clear()
-
-    def reset(self):
-        for key in self.__plot_data__:
-            data = self.__plot_data__[key]
-            assert isinstance(data, PlotData)
-            data.reset()
diff --git a/python/paddle/v2/plot/tests/CMakeLists.txt b/python/paddle/v2/plot/tests/CMakeLists.txt
deleted file mode 100644
index 4b6c1c80969..00000000000
--- a/python/paddle/v2/plot/tests/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-if (NOT APPLE)
-  # The Mac OS X backend will not be able to function correctly if Python is
-  # not installed as a framework.
-  py_test(test_ploter SRCS test_ploter.py)
-endif()
diff --git a/python/paddle/v2/plot/tests/__init__.py b/python/paddle/v2/plot/tests/__init__.py
deleted file mode 100644
index d1abfc08f19..00000000000
--- a/python/paddle/v2/plot/tests/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import test_ploter
-
-__all__ = ['test_ploter.py']
diff --git a/python/paddle/v2/plot/tests/test_ploter.py b/python/paddle/v2/plot/tests/test_ploter.py
deleted file mode 100644
index a75f853ed93..00000000000
--- a/python/paddle/v2/plot/tests/test_ploter.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from paddle.v2.plot import Ploter
-
-
-class TestCommon(unittest.TestCase):
-    def test_append(self):
-        title1 = "title1"
-        title2 = "title2"
-        plot_test = Ploter(title1, title2)
-        plot_test.append(title1, 1, 2)
-        plot_test.append(title1, 2, 5)
-        plot_test.append(title2, 3, 4)
-        self.assertEqual(plot_test.__plot_data__[title1].step, [1, 2])
-        self.assertEqual(plot_test.__plot_data__[title1].value, [2, 5])
-        self.assertEqual(plot_test.__plot_data__[title2].step, [3])
-        self.assertEqual(plot_test.__plot_data__[title2].value, [4])
-        plot_test.reset()
-        self.assertEqual(plot_test.__plot_data__[title1].step, [])
-        self.assertEqual(plot_test.__plot_data__[title1].value, [])
-        self.assertEqual(plot_test.__plot_data__[title2].step, [])
-        self.assertEqual(plot_test.__plot_data__[title2].value, [])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/pooling.py b/python/paddle/v2/pooling.py
deleted file mode 100644
index 4881c27d1d6..00000000000
--- a/python/paddle/v2/pooling.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.trainer_config_helpers.poolings
-import copy
-
-__all__ = []
-suffix = 'Pooling'
-
-for name in paddle.trainer_config_helpers.poolings.__all__:
-    new_name = name[:-len(suffix)]
-    globals()[new_name] = copy.copy(
-        getattr(paddle.trainer_config_helpers.poolings, name))
-    globals()[new_name].__name__ = new_name
-    __all__.append(new_name)
diff --git a/python/paddle/v2/reader/__init__.py b/python/paddle/v2/reader/__init__.py
deleted file mode 100644
index 12efdc4a0fe..00000000000
--- a/python/paddle/v2/reader/__init__.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-At training and testing time, PaddlePaddle programs need to read data. To ease
-the users' work to write data reading code, we define that
-
-- A *reader* is a function that reads data (from file, network, random number
-  generator, etc) and yields data items.
-- A *reader creator* is a function that returns a reader function.
-- A *reader decorator* is a function, which accepts one or more readers, and
-  returns a reader.
-- A *batch reader* is a function that reads data (from *reader*, file, network,
-  random number generator, etc) and yields a batch of data items.
-
-#####################
-Data Reader Interface
-#####################
-
-Indeed, *data reader* doesn't have to be a function that reads and yields data
-items. It can be any function with no parameter that creates a iterable
-(anything can be used in :code:`for x in iterable`)\:
-
-..  code-block:: python
-
-    iterable = data_reader()
-
-Element produced from the iterable should be a **single** entry of data,
-**not** a mini batch. That entry of data could be a single item, or a tuple of
-items.
-Item should be of `supported type <http://www.paddlepaddle.org/doc/ui/data_provider
-/pydataprovider2.html?highlight=dense_vector#input-types>`_ (e.g., numpy 1d
-array of float32, int, list of int)
-
-An example implementation for single item data reader creator:
-
-..  code-block:: python
-
-    def reader_creator_random_image(width, height):
-        def reader():
-            while True:
-                yield numpy.random.uniform(-1, 1, size=width*height)
-        return reader
-
-An example implementation for multiple item data reader creator:
-
-..  code-block:: python
-
-    def reader_creator_random_image_and_label(width, height, label):
-        def reader():
-            while True:
-                yield numpy.random.uniform(-1, 1, size=width*height), label
-        return reader
-
-
-TODO(yuyang18): Should we add whole design doc here?
-"""
-
-import decorator
-from decorator import *
-
-import creator
-
-__all__ = decorator.__all__ + ['creator']
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
deleted file mode 100644
index fda5246d74f..00000000000
--- a/python/paddle/v2/reader/creator.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Creator package contains some simple reader creator, which could
-be used in user program.
-"""
-
-__all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader']
-
-
-def np_array(x):
-    """
-    Creates a reader that yields elements of x, if it is a
-    numpy vector. Or rows of x, if it is a numpy matrix.
-    Or any sub-hyperplane indexed by the highest dimension.
-
-    :param x: the numpy array to create reader from.
-    :returns: data reader created from x.
-    """
-
-    def reader():
-        if x.ndim < 1:
-            yield x
-
-        for e in x:
-            yield e
-
-    return reader
-
-
-def text_file(path):
-    """
-    Creates a data reader that outputs text line by line from given text file.
-    Trailing new line ('\\\\n') of each line will be removed.
-
-    :path: path of the text file.
-    :returns: data reader of text file
-    """
-
-    def reader():
-        f = open(path, "r")
-        for l in f:
-            yield l.rstrip('\n')
-        f.close()
-
-    return reader
-
-
-def recordio(paths, buf_size=100):
-    """
-    Creates a data reader from given RecordIO file paths separated by ",",
-        glob pattern is supported.
-    :path: path of recordio files, can be a string or a string list.
-    :returns: data reader of recordio files.
-    """
-
-    import recordio as rec
-    import paddle.v2.reader.decorator as dec
-    import cPickle as pickle
-
-    def reader():
-        if isinstance(paths, basestring):
-            path = paths
-        else:
-            path = ",".join(paths)
-        f = rec.reader(path)
-        while True:
-            r = f.read()
-            if r is None:
-                break
-            yield pickle.loads(r)
-        f.close()
-
-    return dec.buffered(reader, buf_size)
-
-
-pass_num = 0
-
-
-def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64):
-    """
-    Create a data reader that yield a record one by one from
-        the paths:
-    :paths: path of recordio files, can be a string or a string list.
-    :etcd_endpoints: the endpoints for etcd cluster
-    :returns: data reader of recordio files.
-
-    ..  code-block:: python
-        from paddle.v2.reader.creator import cloud_reader
-        etcd_endpoints = "http://127.0.0.1:2379"
-        trainer.train.(
-            reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints),
-        )
-    """
-    import os
-    import cPickle as pickle
-    import paddle.v2.master as master
-    c = master.client(etcd_endpoints, timeout_sec, buf_size)
-
-    if isinstance(paths, basestring):
-        path = [paths]
-    else:
-        path = paths
-    c.set_dataset(path)
-
-    def reader():
-        global pass_num
-        c.paddle_start_get_records(pass_num)
-        pass_num += 1
-
-        while True:
-            r, e = c.next_record()
-            if not r:
-                if e != -2:
-                    print "get record error: ", e
-                break
-            yield pickle.loads(r)
-
-    return reader
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
deleted file mode 100644
index 44a6e344630..00000000000
--- a/python/paddle/v2/reader/decorator.py
+++ /dev/null
@@ -1,405 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = [
-    'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
-    'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader'
-]
-
-from threading import Thread
-import subprocess
-
-from Queue import Queue
-import itertools
-import random
-import zlib
-
-
-def map_readers(func, *readers):
-    """
-    Creates a data reader that outputs return value of function using
-    output of each data readers as arguments.
-
-    :param func: function to use. The type of func should be (Sample) => Sample
-    :type: callable
-    :param readers: readers whose outputs will be used as arguments of func.
-    :return: the created data reader.
-    :rtype: callable
-    """
-
-    def reader():
-        rs = []
-        for r in readers:
-            rs.append(r())
-        for e in itertools.imap(func, *rs):
-            yield e
-
-    return reader
-
-
-def shuffle(reader, buf_size):
-    """
-    Creates a data reader whose data output is shuffled.
-
-    Output from the iterator that created by original reader will be
-    buffered into shuffle buffer, and then shuffled. The size of shuffle buffer
-    is determined by argument buf_size.
-
-    :param reader: the original reader whose output will be shuffled.
-    :type reader: callable
-    :param buf_size: shuffle buffer size.
-    :type buf_size: int
-
-    :return: the new reader whose output is shuffled.
-    :rtype: callable
-    """
-
-    def data_reader():
-        buf = []
-        for e in reader():
-            buf.append(e)
-            if len(buf) >= buf_size:
-                random.shuffle(buf)
-                for b in buf:
-                    yield b
-                buf = []
-
-        if len(buf) > 0:
-            random.shuffle(buf)
-            for b in buf:
-                yield b
-
-    return data_reader
-
-
-def chain(*readers):
-    """
-    Creates a data reader whose output is the outputs of input data
-    readers chained together.
-
-    If input readers output following data entries:
-    [0, 0, 0]
-    [1, 1, 1]
-    [2, 2, 2]
-    The chained reader will output:
-    [0, 0, 0, 1, 1, 1, 2, 2, 2]
-
-    :param readers: input readers.
-    :return: the new data reader.
-    :rtype: callable
-    """
-
-    def reader():
-        rs = []
-        for r in readers:
-            rs.append(r())
-
-        for e in itertools.chain(*rs):
-            yield e
-
-    return reader
-
-
-class ComposeNotAligned(ValueError):
-    pass
-
-
-def compose(*readers, **kwargs):
-    """
-    Creates a data reader whose output is the combination of input readers.
-
-    If input readers output following data entries:
-    (1, 2)    3    (4, 5)
-    The composed reader will output:
-    (1, 2, 3, 4, 5)
-
-    :param readers: readers that will be composed together.
-    :param check_alignment: if True, will check if input readers are aligned
-        correctly. If False, will not check alignment and trailing outputs
-        will be discarded. Defaults to True.
-    :type check_alignment: bool
-
-    :return: the new data reader.
-
-    :raises ComposeNotAligned: outputs of readers are not aligned.
-        Will not raise when check_alignment is set to False.
-    """
-    check_alignment = kwargs.pop('check_alignment', True)
-
-    def make_tuple(x):
-        if isinstance(x, tuple):
-            return x
-        else:
-            return (x, )
-
-    def reader():
-        rs = []
-        for r in readers:
-            rs.append(r())
-        if not check_alignment:
-            for outputs in itertools.izip(*rs):
-                yield sum(map(make_tuple, outputs), ())
-        else:
-            for outputs in itertools.izip_longest(*rs):
-                for o in outputs:
-                    if o is None:
-                        # None will be not be present if compose is aligned
-                        raise ComposeNotAligned(
-                            "outputs of readers are not aligned.")
-                yield sum(map(make_tuple, outputs), ())
-
-    return reader
-
-
-def buffered(reader, size):
-    """
-    Creates a buffered data reader.
-
-    The buffered data reader will read and save data entries into a
-    buffer. Reading from the buffered data reader will proceed as long
-    as the buffer is not empty.
-
-    :param reader: the data reader to read from.
-    :type reader: callable
-    :param size: max buffer size.
-    :type size: int
-
-    :returns: the buffered data reader.
-    """
-
-    class EndSignal():
-        pass
-
-    end = EndSignal()
-
-    def read_worker(r, q):
-        for d in r:
-            q.put(d)
-        q.put(end)
-
-    def data_reader():
-        r = reader()
-        q = Queue(maxsize=size)
-        t = Thread(
-            target=read_worker, args=(
-                r,
-                q, ))
-        t.daemon = True
-        t.start()
-        e = q.get()
-        while e != end:
-            yield e
-            e = q.get()
-
-    return data_reader
-
-
-def firstn(reader, n):
-    """
-    Limit the max number of samples that reader could return.
-
-    :param reader: the data reader to read from.
-    :type reader: callable
-    :param n: the max number of samples that return.
-    :type n: int
-    :return: the decorated reader.
-    :rtype: callable
-    """
-
-    # TODO(yuyang18): Check if just drop the reader, could clean the opened
-    # resource or not?
-
-    def firstn_reader():
-        for i, item in enumerate(reader()):
-            if i == n:
-                break
-            yield item
-
-    return firstn_reader
-
-
-class XmapEndSignal():
-    pass
-
-
-def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
-    """
-    Use multiprocess to map samples from reader by a mapper defined by user.
-    And this function contains a buffered decorator.
-    :param mapper:  a function to map sample.
-    :type mapper: callable
-    :param reader: the data reader to read from
-    :type reader: callable
-    :param process_num: process number to handle original sample
-    :type process_num: int
-    :param buffer_size: max buffer size
-    :type buffer_size: int
-    :param order: keep the order of reader
-    :type order: bool
-    :return: the decarated reader
-    :rtype: callable
-    """
-    end = XmapEndSignal()
-
-    # define a worker to read samples from reader to in_queue
-    def read_worker(reader, in_queue):
-        for i in reader():
-            in_queue.put(i)
-        in_queue.put(end)
-
-    # define a worker to read samples from reader to in_queue with order flag
-    def order_read_worker(reader, in_queue):
-        in_order = 0
-        for i in reader():
-            in_queue.put((in_order, i))
-            in_order += 1
-        in_queue.put(end)
-
-    # define a worker to handle samples from in_queue by mapper
-    # and put mapped samples into out_queue
-    def handle_worker(in_queue, out_queue, mapper):
-        sample = in_queue.get()
-        while not isinstance(sample, XmapEndSignal):
-            r = mapper(sample)
-            out_queue.put(r)
-            sample = in_queue.get()
-        in_queue.put(end)
-        out_queue.put(end)
-
-    # define a worker to handle samples from in_queue by mapper
-    # and put mapped samples into out_queue by order
-    def order_handle_worker(in_queue, out_queue, mapper, out_order):
-        ins = in_queue.get()
-        while not isinstance(ins, XmapEndSignal):
-            order, sample = ins
-            r = mapper(sample)
-            while order != out_order[0]:
-                pass
-            out_queue.put(r)
-            out_order[0] += 1
-            ins = in_queue.get()
-        in_queue.put(end)
-        out_queue.put(end)
-
-    def xreader():
-        in_queue = Queue(buffer_size)
-        out_queue = Queue(buffer_size)
-        out_order = [0]
-        # start a read worker in a thread
-        target = order_read_worker if order else read_worker
-        t = Thread(target=target, args=(reader, in_queue))
-        t.daemon = True
-        t.start()
-        # start several handle_workers
-        target = order_handle_worker if order else handle_worker
-        args = (in_queue, out_queue, mapper, out_order) if order else (
-            in_queue, out_queue, mapper)
-        workers = []
-        for i in xrange(process_num):
-            worker = Thread(target=target, args=args)
-            worker.daemon = True
-            workers.append(worker)
-        for w in workers:
-            w.start()
-
-        sample = out_queue.get()
-        while not isinstance(sample, XmapEndSignal):
-            yield sample
-            sample = out_queue.get()
-        finish = 1
-        while finish < process_num:
-            sample = out_queue.get()
-            if isinstance(sample, XmapEndSignal):
-                finish += 1
-            else:
-                yield sample
-
-    return xreader
-
-
-def _buf2lines(buf, line_break="\n"):
-    # FIXME: line_break should be automatically configured.
-    lines = buf.split(line_break)
-    return lines[:-1], lines[-1]
-
-
-class PipeReader:
-    """
-        PipeReader read data by stream from a command, take it's 
-        stdout into a pipe buffer and redirect it to the parser to
-        parse, then yield data as your desired format.
-
-        You can using standard linux command or call another program
-        to read data, from HDFS, Ceph, URL, AWS S3 etc:
-
-        .. code-block:: python
-           cmd = "hadoop fs -cat /path/to/some/file"
-           cmd = "cat sample_file.tar.gz"
-           cmd = "curl http://someurl"
-           cmd = "python print_s3_bucket.py"
-
-        An example:
-
-        .. code-block:: python
-    
-           def example_reader():
-               for f in myfiles:
-                   pr = PipeReader("cat %s"%f)
-                   for l in pr.get_line():
-                       sample = l.split(" ")
-                       yield sample
-    """
-
-    def __init__(self, command, bufsize=8192, file_type="plain"):
-        if not isinstance(command, str):
-            raise TypeError("left_cmd must be a string")
-        if file_type == "gzip":
-            self.dec = zlib.decompressobj(
-                32 + zlib.MAX_WBITS)  # offset 32 to skip the header
-        self.file_type = file_type
-        self.bufsize = bufsize
-        self.process = subprocess.Popen(
-            command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
-
-    def get_line(self, cut_lines=True, line_break="\n"):
-        """
-        :param cut_lines: cut buffer to lines
-        :type cut_lines: bool
-        :param line_break: line break of the file, like \n or \r
-        :type line_break: string
-
-        :return: one line or a buffer of bytes
-        :rtype: string
-        """
-        remained = ""
-        while True:
-            buff = self.process.stdout.read(self.bufsize)
-            if buff:
-                if self.file_type == "gzip":
-                    decomp_buff = self.dec.decompress(buff)
-                elif self.file_type == "plain":
-                    decomp_buff = buff
-                else:
-                    raise TypeError("file_type %s is not allowed" %
-                                    self.file_type)
-
-                if cut_lines:
-                    lines, remained = _buf2lines(''.join(
-                        [remained, decomp_buff]), line_break)
-                    for line in lines:
-                        yield line
-                else:
-                    yield decomp_buff
-            else:
-                break
diff --git a/python/paddle/v2/reader/tests/CMakeLists.txt b/python/paddle/v2/reader/tests/CMakeLists.txt
deleted file mode 100644
index 107d5912e15..00000000000
--- a/python/paddle/v2/reader/tests/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-py_test(creator_test SRCS creator_test.py)
-py_test(decorator_test SRCS decorator_test.py)
diff --git a/python/paddle/v2/reader/tests/__init__.py b/python/paddle/v2/reader/tests/__init__.py
deleted file mode 100644
index eca2dce114b..00000000000
--- a/python/paddle/v2/reader/tests/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py
deleted file mode 100644
index 7fe374e6636..00000000000
--- a/python/paddle/v2/reader/tests/creator_test.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright PaddlePaddle contributors. All Rights Reservedd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import unittest
-import numpy as np
-import paddle.v2.reader.creator
-
-
-class TestNumpyArray(unittest.TestCase):
-    def test_numpy_array(self):
-        l = [[1, 2, 3], [4, 5, 6]]
-        x = np.array(l, np.int32)
-        reader = paddle.v2.reader.creator.np_array(x)
-        for idx, e in enumerate(reader()):
-            self.assertItemsEqual(e, l[idx])
-
-
-class TestTextFile(unittest.TestCase):
-    def test_text_file(self):
-        path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt")
-        reader = paddle.v2.reader.creator.text_file(path)
-        for idx, e in enumerate(reader()):
-            self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
-
-
-class TestRecordIO(unittest.TestCase):
-    def do_test(self, path):
-        reader = paddle.v2.reader.creator.recordio(path)
-        idx = 0
-        for e in reader():
-            if idx == 0:
-                self.assertEqual(e, (1, 2, 3))
-            elif idx == 1:
-                self.assertEqual(e, (4, 5, 6))
-            idx += 1
-        self.assertEqual(idx, 2)
-
-    def test_recordIO(self):
-        self.do_test(
-            os.path.join(
-                os.path.dirname(__file__), "test_reader_recordio.dat"))
-        self.do_test([
-            os.path.join(
-                os.path.dirname(__file__), "test_reader_recordio.dat")
-        ])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py
deleted file mode 100644
index 6b680e39f3f..00000000000
--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ /dev/null
@@ -1,178 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-import unittest
-
-import paddle.v2.reader
-
-
-def reader_creator_10(dur):
-    def reader():
-        for i in range(10):
-            # this invocation helps testing paddle.reader.buffer
-            time.sleep(dur)
-            yield i
-
-    return reader
-
-
-class TestMap(unittest.TestCase):
-    def test_map(self):
-        d = {"h": 0, "i": 1}
-
-        def tokenize(x):
-            return d[x]
-
-        def read():
-            yield "h"
-            yield "i"
-
-        r = paddle.v2.reader.map_readers(tokenize, read)
-        for i, e in enumerate(r()):
-            self.assertEqual(e, i)
-
-
-class TestBuffered(unittest.TestCase):
-    def test_read(self):
-        for size in range(20):
-            b = paddle.v2.reader.buffered(reader_creator_10(0), size)
-            c = 0
-            for i in b():
-                self.assertEqual(i, c)
-                c += 1
-            self.assertEqual(c, 10)
-
-    def test_buffering(self):
-        # read have 30ms delay.
-        b = paddle.v2.reader.buffered(reader_creator_10(0.03), 10)
-        last_time = time.time()
-        for idx, i in enumerate(b()):
-            elapsed_time = time.time() - last_time
-            if i == 0:
-                time.sleep(0.3)
-            else:
-                # read time should be short, meaning already buffered.
-                self.assertLess(elapsed_time, 0.05)
-            last_time = time.time()
-
-
-class TestCompose(unittest.TestCase):
-    def test_compse(self):
-        reader = paddle.v2.reader.compose(
-            reader_creator_10(0), reader_creator_10(0))
-        for idx, e in enumerate(reader()):
-            self.assertEqual(e, (idx, idx))
-
-    def test_compose_not_aligned(self):
-        total = 0
-        reader = paddle.v2.reader.compose(
-            paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
-            reader_creator_10(0))
-        with self.assertRaises(paddle.v2.reader.ComposeNotAligned):
-            for e in reader():
-                total += 1
-        # expecting 10, not 20
-        self.assertEqual(total, 10)
-
-    def test_compose_not_aligned_no_check(self):
-        total = 0
-        reader = paddle.v2.reader.compose(
-            paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
-            reader_creator_10(0),
-            check_alignment=False)
-        for e in reader():
-            total += 1
-        # expecting 10, not 20
-        self.assertEqual(total, 10)
-
-
-class TestChain(unittest.TestCase):
-    def test_chain(self):
-        c = paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0))
-        idx = 0
-        for e in c():
-            self.assertEqual(e, idx % 10)
-            idx += 1
-        self.assertEqual(idx, 20)
-
-
-class TestShuffle(unittest.TestCase):
-    def test_shuffle(self):
-        case = [(0, True), (1, True), (10, False), (100, False)]
-        a = reader_creator_10(0)
-        for size, checkEq in case:
-            s = paddle.v2.reader.shuffle(a, size)
-            total = 0
-            for idx, e in enumerate(s()):
-                if checkEq:
-                    self.assertEqual(idx, e)
-                total += 1
-            self.assertEqual(total, 10)
-
-
-class TestXmap(unittest.TestCase):
-    def test_xmap(self):
-        def mapper(x):
-            return (x + 1)
-
-        orders = (True, False)
-        thread_nums = (1, 2, 4, 8, 16)
-        buffered_size = (1, 2, 4, 8, 16)
-        for order in orders:
-            for tNum in thread_nums:
-                for size in buffered_size:
-                    reader = paddle.v2.reader.xmap_readers(mapper,
-                                                           reader_creator_10(0),
-                                                           tNum, size, order)
-                    for n in xrange(3):
-                        result = []
-                        for i in reader():
-                            result.append(i)
-                        if not order:
-                            result.sort()
-                        for idx, e in enumerate(result):
-                            self.assertEqual(e, mapper(idx))
-
-
-class TestPipeReader(unittest.TestCase):
-    def test_pipe_reader(self):
-        def example_reader(myfiles):
-            for f in myfiles:
-                pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128)
-                for l in pr.get_line():
-                    yield l
-
-        import tempfile
-
-        records = [str(i) for i in xrange(5)]
-        temp = tempfile.NamedTemporaryFile()
-        try:
-            with open(temp.name, 'w') as f:
-                for r in records:
-                    f.write('%s\n' % r)
-
-            result = []
-            for r in example_reader([temp.name]):
-                result.append(r)
-
-            for idx, e in enumerate(records):
-                self.assertEqual(e, result[idx])
-        finally:
-            # delete the temporary file
-            temp.close()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/reader/tests/test_data_creator.txt b/python/paddle/v2/reader/tests/test_data_creator.txt
deleted file mode 100644
index a2a8d47d438..00000000000
--- a/python/paddle/v2/reader/tests/test_data_creator.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-0 1
-2 3
-4 5
diff --git a/python/paddle/v2/reader/tests/test_reader_recordio.dat b/python/paddle/v2/reader/tests/test_reader_recordio.dat
deleted file mode 100644
index a99a35bb829e066c4845d0b85b96cd1eb3a12491..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 76
zcmZQ!W@4P2Bs!asfq}sSh?#)+KN|x>v0q|9K_sIV14Bftj}1RiRKwGd%hQO<)0nHI
Tz>rH1B4onlY0Bkk1`z@P(}N7c

diff --git a/python/paddle/v2/reader/tests/test_recordio_creator.dat b/python/paddle/v2/reader/tests/test_recordio_creator.dat
deleted file mode 100644
index 17aa89b6796184407e83246d3f342a55a66b4a69..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 88
zcmZQ!W@2QOHw<B9U|?_oVlE*5&&I$|?3Wl&5Xor9z;M0c)+Lav0f;aJ5k?@w7(|$W
R2vZPY1|rNsgawGO1OWMk36uZ;

diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt
deleted file mode 100644
index b4333ed530c..00000000000
--- a/python/paddle/v2/tests/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-py_test(test_op SRCS test_op.py)
-py_test(test_image SRCS test_image.py)
-py_test(test_layer SRCS test_layer.py)
-py_test(test_topology SRCS test_topology.py)
-py_test(test_rnn_layer SRCS test_rnn_layer.py)
-py_test(test_parameters SRCS test_parameters.py)
-py_test(test_data_feeder SRCS test_data_feeder.py)
-py_test(test_paramconf_order SRCS test_paramconf_order.py)
diff --git a/python/paddle/v2/tests/cat.jpg b/python/paddle/v2/tests/cat.jpg
deleted file mode 100644
index bc1fbbd371216b9904b522ed302700c79d2e4876..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 57218
zcmbTcXH?T&^fefI??LIk1rR9#>Ai#!N<tTvCXfh7mm<9yL+=r(352E~9psTBozN1B
z2#QnzL1`k0I{e>v*34S-X=ZL#@+H5Ob$9ML`|P{V`M30M4Zvz<Y-$W3BLe`)u3vzE
z9{~mcT54(<YARY98X7t}T6#tx6C(oyBR4w-3y_~jP=KF@k55QUPEtrjR+NuV>JCU&
zUI7dS3rebKsVZv9DS;LL=SRrs=;#<37`d33xD<u?gcbikuYY|2Hd=B=3Ly$I5db+G
z83h~JzxM#a>wQv@{jUZ1KNlG}1tk?V4J{o#!}SAgtN?N{3JP*c3Mwi}%Il}2ufGRS
zvQe=ME9g;k*tpS%1OgS~vnyyt^_%-SVY5HPl-z?7=;&|U<l^QLmynbKNrRPD?x?D%
z8yFfHn?Ov>;I?-54vtRF9-a@qynPVJ;E>R;C*cv1iAl*RscFwpIk|cH1%*Y$n93?F
zuDS+aTlc1=wXMCQv#Wbx@cq#6$mrPk9Fa7?u(<SL`OEso=GOMk*WJCN<CD{~pXa|W
zF8>D?8Gzz{!@B<b-@yJKxY(|7kyBDqP}2MlE;8~b*KZ0oN-ALmYIZ#v8n-|W5yf~~
zpni5mb3dJ^66^=3d(bTX4KeT+@uUBN_P>z*zXO)={|njw1?>OEwG3dSAiF+13O0Za
z;ByTOwm4ChWV29^1<SQ9nn|g2BR~-87S!%}^e(V-ygR*((Ji{Td|p!Ca9t;x?QQrS
zP73>G&sL$D?W+5rz7snkNFI#nfYUAyBkLzPqV#6w1I?s2?(ioJO`>i6jf|D{&5RM_
zJ#OwIiU<W58%n{k{f4lNX**8j1xi<*-UZ%mW140Sih|KQzliJI?ITjD!pJ^gv6Zy|
z;sT0^JeRCGrvy<Q#~vqDV@=LxA%Y{%YP4ou&*c;buD}80S*S26a2y)tr-?#fg^e`f
zMyhZ!A7C-Anw%J4?S(#pcs#3ofl<<>AAW(MtKNQ2VeqUn3pSJNR$YWNun@s8Fk*2+
zmoS(skO#yh!~<33al=t#xTU~d_>&<i<w`Q=Dr6v-XcV8-Re`vNU=6m73vkYNK>2Vg
zGE*_qqSi=wkdqog%^pj~>Rv;GdWN(PZ@>=6ciTl!IZ9L@Mf&rdB1uumG5q++-D=Yt
zpFlXIFHC+tRfwxi8il6MhI%cH0dN(Ou(?j^@aiWx2^1o3PPLl34a8>~zWu_3YXVXa
zW7FlVj3j5h1?8$H&b8#FK&Uif^fC$sMz_GBRmviQ5U9AYvJnFmHx0}?6b>|rv@tbJ
zbC#;aQi6$=XV?*7-?L=h^@b+AKwDZdlN8XslnU;eN*`E_3WW|!lTgyiwe~4}0l+m7
zm3l=Jj}aIZQJssCqhN71FbKsGY($~JPJ)C2T!?Xwh!`k$v#ci8E>VEJa&o~l2`^E)
zpn<I6Y@0SSqy#R2J>N=M05Kj>Zn$0ud_MQmHsv~NkzSnKbz}-d;<v;NSd-u^&JooZ
zcQSJbl|~zw%t)NNhzuNp#c6>UUFN)U!EF$%Ft&!x6eY_@rHuitUIzmdT>SzHfN_rE
zYTREK;Va2$tG8?1vaR1-+8!sSO`e#k!>)stt*F&25vm+WW?BVa*)FL<HL=cAQI{GC
z@y+S;QCZdx87I`(!1|!A6^LI|z_^>JFoeeJLQ7U3COuuf$R{mDpfw4e&WUII2pL4>
z0y)bKRm<U{Lw7hCJ=4N*5*SrnEg5!hjz<fG4%~%;TT}AigHLh5Bv|9UQyxLf`=Hh@
z)_wWV+ejarL=LyBHKMBu+?~LG4X`$k8;1pY<-gZy1IEpot=~Y;!Mw5<^ma|nEohf3
z-Qd|aO$47J*7cXD0GGM-7nJh!VW6ez5*Qa5TnfK{yPx-htPkD{0nIbT^TMH8bxKfL
zw|z4z{$1UAR`(azo1K$({APj*D4$V>L1g8m4$MvMT$_ldW(X<PFP6YrD}HIm_sHZu
z4;xi{Cu?cz=03zo8s%fShuFXwR+nK~0rPC&pUT|I7J3*ni&^XUz!a2=HGoPE)Cb!@
zVJW?GN=VBd`cJUn);y79axBA;2qUJDGtj`$5RUY6Mx(^y)2>fbsK(wiyG;cvQe=;D
z6{>+N8cC_uS#*xVK*@-8wB5}h_P29>L(ukIKMC}*h)B`A(k){n@h&k}Pf{I70VX})
zPKJWENqhn+zyrzYi;>El(kQNxb+k1IDx8ibfU2M1i;}^E3okK<+B7CGJrn&S^RxgU
z7KU<mH5@3;_@6-Ilm*!6;)WoMV0sv+byx#DhX{sBWF<-DTY(26aGVC>`&OJhq=Z~u
z9WalXK7z3s6Dkj{BVz{g_#^b{zRDB0u~-5mHPPIH`XvV7GRH$DCq+iKT?l?DRZb;>
zG1AWEwDI6>M+d^#AV!XC3e{dEq;YCBD`j;|ob3dxFZz~Bp1nJT1^5K2454-}p%SX0
zub>jH%mp(du)yl9&<emFq6X%Ur>s7&;qen@241@22ohjQB2Nia9`hV9Tuu)skYfxp
zfL+*Ky+&4As4-L(gT`3sZ=>gIV8VJejq>mXyk{H?Lq&t}5CJbo!I=Ol7?(hElSo}i
zz7*b;RDN`K!~;?R!QX8(v*P3s@F^I`q7MMhfYzIvubI>hhH)#Wg;dAG*q}q?00e0U
zL^=eIxaacCp)}@Qu)=({7Wp3dYBmc$)`MCgEPF_L&XalMI?OCdbJ74;K2klDytEe)
z_eGBz2wP85RM!V>!gz|Q(srv$5c%Znd7;&FsQR=lo=|z70A*KE+fY$kuSyHo>vS%z
zVb04XV=+UN6HV(n`!EJ`If*ceC}OfwuPRR~<;4FwofKgNPeB%GiEc;^(OawJ)Htl#
zCUwm$0}T!W&^urbXsxRzPdNU>iaLoh5d#BR2U?0Wiln7sfj{xTNn?&&U(7%>W&zkD
z28ao8&%jworW3#C%_#sv+rVC%q3N?=KJIfrv6U5-Mp_7v`YZ+ku)!d}Ue{DqjFpIo
z=;;EV;5dEUXN@Qt0m9Y9CD%1lsK|{%@XC6|=?8PSyEw&?NVv44JPwPiV80~{Ze38W
zWE}=C!xzvnAHD13p#lMO$ck%f@kPWuB6$EJ3JxUhqCH^lrA8`QOhdw&#7witaHCYW
ziT;{ID2}skmPdHFh>T#moE3`wAgf?v<}!oeaHCtt_(&9E3dkTv!r(8av_`;n)$91M
zFo?9lI3xD;c|1sS*^t(~`ochL;B{pn`ndZ9l+)`iluA>%b>fDkfUN`lW=vez%0bpS
zfxIp(@)=;z!XLC<G*8Pl?reEliPtCAwjR#=*s~JpYT!(UDdcFP4?jYf-U>J#!3S4B
z^)P~%xUpa{a8eDumk*P@$N(|^I>h>Bgjexk<~&=lA}qW*R}cK@EQ`5@-lxW^L3cP8
zoDL;GSg5i0+h~<VFo5=&cr&PRbzWH{se+5BF?XW^>aD;6JhA|OF%#v7_z;xSk#Qby
zC<wevt%f+`0$&lE4uMHCP%tL7OJq9><ByQe5)Re_=UMkZH=_c;=9-dlqlchslxynW
zN|e!=V0KO2COY(xNrd#csmJ1Z=@Vv_Qyt<l9;0%fw()rsSrg=Y^67|;+4o29Cg>fX
zhy`i=16-bUH2b;iWw4EL6^U|xx9o4stsYd&$)E^bjvcOtR*wPQjkc5)=c;Y7&n{cn
ztsicPR#dZnYViA*wE|lE9YwOhIo7VB2vu}LHTZ$I3F<s%;lI-@u%k^pXkyWI*?K>G
zjsq?S&C>>8kpa6<iTI^tJlu?w*m#41<CdAUGM{ZHwhvKFB;3IPQrbi=XGSo(;q{PN
zVbWH$H^xrc!ZYaz0!lhDWAggI7!ATwqluj<cg)b3?7Ew(weD`Eh;bwzv41;Bqz~9x
zT|qF-kp4zNTfL6GR#;hN=DNVn?S!nnygdYX{3h@V6nL0dgaN{cCK82eU}5HiYZ54p
z$qO5+rmzAQ*9hoN15O26M}+vRIvzB7i{twA5lM5o7PTl|_B?p78IwwbsSpl4<0FCb
zXcEZhg`ZgX`?#u<RDqYXK_)>)BftT*+9)`rvB8EJ#0AfErX|oZvGOQcJ;Z0|RXocD
z)#lB6YLfQmMI?Yp>lmH{lB361Igb}&;7yal8nL#rpq5<}PKN^%&HAJ?Np(mz7zj(b
zhcLHwol{t^+vvQpT~dR})Fj3a8TNC4YPKukRs^X-g%9hPVKk9z;ST@-<}lU8?@=tg
zTrL8ASZ}qUlz3r=sD2ZbW+sJIHbpC0T{GYS-XfqfUJne&d$|i0mlvRkS>1<2-<&M9
z@Jasz+>FI}4V^ddJF~AUzshq04DwO?&*QjiZdtws+ZO$SJ0(og<}#~C#ygFHJER3}
zHDF&R1%Ir;YuYOi#=KTGGps~fLmFpB-FLTxXf-`UUd@ZGX5O=DeM~C2W$ZIG;^1IV
zA`ojp@cn=cTl7&o{?IAL_bU2NT_;MWvOh&^O646ROLc6;m2?lH`FZ+;_J>rvhoYi0
z8D}oNX<PSGO4<`1<XijyJq#{`X(5vdr86u-nT4gLpTGRF+jj!~*>-tN^+a&yj5D<I
zCaqJu>R02#cMl94N#&TCVXUZ)vx!1e72XX^vQQ}z(BN6^BL0Ov&rvwuY5oT|i<KVl
zN~$kGvj@>!oD%}4UOelECB{R$*D+S$b+yZo>J3q4jERx~!bp{AgbT;kQLWu(&p@kI
z-64WQOJ$LNxP-;tm;q4QiGkB-3G}V&oi$SH@k`k>oD!<l_#yRd(YrXqkv6kIO#C!B
zo=E~tG~Leoft>YmAtr+vG2uXLf11w}2nr_o8}RXzFqv2q<oSl{#};t6tkr7WJ(LJq
zP<=~g(0Gh5>~4G@lcl<y&orlcsf=wO+*I%MJ5(Z<Y@h(c1OfxIl88|=h}mk$sOl*{
zluAtc74`P+QsmsB<Z`KpY~Yni(*bM{IXBi0mW__0m;K<9;Vjb5{7Fa9%_>7hRKrrs
zio8Q;Hb@C{(Zf8r59e2zVtZyjZn~LET4)*^+auWXKueC_J0_@2<J7n8SL(7DHZ&l+
zXa4~7yY6@8DU#_G+F~ATO20s;3_E0FPcmINNWT~U8=ja98W@u??4ykDy>M3|mY-ck
z*;1C))Hd?|B3B@{yl{^y({suQ*z{I?g=}t5)vVm8Vw{7g^>Ob*IWhJ^h;n!umi(D8
z^E8kHD)9o7hcak^Ae4-e-y|d>^I14KwV<l+4O<t$UF((E_2xiHXHXsK+jf#Lm#tA0
z2}^WNiE{<6LB($w#l?Js-Kgdmx&!WOvjk3^esPrdjGNo8x>h_`2oIthqmO7Jv!bpd
zXxjVI=~dKH2`fV=!E1=p(=L1tk73`D6?KJu&32ts#fR!#V+8t<H95ja2~1&q&L#!^
zc4Xs)rN(e!;ZW_Y3=mj`jTlT&C4wYw2!KdKHK_Wy$^-<6u1?>58%<Oq$RXktH;>gb
zkkFW&@#ch8YBI8n&ygs0hS256M7y_^_dbc(1X+8YQFlvyeWAtopR^sleGrx3WPsk)
zzPl<(Na292N5|lxbY$Pr{tJVYaB<|qeEV9=z5c)irG>VP)GnBcGD$W&sgpj344tLl
z(R_M2r|x1RSingNv-cbPq4Os)*)yQLR$=3vX6$Djl3m$>VY^WV>>y6fNO08S>gTJx
z^>Z~bY`b(sPJ#5yRE~!bezhb9xD&C+BE})1lWuZxRX4Nnb~3qR{HKhhPFf!}`|Zjb
zN`s6FvXM=z<BB1zV&*?KVRM0+kJ=K|H!^m9oZ)Z#*_$wy`%0@Qo;Iw2Eu5@u5^VTl
zE!%3m@fYZY;~c&X>A$^cXEYy=_npk>%_fTtEWb|YT7E3BDZx|)a(w(xB~D@YUXXs(
z_OMM)r0tV1wpBxigr(s}rM8XDV{pdOW;I!D+r<9r?ySIje9pRn*DgWQy`M7nruMo3
zp7xe6+NL6`bBtvn&Y^%Owc_#l@gu%G%gu9R0l!Y`7oD9)lS#2RcP+uIsPvnUO7iew
zpzr88s*6~juaCiDY%i-M@mKo_4W1U{3Is{&VkIn5I-?aE414>Kc0C)eIvtF8!F8;$
z^z83GPa}H%gI;VNa~sb<*xC+vZF|+`6PX6F?5_wj&)n}0NsdDCeaeCbP8sSH@y=on
z;;0G*b5z^r8nZg-p+ebsUZwdnW+kXdwcXf|a$fvdE#x^Hy#cJ@VY*x&nRebcK3o{V
za1}WSnv$lW%pFl?qN}7BtL2RF`o^THZ=U?!;r@zH|5qLObzSIhb;y(?Z#-Bk{%-Dv
z4022yQun1M>j`UQe{(x${97E+$~i)fFVHkIk9p_9c4=zR1iTkwQ|OTae4>?X)^zcr
zN&fk#TH`Hzy8(~C<V<M2yTpSmmXbf;!loY|)rYBQ4D>KH-<}NqOjNGnY%-K2i`wCq
z)cR8oAwKqaQ^0FJihEoWz?O2aJAP4cuJu{M`0W=HR8bUtb-8=iAGcK(ceFY@|3Kk3
z%KHUUljR{_X-JDk>Ii-_w#<R&x9i^$-T;L{xLk5#%0}%F4>2xp&&e4=zkG=C`UeP!
z{2%^?FGoEJOo=`m^vSrL7p=4?@L$`DC$YV8c2;?A`f{AwrGL-bNcK2se=9?W9@2NQ
zThe~<fDb_LLcr;*j8tw_9PzBgMOLoJ<@o){uip#ND=U6nu;7`E*YD06QuAj?StjMz
z!T98kEq}&uM>K6!anMlecoh8rTmIvsr~d$}r<eZ#$-ezzZ+^vye6t&&$cR-RQRN<4
z<?~+X+W%;rDe%@$MIpnA9|WjE&n<$+--GjdYc*zsx!_i2!2*b+6R1doxCDX@s~pC`
zkBJnJ<|Cm<Gh*hY(sVqfgdvCQx|Es6H8R`ft5$Hhn`09%IEfU2y=7t0V%lBhRs9`1
zoDK2^QP@V+;yTO2z%3{x2ttP1O9$Ljr#ahQOB<-=Lr@Bp^YP$QC)G&<_u=^U3UYp^
zL;`}Y&WH|UF~?zwNujJmtdWF>{p)qYfty4+96^45#es2#2ZNC2Ws^8W+9}^vRuquV
z_9=mJKv(s^#RjI;+bQ4`Qd&0RLp6mN6WyNhDbW&`WS_>vU5ZdLL$nUL8>*_E*0zl%
ze!M$c1t`b6i-yzm-hZ1pN%WA`1LbcXoxZ~GRA+pnY_#M1(_p@xTd?uE4Uux38!mX~
zNA8r?UI}tnXG(?9`|5H&MLfe^WT$2n@(-Ys|H$KWZkN~-@eD7fGx*|<+7?=>tHqee
zI!wroJM5h-p5qER_fTsw`7IA0F;en|XKzm{O1RXXQXV<$GaY)U4FT7iS8kXZzKCKe
z4n9-cI3_@fvwT}xVk4B=biP;8T|K(z_fi)jahw+vp#J-b<PrG4M;_;_hZGGt36LuV
z)K{D45zxv+w0cCGJ9_CGmM=z2x?bjPnap>O?Cc(aD)iA^j=Rq$E{Li0io?cVOn!Iu
zUBx<AXa@Ly1eW?fCn@niJGF`bMH&}Hf3Fg(L9!RhPc=5JdH-(ws6DdK-nCj9m$}dd
zRx(ogmGOglk#&tRif`kG@|@$RAr+g?lxW6`cYh+shBpek4^w43WfJXSp&|Wy$dcO4
z9)WT+s^MOC3a_SvlD|Yr9ppor*rT>c_|?#6Si{oclK`lNAiZ$}&2e?Lm3ZrI)>i@1
z(v3q{e-f#r_Q5L<K~iX^S36{;_vPgI++cxAaE5d}!tbpsA2XBRGY5pfB9ymZfYf-y
z^Q%$O?HX_pBQxo1>wp8({@oz3Z=ax903aWD;ZBb-_Sk*<L!DV#^)xqkSmwzSHIdRJ
z&-vpDb(!F@zhl9|z5$&8JDGF|&&=epu~+Y<Ha!}J(|LvB3Ctx<C3uN|*$<{Wf`u8R
zT&FkLBLJ@+(_8K?S5?Z+cOh77ETipjv9!xK&BG&X<R+r!061C_l-<9>NrF}z4+2~{
z>>d)kwbf4XAxr<gno~<P*bd$&sLzqc<RGJb3;lCtfnJ*>3mL_@<i|yUs7$Xz3;Dxy
zFRkhVr>8sjt@Ip(XG_0eOsQ{bvLL^{#m`Vt#G7veiYOY9wL|_Pb<zt;p2VsI384_d
zmqEe9!%rl0F^3QT9KZ2ST)-RP^tQn{=#vr~DUgh)bgK&nvSrkh{{;%=N9UB~oAS}`
zZKGDIhjc;NdFlFf@fp%u7(G78U=bz>Qy@!s@%~;k@_083`{<_q=~?}#fIWS$R`M+Z
zp)DsUSh9Q*vj8Nj*nVMwwKUkz6y$nww1iN%;7I??cM0E;P%j!8*J-?|sJsDESu)ak
z$Ss%!OsDiuP~&%}sH|Onmn}HDFq>(J*dDEF6@Is9z@}9yqV%h@)x~+rgS)_V@pp5l
zV8zH;rMZ?RV|1~5j)<z%&ZXe4b!!LmiKgP|pg(WiRa2~jgJ*}4%we(f-Ke)r?YSa%
zybY$WXgy<v?Zi7epQqfbi{L)JlKyQPRMN*IGN}4#%c{F%<A+GL|LdRm+4C$mT{oG2
zd0?h&p0z#{U)nje?$G25Yq)Uj%&`;|ifrn;+Q)Xk%ch{yg{iLc0Aht5r{0+T^?dQr
z<;Mp}!7{TkY(~;&(9GR|5rcMkhrg-!&apt7zCG3WYkJj7d+e+1uM6XM`PgN+Uk2|x
zE`H4{d#@9nF=Yj$e?EByva838C>d5JfXcT^kPSCPBcG*P3{+_HAeaYiFfLws@fJ|V
zn%n$7AI*%ZXxZ6bpc+MOU#-{$1tGJR*@B>!<)pw@P-nxMaah`9pbbZYk&_InQIg5P
zaEz^M$fT3_sklR;dH`|<9>4ygiI@s>DjY}hm<0moWH=a@z{AKu{IoMxxQ{rZ%&H9b
z5VKWa7de4%kSDp&2Z!L?=S0B6;8%e*O4dt7*Z%3sEGAA2eyuj)R;v!IVg&G<R~Dw4
z3}Z2i@kf#m))^_Y)}VZ_rrini+~5UYJF1a_?JS<>p}f2JyllBETYRwX`R07u3lxsY
z#C>KLVnmM^uFvBU_HP}CsOiGj-|3Rgc8Zk0((7-0<m6tk`bAc6ID)1mT83A(zFkRV
zEJdeiGsbRK-#*)RYXUqblHeUI{<fgyJ&UlX@VoD&!>#O+WJQy<@#kUG(;M+lL1TW1
zC;vUZwG_n9nOOEd^a{Bgo@wQl-;rLX6B*pE;}WZMKQAJ(^kY!Y+feYWwI>t4y1f(S
zHe~^HW0wnA;TskJda?-xX&t#U)aAx_yp#6y$oS~!puop;y;d`eOnmB!V%K1dCZB6G
zvMXmP!~X9rrbms2<YQcjZLWb~vgK%bRf>HB@%M4PlV2c!v_&qxz34rD#bMz42~hO7
zneYAo{w77*)#y&@+DY?<d8yeP+J@>lE3ZEL;Db6~3qV=wZh64p=^O}reyNZ*!kb`D
zG3DZ>aC)dCJ3k)>xc5<%+sl>gv3}r8q!hkLwkAaH=)48Z9vu{0Ch?qO((My@9CsfD
zpU{2C&-;l-)gu85{Wl`SlX+j}Y-InP`4+8kmmHWXIsOj-3C`nXkUtaEd9oMic&?7T
z$*%PFZsuhtJgVc0;c|nsq>8klXk{GIPSlI`r-$$lYy^Bis|%75O&h)FHHjQ)yK^Jx
zMb3MZubJ<B&Xha<0d}R|3<r;@!HH!gdrslW*QCIwuU2U_KZ6ooLr3PSe_M)WdB=t0
z6HVke<3CYTNAWbudv^Qt`kP)Zf?J(=Wt8CIs%q!u(<LW!b@hL~Wbgj6F7M*{8cFI&
ze(og0<GqKNkYNQr`D@3K6SzB9^W4YVe-MS0Fl#Phk0i}1rf!muOX%i3t1a8opqBVj
z4ZBavxsP7Ba8CD{^!S#}Q$C?eiHfw|`tQ7)aJSH%U9KusDABC3-2j7otj&8=E8y1J
z{N3t>?iMKrp*j_bNn;|!V-jO5nJikQ3$zmol?N8YDYjNouf>X0rz1lO&07+$S@X4H
z|FlLj^SzY9ppXE^%~($8_P1f_1`BGdr$o|WkST6*#8Yc~*cd5%XWTL}48N%(7i~IZ
zU%DnnLt=iUGH$k2rC`LRSt>30GJfB-z#)sAY%08##OImu5)bozTFb1+R#H}!Aw7g;
z67|fEi&gh)D=YfsTeHPiG8=*C5>*&_qpj*5A!x+6xj@?e?9g1Y<0slBe!BaJLn=#z
z?Zkucx90+5A-P99QfyiuFukTku<eR?wIyP%d)*<6qv}0DVdni3fTc?j-m_^kXQyY=
zZI8d|r{{lA#7!y`m%mlO{{vL(nR&D@&x3k9T@$>sl|J)%6DeLrCa&0x-ogMHfdtJ5
zA?%-3e@gOYj(|@X1IncDJ~dpmi~7s2BbU-@BRs~<5*&!sgH*_*_a~5ZMIK!Utj+z`
z5r9svdRCF;Z)%(Ui|+|%EuZd<<PH0@oY_%ogXrMdUXz&0mP7DT*<`yfbyh_7s|d>2
zCySS87}9>W@FvZXa6P`|hVgAS&D6A=qmc*WmEFwE_1VB#?XWSf;phmi{;5aGZ}*=Z
z7ainViuw=r#Fhm*2WeP&a2T{XO(+=5oNfqO&$IbEE17WKXz+M2K=AiWr@-^Au}^$6
zId1qJBD-74dtukplQNi;-B0-xdt6h^vWB?4_hvd;V4hy-MCz;7ko#1rH&0|isNs_P
z^KwIe=);oRVt+Yn>pCBe(cZuE#(Dl1>~a>P@^s52LgiYy8Yxqmy{s3Jb<!JBMIl3K
z)lojJY3YW8m=e^pZekM{On;wb9clZ90=%$m9Be@D13|EABV~EsIB;^@R>r!IH+@Z^
zt&b0{J9<stNYU3=MPbMGs?FYLM>*hFnvNRTTP$}|<{|=EDTr*U#pDT9z@@?Cem6AW
zPrJLvO6R%F_!Ud&(uHy0luk>wqH4tx7!gwURfElg6Rg>0I4>*7c|&5TDZC;RNFbpH
zfsS+UVFE}IpX>|#d7$;EAjaj$X1h?+cFBN5Y_R}(V_GTtYrLv}UDd-nV4^WX36qzS
zb!2&90ld!OK5)DAO=J|$Yti*up_`LKqNefI+|nvJZ78r&($hAV<a!mQ6T7c>7hVk-
zCkTLk*U^4NMIbGFeTyj*Z7`)qipH-rN(4P@R)kH}18mtY>@Bq{mPI6Lo`bG0TFZEM
z-ja+OeEp0u7w!0=@t}lORQz`b+RwK1cbM!bXND9@JUz6}65{>EJ!J|tS(z8EG&t@(
z_(1mN;R&6!K@fG|{vv0Y+{=d`-fb4iSUJshEqUmZp*xRCr*6DH{5ILv;zRpDMX%=&
zo7F^fOx^8wTh9}1g*W9wyS?O8=$VBG1s>S9)`LU=21OMxI)q@b$I*3Sr#ymgk)@?b
z<QK*@=MM<hem@MpJB)G8z=fR}IDJ|pZ}xr0<+qxpGkCGFKWj;B`hs=YVxyTDsg5iQ
zp2PkI!%Vw3pV5A^v=w}Pid9Fo9JD<TA5r9p3tVEciYaq`2WR-n=XOG&lm$5`!RsvU
zAsDDEF5)@L{XU2PE&GFfEt{jdLO8~A3k7JGZCGT0<UUIOzFAXbLby1A-L{UfZNVy#
z6Xv(3KEF~gtF3denUJTQoPL+SAsVI*s4DnFv(UDY^68_(@NABi?1OE|&!|_kDfz3O
z_r59oZklwAYS+0WVR&*ZcJ2=Zi<GIkQ{*&CsANW0x5-u?7eh>y&do18L03m3y^2Td
zD#t^|`kQ<r+1m<dW0Hb}Teg)X&-8p@`vQe5&j~Zre}G4_GAcPLJvr*4kNJmpo&$1%
z50Sh@gYUyKOF}sL9N3#ap=s2tRj+8j#d2UeG>vAUTx;n=kLa_A`c*Po7M(G!PXz96
z34A)M@P`9lTkSeMVfz-Fx2g7(MFf}fSdN`JT=ml7);Gd(H+Z|>g}DZW<k$*aIda-Q
zcsYO{{hRMr^KlEWwsC@+l;M%n@5f9jGCQ6Paje`V<ERe22(*YdEHu_GXTwx~ucWP)
zDRgBSj>NQomKVu028f|u=QgPrEtQ<5mI&ip!r8)r-?_Us69G>7wwrvIyR{3dj^|;0
z4PWIba;H^Q8?a~14R-Sc@!T!#US!+Qgb<CU{QfB}{qGt-L<`xD|8%c~RZU;#z79^N
zhuk?wcI$cYu#B#}564oXV?mC8`%2{c>$@vn;Fa*7m9oJ`O_l*VZ;`GW{w8j=7DG;V
zHCgHK&L{M?0(wIZ%4uz5JDtlfOzzL?DovCGPXm4*S&I+4@K>q9%W(J7e3|-IZb~Z4
ze0q2#RUB4+3+=ZSEB)@VAKF|<buadh-Teg>+nYD}<hf|0NKXn6SjX#4J6={=#L8J6
zzruuEw$KrJ**t7W{O$@vO#SDOH#&T$uR`us-ShZnd~f6y_@NW&GG7#|7tMb#c7bi_
zlti~b5qupx)ruMOzL3M&g>6C_^Y4G)-i)E{w(sG-(m*v01=1IUmU0SV>&)JiO(cGw
znSb3RCE(8Ip(-AEnV=<IqVNe3%-Qu4m;B%UfyU9<eXGUmtGPj-z@LmSo+D&abbnL#
z?F)ldKI@bD&%FHk`|aBx4$9%E%~<xblYRS&m2se_Ozec_mKBk>YfqbAB*QB9m!pK^
zRO@d(Fk9W2;qMWRBp{~hYQ0h9aiF|Sael-`P-%u#XB+D85;j&0Wgz61$ouosd*KDs
zN4XnDlB*GJyV?H$`}H!ik56RC;?I+n-BgHOV|xK@SNA;w%hZ6WmbRVzfnUdi{|4H=
zJw0~UDAANvgiHm|?Hm!p&h29nXb8SO;B9uP;L+Ks^4MO12vvUH@gUPkz#3VW70Hui
z#&6TBUi;M*IsU8;@-g$fs0o0@S_iKjHf+`Mq4jNQU|V#cU3KzzWlB+IT4{*TsXNWC
zgQN?8+6x<F>ZnC=d!lc{y}NMhyZKzXq};Q*V-M|ow?a;ej7OQlV~`{iMqx-Em7_3p
zL;C#$JpP#EqXbW0$2>!dF_qXl)*C4zs?}!=`5+2VSLAe|#qkO3OKlm*PHz?N$5nIT
z2*x2d$nzlFNEJ)u6bKjeD7<e$LDVV_eh&M#j<H>sjjv3rsAZABgkxzi{~6wp)Pp7C
zZGP3}<G1*eWrmC=V7WtvcO^1x8O<zaR_M`f;m=B)q_85GMk~(p(PD%HDuNq4AdhfF
zuza?8n}-g^^W!~S2?QZ!fxXBDKl=9xd7(J+I0dr-9~+ZMbvWHNMl84S)^(5H7c0DG
ztv<6d$1G(az@&=t@|iLcG45Kg9Y+pgUJH%Zr_I`|Btn^TtpW+kCd##*N6bbK(5`l>
zsUoi>&8|)Gb{ge7-uw%7DbjtHkqg)xzXz<iWD~#Nj_&;jkZ$A6@8T?ZwtWHTGrB!)
zb#CS1eA8am&+N@Q<ONgw{%k5<^>0i*460)EyDCz^CRI(K?7@!nve7q1p|-bUsvr4^
zhfU6d_tgIZN|7Zo@8jjrJEb6D*FEi!exb-P?e*8olP}`g{27Bh`2Q?k>AVb4`YHpy
z9$JW*v+$>;|NCG9Qy%iXsHkeGUz%egb?uJ{CC?mKoTkF1@n%ChZs1LQ?ewP%YXgc4
z*sVrf&o|V{gi1JD4n$ywf*kiIVebwtjC5Pq@(J7S6!d<<ID&WQ@#^^rkn9)zw*;c?
zV@oIy<HVO{E1f-ESPG2%p>$OT5%w8=a#8ZM#yi$kon>-9t5UVtctB~^_~wx$>b3YF
zH+^N#Ws<(_wdZvBDC(Qo!hk;0AfUJMep$M0>7%eL!?Zb-*m=-~cFp2ZJ)6_yltp7R
ziyckwulb@4)Q3R&2#<HZq;_6T`$>6r%lNS4Yx#C@`VXMBH>hp*^aHu19#00#FOuR4
z){|O1)Ac51x*%#KRDPHfLfOMkE0}MhdJt@EDw-+nLDBK_ZwdYAJJJCyo7Vh5ZlAfQ
zx8yMwr-@C5UO{KKuRtAYN#M#uGZI56*cr<+c+5EI=&ICCcgY+(j2oSk2j7oYoVWa_
zkwK$qHXe8iIN7xsjP~p-Rl`0Mr634O=^TWPtOGdn#T=N~<x_o2{#!E@NDA9{y<z9x
z{9n$U<28@1c5^c)E|2a5ZoxnPM&@29nXZ-PiKL4v>sN~88%%!>Rdp=>$mkCpk%zb~
zz7{HTUO&A!^rI|~=Ye4J`(?`D2m8c@?tP7T*%W;;Uyl`U%EW!x2gUSfa*cTkWX5y{
z!&F{H6f!0yIeHsyzSAcOx@#G$^%9aN_U<>&Q&k1WyLR}YZnx+O3^Kl%+|z0`apvsm
z@fqrNDQQxDzFSQ!F;pgUU;FF?!;>#d>{Y;>`cFGcgSwWI%Fp~0uYQv-ri>@rdK3Ub
z=K}YFcGzeHTg|wZmEPt2Eujl*rkz{~)3d(mUv|p``t#>^K3RaYf18}uIXEB{%YVF$
z0T;4=nG@Gi)qxW{?z_1ARaO22sE3MsLRD7_<TgLEhpJ9#zh&8rm5z7!*>N1vFs3`N
zZ@2#kK%gWgx9sJjk7i=3F~Its?2{WhnHNLe;w{QnA?AKLOPc)eS+?xXj1-+#sI+W5
z{+wihQr1jD(wMa$EHq?cft!bnItrOT>eDg8s~Sa@PdClQp#5Be8B~b(Ize&F$*H?e
z^riPJq`q%^|KRLFR#dKLFi`x_8E#2f7$TfWPdOT+6d`LwM)A*)86|%r=dno*=!W2l
zK&YVx7R|~Wq{Cr1ZkDy=7nX2@?l|Fp<i!w@rqsDva<W{%G|ZitODd&u?vCzpppcC!
zd%r-cAPr-`SD4uJ(#zWl2GP{i?z%i(D2#m0U3Bk#(q<l7;W9oa@NOiA&I$YSoXQ2O
zB~Qb*;8hlkrwKC^g6NLRB1T9X9lz`+ZDxJ$wYJ|49~j}a9Nd~<xvkwf66+CvH|%Rp
z6}3KFJR|Lh*pq(%wqFaO|J4VGQipu1Nfv=|#eZAbxv*PW^+!zrr+M+ysyan~+pvB!
z<IvtQqh}Dzg8u!_LKT6XH8lcdn`{;ZleG3Sc_;4o(S**|6*R9;&g<#IMBh6|0R_8F
zD6~K2`{}<aaN*SC`v-V%#J7N&eBSClB2I{VDB%0E3Y{`x^egn5rS5g~I+=~iQCl+q
z_^5aJljiuGQ@(A<Tt+*Zw`}nWiEcL|-ncE&^)<Mo%sRFT-j*6N@w>7+KMPRE_Gjy3
z;&k1mpv}l}-CXOBu_=YpRqcvYwV$FVb#D~VSO?Z(4qlr?H<l*1bKa$mqgrjEh^Wfb
zUx4M=sX&p8zMYhxI*l61>5t5dmi!zT(J^*fnYfz^OrZ>TFP6ms&1){v!FlKlXJz~5
z^u71|I3_sxY3<bQ^7lgT%N-4_Ch16(AniO-Pqx?sej{hBu_pheotOpQU7_c5W}`!P
zgqG>~t3{_~jsbs>XMCA&0oOJwY@jdqICBuQ@m~KUH|fQT9x25mYt?#x=^hDapm=SL
z3>(}S^t3zQfwMc_o-<t8&5H8@4$?T}Q;QL^0D#xJ@TBb;8x%2D*T?NSdAxvz?FPoh
zy(Lg&$i(uDfxbhiE|<S8@EL+lVMs_#m6Jm4Ry_Q@x`UxdMWta~@xG<cuUb~y<nX$t
zLvKhCo`t&go;0>L(V}3u(Sli;a~*eJw%#T24dp?vyIRZNSuUr%Tye|nz((OUnSh(S
zHqy<mO0}=vOtmo^;u45S@2WLjukg@=JR{hJZP#SY+i%WsRpr8k@b)!uvmr5y%ETcN
z4A-oiDWiw!rv!u&!Q9v|j(B22ALtJPrg9i5g<iLsuSZ62ROb@pJCs$#D@HKsi>PqT
zxu(&210x9+3@5Tb1H*xt1tzBjI{=@yIo+hR!|}m&&AD#%Jy6rYs|Nm!<>~d@reR|E
zPLWf!<#r}W;+d40LXO{4kMEQ%+TCW%{&m)9I~SQB+)O;Y_0ZRpS9mz6D)CorvT4MZ
z&fBJ#;SlyapNxOMK0DPu8y{@c)J^F~??3bU+ttVr%Tf7isP&=tGtkJFp%7^jwo!ys
z2)A6Gwr?tYG`#e!@4H|ce@yQqhl4vG^{n-FPlR>wJEzL|e(PPL_gb7?IKMKCzndh@
zg%8gu-uVb&A?m!M^5fRKH4c@-^#{jZS)(KAXmnDj*iFDm0=9pU4KbjE-GLG2&laRY
z=e*`&E`0wkK|8$nCBgM|Q@T6%JTCfum|NKnCY+90FL)olabX|)*=$3@kujE|Y|9D0
zSHV};--$k@5S+qIezksV5VT?KxVUMP35&h_P^b7=?RQ+W7A-vU<ktejuN1}KI$@1p
z-`fg)<N$9=bfN2mS*3KftD2{4-Yww9-pos+6P5d1di&ukeDw8!va}Vrmz$SeLv9*>
zx`W!g%h_K;1T>W*<Nth%RywfVxT#o>wj!1MhO%0}hj%ufU+KbjHLo(GHWZu<(~jUW
z8S4{mZn!uDS@iQWuZ5`um;BTWz3zBME-LdER6Gupb{PreX}&P`@X<<hD~XO~j&d-m
zRv?iNg>rG!E{~5;rQVP9ekZ!8fx6L?j&0L_ULp2oISD;h!k%8z@laH|tkLD!Lt$|_
zF$<;_9W=)0LYtW<2acDo(xbftWoM0DRGz56o@CbJZ9e#Xbm@;!<gIu2G~iX}x8KTc
z`CO{E_B(Ljcp$887Gck@IxXuR4k)Bxy2$9gytJziNttJ(K8L-N`Jt+_1JtUI2wxG+
zV!^Md{e8E?Y4?6YHqbK-yXm>J{qubK1pI=Xw3KO`CL_S9$8Zu`7MbMST4=df;UOxh
z8}$dt)jr`T@<q>GK5(%{_3By_5-o2C_l;_)v+9iEJ(e1>sn`-To^WK44A{I(+;TfR
zw9nM#L?<!=tbpvIQ?W_GSNq!c%zP_q`93z7&t{nZeDJqR*?4u{iy^ZIojHPY$t$~6
zlI3I#(D3H|qf|EDEW7<V?6=kU^@K{p_Mc5cdGA7cTj8@;V&9mxTK)kRI0>s(|J_T}
zcu{t8X}`7j9lYe8DPH4Fb8W|-u`D!~uB3YFL}Uu;Bayy#TFUBC?83w^To<SUHBCDw
zF|twq?x^)D1Uk&u0i964TxkMFyk)g|jjxf6rZZd4d;E|=eUwEjR@A}0b5foyYbv&g
z)AE4vhnekWGQ|<Wl;@)T=R>^<hv3NQmFvE<Wxe}LjL^%$`-M}An}4Z0LVkXvuhop7
zUg)yRD^$}NiodWwcsXnnZK<Q_o_7xyNVvy~j2nmWo5$Qkz8SDcADJi#*`12TIi~0n
z;@*KIFBsve<2zcsmkyTbtsa945&jGA36Ijjcc=JQw{7*e0tC4Lp;vptq1z2r-v#tW
zh&V4if?TtKrf4WXI5(d3`?T@KGW$YzcI)GAZ>fh~s`nLxXBa~$-fcijm0J_XLj~;D
zPE18hIKHZfY%I6k+Hl{IjnPXHZ9Ww^a}K}S>F0+0fQX_?gQ9g_TEBvu?}`CR)=Hn3
zu;>+Bi&W(2pUItWZ-obf4vx{$*J_C9Y<%eH@pi+Vxi2z!q}nP*hJANr8LjZ2ordJ3
zrFwzaT8ypljV{tiFt9fNVm(IQs_5N@<*~K;TXs#cAkDW&zapa?BvOF9AM6+yHxo?x
zJ2iv%GjUpNAYnI=RW*24MFfKtrhjbdK)9<Apk(}M|1V88{Gu5`524n2qu`QX+96x>
zs42@s{AJi$WKQ@<@1w!`j~$~qG8?}#xh`YVUAzp-XYogxQj_sDGzsN#n^8Y?(qYsZ
zsqc+89t!TQ%9}LP)Q|#@w)tJ*pJnzJPwKf%e~3{%XrV~_#nUDdrtoR<W9AAQ$BjN0
zH`B|I>$2U3bh11a`t8Ud^7KD1g$v?lM^Gn%fjRz%VoX()wfhHFW4i~I+7JG=*|p6S
zn|;lfn^Xio+~__ZdEMFY^0C!}vmbwmx$$#Y@ih&8hLI9H4HZjW>B!akD?RxF@{A{j
zD-m(g#61&_J!$7{doS7bv1$*@%)IRK+*bU(@?!<!kDqn?%q-3uKmFwfAl6;Gdd`vb
z!Su@tGxeN~YjY8KZ|40zD@%O4dD}$X(TS{K)Bg;4hv1-+Xf(3|?+CP;@5Hg>!rj>K
zh~QVywdAijhV1gZ+LO$U12pIHP~F{jGmM<fBVchL(!a94b{%V9PL}KtGKHMov^BhT
zXKOQoy>%0koD_00rB-kdD@P*qR<>a(brBCHE0?Vce?-uZQ+zaJLMOtE95}KOH!ZO*
z<Z<un!#g!wdf~;8avJFDPCS!Luc327Y9hfh74=H3HusSD%ruv}yhEApDG!BLZF=Uu
z(Vb9kd~?y&8r+=75NNTAVds&AV`9Sb)<;$ZMzVBcs9L(A6#-@{Qi3T8!CTu_VpywR
zTwkJrE5Y_D$u<Pbe4ZcDcGVggLu9DaKgQQ{(9oOgu3m1l9{&LE+>6dR<VP@TkreTl
zFl9rlUWpHtoMOu>CEm4yJD9nS2zoDxz@-Fi?G=mJ)V;VXLT2cFBPh@yAf)q?&Wc*2
za0txAq(=8)){Rh_-&HmL0M2cpTqaBHl+(XOd#z_1)-~=uFAHMwTQyE#ugWrsdDPBe
zP@U7#HRWIICiGtSpu6jj<=V%t2w@Mwx`ZI@l<p61A6V^#6=pn8lcmvUc1XtT;>3uR
zi{P&9?1!6;Q+yA&wq+Yotfltif)=^C_Eei4rYUW*Rw90=y_)^_KQcJX+?2t#Z(Du_
z+3g#1T8btQMo^WX1Um{~vqPE|Yn%)q&_4j#G{tU~wC@8=Gj^?2Vp94S^S22-DR*)Y
z%6bn<qk?V3x~t6K_HUeY{oeVY<AwM$te7$l2tL}mKl4~ek1qBqoWu>duL7brNS8#O
z(P7;lrE61pq~M{K>CN_|*^%D{*`E^pQ%w-!TPR*X)4_TwQ;#eekKbr{J?*#+ty<Z+
zs0eISne?7dn%k|WrLAaUg=Qq<>C_|p;eO#lLhG@?8?t1-{>;~!8`m=-jc`Uq_2O?8
zP7Kip#SU7-U&hPu)dtx-dp0z2W(*%J!(=~<i<Y8-$GtVZ(MQ>4mo5dz8p}OOq1GGB
zs=sSmn00D!&gxH@3DUw7cqdL%;W$3eC#tIFUy<*9EFPNP>g(U+$n+n%0x4QgJSuB9
zV4S@(wIofedEF&HCFr^x<D5kQ0qkEn)|_jZ?7(R~5EfrQ_o}W=DG@azf9MD{YU@rt
z9rFE;)-AL5+RtLjOZf=75r9h>wp%uNtyB09aQ}5GfKxMB?Q*+Cjgpb9gI^;~?{C|~
zhpVi&KG*WM5DXqVzL^W)&>BJ3YQ88$p2aAfZCrR&ouJ}7H#h$#>U=S$WpB{P6ny<|
z7xR3jwZR1crZJmVcXZ0iZU3tNWgDD1Fz0ZSBBMs<@9VjaHqjma8Ug2*--6NcA5RFL
zX6VNv&x}>7ce%o*#bVwy8W}#adc-qZ_l7ZuEsR>=PoNFd*YoMYuxTH^Qij4Qrw0_e
z$M>f%h+2J<(eC-A0Y!&ssTA@^;-{DPYxN=e5w5YR*)y=($0|AIJ8@)JG^b;AN-_`3
zYE*esw3BwOkdH1LzSYtgv@(YAk6zW?=<s@(&6)HGA*#mQM|OQ9*|%>E^nDHhRNmf}
zDR~pjD@CK}t@;n3-8QFh{^3i628|W*3<v$Ji*E=jl>O}I=zz|>xUM&3cX>h@=1hlm
zCAoI@ZOqBj?f$#ZNq1j$Aj4|gNIWqj%5)=eb((nP6G(cx_>A<S{p|aQ;LdR^8Wkb=
zO1CcTIA(3*^z3?j8ZY9S86&*~@8{37n#j!Oy7*c*Cu?ve^aW-~tFpE>L%tX$Th5PI
z#%A?h)%C1geHhznN5`7AZA&XWxIm62u(<8oV?iPgPayVEX{#M-`*x@XJ!j+JU2LzR
zBRxF6=ae0XSuMf?TTKiewLdBT0p{&I$1Uo^>>UTlw(pR=*xg-3VU5xxJrpN`HQ!(X
zto5ycaTYP=i{0JKMvqm?ET=K8F1zPX6#25Ongqjt|8;D<o5V4EJa$9aMcqiok7=6f
zcYKyk7M~rrYj5|eY`~ao5yQ`f{~Rg!E?j>+|2w;lM-;(6fd<;Y_U}A0Z#U6i%OsWI
zKphbY(=vZb=L>lg<EH<P$3<xWp?Qs5>6p3k;U9n~el4k7uxbbfj@9Pb=$70S)T@xY
z<}W&O!aYelx~%c7vaVqRAp(iu#+6xUh|*PkeR|Ue7gIiI#pc4*+`W-{!QfwH_Eqhj
z&ul06g=w}wrW1Z8C|r`NWfHrDQy4Vd6}Ik5Y`Yz<#3Yd<t%_8(lP3gHELk>nhVxT{
zEf=lBxun7P!J48Ee;(N=zq$EGTEgydja-KM(}$`DrIwNTLOgE^vY;m^o`Ii~(0@50
zDOJ&yWmcrMF7AXaahDD0;ERQVgZ+_777v&IvL8~ibBm_iO@H#-(6Mo5iv8&nG<{sU
z)F>*^p|&UWS$~xzBo}tmX7RDzKLAg~&*Y8($DEqO(In4IgVJ~Dh`~^*HAZHxz|wps
zk{XS$Crtti<@~CWMA|FSj`?p-TOYFHFU%QZ_!dE6#eRvg>}=A3s}U0H_Yl;Tuyzp}
zQdi=!ry<Z<c=LH!H7}kx$%n0q+xUT<i%OW8LeKRCLcY5DCsEAo2oW7OZ7(<<<>P$k
z?KS6Xscd7IJfz<Ensbj0-9>xg;PIA0$US;?4@e%5-Ip#b7f$WK8LPp5r_Fex8_S1M
z7`ivGW$M8nH78<w*hLUQ2ijuk(ry%HTLRPVi}h31OQ<1uvi@2-ysOzDs!?jlX`1uC
zfAOmb4l$b-jrzTa6BVeu07z5=0Nzrj^W_Ow-VzJss8}zB>N7!@5zR~<c35Qbv~wc}
z=PK+D1;Z4s{p^P>FT;UnVyH9$>4#sCRb4jD=^m!tIR*qn=eFj0!;0OHb)AthhcX)6
zjuA**L@_}v7$XZaO1rczS}$f@ulYD+AfDBPZx6=2v9A(&PHV_1RE@6%R153kgkW<#
z01IM%oni!zIz_^&8%=U0#RWpixz5;}n<la?7#zltY3)bOk*-}RJB9ZRy~om(Mm-gR
z26r)GWkER4G>U^#8Klh<unF{G(14JbZtAXOXO?2=ycRyo=oy?TBcEw*jlHtlGZe10
z32Hr@(#|iFN(~qt)l#d!v&eXJhfCn)KL9|6{q9mZRRKtxrmpMbCCeTnm=|j*-uT9}
zi!p6APj&P%@(-w%%b=NkSK}&s%W6&IukH5(DxX2)qoFCK7Jsd02WRDbu!d5*sXxVB
za^|<MCBX|HW!ZX5ZSzjU+AHx&SKV{fjorCdgFe+Wx8A*^=XlvzU{hL7P3hiJY9mSC
zlX^}~_74z;L2=(xoN1|e@Rm3JSNzqGOodb7UDGyWuTU8U;pZmDAl2cp9l-)6IGd*G
ztzuS>Z9B@NJ#~`Jo+AJFoC3{vnXB>WJ}nS-z)@u(Ei1s?U*~)7A6i^M%brCC+xQ3F
zo02)|-Ya3?gQ=RWEI+N5kuQHaDN<#6udO9>Ny=yw8;cXqoyXHtw8~7a&XBivr(EB!
zKjcsS#DASNPYr>Tg4wD5^Lo7XIkvT`(~bAi)`f*#*v{kl|CXNJ={#_|(|*#iSmQDB
z4^a1v&Nh4T@L>0JNchh(x%S~9?WuO_`GiSZ%)SEO&ox_g{G!}MS-uHnM)Alm(k#T#
z=4fnX<>%Nn?Ozh_3V5Qa?QB_wUz|H}XyscGYimE`opFaL$p#2s_!xirT%j`)6Y3nw
z0{*76RpGC9r87N;RoecsqWGf@GW{xcpx#-hXhlGtX8#{xu{Q73Hy68n!bivV_JLAs
ziMfAtaC-F#Kcj3t^Sv%l$s0V4IQz8>T%!Ua;g9vx91ljVfgbzLcg?vzf9je_o3t16
z-Wi?={62l6tdG##eVFBa9wB;k-9qe%+!86%1X9KEr|RlhE!3JY+-kO_^P+Wpq9*va
zZs1-<(Yfd|dKD{KTD6gm>%XMC=l=i<LG!+7{?BRC_;L9R*FOfvx@fV{v7d5B{{UXM
z@8=NNqJz&1jC*9&QdYYrtG$txIXmXS+}>KmJ@Rw>%~evPQDg%tyI}X@{Bu{AA1s7?
z+qW_NIQ==QyH+-725^~d`kYZ9iK`5lwa7hNulfG~dZ@`eJhI4sRNOY7Z%@*ti*sqg
zM{~jS8%Amsk1R9Fr_2Yh`OQ_^Ng3MAo-zw#I3tzF>Hc$8%a1UCJK+BSpUSJeeqQFs
zIl%P>pN3fa4u2u*`qd#aPm&Ze0fHABvCwrqdXrFxSjxWF+(r~&<a+%*%|`N*At#kl
z!0t(>Mv*pVnL$<i#Pi3Xt5PAN$|08mINayvJ5&ID4o@W2xNPLNc591*$D+i)Fi&=4
z$u(TwNf!s@kmKe4=6!$K_U-kirOceFb?SH=dy;y7HKLK5MX4?A5)a;P+IeK$06*^@
zrl{OrX{d`Il@LI3sn9X?+B*JJ+lzxE6jEhua!<;D{d%9n6;Qm<lG0wP1JuTwPtXC+
z=4twZK{f5fl5DiHkryB9HW7jN;<IN<`D9`tcOySEj(~ny{&i+~qJ>FzX;}0Q!GAnf
zWb>`y^2_;4*uXb&@3>KN(8-O`#Ko3&UhJ{_x%31Ml`V|YHL-1gD)E&Z{VA~9TBCs!
z(+L-*@09-l<F%<Al1cK+Qbim9<0tSVniYy{)$r_Yu{%CKW6)KNLkfR{t94Q;eKItT
zG3DeQ>c{%jT9mvp?Ew4stB%JlS9fy^%3^;q7UX{jt$Q6&ZsRe^yI6N4IR60aS3h+t
zbPh*7xT=C_A(duEB0zJRSGLA?I{T|w?iHRRwsHwyn}^45f1&2N$=FC9Foqf9Zc(})
zr=?tjQCXlRl)C-rjZO&1b@v^rJxKkcE$3^>sg;#S`qcG5?FznYpxCJd<gUgeJqCIo
zV@l)#Km>Y&+Nk-NInO=Mtxx9psgL<+2OsSZPpwR3QX3<4Rhtad*PE_sBYd?}MKoHE
zfm0!>>e#74s35eamvA*y%d}#b%)-?Sw6EHoyCSK1+|-LEYdJd**)tl6n+G(k!x^g5
zNHA%}Wkj4wNMx#Beml|Ud)A6Zwk??ltvMvB23l2==9`#U)DnzTGRO$3hFnx;Vrw?S
z!b)<~%+ne2)P<^BGm-{8)77}B*nHFjgI;<(;$YP%HC!5yHxz=v?bfSEYG&rEN^wF!
zLg$)jYHPPO9{y?Aq>G4hMOX)lr4w;h194Rt*wQPgNNNL8zG?<Fp>60Ynvi?b5LBRX
zP|U?qPXdq%QPzMg$5W1#UOJdJy=FsT!lNRjV^I@KR}@r$)FG+rYCx28NWy?=tkp<p
zV^RY_G?dyCCsUf3bgHO%s{rvtMun%U<n*Q~!L2<#!$_*o2L#seSJ2TRwpKICFYeU+
z4<fsLGS#AmM5q<Ls!cM|B}F0)m8*cG01EAeK2&WKK_DP;ip{!W0mlc{v}A5iTBA5q
zoYYL{iOk<DnIHi9{t$VrY3$xrW?5P(2V9IF$JV-g+f|SftIw@rTw63paee{*F5if(
zS~?T#Wk&_#5w>{4ciM8UKAoy%x!4*r7zKKxkGg$EJ9~<_VU5>!1-lRie@?W`j7Ud`
z9CY2*oyU8PhIt)E03$pr4CClWU*}fQno+lQ)*Xqe%VHUu%Kh7SQb_#A^%Q1U=UC@a
zpW$^pgYW7+>4>%t8+$62Uf|WB0bQYp$jGX4A#`aMg1nRJr@!Y`ZCJgyW?!BxsUF|f
zkOHf#ql|`Ajmmi+l}@OWFb{=&*xon?>N|Z!F;x}UkKH~-b5pbhW&wFt8B^;(3^t7z
zbZqho!5s}tY~%$DzcP#!>$qo;{uMkdSxi#oE^w!zT1K~TGbDWKHjqA?F!k+8uBa2}
z#`#-f+O4=_gPx<Ppb(1~$-zBGPX7Sbr%kfLxEz(>spxs*^vL{aBPh`Dy}`iknoSKd
z2T-udfOC_^Ju0lL%Q!qPPCrlatv@Uk+n&7R^Zu2cJYXv-G5LWR_dn!Tb4A6Ow=;Qd
zh2v=J_!Cw%*ob)){lNMB2pInWKGjVS&od(rodFp3^!zJI1p?HL?o?!Y0LG@8ySa0A
zRxj?%2PcP@anGj|>wpSKI5-*hI6Xh{+M)K4NFd}nLG}LtIjsl(097;OoUqTf1#F$n
z8nMbq%{SXY8B)EEU-QK}Fv3OwiDg3n0FVGwHk;U7C<GQzgCpLoS&=o(%$fO45j`+D
zKl%-G*y(P>T3jzB^FGpkY>f8f{{XLAy1;2!RY^GA{(m7+=&@|HS=*ku&#5>g^35rf
zo5<D3BxfT%PfzEX(JMpL&Q9~cVUm1`$PNmRPi%jmtyPXds8sc1pRHJti$!o$?qy{I
z-{r}y%U$xwq!7wDB=>CK56u3RocdU}^)?qF5HFz^9{3pjD$Gy?W&j<i2TtVn{{TLw
zqlPWquR)G|g<DoTaJ*-bf30aBGdU5-7%#ggXgL9UpIV0C26x&HV;J=UuCFAM%7Os*
zIq6YKnUiB-x%q${%~3Zg8-0)}L<16V0q^hp>Qc#Sk)dtU;5g%f=*RjG#;<u%!yla{
z)-Cr`_xJjC_o`19S!CSBn}N{C=c&(NI%cgaYt)O}s9TRVP2xQ8RAuBoi~clypp*r`
z$MD<6c>GUu??lYfJFz2Z0020`gVWemSzWMIMgbd`^uQFZI}eiOix~0&?iZ)$RBo=t
zqkN<^%rTzGJx};k<u4RIX5X0o)$RD7=lND-w(eyjSNV_3?0Cnp^ri_AJ;)>(0dMZ1
z>V3^IB)YiDENdEeW;oB#cBlD)7#O44y_tqtsf^PfnbT<OPu@RYN}|!FWFm-wUvqa*
zD`{d^RgeyzhPhZK3fbF$d;0!0zo*=@kfnByN}Je;CUmw(<jT`--6YCP4m&VBkMSPW
zJ%VL&PBZC7p=zE}H+*qg!8;t)PG0)h#MulAk(DE%9@UuUugd;gyi(<%CnOF^pYIQU
z=UKNP#U|W>2+FYN!?6`D^)R^9nc<K+9zp!-+&6!{C=bd`<n}d~&q?NUQ_NM%QjM}4
za1SD)X>dByd7GD>$E{bH>8WZsi6s93Hc2@(Bob8?Vogm85!S8F<+f@oRH6P}wMkzh
zpTCMpkjK2)f29U)D!J>LYqkYiVA*6#?^DGtd)8QXCapk7BC@E=*`Upps&X;rtY<4#
z<Vwj%xH6QE!T6^ulir;qd8k())$_BV6M{`STWO?0>qGU;9OWV9vr#isOJbN?H2Hy!
z-OVXD6rJf9@rqY5k!6KP-kwb}_^Xa50z<&!f_b1(ift6ojCiV&w>4%?l}ba_nnYNJ
zLsi3Nsq(c(qZO)&q!r6Z_!T-nYLc}|uZpfyXB`VgsxmpI2hBWlQqhwfwK~)TgHtKu
zrHaI&lnk0`oYaV<)RM^*U{mu*1_dWMtjGpxVd9<Mn#QRh8J=oYH4l1lp`fu~9x07T
z4-_+Yp&_I$DBD8xr>M<S8$v4J)S&TABZ@$9YC#jl_rW7I*Jv?9#2=Bph~~K%n{xr4
zYpBwPlE7qc@9phflo_O6r)Z3*fD8(2N<C_<avh)JQM`t_B{QlqC6Y{<s_oX1rZ~k`
zl5bKg7b8@N=Vlql^Q_6F`HtmW)tifez4KK8^G4=is(No9<eG-fVpfrts8k2GFf&Z^
z3miNH*@q;3eX7|=iP^9g>`wsx9cfYvwK-6MJ2Y4YPxGcSDC3f0%u)s&(R28ZYMJcC
z#APN$+tfbw1%3MTq-$wJq1j1R<g1_LPkOTrq<hPBkv{e~C)|$z09v9*<Fc9}k|QZ>
zVZmkjz53I{E}>>vTmi$V=V|;pkN&k-R0;B98R!oq>FrVZIk@v9A!Gv*MtI2e6b7SA
zOsqs)dvw9;`qc4&>SJZY2Fj7f4ND&7KvjuV{uksCM2iX9Lb8H!xej*c)Ou3TE;D8(
zRmmH_fP;+vdsNX#>&U_9Di2Tp08!~omNL+lc@&U-`kn#pPs<ef^O5(!=O674>q_=x
z-3G*s_onTmk`GnE{$iKQ*(vA)3><U^+LYu4e1<B1W9q##O)a<(58cjwosZ%Csrd+G
zj8P7A(;(Fww%X-^8!^s5n5%`38ZrSOoO8x&I&5KhJ5LzT=~zvg!4XRjmwc0``6OJi
z2arJ={#ZYSCG3j~SRW@Ok)EF4(x;64jTdxcFnu~7>-mbz)pu-FAZ`dZeiS_-+AV6*
zWD9%dNUEo}Y<&(ZpqgE}Sz(Nk&*lFB)~-WMQ!LMr!Icl?&Obq2P1)O{x}1Z62ewcD
z0A7k)HY&+oBQhvdMDh+cw$u4%`I?3m@^ovt{ULpzae`Q{^{K7dTHVtiG>l0dh{s>^
z=}ftdu8k$XLo*`&Y(K3Q*5TxA>F7Sdha;-Drg-$MY1pmwu21@<BzENRKT=QOTd^oj
zJYh&L7y}2`Rt~bs3qqWSor}xQJ4Xt?i6`~+tD5&c2!p(skh4u?b>)q&Lu}8fP(P(E
zodJd=W&|uFEuN*Zz#l?IJ57;%#0qhT^A0+ZkMYG^*lg_NE%$~oo_XZ{ILSX;R3&wO
z#*NzOu^R*n3Xi%zZM`$ztSD2vc5+8<!=^t<p$Rd72Mlq$J-xqL(6-BD9FvfEqB`zO
zv?WMBbR+^7qK@o;z)=h-xZKD~MyKTf<9mOH^d9wUHCbGMFgeR-pzeRCy(G$BKoA8)
zUzE2X4_|VB!nRAPi+u|7G9B{F127rdIKciD&u{5f^4?QTC<V6gsxSxB*j138a*c@l
zbMlf$C%Hf8pQSc(trCC>5AQeeAE$riLj-0?6M12njnBzaK*{Z&YO4*C&gC}~yOv?;
z_5Qu9UG1Jp_V?=Lx{q)1sOQ+Bp^898aK!%r5znun^rVK_iF(S9h}Uxux=#v#cl<kd
z&1TAy5JY>IcUIey$NVd!dqwiemn05wJJxaYT%VUEjsgx90QEi3wF6AXOGW^&Cx3UC
z?mv*=RLg8v9%fzh(Nt&r1!(=5iw(*Uo`m)Kf!pe7t8EzJGTq5yNorwpIK!4X*n{1-
zAJVln>vRMkynUAyEOy`!Ku|uFqi(1$Lvun`6K0*D+Moei4Fb&;4^vt2KTKAPQX1M8
zFrKG3s_3VdMF;6zoXA#2w37n|DBYF4K8C$r&^%;U3*tKsk`s<P<k421oXF>rIVZ{z
z7!XGU9@NEPYUR{sL=tjXe6Pn}LsWJZAZEFe1+ge>(mHOW1DccqX^lv%(fd%VgT-0c
zNv5)5nk_O&`cj&SSoEipni~c<42q4#NDmYh$e~Dr6kcl53{w?&sZ>&sNfRN+rpeNp
z4h=i4W`!NdcGOP0TQy{WDhZDqRU@L~RM1XpBG?rhN>-hR7D&ex7;svnk;xRu$sEvK
z%OuhRkxVa24-^>4vRaco(+g0i*12MECQ7v^=}{$2I#sgBWEE0n;MLX~)GnE(g<~<I
zC9~S2BNd|2;8j^7^sP~>6^fwYs&zT2wh-LZs;9MEdKyXSJrt8x6>untI#k;+SvFwE
zg7Z>=PDts+aZzzxatgAGP(&)qQe?5)c&M0Ttyt|)$yt&_d8V3CN>??kQ9}SUy(!^w
zNW;AnLK8gF5k@E{6p|_^a+;ZVs1l?N?@mK#lygYf6oIJk(>|3cU5K}57elvmo`cig
zyDb(4C^H!czH6J%?H)^j>|76VUFEb$k%9oO`cbmEtRtx-C{@6!lH^sX8mY=#r%F<1
zR8@-3>N<l^^85}*etJ{gGs|#2eQGBl3=Wy-Dm_h68wCV}C6B9fkD#irZVaC@F=Ofh
z&013#enw3!fk+@m%lOh|BD_)T;2i$|vHt+pt4$OVVWf@t0}UTh^&RR6(OmF@IqZ3>
zs)a^;vIk?I{{USHgB*}VP%Li62<T2h{{RX>4$Nhlmlzr69^Xop4$uie!#zh@m4d$`
zBmJCpswAqrSLHcn9XRD_?F!4W*Co3UdHg+UK@el|oa3=LBAl@W$N++R<ls;QDhl#k
zqjqoe)})Yd7@;J01CPLpn8mzxInHtpb4?M1<P(o>qww^kG*tpbZZVF7>FJN@(xy+G
zfT~gZW2fQ%6h1d#g;VtYpW#grVRrz#eF5YD0IgAXa_p=;t474~q;u_1%B-YIAAA+`
z<FWke>XQ7BLym-Zs3cH|LCKA~4l!C=LNBSAcHl-Oi{%7hkMrr%@~TlvvdmPTUyywd
zU*}e?zD9UlfQ&eP-pBm=)@_(?JjBOuC3^B0bovlMtR<|GymX0sU}tOsRo9$$&-oRb
zs-7?j+{jm_Q(E(H3UPvR7&k%d{xzGbL|Ig+<$){p<DPx0q_idvQ#VX+^sRw#!aReI
zdfJvkEG0(gYK(nwYn-xU6M2MTnbd90Ff+#<mALW`vX^Gc?hJV923XY7*t)xvtw~)%
zNY|A?$K_MYGCk}u927WH$3j(!{-1~ySsaVhAH_6o#QJt0){%?HeQvos<jSrGr#${<
zn!1QRS-l?Rf&^yeNEz*c?f6z@@+6XdqG9(S-N5aS>P`pNu13JPQgAnQ_X8aMAJVC7
zVPb(3k{z}c9Y{bhzf#2oX#Nv8{6so?he>h6k@I_d56ITf+11uP-!M~_J(ub4$N9x`
z8hnJ@Fj;qDh-Oj4Hhy9~he7nNwlTI9L?CgT_Fmu5rDIF6rCk}8vhBD9u&(EY1BDf*
zYQ|>?9~mUE^d6?MH93|!#B2VroE2Vnf$RAGxE-rf%$Va_bRfuo)^or=-TXWI{U~1*
zg)fHYz+iU4ZUY{yTR8fk=qka881QhzsUTx_{{UK`g~V8i6MJCio~EaeJALv#fY(ay
zbFm-G35h;#+d}@Hn5wvUEXcv%yN-%IGu!Z}XSzi@R0cnw_Vp%|Nb(K`KU@LqDjb%@
za=phm3m(LjAnrovpRG1KR6&p!w`OCEew8{~p00r7pev9lk}?#MOJly@=qaLEPGYP!
zq79whn*-_wDH;em6C#XpgOYzrkhu+&j41x|a(@r-6p}0K+!S(oXW)G)+zJ;E+qOV*
zqqsjVe-WSNDx)lL@Gaag_lHCH)`XBdj1~ZQ!0S+$OdP?GM^-13`BHA<L?dY<F_1Cd
zrCh1V>FG@l3uZ+>R2clnPvukX!fsKO{_y_*JWzzmp_#wC_|}v%oD9}V=chGd7iIOT
zOqPwLlOnOT)k7XisruH03{Em?rmtr6!gkiGO`{n^;AbGpz>pO1YJ)~qh&@5YYTj79
z1eP^MRn8kdct2Xl)Xl3RxI$Y3tSopGM7cFN;8Zgt^&{8-sNWqbLe&c&nyDKmFrXaM
zbInLk%TN}<q3Slt17f5`$*2PZQ^d6*M7&dlCfZ(<TH|9}`cu)eDl^HUPdKP*#&1SQ
z-6{zeAc~tbgk)2TG*T$w=A(7ZT`}dU3^>hI%_7QzGl~p&t3f%<Bd-*g<jN9oY0BJD
zGMs}=2B{-CGSC?`{kRk;#XA{l<VPKWc&SyYJYtk)sg05WlSs=*q>6c9M-(7L4z*H2
zR^&MqRmU~FO`=Aq4M`jv=7`rOuEifTrpc2T99T6cms5!8+iM2Qn<3eOQI)BmiijMV
z7OY5maZ^S)s7D5-PAb_=4<{8AoYJ|)M9;-%ZAq});i>&<27YPi-K#`^oOGlkB7si>
zG?>U3VAN-mR%A?yghoeN5oi^0NuF^}>BTc{TPB7kJTb*AQkdopG5F9p9@T2sPd84x
z4h=XbeF8e2W{rFbC}j;@DoPnR7_1E&NBce+Ndq5B=t6~D0bTdd>4If>AFWqn4stlD
zrXA{|$U1ZQRt)GvWXL>Y=}`Ga+m!>{ig``^{KM-?p_jHt)~4m80u~&Azg*H4OksdN
zqMl!9<P+>EqF_!6F!awft~7!;;zPj#t*GPg4t-RLhCU7pZ9neSg;vH^R{C+xR7D{(
zw;&J+@6YQ(mdWP?bs+Ih$>c~6)|wRw7}|dkR7miw!*}XNX~75wE5YlGe=N|E?ci=+
z{2H7B4o(lgX{0NV1@5HeQZl*fFnvcuR!H5!VV^^h{V59ik_hY16gY=5QKMooFvHMx
zs1+U~xdVmzXCB6{H<(W+Bh*v`6=8?idnw21{{ZW#aV<z{vbQU=cR1{6Wnkdv?;qBe
z4UzXtXQBT9Jbn~Pn~<o^dhI{cHKJM)-lso&!Ltcv&+zB5J&&>ebwf=iS(zWH!32&8
z91Q(&kHW5M=^Mnv@Z&jePyV%M=oc}`B*<h1i3Pnw4*vj_GwY1kH`_XX#m<Gc$sPbX
zWd%?9{{TGJUZxJxH;@M603O^5*I=J13KW8J2W<EJ>nB()%7sS^FdH0>D`{+F4~XUx
zR!f%1A2V>?r2ha))s{jmnj^+T4^znh01xL`Vkc>%a!x~V3C;&>dsW%oODeLj%JHcD
zNF($h*E}z&(>q-gDk{vFW0wFD2VuebedAEw%v)5k;PEVnk9Q;dDp30+A#uWw$PvwD
zeW3pUY1xsS1Z;!b=LfHT->q!)iJH}$;zF0FbE_bH^am$D{bBjhs!Bhz(aSk)#~kD4
zIXV0RC+ZfcJ4)7zaJb&eFfoi0O*dMeQ5~!Z2}aAauVaosK2Ok6zT!&l5oxC0Nj9Qj
zZrTS-V0Nyy*>Mncz){ft6~*YPk-DG<EZA%g)!f={Sb=l{9ICP44%w)pRej1=Ht^Wn
zmk3Pn^A;VwG3nI%^Ts;VRwv6MJ-@zTh6~Yu+5Z3^%hs>zsuI`DXZ`6T7&tut057j!
zO3JZyR@$Uzxg2NM4$J)N=B&{WcTDMRV^xflBc7~#(`F<l#UQhP5$XQ`*QK{oP!?gH
z%tc5@X8ADZ`_gA0m1}JbZ>dQvLlEppQ^_FUnzbB201yxdrfF3qke2fcj@<E|U@B<g
zz{v;OoC?iFT(>9GV+U>q^&L8M{{Ysgj0)U3@5ew+YSRoKnUF7h9MbMpIZ)63e+smS
zxm<={yBI<0asGcQVFZk^+}*G`eJW>^x9|n`-ZNEM8-C+3!R^!eP{9i7fOC!B=Rc)W
zkT@SJaC(wOT~&1>c_e!e@u=h^gSWS(A(`xj+YcE0Y2q}5<xU&hnv-B8vuE+D(iFkM
zlkc2V+^bx$p(OjH1L@fPDn*QE?^b+!cV`FltI<ebgZI5pf2~q?C94;qJ4oiQ5xz6t
zv!m;d^`#yP;<Sr1k~wW>PiZp409Pq28#g9%-n}CJ-^`4ITy_1Qmnk7>NXl{0#8I}9
z+XAMNHd1#CRcQzVn!@^=$qg(3$*S)t&sv7!Vmclwy2x18h}joB&w67V^HG9vPgA>!
z3P@DRHF6`;t0xR9<aIJbXq<GY+*81&wK#~33QXat#+uzIk|siyr>`{%`qZjxGIk%f
zowp*Kqau`y3Rlz=tVYN&P%x`JnvK_*cQocR=3G^LRoT}fs%<`%ab;TQz6TVlywtmB
z#szYtiLx>&F-pYdl}CECE8I{ikKUxH<jyG!s?uSwX}wPrj=5@dXhGwpZ5C2w@g^$V
zOM_6wR*`BZ5@exxq##q&QZVAMGHj<WT8VfyETEc+M>V7oaVAyCser(wMZl+#RM~8=
zDdMWL8n-B_j;AyrNRPcrl#3o}RE&dG^b<t!jEb<S#X}NeuST2-$%!HlD>T=K9M$hE
z3S+MnBCKZ>=cOjfu<3(LBADz&u(9H{w8%!`nF5N;jB|n3wzRtmCVcUV(vs9^S)FBs
zXs9rA?^nne;;LEMn4<xXD%*UkU5{gUqMXMSRP4@s)v1np)_mC+EBV$e-4mDja&d~U
zBX8(&RZYVw%VhlxL~fc!z>FyMV^Jn)7?w`(+?M<*MP|lJ06ot&RzG>O>^Q5DgN?up
z`*3PeM6q%P&|DtjxT{FA4ns%n$3LxCiC7%5V1L>*bS_sUllXsHxRD|jGW6WP_thXI
z@Ji>~nwJVk2k{*$Bd`O4N%aScWRDpGzCRj~TqyaAkHDI&z$os_dm4?OV9bQ5?~_)9
z(V!Vv{pRPfrnpm(u6XyXnYD{sY=@I>J$U2ssx9K1GRMoAN$k>sqw}hAk5MQ~p}V{X
z$C!BPGyWo!qk)AP9-XTh{6++U9t(+EyIZi31N7aKQ^Bji$8n#_W1b{Nk3Ud<oh=wz
zOLF*-uMHTD&tuTjnVFY6dFg?Qt@fCuLApd`BO!xl_4KOp%3{FW%!BxW`=k6Q*f%qE
zl_2g7hhO2(U#Iw0T`x|GYp=D%Bc;NqZ>b-`N1++P9{&JJxv1@)0t1F8rs7BPs`@;E
z5<<a*sQANo2a)yX^{lM9?rx;{4eg*ZXOZ0JkSjM)h%_EsbAykl_N(gZ(DfMy8SH&3
zjrki{vy!6#AM@M#)ih-nsmMO`c!>mNL(gN*Kb2`d-!|NUaIq7C{{Ve+`Tn%qm=)e2
zqiFlX^CbTODv_af0RI5fE>1Iy4nL)51#6p1SFsFi8tLDHk%9_?{5d%OMyTDu!u|jO
z$C&DI=uX^zXEmiAs;rlClZAIx>Cg<E{$$f`;SaUg<7;o@u_WjDkIJ=b=#?umw=xHY
z$^dvPxRNu|j-Rb$Y8M}Bl1UV>F2xWa;Z%G5D#oDh&WxWTlwQ5C2|v!MM)8Rx2YMt+
z<)OeOh~zKRIXUT?=bo(E7Dm34=C0p4$vNGy@CTqjLDst4R%TGo_nQRcx1g?POoB93
z50o3XF&{C`I{yGKO6l#vW*%Nu7mdy7=s(7)lv$lwCdn@C8fMD24i8bBd*k}{tcW9q
z8MdZdpIq)I)YjeI)Nw${#|4ft)Sqk`#)jf{n6WGde(J73`mySNPfFTp8A43lg-AeI
zOS$zp1OEW6L6MG7I%hw1dB@>Y((7_<JKG(8V}bR>O1F?j56vWTuW|+l^ZvBfsEt!C
zqe+316nDwT{{UL2QyE}eV|)Jq`s)F;+ZeYoodW0Y$luHV0M%9_)*z7;i}1raJFsX;
z4cyxTGCn|`&YJvn4Ufkap1OoB(cxk24OvT=vGQ$Sp=JJ6S`(<OM^MSn(Ek8hZ=TEd
zV$Z%np>CDe<YaDo33L3#96zf48#S9L3Fimlml^at8jeB<IYtM+YLg7T3o!SjW(>Gs
zK<({FqdHa#_esxegT+#~XWO{T_+qrCaS_KSzCr3MF6QzR{0_u_bW}*RK^*6K<EMJm
zf?z<+VLZFOU`mfdDp;p0j2!)HV(d91q>aHN=I7tNX+YTmxyw1rumo1^yYMS`vlfeT
zAZ0a?djxw<R<;9iRj#916xB6yP9pv|n2<QEJBYEIb6tJrlo^Qvv!6+0#@fP3+`-dr
zoP@9vPDM5=a>s#PY0w@+_iDmu8wLXNO&Tg}<VCAD0ePpjvqU+rz-ZoD{oZO8(b`tY
zUMkAvIR%Z2;|8xH6&%+3=x26z@ln2(#rPZwhjBQW2RNpXde*yX#CE7&#fji?LUtpO
zvoO<0z{#u9T9q6vKwx9kP~F7h%A+LlQ^um4w(XvkVmnqns#_9+BTSRUP32@}t;1)5
zk}6XHCjeBB*t0jwz^K|1dsfof5_(kWrvubfyO$aa=9<x%bg2Hs-`zFW8x;eALs%$Y
zm1^edk;=CjsMyn()GXZADDn3KX+h$nCz=5@Rw$G;HTe{Rk*Ee&ag$R<pyHdiH8ev9
z6w|Pl<j9z-C^1b%sW#CgOI&yz=^LdvM-+ugtgJ(cT1Cx9iOpLnii>S(i;5hyH8*xC
z3sPk<jyqIrl_5OR<22$Zj6~+A5rImhigxT%lPQa|VyA_LRu4+AwC1xBp~fmUrmUG1
z7N!JJ4AccqYIijsTC`CmyhWrOS7oGI5gOoQ-nor9D3ESn{=Ifq4e|i0K(2~P+LaSb
ze5=;0%O(X#bB;|@o$;P(<j$C)wDSAbbhBiY-Ph8sOxftds7yj0qOxq+A*`ycmXsX-
z01j!NXk6d`F-;*@^cnT11Oaj-!1m2UMWRKx4fluSD%4w#%%G3>6;y$k$&I12+}5OO
zxEbf@aYRr_mLYi+0b`$R)S;we3lOW=0h3U9>kz;g_a>9=MYvp(>(Eu1*t)@S&g1x0
zPO6K%ug%zh^{R{A>;$vt9kOv$C)6f<u-->}bg6X`DN}5{G|r#J$^Pf{tX*2@#;utf
zrgPu$AO5Oi-OTPhypz*CaY3+ViOh#{9QDO0%b!BrRCQ;bYnsf~kUYx}`H#xd63zY<
zhZd!JmdkfN#%?9%0IQ9I{(5@nu>AcirSV;iM^9;H2g!u)a5|Ho6n%PqGhASm?qo7a
zjm^#gz#f2m4wcfO7{>M`&9|XtH4CrexVY<*M5G$i`$nf6YEto~1Z7c)Vt*0O>scUS
zG7mKy-Oq0m#}uxp!vt=}y(seD%uUGIvGK&Vb00R{kx$-e<Y&*m)<4MCWn<#FC6qw3
zDRF@Ni8w#qAoRz7f2DZ!i!xowbjOIkNP+$-I*-70{Jm)vo+(U<?622wJpOf3rk2FU
zT>GByHRzT<G7<pJ8FR{?>J2gr*vy5Mwy^^seRG_8`q!QKW5n+Bt)1%(kU~m&pJQEF
z-30rYxxos<qYk(};QnH=wxIMn$gcr;qX!3YU>E3rtw$uwG{{4Ldk5Q$*F9&bI+&E?
z$XE=4+ZgBbHKA`hNopKFAd$yjrz81QGBz(NDV+mjDuiR->q?-4p}&ST>EE1V^%Th5
zJ4G^_=Q;Y~ts)k=X&eoO0*rS%s8I&NZG&3KiSutevFV<1RIcIQZU(}qYXOgKr~GTC
zir5F=)HXjtYK`<NA1np|VUMV-B^?DB>T(E-Rhm$Ga8D$1SM?^Ng2ZllW!&4@<T>m!
z$JZwnywL5Dvw^+F3I70iW~oa4TE>HsjifiOFu%_=k0GzLELr9+owk-F@t%Mjb^ic9
zm8JcpakX0q9P^xig>usB2Vv!i3xSTLo&`a7sXHqwouR%^0psiXev~aXA?kEDdZb_#
z8xS`Hmcc*Q@~rgMp-_JA#Pu8xObX>DpD5QC3{o{8jz7=;0A9I0TV0y?`A`!Ly6^xz
zx4AUbT$s_`-~2$fv|@?@W&BC{dJ5;SJZEu!a*(D!+gzM|Pxx08C;B=wj57hBQJT=P
zz9^xwv@jU}4!wBvtyMK^h0=CL*wpXT86q&T&niPB{y^aVAC)#Q6nShK>raV*>d06V
z_2e3%bkR#YG-Zk$U>a<V<|79In2ZKJe?IjNFujXaBzgvk;)^FmF-ssr{4CpmKZwY#
zy5CTaKnTFy$YeOr(2DU53f<(AL>a&*73dmQ+HBxcCS-w+%zL5rALp9Nad3J)3(`vO
zbI>BVE`Dd;eOLm2I<S%|@JUnc#d2`!h9>g&41vmmKQZbLTGEG5-bPO!m1ob~9W$U?
zkN`Pj?~Kwtz-)|j^sMOT7#v`GQ;dL~G70U_)uT3vWt%I8WeR;y@)bhrV5bBr<BSTO
z{Ikgl8`LN{s2j;DRmdFiniwM<L!6dyp7k7e3BPPxn;rXBo7*h88wcf3EsvD>2NgCc
zvqHhoaa;D%aacC-F~G^iYukdV25Q!XGg%uVm6Q=x<9?jglsT;vGia$Q3V;an&MNJ~
z{JE@Y+@l;CndoaxCSL^yKq_QKJmRsE=LeHklK27Evgb8)%@Yc@Y78g`097@;2OQMN
zKI7J_oe<#1Gzv!oqHARXrCHt$E;CNzv7Xk7dsHuA2*xX4U>wsJQ?O`ZzM4HMvuR+S
zoL5D#R8jy1CV@pW5BoVl=}ucUQ_{3pQyD<(P|XxkPstrIQlttg%zY^VIp(J2@}hM`
z>?%PPYADW4T2`in)rJVb>qzOIf~Np=G#p}?Cz?G%%bKvlm=~JC9tt4<(uL-y37TIo
zj+LQEgS`Z1rZHlTqNR<ZB~MzIB>+^YxuhhCA?brtgv~Kmik>nkTACyqKop<~QfL(H
zQ$#YZ4MKWVw_U=kNSLOHv0cqZUWqa)A(BlnW4C0~sA-D4Q@T(nlWxT@y)$(tY1pyF
z=vgvqCp1P5X;?+nb5d@|sE~0_98$0WCWQOcq*BCyxaOKeH+0sCO63DLrFC}93UHf6
zU}-QQP;r{*)PREn71<SNnn4Ve@G6Y0oc60S=BUir2Njf=)f7{>BeAMYa8sZSk7{+y
zrzBQn%QU<rZ62Ad+>O#Lo0!~u+f@4sjhr#aEC-;fkyuN(ZgIEQoSLyDk=ud>N$*2p
zVnw;wK*`VhMy^A5BLR%#@fFTJ@M(bB<Y%sUtAB1g1~Lb`0aBRUO}Dy)01GL0_s{aD
zZWCdA!}Q1?ezlO7U<~j1h{vF$^GlvgKsQoY3LLIA8yg17S(tfp05QV!AXT{Z`2ylc
zEV%42b6NUz>>-GbNSqF&Fh9>Vx915HK3nz1Mn9qEw1gwMoaZH>O>E+fWt1*>z!*Q)
zl-nqTq+z)ne7O~1tcQg2zJP820QJ&{Sd+Ye-)zLzv5H$5%1xeg;|oJJmvd^X^93i4
zK|ZzS)&Na!f-{bz(!H<579$0NDb7LL85#co>(`1aO{ME|LozgI>~WEding1JE;^UI
zfD<Hz_RUe27HM`U%Q4yqsjc|qNfTi!mN^89%fEt4GnFIfQUUt)`c@I%<@p#bx9yT5
z5MqNUr~Ax#X8cd-igatcig%RG<N?V!HKA>4Vvo#`K74$De>&)t?nGhAjF}Y2F6AXy
z9FJpOf#LmE`%=L-ureUp!1V|G^IRq2z$Lgfr{S4e%K99!Rhxz${i$-^Hyhr^a$s+k
zNdExV;DSN*&-AO1lOrT*avAaO*R43h&uqa)BqL$K>T%cdH7daH%+3J$co;e72m1d2
zFM5MEYFo1xK%+l4Fg-E<0M@N&NGbw=c7S;4(?8a;t>f7d$>kJh9CpQPTSu?}y~fk+
z$NvCaRisNcAOp%3LjEGfu({4gJ#o!wqHO?n0~HkMxer{9mAj4Aj8v@7UhJrpf;cL~
z{$L;HpTe*;Yp*5U2nAPw-FfJLophRvUo+$dWys^d9Y^7c<!@p0%*wfPBT@kFa6$a|
zHO|(i>6s+DI5Hl0=j)D#`qLst@=`}5=I!m%HELKHBs`J{+A-<Wf1hfjt6Vjefs+Gv
zz@z=|TAN&FHLvPIYlCSEw$=OiJbf!C%i6zpCZn|*=56OKxRQNqXh5$9^%-UR!mX}d
z2APF*88+^Hvr}0_O}u-Cc&Vid6@dWmQYrSx8CEsN$^fS2?pQfrJ<BU^l^ahK=8UK~
z908tccwr!9gMrO9@f_`rCH>^2Dvx@`I?Bb^*6@v}ds${HoW2L=UV<z(TPE0x4?;hY
ztep=@`xT|7n2nA{nArV49{&K1b}btuDjp%XjynGUpU$+Fq~6CrCX!~iA+5=H@=)aT
z?lbz2#;QiL+qTmaf#`ExtWn0u3}7pEAnvMn)^cH=%ai`uw?CH^&nkD+)-`%F5)0;u
zx!f0_$m8{`IH!fvK48J>0jyidp=|#7-NESV)AOo=S)l+J!CuE5D?7NFSxD$E1X<jG
za5%vtsI310H&OWWRK?_F2xsJf4QJh6`AkBo+x_5r)n4TCSn5UDDgb3T_Ub9c%j1!X
z;$zh%5`3Y7&wAOm)FlHTfTy)PiNNXY<U!T{01D8K;X5#{Ij-g>f}ncU$S$08U`;-B
ziS}n)lY@?NR^pR5!Oe3v)5zE)*43<;12ie!icqpNiQK%_KA|4pm@R9m$f}oU#Z7Ix
zA}-A49!wKan}R#lo5TT+t!KB=xn`_(#oZSox;Yhk*7Q8GN6us0pTuUc)rhCVKbLy{
z0PV&P<VgOtnrw1N>tdaF$*W5!Ju91r@DeiBqYdJL*0hYwr2}7Zqpebo<99i!#GF>D
zMou;)ViZ+o1-_Lp-ZbsG6<4^nDKhQmp=PVHpv6^a@sfH}G-(!B$flE;M<aFx3XI)r
zJDQ3g9V$hX)P7zwnt)u!(#aa+w0Ozw%`HPh_u39|nw1}QXDjoXu^gQBp-3D~A%!#!
zY0P=07^z3jp}<suMk%XLrcoOTW-3+`&^H=s7h$ToH9DpR2)H#NIHb^+gNjvb8g6N&
zu|g^(kigRCny<f#a=v(`#@S>rsHFoGA*T_MOh$Z>^H3GRtMac}sLRa+whO?flTQ^J
z^`J?MjwvxfJYtjzcP`=;ry#+lq{tLpo0F;HotlAOqz02HaeQ%4f^Cj7(x856V;MbZ
zp`EUiDzXd;-BZU}<g{@&9hSOivKNzGN?IF0S&tQ&aCkMPCrZt^Qgd0z>4A?q<nzT+
z^J5Q;oc${F>N0rEQe^~?7n;Gdp&)69UKp|VtofvA81CSD=cQSe7TSJoze=RAgC-hK
z2Yi|?Qb=n6x%solUA0s^JF;aZhgN@-Rm+=D>Ohf*_7$3_ut|l4wmnTDrfEcrAlriD
zsl{kP8X)<$?mxmYRkY_bN1dBfIL9B(uDn}vcLm4rf-1RmGji6&_-05~a4WZ=1$q4V
zt8v}L;9y{S@%}Z6hTteMgd@IlpVFd{h}>DQ-|UXG?$E5B=Tsz*IpqQVB1RAEN`@&p
zGVlKC<AGdgx|J9on;%}adI_W`Tgiz|da&#M6;Z6VWXhJ8M?rTCbHTIBa0tN#xUL&n
z@Z2}?Nerl~F#D1Y59eKNwxMn$2;w!Aj;+W)n54V4iuJJ3#;yJ3;<o1Mh>g~a@!i)t
zEzyGdAbG;e&@lc$`&9Q+!E(Dm7$Z5Zt6uP(%&JUeOoWry<~i-3O5=xzFW|d)f;2Jh
z$Z-5;>5Nd}8EY3}(`ol0hT7aozdWz47n5zvoMC#L^Hzt2^&PGLoUAdD6*>Ist)GQ2
zuWq)ryrM4VLz2hX4k;*Ed(3ZFqA&*Q`Mk}i%Lli5x{D-taJIs}TaGb`^qn)nQzCD0
z?m*~!W80vl)O<Z00hPe{dSjZ?om$BWC4E`Frb3rC@HoZ`1O9#MMXUo#5f=yn+CQ28
zL-VRW7))BK1pqI;4r`-pb}1|LE&`5v`hFepQx|k&HD+W-0clF|WBbJ8(||u5R>VjZ
z!Ny4(aon1U-U5uf7a(ISPY3*u>sm2F0rRoA94Z6ew1jSFCt_d;L+wnJOAnXUtg7}H
z&$ThvBn8K){OhNoiO%Y_%_JwwVhQAW<X1jyf|Ybo2R%J92nX=UuF~!yyBl+p^DnXD
zvE2%$)+8`IoOR}~oK@~=7VL8umS8fs91P(1=LhsP#dy+hDdL12jpdskPBZz}rd~&w
zU5bu6;AXt5#c@66s8T*zNmUK`;8t<m+9>51m~I4%_fH15EN4uDa6uf@(Rg<Ct(ERA
zK#Y#R=cQJcN|wkMSHxgshXnEU6;o<^GUFDI+lxhz{_7a`r`cK%!*TD<Dzq0a#{dQ&
z?6Cg0t@}MUR=bHjojmQ&-*d<5SvXH()R7Fa`4POVM%3q%(zZM;qCLgu_NJnRVL(J`
z!>Q|xdY?|i>smU0gOUQb+O(oc17#sh4Dfkg!=C(~rFKx;wb_LQx<zXk{nScE2d;ho
zwA54;-sIl?gqm%y&esz-4&387z^!EkRC#TmsOwRMjBs8+20l=Fei)|QU114{6|>Ng
z*&f{u9;R<sp=TjfW);cwQ%rY^lA9N9IRt-P)*PD5YQ<zov)Jt*{{Tv7_KQk$9BQN8
zjb`SHP-V-%vkY%=8~wgNBV5+1>~rNS@y2){{x!1Kg~<dls<KHT$k`E7{obSN>r?R-
zpDD?-vJRQgq0KS;$-pvYaay-=6Cuj&Ur;(zVzqL9f1FhE*uG;jC`Rxy3H2haqDvpk
z#tmEk%SHL)-IM$|rx+PHagWC|IE_s*7()}uKSNq}-zyFQsNl6nAeSbzp|p$vf@?c5
zwp+GfK;sp$ZO~R^5QQKBO>00f2NiKfb7q81#X3M8X_1kNbMh;pY+=;H)Gfk-O6IU8
zkSQX%d&MUsHG_9(GDxhdF2`Lb+|Fx^bQKUK<qlf}D*!!v{{TvRz}fVw^2SwiSoIw%
z4Vk4$D8b9IVJ>#@n!63=I@V(ZjGPRSO%k?q(xIyw=+V-_eoso$g6DQJYmv9OImK$*
zTpW&QQfP@l>LZXfOdB<m1;#+;t_g<GgIy4v&S^w^%mq@AvsKJztw!Mi=M_&-PQ_$|
zV<wTCZvvey!m8j^Q2Whh?2Q%J>15j*HhK!7Zy(Cd+@4KOdck5?!R3yBDs&(vs*Gfk
zO=R8Y%45%Z$#x?7fvF>Nnq!lXN@_PWo`+rTb0l*{DpIt~rl~XLlNCDDpi>b@H9k11
zD20Y;SLLY!(iG=3(P@Iv)V&83#%u~15@IuAp)Ho7nuHlSsku>l5)PEB!KVeM1*y0;
zJsO{%T8kJo@N-s}l!nPrGS!9|6%iB(vb%9n9%^?L7Ac7_gRMPi3(Y(&OJLa2YBtEJ
zSW|Y;n3E$`Ak^Ebfhp>=+cdaltXr!Y1{RnUdRB#u5^n`insC*Wjm<o=Mh4s;O5C?s
zU<*|%3t0++%Zk}Zwy!nVqI5;*K_Y-FKI6S<OVX&wh0Xvbvyr?EYlPs|WUN)Nqb=A0
zT`O9qGa(rN06prQ^TltKf;G>tPyW4U$<@0XDRwZ$t;>SUqA}g1?N;E^?GTN}&5-v4
zfkR0&@AHv>dJdIhIT=RO{Do4CNtBsZ@l^p-U5Yxf>sgNg19t7J>PIK&D`HsZk+KF&
zQ<}zh<(RkV1xtt~Lu%0(Lgxq3m=^y4>sG{3B#ZK*f!vk<02<D;v1A0HP)X$N=ku*4
zgJNT0^(+rR{c34D1e(5sp58JK%vawaR&CX@FWr;PPeK95=gnD__8fQgZfYX$FIG$r
z+2iswuC^VznDgQ?N0eDlU4y6iQ>~^%I1);@=MTsp;-$FL_c3HKsq``xAC*^FriH#;
zqz-eOkL6P0*=k7Hj!`NuBu&5G3<~}$#a)MhIp5zX*&jj<4l54a4x@56bA>;xZCa=?
zgD;fn*LjZwenyD4f>vgNmh#8SnLWdDY6)!PowFny0gc$l<=(79k-ss_G2aq#{EbeL
zV*!G>J<E^eD_h@lEmp-8(rpL@qsURv@CVkdT3EpwtE6fMFuZlCj1f2=B9Wd0o+`qq
zg>uIS`^+#ZqLDIeCJ(m=pbu8y53O=q+)We`?Q{;vLh*nz>0M0Hhb*eH9CAP$oc{nX
z=Um3VjDRx*M<?YC^JD)2uSDf-Q0XhX9LI)rTZnFan}oxR<Yy-!<AL70=w|bQX;(Pv
zy;O0}<^DD24HuQFd4Yz~!*em<XE@2PL(yDa+AM>iJ4wa>$7KWFrkib=){08y#|<e`
zRN=ZH<B#e80O}p<V%`v{tj?r?fUD3`Ev-~dxULU>{=I6Vxg4lb=nZz$gT2lf$!t|H
zjiii#Mm|;P{uN$GnTX6l0pE5AAJ?^Oh{kh_nubXBp#T7Vx>n5@+{cm}kD0?292{~*
zVBgwHzk0D?pEd_OhI#yPkF9mmh6(d-+wI3nrz~;1zXT41<A7^9#`+YcV};Z&QV3li
z10D0uJw<R@d+GY7vQhKr$1j+J2MwHeuA|0DEO#>72T0>Qu1ExDx9V$~()0<&qXpf(
z?Gh`aH_P)f;1a*gR}+^jcRS$=#^<P8>5w&=$08%654^RFci~89-jBCyAD52R(AxQt
zMZB<R*l^%rA96b%TCj=omCCUrIQ04-N_sV^joNx1dOjaoLn6r=@z)(k<y%(TNsN5c
zZQIZV?Oh|=xKpqJ><<+}Xj?x!V2+BTH7&K&(MseIX|N0vnKArbe@@k8m5l9obWVVP
zf2C8{p-W}=XNLf&AC_u2ymVYFtfX`Ij|bP9j&z60*=~E|Z^B;64nF9{e!n+0LTy&<
zl<tvFAI22%?nO|V_1I(?r6lB&$fzL1&Ige3pYLp758^5vM$1r0oxJOpAL(!-9&sCU
z`kJ#In%omM?_JGJiZqWPh}WKTCTec`g91K-0+oW~av*`fz4XDV&e_f)LG8#jJ;afy
z&fa}RQI~rRuHRFUO<fbC7jOstTYG|YR$#-oF_1fPP{k2oH*O!TU5z#<JMqtTtr9bA
zl-QY<jIYwE%ce+;#F69uyo$Qlep2wvarsqv^$8@9f-(u{+=^|*Ek_P+gQC-?z$0n@
z0Ca;_qPCGf@IRGFZ45GC`C=pLr;5;yCya3Am)A9)B6}J+GB_dw9AMSx(S;&29{&KP
zSw?R7{Hl^hQeeG0Q;8VsPvnY)_jW&qD^x)Y=owB0VqM0{PXG^dT9Y_A1&I7>F6u?;
z)}nSdr*kG?xjAw46;Xng3NlGJ8LP9EBZ19Om0NIO)EtWEZ0K?3i%dz_#6s;Ez~t3e
zl#|U_yKguNSAn^Q_5QU<hB}(fu)|B5nq<*Qx20W%=z=RKG0z6Az@?Z}GohoPww$&(
zt&4XU9<_;M9x<BeEmd$ZD@tcQOA~Hva4Hvsu*v4H1ny{L7~;B8Ir*$($!t$2ii>Ge
zfm*U6fyt^9l~rc!22DdqwJzpu%Wj>IWy1deQ&$moz^Xdqn#@egARpdr{(RN*BZFM@
z+H!I^@ZR*LF|tN_lTo69-iLBeCYWRk(wwY!#KlpQOShVGigi)*19tOLfyFaC(}PnT
z#oJ9`QZHIY9MN*FLPlyC22EPqQMzK2V%)2?o{D`cK$$f#kBXau8jI^s#wp=2DH@%I
z*up7>pPEk8$he^7Q(0*`Q-SPgY%V~{QI)F>DiS%Ra7=2gDHPx_N<~e<vMf`!(t*u8
z6z&LUT5^C-N^lvZAc82k`HOb;rbh%<Tcz7ZiZTXkl(Uumzy)=>Xpn$NuG~6C(04YI
z0ALDJlSrHnY5DZ7rxT$vyT`^&Rh9OPD65ilRV4vOtzkV4Vr5*Ha(5cfkvZozr#?wF
zLS8<#&i6W_u@Y@&2(je*R-`s*vo3b~R#3Z&*RL(tinSi0B*<GHT6&(QwTo6Wnr4ZL
z75k$<K~TvK@$%-VwxGQ?6N;@iv=Czfi0ztANJ<R4@Hju~a0gy<QF+Rz3QUK)Fc=k;
zHmqk)-V&;MDxRjKhG$h*3aC9v$sa-6>q3&~Hx{=o&lF?tGJ=2E#b&cd9Dr<<kF0G|
zNpa>j)hbEu&5~-Ywz9ug0DUTF<T&}4qkDEhyJtXpi6c||#YZW%Kv!%|2=v-V<Z6<~
zD&<T3qp%sr>zbKW-;hBAkC<?Otxq#5%ehuTvQ75QBre~=*dYG^Lsx9?rjA0=M$td_
z&|y#4wKd^T2;9gUu=&U2aZ*}AavKP+%Nq6Cb~qnHRFiB?*&A?3v#w-;+!2z&Px9up
z;fZ!`jvSyF#=>fafg8fKyAM#TtAqJ-{HtA20bQ~#@Oc6?{zQXaF{Zkl@|(E$6gNwD
z8S1H({(}^$19`+nZWQO1FO&UG@~bgMq=3;kKAFemnz<U|jqDe$G5-M9t$mfu{i8)A
z45=FgK)#0^{{SEUy>mL9gtsy=Wf%i2$G@<y<I4;VLw!iaLe?nc78wBK1KiY9px%hO
zb*%1rn`lxA0(l|YM&>?;r>X2X$I#btq-pWmTgD<kBCZD{n$SxnfHSrO9+>Js&-JG%
z-^X0{9+gz6sOrM4XFU--ah;%U<Iq(5Mmy7D91QS%>BQqAx{^7=oM-f=mCjGSJ3RZ*
zkVbpZ12$ym12szKA^{;XGti#3d>kAaWXrt>9B03!4GQ|6apL<p<iE5GjmRv{6FJ&C
zdUMwta6Re^3rL_bvSbBZNw{YVjA!uvRo%QX%yvqFn>jcnbgEHksSw=~obK*>dRHA<
zO}m|x>ZKbpyf8}`3=hr@9Y=HOJ!)Wqn`kI|l_Lxfw|dhSVPe4<0AL(;^dhQHY~U3H
zNHf4;mS3l>OQ&Q?nrx!#th}qE^dLBI(}Psaq-I>ROSE(*7ay%^q;CHJD?2_qf(Qff
zAk;S!TEvGUHa*E80r?7_W|}YBTSE%-WMHUeBeL`Qj8+}Z&&*4bn|KPqbNbfR_3e{x
z21Ll`<Qwt$=DGHk2@82~?iu+=kFVxUVNwYj!dD^_yD=^z+wGD7BSGB0e+sn?(1i#O
zFl=<{KOSfd1YvW4e)b~+^{HCbCs4-YVCj`l{=G$?LAJV(O|3{ekO1j~$)x*hTmV{L
zdw%Q+c(I8IBIy|WuQde52p21~x3cD)rPP+!LgJuh94mU{QZg<|v<K>=kLOv3S#$s@
zC)aIHYpq8q#DD|rF;{d$i`>zOsN)1xo#>+y9zAnVE5|fs#OVBTGlN-^T{XLvP)0ou
z_|w=*H!s|^yn_hjei_X^($NkNB-KlMnP*{<f$jxt+Q%9GK7O^Fnv*q4l@_3YE-L(J
zM@o(efWhGV=C2?CbBd&l<i;sdaLa@0Ds>q5r!EM^9#%O9tcC%xO>zjQ5&iBDtwAVp
z&}3Di6mHNrLs*wsLPkw%-8lq;F<94akVqWXRSueE2^>{dk%cA9?1b=YnZHcn^{!^l
zsG@bx*;xnhoc{nytm%`<s}}P4ti)p*NamOqc*zwDvCoC~d7;^nB8}Wug|OzTh`=>w
z62OcKmX6DtwqvI?w{E};3g_(|xZu{bEax0m!Pw`hu4@TY4z(N3PYX<n7CmZV2wt_&
zn`b_Vr6F>1O=oHn6^KXO&OZv;byM3l&*~C+my7`7F|>V6Wm)pUvnuoDru@cQM$a)S
zKIp4bLm=jxVB#nXW0pUlsaof8<W$j$P%jM^8fd2Rw^N){TY_~JZd;Lz5mjWB9Agwo
zBd#pWLr$iFiW#V9%>gv3Y9e{3FG>><Wa!+~olQWdrc=cY*ro(h70o{x6y;tkE@2`i
zOjLn!PB;}CJr@SWW5qy8r7cmGtrO5qifGL=nkk5Ob3hb;Q^2WVVXnCq5ngJ+=~1w$
zxDn!tQ%VH_8BPL_ie8mKE-^sh@k*-Ba6dY+8l-q6^Xh+~tBhM6i+XfTxEU9(LEP6{
zW}qM#$F*}-R}H)cUcLL8=<PQZ73k5jS{)S8rmMzDr)RZ7q=D9?L5@vm-PzKWkl8pD
zN^FzTt;*F(QciJL)HI1sPzFz0&7Zl(D^1fqd8!vdRCD+W=aM&rG21?FasL3<soFvl
zF5PL8FiuZX*i}hx8Oa;S1EQ#?S{l7XmUIWH`eLg{;fKq)v)cetwC|2sf%F{HWVsmH
zpnXTnOO=nB#aEU{<PYVHk8GdA{A#TC^Bz9NC2q^+F#blYO+MmDU_Px?9##VjFF)^M
zsb~$S(4GxK^p(x6sxdqT-+}e#n!LKCgaGizjB|x6lkM|o6^<qHqcTY*UHBPefGJ?P
zNgw77Cq0Q3uIS0Gr$G*|TjlZzP&$&wAC785TI7QAvS9ESk^1vl@d#wb5p(WIKhCBR
zN~aL{N>5^o{c3E*bZCn!jla2V*d0{l{)hSsk|{3Xak@2Q{@hqUf#Wq+*3$7xma~RI
z{pDs-e=qQ@Yc#_y=q_MAep!nt{KaVq>vTk<tZCY}nHsdB9!K10@t@Ba{3~wOD9mAe
zwm9Gnar)I8aUQ@-vc|`|XCI|#!^(m7g(v-x!~83@3%ScCT8bc}0XX!+{{TT#BF3C5
zs{I8(n~w?!^(~%%TC%D}0So-ARA+JFPf^FzRDfr%CXqlLs!ZTx`eK-dI2(F#RLUyw
zIu0u7IrOTMow*D15zZ(YB#Dp84l(WhY2^H)j`-%Hjkp-e_Z-x!+q>{-j8MgUWKo`&
z<Qk1+`95QVj!6{$Ks_;v02`79diqmj&dzwxr640ahBKd9S0}A8GOUpnWdX7Q&rD{l
zMhXqQVDzZaWx@2TN(cDnronN$K^Y^GIjXZqw>jK$I)E}i1J<puP<;<-ts!ijFvsVL
znJosHP?-q%2I2nFWBH1uaD<~0k;y0UsrgiXA!|xhSrcg}Pq%8$xsG6jBE+gYm1g|;
zsBtMs=CueQmOw62Hh#Mz5-|RPs5}lMD->qlA^qWiKRzpF?&HZ|B)E}D>aj!e*E^}r
z1c!%tTy;=b5%mCpT$K_!VA~@r&leC%dSs|y%ZyY^FJf$?06P8PL8=kYEMw%2K_{e{
zKdIub#XRHYHvRc3I(}8pYn$G~n(E~9Ap_sH@~oXg?b%P3L;8T3tMc6!f89t>{{VHc
zr}L`LWRPLqhtTG!CPy+bHkNJ1IRVd5D%=8C$Y5Bn(zOMwilBj!*NmD$f%26fT9*?=
zG4R|Cxfuj?+t#f4o^SyC$KI<df__EM(W^$%4dfODPoWhW>{Zb<q**vCk9yg+w3h&?
z<WkyM0s#PiCbeR>WnMV?RYo1m<uW^XnQ^=kR@DwOx}2q#hU8R=z>WoIk&|~L8<g>a
zJ*q{U1D+{a1~}rQJC}|Ksu4Ee<F!_qunAVd6c*ZeHCE-e<=FhBe8c+FPTPafrFSYy
z3OU-{jbPl$@Ud6qdivG7=O7#$spxAKTa4ru8SV{bHks2)RdhO8quZPVRL_uKB?Tia
za&WD+y$RscO}j7Nu6E9-p}~NaJeCX2NXB!Dvm|Pgx|56^K9y21sKJ;Lpz~K{i_c}1
zKnEL09@Q7soUv*cmY>=}AWZblU4;V>cGPfN8qkUrRpii>&Zi>G1r&4@r38TE7_7uZ
zQPi5d0Ry0`S{X}ISsdb=Mpvy=S#Ubj{?MRuYh=tC9L=y2M*w8kEj*ju-}}s3x34cF
z`#ghi+*I_gcwwbLdJ;4JYnu}8)KSNYyOM#k9k`C#A^br`ezhAajw!NSGv^2Js_nd<
zwWaK=k4iDJu=x}aamlJ|LLZn?$R!6M_^70EMh{w+Xo|1FsE=B(SFJ$es>jT<Ct7e6
zgQYoYY)V9mIH?694l-$b)JUrmuQc<(rXiu;vq%OzR4m+5$n^=yrneP^CZjFQDa}TD
z*2xvhKXpA#L7IA+VPeA5#Y8FK@k+vDjYdT&ZYiYHIFcb0P)z{PkXQ(%3*7K2xyL4&
z%v-6ZwA@6EARI6M09uhrEs|<6Ql@i^R;k>yjcq-~;l*_pvJIlSdlbnP(^;kzU~^rx
zo3YbP7u4i&QUifaRUmUpYik>j$(H(6ck}|SFgWI`NZWq3gpp~L<-#vLYcAk!B%0D-
zTO^(;q_O8Gpsbp?q+N_jlLVgiojbbloO{;n&~@UdOKYA;{3||IYNU>SM3Av<z*T8u
zY<>17wx)n5Ey?t%uLB>Lt~~`DrE?hp92_s>nt6fSt~2UuL2a3jPpw#30*r0RrlL!z
zVZD)d$wKGy6+Bvb7%zao-R)ZeG1Wj6;U?3F)1OmRp<-P^D@dh6Jh*`azRo{0TTuDf
z`JGefNA#@afBER!kEL6NQ=G1G>~UJhLn%9#gGldzb0&QOAIh~XCcO*><SYK#;QnH=
z<C<K8B@5dNT2|6$hGI{t>-h@W4^t^@Y$UQV{{T{l)PtIxWgz_9RDYjSRAP-A7!99N
z4O&Q}L6%oz?l}Exp&2sbKH-HU=nj9CSd2F8<d4G@RU`ug?t}0C6=fI^@{k9wTG0|&
z4bM5vJ9ao3s=_R=VpyKTH8$`UAJ&+KcK~%Col?4+V=v0PTY^d8eQH^N*|-)S+mAJq
z75@OAJ6p?zXwM)LPBX;~l@w@2GRCMk<%rJ}WIrbZ4Owq+nJx(GdG$Z%G@|}8+XG<Z
z1N{0@F5Qh?!;Ih@@I@ivfL<|yS&e*~NMZau`ucw?(`VG<NmVzE#EdtmPQL!s$+I-_
z6q0GKz^Uf|=cQvdw+EO|x~T&M5srHC?N<`+In(59D|7UrG!89pTY#uydC#?41fKb<
zNbixHd2%Zp^vTcV+O(mNIL1`{C?S^H1P+x1fCJFxl37e=j&s(aYmmbneL>AJ5zOi`
zHn1bt{{XL4C6JN#BxOFb59v}|OOSBOk^R^I02<4Zbn4m2?~s3$Ln#FC+DyMPu7BPD
zkHWI9ba+@2kh>@SGmtp{0KQKn@T>Dn9l~LVf6$NXSyMc4xdmk&gti59O(R&gWZhcY
z+qohs&O44jPPIl!EyCn1tLRLdf2Cc#xO0#bgWNH!8SN#KLdCKP?_BLAvAk~gCz9z<
z5)s&Y$_+vNr4hjLi1g(Clt*l;dh_qiOt#Um;YNKiQ0`UYh9;y{zE!~W=B=#Wcm<F*
z<Z(=n!!7cuU#B9pEv#HN9T(V&T9tG~6}p$miyq@5wyiDLsLvmcYHLeL7a3fN)`lNc
zIir^Phd5fnB4CFER+M?~!KkB9oRP&?R1M12Bxc>k9S<3%GJkfJzF)k!q}(ao1oH^%
z`P6S99MjvL)f1^8jC~CV2&8bsKdn`gP0Ce|IN;NsMumuF;P({-_;uWeBhrG>GUbR8
z@g7vyKYe7;Ny~0u;bGpjFXm{8bR-aZ8iL;T3-ZYL^FN7CM)$5di;?ND({x`l);&z9
z?YHrtrC5m`E(yr=G+kS@teF`xlfm?<VUx^WzcBqpW}Wss<25MrErpUKXdHpG?anI1
zFtjS81gr85N6GwYu*mByJDrNJVtDj4uc|mPe&t9^x&)ntJS^SCXhac*O1T}pYymY%
zT<mhOmlH@2O3;UXI%2LY5e|0L!X+SJ0al><nYopcGTa(_TLu^{im(#~hF*GBi5c9$
z)Zthq3XFMNx8iCGNDR|7hwiTf>rT|9-FV|Y4nHcpVHp~HXBaQ@etkbm=1ZQd?2amb
zysqT>5VDw$12r2G-npyJgbqzWRH(=wjcnuC^puK{BV&rEDP?X}uE@lm4M4*=$<0eN
zMP1pbo2@iTDOH0S-Nwl9oCgJ`W4$3@yii*;nVNwn{0vdLq1}_kNUBJr;u47}H4c3#
zm^q|wl@gAG@{BywPZVdGTvkz9f<sL+6(Lhewz`rnq6Qq(ldVWsj+BbL(9sPs<24}8
zDHK!4#R(GP$c-f{Q)*m7!-X{QOS+Md6)6tGN$p5EpwK$d(40r8=B-7M{5h)1YRVH;
z7h;k%Y?uZa#dKEFDuIr*%vpk3x@#B)139kxE!f%yf^L%swMnxbX;G9e0H>VeHPOmz
za<rW)la0;KHF{2Jv0PR&j-^GF**$+MhD6V~t1P6NX5UZ3v*jz8q|j%ns`9`O3N!Co
z{{Utu@8JjgU4JgM42ueI734o%qxzb!W*p`Owu%ns<kM42l5dzpD}&G~=QX|qATA?@
zGwK&O{7!15(it1)c_#LDA%7}8np=lEC=DV)LbPiiqP)rv<<_M-O{_!r_mM^F0W5zl
zwIMHppzU7a2GROeS3#PqMGEbaPX?Bth~&JLy`xlL%$)v}T(+Et0?Oh+{?*7I$!fzW
z83XgEjAK96r{pv$+-d3ebco6S0H%{O)tI$d@HXkzqaKi{{$i;ZfuUo?3QYx(yKk#c
z{1HPR`>25b0O&QPZ+(3!0VF5iAOZOST;-}WN&p3z^ilp5(Az^KL<bmu?q}xwt5`|1
zIV5XDmyAaHoO<9@kqIAqW@@J#lE%0yKNCaWkTAmo>x$@28J1*)<mn>?_UTL!It+QA
zDD>izE46;iGt;$Br5}@2gfcnUj0IvpomdE13;?Io9Moj42@XPztPLt$CU${<c*QUo
zFfpNKf2(fBD~i-^{JiRsqh>RX8ztlS9kJJ*wcJjz$^e&<*b2whETEd%)DOHdo%j`<
zOQm#aO{?m8$NVMo>Junc*r<8!o;^Evt1GQG(%bft&PQtD{AHm{q}(*t*Do625w*6Q
z)g24OlFGu;{{Rf<g_sO(_HXg7RZ>q|o~1a!t622-JW40E%1ki8G;V_+k6+4&H95J<
z<&JXM;l84{S?~6IirZJ(tXgV}gzj&UbCPPKTU#>T9VK*jGhIRk=4x)A;xTJ<_U|_E
z!zelv+XLxdZGGZ<tw!_gx@wh(b)2qA`VPXo9|~wQ&2pDlo2G1KOLbqRTdA&xHCV+*
zQ_%DaMsnAaW6pEvK?Alcr6Zx;$o*=al8CLsg4_&xkxrG0?Jh<S@hx;iW6hSVv$$>f
zmybi*p`U7@DT2eMYBKIu1ExDs8h31C)OM;7og^~c0vbm87_<FFXWUy`T72?}3H1sT
ze=}Vx7}ZD5`f*TB=*hK$x*t(c<8h{Qe`>WneTqqa_o=G4T9u>#@6q`6bN>L=tD2pZ
z02YolUilnXH8suLE&Gd^P!C5x%cXP4wYj26l1btV5J*i5#Gm$?Kgd)++mOq_@b;K9
z-w*yYtEpZ$6Mdd>{{T&`(Z5_~t;H-ZM*S|t`>L{kCaAspeZ^YECAfk{{{WCKoVoN8
z5B-|0!XaWeX;y>KhLisQimeD@v6BI;UO#x|Kh#yo?JUpTl4L%TAS3ezj#uO^Vzdbc
zL47t1dWj?-)~}>?rx>tB_YPX6iq9YF<cY_iY<`BWMQG!nEr+%<R-@!AkVdigf;IYv
z{{R}aoiZYe@EQKKGQtmR8dirKfz2lo+-kl!F`)kdfYLK<ISPY6?9+n`;GTUADcR7R
zW}&81WL|pobYH_XjPVYre*;M>75aMAVY;g*Bh*$+wYj3O@7*x@b5TCn`P|t50CzMp
z;N*0m9E={7De5w`{!5N!QJ+w2J>mczvqnMx0BaRu_Acsq^sHIr*iLirO}lh7itItC
zS<NbvfgkAO{A-<>%JwNlk=wX&(62SMGi39~HH&to8*-eA!fi9AntK}dlN-xzlf=sS
z!z&DCqH8}Oh6L>=gXvi|((RR97=|9Ex)?6EW1Xx==Ugw`dTesZPYEt%@V~&W5wX2`
zcOI1<HqFT6KhCr~=~3{=j;FamKb2R#!I2qMC?gzqtCgDOOnA1cQRYU)q85!<9-_3N
ziH|3xW@&OO+W=fE1Kzdc+&JKmP;**Ebv1<fsis&&6(XSZrXgPDmx0l%WJ@O61xF!O
zQq8~vG}8F*RJA$8c5lxu@`|o&E1Ajirp|75GfdvD+kX;kplkbB)nh@zjicx*8%~T*
zew&9142Ss-=UP(PqjQien-6<AcQV39-Ew_MC)5`|D9u7|%!`7D@~IlokGMIitrY0T
zI3l%oJJp40QNis{825C3De^L&9S&-^8yJ8{tl6S5M2)`{W;t<CGLYO=n4!oS>s^x6
z@}z~{#M8?Vy(-jppssOGdu%Tvp+v^TD1K-5tE-H0P{#%&>BU<S;**ICi10H|u{B}G
z6&Ax%;h<Q6o@znbn<Qj&PfkFrlywBP9k)FNCc{<{IH&o1bgs1>4IEX9yJ@>{YRODf
zHiBvL+?ygb>rdO7aF3dlM^vem$Wvsfj|Q*2dF@r+6O7W8Fm%ef=AXqhsRtCJrDEV_
z;5%;>6Py}(YK4wkmnbL_BUQ)=HAx3GZXrU3r6?O#Wj6tlT@|q#KnA(JJ`K(}=DMpp
zZ<J=cscc$SE<&RowJQvY6+(;(c@@%@f@N1)gr-ga$*XPv``?hLBx8(p9)h97E>~nw
za?PLiSNv(4+oAx2{hq&{T6qNfRBpmC#YL<;jFEy3tJKmYVLoT&y*(+3o%@g8Jw2*M
z8TmyoF_1_Mt)9Z6Xn^fnA~30eP`r8hK;+Qk7c*WWLB&GQ5D7J|&|rI14FgC%Rmk<E
z;ukTV9`zrR-FsH~(@_9xnSyun&~9V?6rO6^{{Y9dOBw$A0MGjHKLJaVxcQ98beqYO
zf0QDN{(4Gwuj|&O`#zg&e|bAyIQ|&&6n|6sid%yu=X}o6A6IPC;nD6cFn0NA*K>~F
zPQTKfmql>(BmU7~JgZm)N_!_6`s0)MlUmwjZX;6I8~Q4659QXW<4x1f6^+FGK2RLz
z+xSQRJZZ0}T=~0Lt*-ty0%V>20mnaw$||FExSqQmD2aMG!Sn-~VYXKLi};h$vQtnH
zFo)$50rLiJ%s#l_A8u;eYA)Q6Saccx01DReBFvskR7K>HPi5ovsFfHlN~lhsH%e8%
ziH|2O-_oA*#@{f;@9#P3{(Y*Ix(H+3zTl^5QR!1hDgrPtPXir)3aYHE3n?r+zCS@o
zK|!2j>rUWm<y3Ra9>TIUJC%hAoaC=0DL5X3{Pm?5jHl(xIUKR<D+^JcS!dhHT;-3o
zWnNzHh0|K<b3Q7!ZBoU@mORpOr*BVT@7q6LO7myZ8pia)Alg9cJAEtGO~VPxDyeLF
zVe!BxuHVCoz}NJ#smAwE<sK1?H}MX5`qv~|XKW^-&LZ+*=HstwNhDXbc=uy>2kB8y
z8!f<64svU6MDYBV_X_d`STo7*_)!(9**b|tx#1rVu6))|a@(*+KZSb6lWMRgLXnOu
z2TzG$i)e@jPfoSF9nf_cZNv-^c*ytv02=CqQq=Nl)N*Fh$jU}a^y4G1(9_I{#H4_A
z$ie2an)mlx8Fw#noDuyqRNCQ*>db)hz;ZsKwz@_ZxzK&8D6@`n?Nx4XgJ3F;#2zaw
zz>T&*O9RG39DntVU4=efx!e!%p1=Kn!j53E5BfvNl-R)b546<ANzrj4$cO!^pJE`-
z8+(J>{3+gWh-Nn1>IvZg07_P4F?9$ekpnF63wCgRT>JV~6tcWhn6(R4k-xk~$Na|!
z@-^4Zd1ojwOo<zwIXEAVI(=|!D&pO3*h3-P5&N;+#!7#{Bu<r_<JAo_Dmxo1fIN$n
z6Mu5yarrRg`BeV^*>@Jo8RBi7nR#E#aaPk!gpJ7*!F^$MnSY`D>cz4xx&6(|$bB|r
z^v!3=x|MRYF>zO7OmIb8MvhVQh@ATEDAgc>KFsKQ2N@rjt5FrmQ9<B}wHVq)(~qSB
z+>xAT9oL$2f({SoPSz1^U9o~s<5GjWt`Ff=V>mS^$mAT;xC-&B4D27BR*_TgkmKBQ
zR>S9~2hyW(G03TuQyOJFi1GWuk46TV@fZ1{+JD_OG>VwUYLuu`l{l;<t-3W#A9{V|
zBBC;(1UK`hJkzKlWjO3}R3^WjPtP1|J=gk*&A6>W$*s#Xk>!w1M`Kubm;~pLYU5w7
zK6_VCd!zpVjZ~E_E(Qg|p*^HdKQUQ4)ZUjW6`;2<CN8XYoQlt$LO}#qK`xgmQqf5i
zfO<YZol}=joq-Q*{{T=kS+w4#bSg#c&Us}5UU;p0X}73c^Vs@T8>=%5eDS_LN}7GI
z%0}Afnz7LNXB#te+VCU!Q}?S^*5KJQ$Ije1`gf{pYPWEJcyaez(xbN2&85K8uZ9)Q
z(8soW{VE|FY0&cJ2SK<$f&K=Diy)DnGJ9lJ^h(Rn8s#*pzRjqrIq1a?@~+4*1LYN^
zHF4o*`@UYo(42EpF4NwkErCq?Kta;CH?_+_MPcjmfYHP|DPxa*D?{yy+z@LIQn-#;
zoaZ=feE<~|TI6;)u`Q?;G@D}3%`dA2U%+$vnzYb5pS{g3jv%p#7~!$F{ZBOcBva7S
zrxc)(wI$00i1R9rlq`hjate-Hhe7g;imunW2O|cl=xnaZriaZ0g#xHOqh#dqPHWd6
zbmpLv@vw4EO+&G2=6XU4XBB!Ya5=?A8lOtP82M|i$CW8+J_P{a)U!l3^r%Q+K^0+}
zF@k#2anO~iSs@rS;TgwDa<B)Ql}N#(mf+%0nlcSFBP8=zWC4vc8(OKr5-ZCG9V)+<
zwzXYDEmDziYnjH(IY+_B#Z9v#r9%<mRh4Yk$>=8{tFI=Ssot!zb4U;^X$XoV5i?Q*
zspCM^DT51|)YUSPk0zs$Fe>p(4n<a<g*1kp%&9=!YO_cIsb@TzsCnd8PCAOolI{ka
ziMZ8e3|g<j82}nwg>s?k4;<BqA<JT)8s@KC#sM62O;Zb4)YC22Fc^%JTai91D%<5y
zJNj0X1ChIv=sSIDc=kltynvp6T5~qh$LCDlwQ_2-vX(d%t?WgvIouOb^6gPb;YKP*
zPs^HHj>1Xuj8#Vknyo)I3F4wah}Ch=dVGAgYBF=c>U}DBlq&rEdk#$`40ARq(Zzwe
z&I#?_qw@(kQG$3F>T1r_iGO*^e{g?Vv>~N850yYs)!(3~-S+VjCBNsRb^HfvSCF{N
zV`%6HN^-zS8?)2yj=%kH#(@-!Wt$2MG5krVeUC4-*zW$~4bAxWAB|a(SmPUaaor<5
zey#o$PI(!VV-XBpe6$>&!@tzhX>}U7tu?)&{n`hYpa^#y`?vH__|+?^<d-7qQmfa<
zi~9B-j!j1`!3ShOpy|gfe;?>69yn}(H%sPE)LVm$f4)agpy%mO<=CqU8c!^2VA*MI
zygbB__-^V?u&S+aux8&&j!!Z8X~_NK`zZbq^yl8Jzngfgad{_~ZoEdSxX-9X>G<^e
z(xBgN&Zq1)JsH=TCH4oYKadp~y}z$d^DES4g{s?@6Ed`lH#BN;5PG|I`gN^LpowLb
zzWU`<3}t<Nm-*Ivl($Hb+u&pydlAWYJsw|QVc*zP$W5{}#moJo?nlR{-{?JY`3{_(
zmmM?{)V1Z6Pc^<w{lx>5{sN=&<X{2G&m>ebU9g-+b0qoYdf|_+y%S$5DL~uL@i6Fp
zhqYDEtXlh7KonyGxE(%~AKTVe!NMTwPp7q1lI0?ZF#x*{k$t^SxAUy|WK!5>C!TRb
zoq<HyH*xJUXXVcs{wkw9s^g!gJCjspxN^Z40FHf>RXHwLFH`<C5}IWwmmkVGEJgt!
z0s7S|oJkJEJ3;A>L;nEQs`K24*ACbKVB__wH(Hv-FP5is^2hoLl-EM6%$xfNUsp%K
z`T<>30yq?7g#eOArEt%vIleOIAUIm-Z8a#^FC*^Y=lRpUnR2^I=SK^Llc+p!YDBro
z+Sm<_IjkkP4yvf#K2y)-`qYN)m<GrD-pAITv@&ov;kwx5Fv#zY{{V$bE)+K8D<Ynl
zKQQ`w)(Dx{dgre+h#bkC$DTnQ>XkxyO{cwUd5J@u^#FDHj{gA8kl)E6%x8jlBiGQ1
zq=?QybDVTFY86G!;s`xy^Ea@pTZ>V+!C6-$x#&mZO`hf2CV0qX95)?4qM&=xYE&>1
z#heTr5%fKew@RDr=_)PVoXVu}mKa9=0A+{ppV0eLO^Br(!yl4W^J4zvbzh;aTS=vY
z0<e*DY5Y*iGFSXQqN<A{F=m1`j#U2uS#El-y84PC-yDIJim>$Hikq|>nu%#?bNkyz
zFu!=;oP7^p=T2cfKnce1dKVjk`ZxaoUZRdjr6bH9TlSF$ABg_|3c9jQ5c%-v-^REh
zzP*p*&04b&3IuLID1E+R{xxbfZM<QJw<3WAQUcNZg!f#P{6|sxih9P;kXUosf#Ci$
z>;#Au_9}frsd}6^4e#EJ=aK>U?@s_Qz~}kX5gx_Q4TJAPD=$4g4{Ccyk;hR{g2~gN
zpkjG|@J0;=Onu{#>rlxc-G@wiGXDUDFtA~TaL3T_e=3j?;jvWA!#hqX(#x>yV}s~x
zI_g#AWCPop&PK6Bva*Kc;<K)g3<n_h6%@BT6?1}m8qb#Lmw@|Eq3Kvk#?gsZO-fsN
z3iTd_q>Ea!Wg>XOvv3ckWna&|fo0m&o@LxNkSmg<C^vIYtHNClv@CO>@+AdNp{QCZ
zqZv76^;2BUyv+mQT&-!tsLvnXZNjm1?eOShJRFv=WyWdFe7o7=K83dr)}Rt<$o~NA
z(slPm&+?~@O4-NH^1gbSUotl!qbc;-YF$dP-Q0y%y&Wd6^8HJe+JyfAc0~aH0JsH7
zsM*1*+Ie$u<L*d0p{l-YyK%ceQAnDD#MpG;RQqSt*Xxwzttgo>tkx2&7bo|t-}}SV
zS9Pb}toCtB+>U;txx3+Os@wC3=TY)W>Hh%Mr)avnLpRuO6d_IqKcBr+WgFcy8gqpy
zHl5S_4zqsl)Yit{l?j*2#!f-#M|x{YPaxn{_eXMRtDzclpYqRv-mp_M+{YhX?~lW&
zt$T)H1F?S9ou#9BvP+EZ8$tBXTE<T89QBtySi|OSn1ggQ#zFz8)?5%WYF{}>t&>Mp
zh#oAid8(4ymd##iS3i4-X#n!CK+%lwv}|w<LeaE?fNN=#<D43gXvf{DaV<!#8jdQ;
zfxs028(S4R9&6CC<J#=7A{%p6<~um6VSr3lb=;WmOw}YPZuMpnxK%iQam8MZ3XIhx
zO9o6FQy6BP%t@;9GhpJ7>^)x`R8rJI*f%{X>`4{RQEE(;gkB9<i<+iMQJS#?!xZCU
zELqw!Lm(#<EUQS)H)K~tU6D#9s3QPXqPa?^6u7HNAS+jW1knVG!K~|%^6^*akaJnn
zwP7b<MlH_ZO-BrDdezyZ1m==89fd-b*;la|Ktawb%AKQ;QYvIqT~0!POCEp>lUg}g
zZbgWQHGbcw?1fRmW7GctuUSaZ#Bk?(kKj|l`k&}OI<}I$gfkrAa0dVnKs!}NUCK`8
z%ri3`v4R)79;f^(UIz-G4ozWMu{?CGc!&T2S8-Uh(lb+A5w7Mmo3<`VtgS-$E=Fm&
zB(BDy-iVTGwevdFyLn5D3g+NuxtwD-&2-k0VFuAt2{|#;ii#y}l?$jGxxn=ARvQb`
zlj~I^%f1Fn6Wgh)mWD`ZW_H?xYW}D3t4k3nfq<i{9)gh}TXoF8Gyec^9-mW8camod
z)b>>!K7xg&;K%u@hY*507d=nY(vU04`w(aQ$FTaEdczzm=1$*4Q_1{lyw?pN+5koD
zi_m|Gqix0LS~Ecd1&{Z#58~*f=}|P#BDPp64!wu}0A8XBv5dc8?*9OdK{Sl`T>f1J
z4ptq92}Vgy-^6<U9+g$M$sj~lc^u#Y$tI^%XT))?=RF7YsO6Du<7lnhbFjpd4*tvN
zKb=Dv6UeqXSQtIm@k%)n`!B!0<xpE%tcB;FCM4=J%bvsWrHT^Fs-rB&fm&uHfntoO
zn&0xu5&Ud=vwM1b4@yaMdJX7QFd3O9x&^o3MFMbKb^vx?=laubCNWO93qAJ<8X(F;
z=oEA<`VV?8FCWinpl>?rPnG3g{RKS-ss2K#s4pkkZCy;P2$thNDlc#kW%X|T2OV2>
z{=cud5iCj(=5sM(KjZ4@PDv;JS;l*l(?8I272Vmja&W717id-cw#nHC`(CH|)b}&T
zVGKzMkjg)Bj5%EOm529_Vc1q|5E6XWUop2~5dQ!h{{VpR_)=}HKd<Zh>^0ncB*PQC
z{G_g5)l~Ff_nY(|T8qxYeWFk>Psp8*{{UEj^}T5>SUkQ@m`qWU^2z=7{wDr{smDLs
zZnsE2MH{yE3Of(w_||RPLXs@YEAPvaT#e38sj8-9yr1rhagfQEWAia2{c1BG?xgp?
z{eSw^E$UK4Qe3x_D*|~qC)=<1R%2f|bqB6%LRiAvk?+Z>MgYx~91cZGk)lY+y}6c2
znRj(K{LeqqsXUT>qA5!sm#Fo?z%{oup7v(iJ&jVD*A3GY`HHeKUM=E8>6XZ9d~V2h
zDLZfmK9zi1mWyuRn2hF^>{-Ca7|kwVu^qM4u|DF&@%h&3-DC#OC#_^9xl^@);<TfS
z<-qJHQX)vcvX$x3boy0+D8~HYcly;V70<A!jK^yz=W>#`>ygj$%~~^Ny502v^Nf#r
zciIKaYysp9@%-xrZWt=CC*>i59jfv(M$2&#$2^bjpHHEyQX!)3R}J@E2UEkNe-P|P
zzrAQ$yV)?77B6o65&XgT{&kx!(9z0^`>8sAu-(-AdR5ywwljRYk+^uzfAi~9o}{y&
zj3mn&#Q5zS4oV;H`;YcdBer@~pDk{9nF7s>?p|^|bNoZ~6_*$`<IISOA(M<ZL+pM3
z0R3XMZRSa1$t9CHB%hfNx&Hv!zdG4FkU=C35g8ncu?sQ`smG}54O=K0J+geu*}%d7
z06(FqS_X#%9E|nsX*k<d1j?%XugLwe{VHhWAnkBD>*#)>qeT3v&mzW*(hwC!OCG|h
zF%|-;&JI0^;+j;W4CGT|aOxdT1Y{B1denkUt_}zts<dD@HzPegX&HNpP6+uA;A$tF
zCk>EA0~tJO=zAK?lpa-ptIsOGD*d`vRrG*IyCWTltmITtPJ8*ICN|pYIxK^5{{RtH
zKGkJ?`=&BlPC6X#L;Y$SjY2GeCJuNfy=Tg*?Y=TP<E?YfZ0U>(a_P5DrJ^ZtpWcM-
z$Klqo?st61<B`~oYJKt}?^k4ye-fM;mQ7vWJYK;a(vCMc!;kn7=CaaWMyl(~=OmR{
zLXsb$s3eEy-lnwfEwr22aRuXAOnOIjK3w}a{<Q?!UG<7@mKHG|`DmXqAH-I3i$<}M
zWgxu#!||&rfSqX&gZGC@n-P)Hu{OC%L?yDP@Tp*l@-e}u?86eLBCK2P4UjS`ITxji
zw&aW~pPg8He2j+3rHrJAGQ88g<UzHma-)`|8gtzhS{xpqh^X#ueE3H=b^ES5ioEK<
zH$~LeX}Xpvj`7c?Dt(ep=Pa-{)(pR+>cv~^SKsgx>+UOJWif>)+mMWdT(#0Q@~cKx
zITMgb{#AcX)THvv7YntA1x;xyCApeaFzepf(zuZQowH8U{!1~>EOKg{)9zJO9ZgFR
zlmG^BIIPpu-iPlhxvs{LAf9SB7#SHf*umhMm}j0ityZU7(4iyX^dgZaVmLLR@Z@~z
z6neK62l_N;fLQzRC~$j?NedIm+yyO8cSw$0?mmnDG#NK81_JsH1xCjM1PW=`R=E_a
z!Lv?{s?^bfII60IHF{3RlG;p|HbJVFF&8y<G|8!jzV2~UkzB<KWGE`@Nrm83#jZIO
z9MKWps=;y$oYN!(jwoY;)|{#c&S-HOWmQsJ6>vlfaw(}8-JI2snL&!mak%tmeCdJ3
zSB=O#R*cc*Y6xKj^sKpT#bPyQYItV{nz*+9!jCKi$gPsl5*Uw#sgitgOk`fw6DbEH
zr7^TgE9JV?8<sh#+c>JSw_{xOJ>6L?q%}tM=ARs3RA{Wtgc+%U5#w*p-lw4Y4)ox!
zsA$Q2ye{Toh<l#@072fW+)e_Oh!s=Szlpx5`U;Xl&Py{APjgT-d%9MwX`ooFgy)LU
zwPVI}QAchECafz3#sx}?MH;rdi5%coyfP}UT-QGYjB&}uX$b<x2{od2MDEG1?)IVP
zx%u7hB*4#FyELA2U{#wN09+Os_WuAH##fs<BGgeXk*AWNo)@|Gu7hY*3I+$I7V&LR
z0O!yOk@hFv`H%NUPsX~T9#nJAT+D!>U|S(qyC3t-MYt*35G$Upe)0bR3X(=?4?lqP
zrWFzJS9FJ>e~<W6WQ;4E5_YF!)Ssa5PiYnfASeufhp+wgd;L08GE7m98+50u`g;nh
zNhboO%E4jtOwQm8v$3U*hWU9Tt9pGu#;b<pfO_?BU;edB9b%5A_bS7j(;|(hauWa!
zS&sonsi7k?DUsjHgd7*gUtl|$Cc0lJMG)B#<SuiQ>Q8e=n-`$zrBH?#G9y2I&sYBd
z0qs>;K*j+lpaZ$3S1rPf=Yfv(D@-iTBKhu*%eZs1*?mvFE8KRl^(&_?Hi>WF<q?6A
z9(x~C?Nuk5K<#e}8E$0HohUz=gX!vjr8Jg}+6YK%wg=6VJVZN;`ud;1RaxC)fuVyt
zTZepL$^HxN`OxKM{eM^dfg-fB?ai&BGD#=PB&s=l{{RaA0JHr5m87X2muM5^d06w@
zI}YFODgOYzo7ht<EmF?k?X!cq#v>WQ!Swe&y@x%;RGLScW|8rnZ2)i-dJkb!vb!7C
zLKfX|cb0NV%R9H~O-}O&t^Ck7mr}5=bx!T|JpQ#_La0n`NI<@1<aOKGex&2`9+g<!
zJ;Sul0Q6DNW1t?wmF?<Ph%RK=8z|tDgZ2La>wj9W9D*xXex^p)K7*6}DK7W}C$>-H
zP+XFt7>Dr23HrAu`HITrq)+GYxg&v-_|t{RSXqh2SSb9BP@Y003O$GEPeLSxx8y1i
z#Qt=yGTNUu5xb9X#*jyhckfje;hoPI2a2^6z^)1U(irl9;Yh|tK9x>b0SbLjO1NJK
zIpowkx149AP&8ML05(2)91%`1g~obzse%o>9&?&=sPgDSY@pHYa%wfY2`E#+Clt3c
z43MqPDulU_44oZ#`ubHNGgRFyp@8e05!_T~_iY@bqPF5coo6dH^#`cyT5+j(fIfu&
zw9_SoX|m=>NcnjJuG=@3N=g2$xNmQIh(<j0az65@Kg;@6u@_&?t!_glfQY7RdANzV
zW7lt^52x}qWg>W8)rZWyb*RyI;Nz7TKH&7N2$V?}Q~t3BAMG#rkL6k=av1wNri$mu
zDjGJ7Gb!)RD_$7P5(y(YL!GDk8pMrH+rS;5&;C7Ic8|3PZdCOY;`%a8OL8{lN6Xz$
zZ~nDe357d(Q`8!x8yH`%RP;4$N;d9Z-9>Dla#{caWas7Yp4EXp%j<fGx3?r&!gB6E
zkNo!h>c*viB=dcbPI#K>2x*~i>U}f89;2=*-MkRl+n*r<sXLuVWBwkMokZpCt-6bL
zOeGPJG4lFT%*&A3>&AbjXWdDJGA}%4vredE&mf+KTcP$fT*+u?qS?7D+fG5+PwP-U
z>`r!1>BA19vYPYl7VRyoGX?wcH!PpxJ?kO$WRr1aZOPXiYFzA6r*om6IienHG6^Ca
z<$ABwRz>~B$c*XN_|bYdzCWc$ZK%kYlHOd7xXnuU7I*4*e&%EMiVTD9RDUt~8qKu!
zG>lq$ob;}fatKm-2IxoEx8qHP_Ayhq9_ndz3mNRnIr-sp@=AwtpT?qD0a20&9>3>`
z<dyHK)~#&|22<asOw|dl0x?23Jq=o53gA{f{^bOWj2ff}uX+`*>Um#UWv%VylL;*D
zWQ3lfTZ8o-4I;_rsa8F@)h1?wU9I;?;MBL(7MDhJF!+Kw+ZXzD5}t-ge{}x<rw8(@
zdvux3=x<)#i2Jt@@&5pQEpsICymR!aqhTUsAHt$j{$+DTPqR0lMZ9J6Y|K|9sSwH=
zKZjbX7UI@V-oY$H_H-ht+o=)^WlyN$wc@zcQ~?$A=sSGmkI2+}kH{Y~9Tn&+(*~q#
z#Q+Hq5$#9YFTnlix1avHK)L+JDl@5D^p{V!2itJ|wE0&?PQ8iZl_ZnoVh3!~+9+np
zW%-ACsxK|~Wh~?TBx=*JzPFqR<h6}GIVO{9t0G*{_pPCr<>7DwaaNbhnL%^~$A0y%
zGysp1IDqVSXZh7eRtE)$`f*uJC9^T9QZ*xDrt*>n!vF?BZYxe7nh_wCI()z!pVG2#
z?0(Y;8N&~{$F*kZTBNo&Ceq{;aC&w>N~pBa%kR<mmd9Nc-LYV#GoOE!ucW+XHmLR$
zk|U1b2_eRNf!?1M=+6ftnv+jsz7vH=-5N}>$lo!;d(~-dh~!|3P2`Z+%}TTG`Fdy8
zuIgKrs@SR*X~Co_ry$jdQ9;2K7QhB^QsZKiC4q#|gG7-5&MP;=_vvt(3FKDQzyT}O
zRPL^M3LVO*!*Bwx$AYzvB?$Pdu-x&AOG52p@dFi3bHNoapY2trW+x)8G$zW7gttn5
z&;>je^rUUoPQ=}fP!^OsX9Ad8Ja?%fQd>Bzw`RApB@wfVdO{Ruy<3R}-juRN3e#~X
zn5q~Krl68k+;C`>2q&7)bZz;`snHdwb9G3hHBb@}#Y-Icp|m>^%r=V5k%4}4eT7-N
zy2{{HTXgd!1lOHjZc|Z5HAi=3TUgo93}AW+O^m0dXhj%Q6v>wdfnD^~k)&f}Qpc@1
zW6dr(=~9njRLe+bmw{2^X{nCttuqy>T8X<BW0Vi2Ip=v!)jU>m%3ZdKzikYX<gNlQ
z-*z}v`kERQ%eJY2IOo*%u7b`VEdx2&r@88Wg0P{xg4~6-9&3)WxZr=fztq*pZk}c&
zxn}N3q>_nB?B5fNVK^SW%|uwc1zu0JQdv(3qbz$=i69>;9Ov&hudw#2=D4{_pKyG<
zx$ZlCKgO#{;Q>|Ze)E4ydrBAOA9(fls_t<^VzCj)Y*cT+z&v+8wJt^~76ph331h$m
zl1*mHAl>uhwM@u~iEfDtjHE!}pWONnf69^FEJ-9Rw_crAKk+}HsuHuv#pGhn+_5K<
zOKrn)OPlwd$3Uy{qNu=8>?$U9cUY4wNx%SPlj%@n=g(nQ$u<lkjZ0ri!Q07*AK@SE
zAJe5x+_hzFTSogJFh+OXH_FVs?9cG8zp1EYb8Y*W8BNmzzCd`0cftN`_zFltfEZ#S
z?q7#>JShI^pXO^O{^VraKyrkSF~{+rPxnV}&Z)jt{eM^dgq6(;Mn;K}AN514k%J-i
z_8x=r>sBqTqLJR><l@#2Tm7TiN7k!Y#>N=oOmBYYpdHEf9co*dmU$wzha%b&^5=rb
zyB~8xdR>FP?1Z|wdzNYB7#$UVx+=0D3%&m8jmO{Gq?LopRetL6MZ+q_sDAD@ztDF7
z06w&vySoc$W!)100J|sVW9CYI`wze$)|nh<810PI=^H$I-<Ob*YH=E^WxFWJxPZ$#
z#y^OKs4h?!cmDa~>Dsk0Jl1~72_KbI5VX(FK}n4n%>ii(<Ei4GBs(JV$EHuYt#7k|
z&NvlLRLr1z(Q6L(Gkl|fGm2K=f;x4puNDpm(xP~<4J!`iBP%yK?@%Sf36DH~TDj$6
zdT~_@ILV-7zi4n1vBqg7yJw8x^c>@{rj#Xz(yKP*ke<hiVj-GM#CA0|MnZZH4Kd@+
z2pwuv1&Zg6X^2A!S@G#wa4+2d0CyD-j51{X<p3Y2Z}6*;$AUe8swI;uaEGokKU%O8
z<_3{h<Q|2O^r_Sg1MgI3#fWWidV#_B6=j1&%E);fgHbAsb5lxpLBZq>D@ce~Wgpt?
z<NZ;$7#~*Teif|~sP9~@t>Z^^aIP`}ZeOVCKQ4a?)Pn3tnM26!LTg`8Q8&_c!B#wX
z{{R}RsOpDGw(|xaMmUX6M*jfoRamaG*QPtxQ(cWx=)&qmEO8Sac6y(o?@Dz!SD!*B
zriRvyZ4>X+Audd$kRR_M@6UXmhtjsvZRC|0an*ml^#1?~$cE?aWg`G8fwOOJJ5<xm
zs1|6+^0#1-)-&%_Pu*XcN)}coxrm6GTXXWBN_v0wtCqeVWW|(VHv|#T*H8A6Ri})C
z>Hf7xMjzf4%xd>7J@zcow&QUFo;`hswM|=C)|{;DX0_GDw7z6+%OK=)gHLk~v=xCC
zx`-bujuKPoeGlkrIbCNsVS;^oA4<;+hUeD@+PNJ@wCHxIC$CZBdz)K_j#OypPO5v=
zw4Eja<M|t@Uw>+qW`_PGhWW6{F%`}^UqIfY`Bh1+X0n*9s4{s39IFpdXl?Gc4K{n4
zkX}fkmvB~Zyc}*xJ(zSqTCF{t7j8wiEpGZ7c^l+^_Za^GBl()IygG5g6y;@_KPq0_
zD<0n8O0_GiG8}EHDJ+sjBSA7UNQVrLFjk?wD(SU{2fw9Qp7cj5+iGa?#-j-`{{XFs
zdmrLoWB6jBy0>(VU0r5@_>ST}RlSdJeUCIXZBWy`=Ra{9+eyL79%>huIQh9#S0{Wi
zNd_xAXz&R`$E8CyiY!FoSFS}^NXQ}MJa?>^q$tDRt!P0pLJ0gtVI!g_*|cC-1{J?+
z3a}N4WD;BvTaY6r2&~rRR4tov-E&puu!=+XdHPkz<R_9U!jO2tHKB#<U`wVL&&$%F
z0pNP*rHi`NIhHbT>4Q?@i&O44$512Wen-D*xZ7mpR#=5YmR74NBd<!yr?JO~qZvEL
zMq5-?7;(p~Qq=S)ucG0C%D-|!)PMD=<TnWTSK+;C$yQ&PO*p$4My#p(l@-y+>DSj9
zb=kDL1zk_zN2so@FqccZG6VM>wNC!V4Nl+6kZhHCj#1T3WLR9>X*Y(}@L}_i-JE)Z
zT3+htoBKD<P3Ux=GMpX=ps?@Or-h=uj5s5HKRQ;3fr6ZlwMks=r&=>tL@PKa2Bc6h
zN|E%Yv;zgUV-&&)jIME2siudTXxiDa)Iz&_q~g6AJM_4PST=Gi#xFrhOAdh6&W++X
zg#=Q$!02n!r&CKDT&JUzQzmNMN&@D&IJK2^1WBL9t;ei!_o^i;u{fP8N~_kH_Jk*s
zSifnKRNi=B=T&CAW#K{1E@LFMI%SqJDme~%S3hrgH*jkDU6a=$t`6pkD0}&WO!ljm
z((}(1Rc2Q1D#9ouiq$|;Pob)lAYQb|Za`CmS&q)0wNhKA8Km8a<uA!`o+<_e^HdV(
zm#b+u;aJGhC3zU5nv2Xku~J+H&T-bNO(&8vY9GQY?ulh!IVO?`f;i@@2Qr-YqiX^y
zl8q<N%*-K8xm}$E7h#85oJz0Ox$yEXHr2Rpj^GHc_?yXSY6t^29qHadQ|(T-V9E_k
zA;R^oc~KW-ciN(}ZjS7LII9;DVDfM(DC9{F2+bC;A!?RzMjvdKGm)KyY>f}O>feS1
zEyR=DW6pn(e(Y*Cs{H_~D-b;fDbEnaODP$V$sv3W6jqg`;gV5r1{sGW_8z33!n3A=
zX{GX^A2H`SzyrA_uoW$;BnV)DorLB|2*H&907V^#;%aO=of^!hEvVap`_KLF@D(gm
z2++uVaHG`LI!?^%1bz6cZ7LT4ACWj;e|k^3N%S<LlK{IOO)=&K^{nZxil_pc1IcqJ
zjX5~UB;Y9SN$e_lx)saR)rneS6$V(+&oF^XK-qFI(*FQO^d9}`pJ|OC@}O*?`_It7
z_z!BV(g_Fv({A3R+T=TMGfX`3PnE`YgTSX+qR8zeoI@Dg=hx7EPx#h$zJ&^->`;q>
zGc*4Hs9a|qw`2Kx)n{+8hEUnNek6S7U_DfO`k%wrrn|XloXpr4E<Qyc^2Kq-(Br3|
z_cfad@|W!`_p^wB_3qz?^ZHdcZ4i>arKzR+G=FDL^{>R3{t^A(@%*ZQi!x;N6zGm$
zapxzkXj;OF9G8;-VTYoh`Do{&{X3ujy+iky_9QR%TX7%!wv|DD_7k%GPgD3+WS3^<
z9eY)FndOn4iDqM#C%L3z#ACH5AV`|szG7JQIW)49BxLuIvG@=Ezw)GtJVwXx6>DHk
zqx*~c(4K%pDOeD9W6)G%I1JrSYJ7-gpK<3L3Y0=eBvQB)1Vw|KaezNsks%1RKEfkD
z<bPTiZCYsoQf`<PTgvLhRnZx9npq>l^rLVy3T26?WnAET)#r?=liZrDS9H0g1Pu-&
zHBI8j&{o68i0Xu?oG+=O!%V92;Dg$!%7Y*eTGn*=YMiNtUt><fxmX}B1xhXUu;YqU
z7@YnBtgX}>ZdsHM;xRTqJX5g6Sn$UqhF)saE0zjJY*O2`y~hzsu2-lBkSf~1JitKW
zor$>%0*Jp;{VGWFw1eE#(?$ps#auVg(qt2-DhI7Z(x@d#zz2i%qDI*0z5f8inxm*V
zSfq43vWCad=B+ZJ8?wFNk{f`*hVQ%&uealhjdcMAs9@-r00ZgNe>$xl>AQ#)z+^c3
zj%v12OK6?j_p7-#W3D+Lp{!K(b~dLCWSO$pb2B_jPRAI}r!^!6W6LW5K*H4s7{_d8
zla|L{TFGmvkg<IH-G3U*PnvgoBPc`KP0OM>s5N1DuWcrL^ce&GdLZ=oAIs@euCmaC
zAw+=uj)$X;<NDVQE+uA3B4U>)`CNBl-ns)JC~g>RE&(b1(~dx`-CJwas+OjU(&~=|
z*n4+%K-r00^N)Jfg61O;xwZ!f=9(eZPI`}Q{{TNjT)a*mLh>f~*kPBxJ-rQCQ7)Y_
zG;-!9`I205>5@M}Ju~f58k*hz0K*dqTUl;t-`K}6`)#2GPt7E2+tr7+sO?xXj7(T{
z?kiS1#<ZB+!Wm#MiDMj?9RC0?dwPy3x}}=QAKLAJ0AZaJ<Su#<>ZkPSMB#l=(+S%~
zP>GbQDEz9`z2c;QWw}EovE%{wqdu$t6_{pBkKsMOl<zlp%J|6cD>(b6k6p|6R-yp1
z#~^qg?n=E6vGhLGSg{<`mUmKHNb!&)g~!bsbZ`E>SeDv0h}_ze@P87Yyob>L0F7Ho
z>WNO;GdAH??mr>^{{UFq)7rBv?rwDHoFP)J#E{4`k`HnHdWzGYEDgZbS)&(7W$7aD
z`0wvZ?6i>Ut6P0W-R86mHVnxVa`Air0JOgU0G)GF#`fnp%AWPP3p^Jp;o3J-m61*e
zHDd18EkJ(kM-AJ5%RZ6Jf7$G<?a=neN-U+e<4wnNmH{LOvsCICRAaSAEwe)_0;L!n
zGLeD}IUMH|gwfR(bZzO<V5=#vrqV$m12{DS<{KI0#o4~cuAa(pR1EW3Ey<{Ed0Ac$
zHJ?1@2jz%0ZV9n~d8c06$8e(~9`&j@x|HG7j&RN~p7mkxV3ZO-IIdGtw79pB`DB%<
zme(>XVc3E@nym`uI_79(A1T^tqbff2P_nvN6bC%|invoehRCd>mCkv+Nl}Sx92&Q8
za`F79k?&c8Go8xZ`c=4#0hT<9o>Oi}$~KXrr6lD@2hi0^>sart%rh@eRlA|}6(lo8
zK~@LrPmo}6cQ5KGG}fr|>fxnL-WwQpH`f|n(6_li?!RUg*FkW%k%FuP{{T9|xw1>h
zN=P>;=togm_V+g$J<7#!#iBoQPR5AN#t@TE=<4_FB^Z$1!Jv*#R+|*C>JctmY=CgW
znTeosk{pWNO6Q|WqMGwOuTj&$fI5z%xl9z8fO;DA3*EnEz~|-SxOR^h1oq8#LF!=_
zdzyA$V8$5NNv6Wm&Y_9gIW>!?sK_F^T_|k@o2Oc-SSI;Z&3(PIg%s#*QbsuyS1%wq
zJOP@0ozY2f=e;DfxS_G99fB`WQ#GV&xxl9_!W7`uE8E?Xk_|LALv)K6EgN9gYU`3)
zOSQ&8tlPatc_b>J;;MOa%5r<vskks_ZD*;$dY}dX`c(5;2U10G_O|lI>e1vH>FsYM
zjxCdpeQRjSGeWiYDwu>)b5<jj7oICKRKJZ$!*f(^^$qR@>cVX&V!3Q|PcpibSl2Qc
zWhyg)TNX%@3^D6c-02F6TzzUR<*_X$W?RMb4At1})s1JHpCVv{a1Cl(OtOrMTWHGB
zXkCKI2<cC25T_std@{0u#aZ61)YZk3+9E{QF;QGewQ5#;bUCWe8_L}=S9U8S8uCdQ
zvNMXB!<PA~`&bt0I`^l=r?h94!1bkeJqgv6Eki3i{E>{ibm#Rx)opE9;3y=X0IHgk
z{hAhr5td<xVc*z$e>#~;%It|*6<+!{;xNVY=kLBTy#D})p!{jjj3Hyo8p0oFlH@MK
zjPqO3tbz~#!Nn-H^<>3gQSmpC9FqS45&2j8{{TvTxM?3G@j<m_DH%K<=jwSs&a`c9
zhbjmF46`1JJxxAq=!X)kTfFX7FaR5!*!-wHO;)(PRgBx)lD6RG*o-u%)cOxW+v`@Z
zuNC9_J-90XIVAOv`k&}KRz!=tipp~3*xfGVr&1vR6*O1JZtmxRD=I|@2{mHYHo#&q
z2TGMTXyUDNNJ`8A<gqvabtbeYxQ@aFjO}8C;B^v7(3Aek^!Fp{RICETlj<`PBDXsu
z!R4eQx40jcLG4+Eq`RG_kdAl?Iw&0g_7$ppvA<Si)!B77T}WBn{KT`l=L5N@u4MBZ
zU~Nz^tNqc~RE5M=qBQ%=es7@YKd<FblsiUqxcXEuA88X$I>f9L0lD3}B95f{3b}tj
zn(<r23c(o4sq4>k@7(=pD7J_!@s~e6$sXK<{5$)fYK3>R7jL}BHCrnScX8_&104r*
zO~GUGjy-C0h>@2)YGEHhI#k$WVThfodz_k>p?OFjjXp3Gk6%ijCFF5V#N1?d;Xv+b
z?GUR$Fe$7CD!ECyNzfv5>zbz~UBK)r+~l97S6I$@?^4P`qmPc9RC0`*Rjgqu76T-j
zVkt}jUwVe#4E)BdI++-HW~WHPrU0Z-PGrFLthq)VkEcr7n~()%yhM@@QfOd^f^Nk_
zG+9!8dRC8;RwCWU0+}>ncARwLlH3eHq&9sHYi7lPw@^C@&yq3bx(<pl^`}`*BZv&~
zlDztgWm`i0altWfGB9#*NZ{k}sGm=3W+uvKLDykyij8hIXKyD2fGMeam{G~~&$nt#
zs}VYPMC*Gn`7R?Hzq~l9qnRX?uyc?NRC&ORgUX(s)m+Fk^5kN(a%VeIi|$y|F19do
za4~`PteczXkr@fYDfv&YwON@Oc)$(F>6*{DP@GDbLbxZ;id@d;RT;#N+Dv?<af6dn
zt+q)+%I*wCdH{Jpk*xNWl58$ygpi?^oF9Km(zKP$z)TH}hwwk-3eVwoWmKOe<omD4
zexo2*B8w-J>B08;{{V$lkIR}8l4mEO@0yPGe9s^(c0bF{BO}t8r^76_PSPm_I486J
z0QKp`B^7bgwJKL-`xrcl+U%9xw5qn=fQ%o;oA#x+x;DV_Mh86)Z}aa?yE6TnQ3oul
zTmJyBj`*%h-f5wqd1fpdC!Vy{x|vgpto@_;9lSUMDH!cnQI}zcU-fKq5BCqR_|`Oc
z%QSFY=ZQY>{_BpvpI^qcB5>fTBC~D9gzwaBqH9Op8isUTu7{+t^~d-g+<H{8-53@*
zA(IiD`A);`=c=&u>__8Lq22^-jm!5K{{SQHQ29!Q;&6P4dV~BY*YN&TtSZ=0pK_k3
zX0jt(+eT4A+cZ7lZ|bM9tk~J3BX?ehHNS4MthfXVsD|I0kt%<5`q6WyLo{A|3AGPY
z?-}$y-iE46n^bkeX=ut2NT(U<D%=vuYcHD#>(q85(2Ati?yKvHK_jjJBryGIRIF&@
zb5?t3uHk8|hSCOJEcEvt)rtP5Z=0du`-;)DxoyFL*z3^L*S0Br8%YX<m~zpMhuwd|
zsq3oTOHCPxG&@|W<aGzuqTqsu<{0Q|(uVT3xcQ?v&$UvxQmwgjpK6DurL96)t@R9B
zUO$&O7>?<`MSoHL*HP{(BI4R>8@9K&LXJNp=t1s1>h08PaTee(%Si>uo)xpc4Dhe|
zxfEsLKHjV9zm+*PC9dVh@!Mf7&Hz1YR^5rgQb6xn%V#yL!5oHG8<EuITzZ3AF~q<J
zPNS`1%G0rKNt%}PI<O?>wG!_H2OL%`GRG?HakSOo14IC9k?TzvHFDL(ycZ*K^)-oi
zq{q0E!1k?JU87b<II9~LNb(NvtsY|7=Ha&cIsPIl%S*SAO<tPc4b$+dmlpX`fl~En
zB(;!5bGwHJ>MGnaIBcN>dxKa8;jN^@ZYx%j{{U_Z<chl;X=rbYsKNP$57MngaE(|o
zYH5%h40_d;R@fzt_bKO&)Fgz{HOK+CBMy61b3qN<O$?|odV~Ecj1z(eB*S_dnVhf8
za&u8CchwxVaLRhE8Je<O=$8Q`C9TQ%U(?vvQ)hn-^dTIKk@BBP%H18*Mk$wa5rdIg
zQbj(CD3UT{v?qi8+DX}*sY(*F(Uc~(W>rjcvaoLSZGgTjwuUhCl0`b|#yMjmyWpLT
zX}u3Tw-O}F7|$ma*F-#vTy!K;ZM1MNH<cJ(YSi!<4gkRDD4zB+t1f1d4V}@9zV3#Z
zHOi~*Q%;u2<4i9#NLor{<ndahYjm_VAiE|$rjev+4%6P28$yUF#0bbZ98i$kBU=S#
z$sE+*U>_&|3Q=~T;PFfLv5|(R;Kg*yw)Tn{c+M)uqX<|F4@z4(gGZbSTZj8Z1wRV7
zD<eod9%-vw%5E4htvg79>f&I;5V@{`eIX>cAdYig2AieD6e9{La$Jed<PlrX916+U
zQ`1(KBjwJgY2!7rn2cj@6<+GjDdRsTDX6&Wh*Cu<EaAIfF)`1jL1+!Mr5iZSGzs+i
zGP!8;(zY$u3(y<@8@*SvjP*llN!=30rIZjj^{1Vy)bm#&wAh#(DO%M}-XD!;$!06q
z&&VgeT(v;W$TemyJ~DV})3>n-1`S$^Tax6N3|5H4rA)R84;?D<LKIW57oLZ;4oFEd
zk+N993OPIz=~U#>;&aN@sW>#oHI$^8N(MjnMY<g98KrADR~&b(7Hk@?GH)0iNI3kd
zb96;G=wvVNrXc{y$t3Uq^)&_U$aMQW8S;V2Jqw>x{RKYb_i>n^IbY&$?;r52sik!U
z5JhFpt%3_t#n(G?ienB=1yar&xnf0HNkf2pQj*ZRnk$irH_8iitFgrMPaU)ix<@zz
z-kTWbC$(u<ui27oo|5@_AMEboN7VA)Ks{)u8?i=riJz%5hFdtFbQbwlBY@0)^L{7v
z0-!(<0Q~B2BQ5l#F?`snz1_>Tm4@dm39TzQOc9&L19&@O2m9UI@cwx8sBJA8dxTD&
zr1k@;tw}=2;GiNj8&Q1+`RDPgit1-B#&W~9H)G2=$3L0=l=O=ovTD;rb}l}(EKpz%
zS}X)97jyYj%5zqhFKSkgXz4|XxmGyIV^<=NVD~h{z#LTR&ot5#V0hxAZgM*gwIg@M
zCNP+xf+{k1rU<f0HFIF^%|{}f;8fUUcfx~I-!Js5(yth*{#njX0MN#Wl|qkt3+Jaw
zC19ORM9z3s7|$fqD#K(@P_f|VtFne|{+t?XyGbYsoq*tz-`<?qN7y<b`KetM%1J2!
zRYz{MTI7UWo^i<f(_QyIK{!1sDM6M7k%<eOl6b(S%nBtIT!!4=GjmWfHuR5?+~c73
zs}9k`iq1<i3=9rAsAjuWk~oTjoF2dDnwHHJM7RRk1dd5O<yWm%fQg}4W9|bEdHNsL
ztZFl!ahAr@i~>MEtz=kkwVPqaaN#}2T36^-EzzsLlNkg7(=^<L?^RlGU1S_&WQ_Ew
z6q4JRZBjU>XB9+FuJNIG_McU5L8P~kT!|u(scy<h`G>cyIxGjk9edSTVNo+HcjOV;
zr8@*|EzJA9Ijv+4OEMGkvZ!6Y#PzGT(g@{3!v*;O_Bh2$9npt$7emc~_|JS*StCJr
zZjeYGC2~O=@kG7r7g92mQkPDp1QM&3R7Tw2G8~m``qt_!cJOiXs2%ymQ?ZGbM`s`e
z9KULUd!uu3fk{}<{F&~3O;gu%N|gPpQClK8<(*{nK|kv<xZtJ*K_!z+rs6Ode;R>|
zA75(LSlp-%fzeOQzr?>o=~nGvNafud@Ou%~pEpCHMMY?1Y4AUnECdp+yOiUd%l&bh
z)3h!|(s>o4Z6jA(h(H7!vW}F=tkoZSa6PJJ8=_>Ci6WDJ6~6al>;C}Pt7Sy7^Y?{8
z6h2_d7(Ma)>dFQO=KJ5yx!P%*bEf056QN~SQ}f6Czxvf;8H%4d06nBb(EkASjSPSs
zj`YB$H^>|wzV%V5VqsUK8+l>~`$euyXR7y8=xZ_J>_!i*ZbIfKByy}UN2sP<S`H+P
z21C==>GiF-l$FlPbr!}M1qj>*M|!mj94jjj!;nX6ds~Cor@c4IXt`6Bq?ys93GKz%
zXN(1$j5-dEJxBOf45Y4Ah8P@Tw`I#HWjkDQ2<i=0x{f_Sj0H>0xyNZfp7o>mmzgWw
zentbGx!cgw5+ECJs|<x%mBu#l%_1(==kTmq&|S17qeS4nxj8C6@*nRnq5Tb8NZQec
z^S&1Qk^#dH=zXfRD&1Fx6>=N0xsFuxA%DKU{-fTSchQ-@Ab65h-3bK#wImX{0J3xa
zDjSO<3oF9^04s8iIv@VOwIo*=ZL)Gf`@)k=J%+5&qK>3s{Mf1O{!&7o4{E5YjKS29
zd;8V6Q;<Qd+a^sUOv)JNy+&<|j+v<obI8G|;Q6sp7kL~h-bPf~YBqeu{VOhAHSL>x
zV|0F%+e>V+AS82Cp2?FW46mWCo>QtbIL27rikz!et4Ur1jmK%=irKu<Bb09g2imim
z*b-tVYmS+zX`@vn(8<xk-x<j3iXgp}8|KL#^&~<_PT3bD)}JH^ox`E2a$~YAtby>#
z$DX35hIL%SCLIq2TB%1H54qG-&9*~^;r%FUnHf%;WUV6Yyd_(uSZKlRPK+O#pq^r!
z;=L$Z);#p|xZ@Ila5~cksK9Ns*zPx0qDEwkfyOFh9%(jZ8D3X1<hKr|HBoJUdC20n
zp4GMz1yG#%V<gs!HfGJ-U<j*%D<<u>u?t!|gr|d1NoH7s*A-HB98AQuc{w#Q*_G+m
zw7jsZjC86q+?b!I6)tYVvblEI+6YK)Y8bDCb27Q%jYOBK$&y7<xI*pbq4|0d=~ATb
zS}k1PM;!Uct(_+ICWrt=YlgkIo#pvKARe9S{*Ns7?g>^HJm$5h?-|ZVc>D~~q1>F-
zY<J>70X=H{w1M~oHBB@^Xs;!*$oo%2PhCD+XA(eir>LzjDM-!^eJNsWjiDS-9R{7A
zg_~2pKbwP6n2A4gBkNMyS~)v_I6aMOrK%1{=CpE8P;TaY8hSU)$Tgsmgct^%Kwbqj
zw>*mHoXSxH$0z>)*HJM!=A6x*wHux@ST<;o(%9=!vCn!!bKaQEg(uRY<=A0yAS51@
zUiHi{Iv>8sNzY;F{xp^kv`H!K#3((_{{UK{zJ2?H9F|f(yiv_%a$CAFyy7V)MV2~;
zc3aw~+45&Kj2jd+W*x*NHbLN1tcw?I!+}rKW{Eyo0blloL2L4iAue5j7_F1KG^rnT
znz69KW*Jq%BiGurUP88!D;&u=GdcYi^dGHdGXRAYj4x6<nzJ;!?g&%zqAHKT0YoKh
zqmqoTD-*;6u>-X{!}F8*Q#A9OlULzSI(1Rhwgpkt^d^okv)X4EdARy0{{R#GszgkI
zPo0CvcKy-vuh4X*jC_Rmtvrmp#Av}Ij+J6G80k^=%}*apQ(Kb9B6r6%9Fga(SqF-2
zfB~AT5>fJqH6Y+%Q;$6;E4GtBG>{HE(iWhpCyu6^Av_aOP~7Jjrn3Yd=B8x|ed;Zv
zIX!3@6E1l@Jt_%CIVYO6BZXS3=OnQoN{L9)F)2Ji9G*!%O;?jF%onFX2YR+on<hH@
z)pt=8g+Un|NUW}Ckr+W5<N=aDDu9pO=dMX4`qZ})0fb+=PtA`?h{!hWKg2RUu}6^X
zRGGwZ_AGsSdQuk>Mdi%cW&mv+MMrRakn%Twx=%s=H3X~!Jj5IxgQ4kCn=Z(!HPZd1
z)F{U8JMmdMg~OOh1mw8{dk$$WWRiQCi;O8?bM2fFO4jD&ODlcn$-wP`PxPR*WJNBj
z(g<S!Gco`P>JO%B>JRkTvJ05g1ZUGce?P*Zg3TehfT#@TEx)p2q*-H@NFcW5Oqr08
zjo;om^aB_k{<PPaJ9HR>Y=k2T9lmY&;~!7!Rug^zV1FL<OUVN9Pt3R^DIVMk(uizO
zcV{El^{H=DmaHs2axskLQqLcnpo7S#s^ojqwl;7&R&AV-c9BHk)W;w`$KI;>b6H$L
zDkveJYbo!?Ut?QP**wk)BL?^Ae!i6ss|+^LG>sva;a76`W8^?R`}M1YTh#ATQjaN_
z46|PsK@-NYovgm6*NVQD0H7^~$6;A|JaRM=GReo8jk)0Q*0n-OY(VHwJanp+u3U6l
z4vmaXayjU7dWx{E9}F@v)Ou4E7L;xs`x<h<$&s~{f#ci%0IgZ<TC*fco6J(AFwari
zr*|L+P%=0wdy0lM+S|TR-S&>vU@E#cK*(TeUEan{Nz}JtvF;dX$@+R#<#6aSpdV_k
ziGnuZA4--|(R|p+^rF`lW}Tw9Wa}7lfPS9FqhLn_Q5(lEsyC-<nTF)w+;G^e<pY|f
z8xR4-O%#E04=Q?!go20EQr0SNEXhg5*^ZXqoQB9f53NDvZ9dgzH{0@`x@pY0>-Z7g
zx+65sx^=o4w-)>oo%>c5^lW0>G94R&E2?xXK3d1WfCYb-2N}g~wmKs#nMo>YspZZ@
zcVJ*hcn|D-{p&JwJ*=%1szFnX8j|4|PU!meAW$!o@ddr(?P0`o5z6~FsHt|A`<pAT
zLgkIv0B+|se8dcUl;j?31jj9`-du`7?0XUFD^d%>suYZhh1rCcRA}3=SjsT-A#c8!
z$KpK=A%pBrtbj=i@~w`(-pAUi8B}1ddGAlOx-qjO<>6!VAwQ_}H2JNeE;lt`xwm`<
z8%<pTJ-;ce&$F&om9mY>Kd<;7YP5(^D;ZsGa#f=ktHrbuvu-@oV!4qR5CIvcIp(Lf
zi+9qZ7jHv~Dx-IcEvB?o;i5S9s}bs!(u@`=eZ^Li%rbnq$f|N$73eExIm2?@T9Mq@
z#7@~qQ}i`T_e~cADn4=0ip~*Ar{&1{RG(?NhxpIqN3~CMX&hH{Rhk*%;ei<ML^6;z
zN}!I|t7A}<j5@3RMMe@iB-~J`_NjA@>76mf(|eM_vB}D|-)gY-ir^3iM{!L^&>XI6
z!i)-!Xk%9uAF|6aTd)PNNc5_AknUlQ^if{SqLeOkrul4C7VORydz@EKYXYPoA4({w
zq40_c6PQCG=~#D`8{GDyirPx$r>RaG285DG+f%9u4il{uQbLn=8uo6@hmC;s6n|g~
zfC8{=^v@JgSIC+b+GEqf<nUuS{{UsGczhWoa)LQS)Hm{?ipi@x7k14v;M)nJRhBM5
zz&Yf8G}!zn4Y|US%kF5RlYFm1B$;7+HEaAxGCC1f;L)wovdRuR9+Xi_o>wcC7uLqy
ze+a1syi3tV6jO3xWIb@50HiWviYgqLN+@Y;ifK6*2Z|`EWmt`cLm}=@6j4yiMQI-&
zcA+vBQlsx<y%bh`N_#Sml)I6@KPY30t$%P)#|Mr@6jV0(OriWUI{-f&4<qSLGZaz@
z!J>*$9Ta8XymBe?7c~Z+!rm60TO^YlOYi(g@Em<lwG>v=x|`MbOY!7@xW^S@5tH?z
zik0ymWi2X~p!qZOtqT|Vq?7xL!S_6WQ$-cB-0P7ja4<45dJ3?I<i!+KAu+(@jC2(`
z9GWPmiDUP6rb!_u0*WfG#bFRrh&lG6icRP=kl+@MLH0CJQ+7Kcj2ND^8A5oXicOVb
zyoh`H)Jng+!Stev&F)$iZLyLE)YVy`4hX=_6jJCkQn|M(0o%qg!0S~OHd%&G1CUQ@
zD5+(klgEHS>6~Emilp|<f=d$Q^v^-<MHH^?RkSM`ADp>gmPI)Su;U-rszs-}x)mIz
zcIW%ub43(fYhr7CA>o4I)xLRxf)5~M<+%R<J!<8?lriUJLlKS#zcf)oVr-H0q*MjA
z<%Ti`e&(7iqf*SkWSryDiYSqVO^DqT9zRN!9itntXri-woD%qpg~q}F;QCcdn>gf!
z%!P|O?c3A}D6Jswc2hZx4>tEwMI&P*sXq15C`9=VHucCkZ~nCuR*!Q;D%(QNUKt5-
z*qXYNsX())03Cg(qLq<Isl)#DN66<Z^rA7AWyo>2rg;=mSlixa6t!|>k?m8tl#F*K
znA(lIzS$MWZNrg86uGRfZ9#L*x6ofR=E{*G_Bg>6rD*Eo89a`2MHO8Uw4$BH&mPvn
z%ChmE#CE1HBOa7dSk#KS$v7(x#wqCBVDzGj=T6T<o#^x#AS!YX<5ezp%7s`A;{ee`
zbyJhM(N0dsDRUD>=dsU9t1L+<Ag&Ls6jey{;O`rY9FnXaLH*({DW0qJG$iNlvXDg-
zQoCbm7ChEL>J3<qMJI6`Xrh#ajVnofx7tZ~U+*ucxc9ALl^aeONcmWI6j4zp?wq)t
zitaGgdc^QJ1R5x%OLLNeq|qy;Q@2{JBu|1BvONtHS6#F^8c%Yf+(`_ZWsP?FikoyS
zH~C^fdr?JZCqka~A~C>8A~L?&rwCN?2kS)@%Tb(_p%t+z#s_MC0HTVBE3=KC|Jir}
Bp^E?j

diff --git a/python/paddle/v2/tests/test_data_feeder.py b/python/paddle/v2/tests/test_data_feeder.py
deleted file mode 100644
index 63905c04cf7..00000000000
--- a/python/paddle/v2/tests/test_data_feeder.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import py_paddle.swig_paddle as api
-import numpy as np
-
-from paddle.v2 import data_type
-from paddle.v2.data_feeder import DataFeeder
-
-
-class DataFeederTest(unittest.TestCase):
-    def dense_reader(self, size):
-        data = np.random.random(size)
-        return data
-
-    def sparse_binary_reader(self, high, size_limit, non_empty=False):
-        num = np.random.randint(size_limit)  # num could be 0
-        while non_empty and num == 0:
-            num = np.random.randint(size_limit)
-        return np.random.randint(high, size=num).tolist()
-
-    def test_dense(self):
-        def compare(input):
-            feeder = DataFeeder([('image', data_type.dense_vector(784))],
-                                {'image': 0})
-            arg = feeder(input)
-            output = arg.getSlotValue(0).copyToNumpyMat()
-            input = np.array(input, dtype='float32')
-            self.assertAlmostEqual(input.all(), output.all())
-
-        # test numpy array
-        batch_size = 32
-        dim = 784
-        data = []
-        for i in xrange(batch_size):
-            each_sample = []
-            each_sample.append(self.dense_reader(dim))
-            data.append(each_sample)
-        compare(data)
-
-        # each feature is a list
-        data = []
-        for i in xrange(batch_size):
-            each_sample = []
-            each_sample.append(self.dense_reader(dim).tolist())
-            data.append(each_sample)
-        compare(data)
-
-        # test tuple
-        data = []
-        for i in xrange(batch_size):
-            each_sample = (self.dense_reader(dim).tolist(), )
-            data.append(each_sample)
-        compare(data)
-
-    def test_sparse_binary(self):
-        dim = 10000
-        batch_size = 32
-        data = []
-        for i in xrange(batch_size):
-            each_sample = []
-            each_sample.append(self.sparse_binary_reader(dim, 50))
-            data.append(each_sample)
-        feeder = DataFeeder([('input', data_type.sparse_binary_vector(dim))],
-                            {'input': 0})
-        arg = feeder(data)
-        output = arg.getSlotValue(0)
-        assert isinstance(output, api.Matrix)
-        for i in xrange(batch_size):
-            self.assertEqual(output.getSparseRowCols(i), data[i][0])
-
-    def test_sparse(self):
-        dim = 10000
-        batch_size = 32
-        v = []
-        w = []
-        data = []
-        for dat in xrange(batch_size):
-            each_sample = []
-            a = self.sparse_binary_reader(dim, 40, non_empty=True)
-            b = self.dense_reader(len(a)).tolist()
-            v.append(a)
-            w.append(np.array(b, dtype="float32"))
-            each_sample.append(zip(a, b))
-            data.append(each_sample)
-
-        feeder = DataFeeder([('input', data_type.sparse_float_vector(dim))],
-                            {'input': 0})
-        arg = feeder(data)
-        output = arg.getSlotValue(0)
-        assert isinstance(output, api.Matrix)
-        for i in xrange(batch_size):
-            self.assertEqual(output.getSparseRowCols(i), v[i])
-            cols_value = output.getSparseRowColsVal(i)
-            value = [val[1] for val in cols_value]
-            value = np.array(value, dtype="float32")
-            self.assertAlmostEqual(value.all(), w[i].all())
-
-    def test_integer(self):
-        value_range = 100
-        batch_size = 32
-        index = []
-        for i in xrange(batch_size):
-            each_sample = []
-            each_sample.append(np.random.randint(value_range))
-            index.append(each_sample)
-        feeder = DataFeeder([('input', data_type.integer_value(value_range))],
-                            {'input': 0})
-        arg = feeder(index)
-        output = arg.getSlotIds(0).copyToNumpyArray()
-        index = np.array(index, dtype='int')
-        self.assertEqual(output.all(), index.flatten().all())
-
-    def test_integer_sequence(self):
-        value_range = 10000
-        batch_size = 32
-        start = [0]
-        data = []
-        for i in xrange(batch_size):
-            each_sample = []
-            each_sample.append(
-                self.sparse_binary_reader(
-                    value_range, 30, non_empty=True))
-            data.append(each_sample)
-            start.append(len(each_sample[0]) + start[-1])
-        feeder = DataFeeder(
-            [('input', data_type.integer_value_sequence(value_range))],
-            {'input': 0})
-        arg = feeder(data)
-        output_data = arg.getSlotIds(0).copyToNumpyArray()
-        output_start = arg.getSlotSequenceStartPositions(0).copyToNumpyArray()
-
-        index = []
-        for dat in data:
-            index.extend(x for x in dat[0])  # only one feature, so dat[0]
-        index = np.array(index, dtype='int')
-        start = np.array(start, dtype='int')
-        self.assertEqual(output_data.all(), index.all())
-        self.assertEqual(output_start.all(), start.all())
-
-    def test_multiple_features(self):
-        batch_size = 2
-        data = []
-        for i in xrange(batch_size):
-            each_sample = []
-            each_sample.append(np.random.randint(10))
-            each_sample.append(
-                self.sparse_binary_reader(
-                    20000, 40, non_empty=True))
-            each_sample.append(self.dense_reader(100))
-            data.append(each_sample)
-
-        # test multiple features
-        data_types = [('fea0', data_type.dense_vector(100)),
-                      ('fea1', data_type.sparse_binary_vector(20000)),
-                      ('fea2', data_type.integer_value(10))]
-        feeder = DataFeeder(data_types, {'fea0': 2, 'fea1': 1, 'fea2': 0})
-        arg = feeder(data)
-        output_dense = arg.getSlotValue(0).copyToNumpyMat()
-        output_sparse = arg.getSlotValue(1)
-        output_index = arg.getSlotIds(2).copyToNumpyArray()
-        for i in xrange(batch_size):
-            self.assertEqual(output_dense[i].all(), data[i][2].all())
-            self.assertEqual(output_sparse.getSparseRowCols(i), data[i][1])
-            self.assertEqual(output_index[i], data[i][0])
-
-        # reader returns 3 features, but only use 2 features
-        data_types = [('fea0', data_type.dense_vector(100)),
-                      ('fea2', data_type.integer_value(10))]
-        feeder = DataFeeder(data_types, {'fea0': 2, 'fea2': 0})
-        arg = feeder(data)
-        output_dense = arg.getSlotValue(0).copyToNumpyMat()
-        output_index = arg.getSlotIds(1).copyToNumpyArray()
-        for i in xrange(batch_size):
-            self.assertEqual(output_dense[i].all(), data[i][2].all())
-            self.assertEqual(output_index[i], data[i][0])
-
-        # reader returns 3 featreus, one is duplicate data
-        data_types = [('fea0', data_type.dense_vector(100)),
-                      ('fea1', data_type.sparse_binary_vector(20000)),
-                      ('fea2', data_type.integer_value(10)),
-                      ('fea3', data_type.dense_vector(100))]
-        feeder = DataFeeder(data_types,
-                            {'fea0': 2,
-                             'fea1': 1,
-                             'fea2': 0,
-                             'fea3': 2})
-        arg = feeder(data)
-        fea0 = arg.getSlotValue(0).copyToNumpyMat()
-        fea1 = arg.getSlotValue(1)
-        fea2 = arg.getSlotIds(2).copyToNumpyArray()
-        fea3 = arg.getSlotValue(3).copyToNumpyMat()
-        for i in xrange(batch_size):
-            self.assertEqual(fea0[i].all(), data[i][2].all())
-            self.assertEqual(fea1.getSparseRowCols(i), data[i][1])
-            self.assertEqual(fea2[i], data[i][0])
-            self.assertEqual(fea3[i].all(), data[i][2].all())
-
-    def test_multiple_features_tuple(self):
-        batch_size = 2
-        data = []
-        for i in xrange(batch_size):
-            a = np.random.randint(10)
-            b = self.sparse_binary_reader(20000, 40, non_empty=True)
-            c = self.dense_reader(100)
-            each_sample = (a, b, c)
-            data.append(each_sample)
-
-        # test multiple features
-        data_types = [('fea0', data_type.dense_vector(100)),
-                      ('fea1', data_type.sparse_binary_vector(20000)),
-                      ('fea2', data_type.integer_value(10))]
-        feeder = DataFeeder(data_types, {'fea0': 2, 'fea1': 1, 'fea2': 0})
-        arg = feeder(data)
-        out_dense = arg.getSlotValue(0).copyToNumpyMat()
-        out_sparse = arg.getSlotValue(1)
-        out_index = arg.getSlotIds(2).copyToNumpyArray()
-        for i in xrange(batch_size):
-            self.assertEqual(out_dense[i].all(), data[i][2].all())
-            self.assertEqual(out_sparse.getSparseRowCols(i), data[i][1])
-            self.assertEqual(out_index[i], data[i][0])
-
-    def test_dense_set_shape(self):
-        # test 2-D data
-        def gen_data(batch_size, shape):
-            data = []
-            for i in xrange(batch_size):
-                each_sample = []
-                each_sample.append(np.random.random(shape))
-                data.append(each_sample)
-            return data
-
-        feeder = DataFeeder([('image', data_type.dense_array(2352))],
-                            {'image': 0})
-        arg = feeder(gen_data(32, (3, 28, 28)))
-        h = arg.getSlotFrameHeight(0)
-        w = arg.getSlotFrameWidth(0)
-        self.assertEqual(h, 28)
-        self.assertEqual(w, 28)
-
-        arg = feeder(gen_data(32, (3, 30, 32)))
-        h = arg.getSlotFrameHeight(0)
-        w = arg.getSlotFrameWidth(0)
-        self.assertEqual(h, 30)
-        self.assertEqual(w, 32)
-
-
-if __name__ == '__main__':
-    api.initPaddle("--use_gpu=0")
-    suite = unittest.TestLoader().loadTestsFromTestCase(DataFeederTest)
-    unittest.TextTestRunner().run(suite)
-    if api.isGpuVersion():
-        api.setUseGpu(True)
-        unittest.main()
diff --git a/python/paddle/v2/tests/test_image.py b/python/paddle/v2/tests/test_image.py
deleted file mode 100644
index c78bbdc40a2..00000000000
--- a/python/paddle/v2/tests/test_image.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-
-import paddle.v2.image as image
-
-
-class Image(unittest.TestCase):
-    def test_resize_flip_chw(self):
-        # resize
-        im = image.load_image('cat.jpg')
-        im = image.resize_short(im, 256)
-        self.assertEqual(256, min(im.shape[:2]))
-        self.assertEqual(3, im.shape[2])
-
-        # flip
-        im = image.left_right_flip(im)
-        im2 = np.flip(im, 1)
-        self.assertEqual(im.all(), im2.all())
-
-        # to_chw
-        h, w, c = im.shape
-        im = image.to_chw(im)
-        self.assertEqual(c, im.shape[0])
-        self.assertEqual(h, im.shape[1])
-        self.assertEqual(w, im.shape[2])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py
deleted file mode 100644
index b169a0f38ee..00000000000
--- a/python/paddle/v2/tests/test_layer.py
+++ /dev/null
@@ -1,290 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle.v2.activation as activation
-import paddle.v2.attr as attr
-import paddle.v2.data_type as data_type
-import paddle.v2.layer as layer
-import paddle.v2.pooling as pooling
-import paddle.v2.networks as networks
-import paddle.v2.evaluator as evaluator
-
-pixel = layer.data(name='pixel', type=data_type.dense_vector(128))
-label = layer.data(name='label', type=data_type.integer_value(10))
-weight = layer.data(name='weight', type=data_type.dense_vector(1))
-combine_weight = layer.data(
-    name='weight_combine', type=data_type.dense_vector(10))
-score = layer.data(name='score', type=data_type.dense_vector(1))
-
-hidden = layer.fc(input=pixel,
-                  size=100,
-                  act=activation.Sigmoid(),
-                  param_attr=attr.Param(name='hidden'))
-inference = layer.fc(input=hidden, size=10, act=activation.Softmax())
-conv = layer.img_conv(
-    input=pixel,
-    filter_size=1,
-    filter_size_y=1,
-    num_channels=8,
-    num_filters=16,
-    act=activation.Linear())
-
-
-class ImageLayerTest(unittest.TestCase):
-    def test_conv_layer(self):
-        conv_shift = layer.conv_shift(a=pixel, b=score)
-        print layer.parse_network(conv, conv_shift)
-
-    def test_pooling_layer(self):
-        maxpool = layer.img_pool(
-            input=conv,
-            pool_size=2,
-            num_channels=16,
-            padding=1,
-            pool_type=pooling.Max())
-        spp = layer.spp(input=conv,
-                        pyramid_height=2,
-                        num_channels=16,
-                        pool_type=pooling.Max())
-        maxout = layer.maxout(input=conv, num_channels=16, groups=4)
-        print layer.parse_network([maxpool, spp, maxout])
-
-    def test_norm_layer(self):
-        norm1 = layer.img_cmrnorm(input=conv, size=5)
-        norm2 = layer.batch_norm(input=conv)
-        norm3 = layer.sum_to_one_norm(input=conv)
-        print layer.parse_network([norm1, norm2, norm3])
-
-
-class AggregateLayerTest(unittest.TestCase):
-    def test_aggregate_layer(self):
-        pool = layer.pooling(
-            input=pixel,
-            pooling_type=pooling.Avg(),
-            agg_level=layer.AggregateLevel.TO_SEQUENCE)
-        last_seq = layer.last_seq(input=pixel)
-        first_seq = layer.first_seq(input=pixel)
-        concat = layer.concat(input=[last_seq, first_seq])
-        seq_concat = layer.seq_concat(a=last_seq, b=first_seq)
-        print layer.parse_network(
-            [pool, last_seq, first_seq, concat, seq_concat])
-
-
-class MathLayerTest(unittest.TestCase):
-    def test_math_layer(self):
-        addto = layer.addto(input=[pixel, pixel])
-        linear_comb = layer.linear_comb(
-            weights=combine_weight, vectors=hidden, size=10)
-        interpolation = layer.interpolation(
-            input=[hidden, hidden], weight=score)
-        bilinear = layer.bilinear_interp(input=conv, out_size_x=4, out_size_y=4)
-        power = layer.power(input=pixel, weight=score)
-        scaling = layer.scaling(input=pixel, weight=score)
-        slope = layer.slope_intercept(input=pixel)
-        tensor = layer.tensor(a=pixel, b=pixel, size=1000)
-        cos_sim = layer.cos_sim(a=pixel, b=pixel)
-        trans = layer.trans(input=tensor)
-        print layer.parse_network([
-            addto, linear_comb, interpolation, power, scaling, slope, tensor,
-            cos_sim, trans
-        ])
-
-
-class ReshapeLayerTest(unittest.TestCase):
-    def test_reshape_layer(self):
-        block_expand = layer.block_expand(
-            input=conv, num_channels=4, stride_x=1, block_x=1)
-        expand = layer.expand(
-            input=weight,
-            expand_as=pixel,
-            expand_level=layer.ExpandLevel.FROM_NO_SEQUENCE)
-        repeat = layer.repeat(input=pixel, num_repeats=4)
-        reshape = layer.seq_reshape(input=pixel, reshape_size=4)
-        rotate = layer.rotate(input=pixel, height=16, width=49)
-        print layer.parse_network(
-            [block_expand, expand, repeat, reshape, rotate])
-
-
-class RecurrentLayerTest(unittest.TestCase):
-    def test_recurrent_layer(self):
-        word = layer.data(name='word', type=data_type.integer_value(12))
-        recurrent = layer.recurrent(input=word)
-        lstm = layer.lstmemory(input=word)
-        gru = layer.grumemory(input=word)
-        print layer.parse_network([recurrent, lstm, gru])
-
-
-class CostLayerTest(unittest.TestCase):
-    def test_cost_layer(self):
-        cost1 = layer.classification_cost(input=inference, label=label)
-        cost2 = layer.classification_cost(
-            input=inference, label=label, weight=weight)
-        cost3 = layer.cross_entropy_cost(input=inference, label=label)
-        cost4 = layer.cross_entropy_with_selfnorm_cost(
-            input=inference, label=label)
-        cost5 = layer.square_error_cost(input=inference, label=label)
-        cost6 = layer.square_error_cost(
-            input=inference, label=label, weight=weight)
-        cost7 = layer.multi_binary_label_cross_entropy_cost(
-            input=inference, label=label)
-        cost8 = layer.rank_cost(left=score, right=score, label=score)
-        cost9 = layer.lambda_cost(input=inference, score=score)
-        cost10 = layer.sum_cost(input=inference)
-        cost11 = layer.huber_regression_cost(input=score, label=label)
-        cost12 = layer.huber_classification_cost(input=score, label=label)
-
-        print layer.parse_network([cost1, cost2])
-        print layer.parse_network([cost3, cost4])
-        print layer.parse_network([cost5, cost6])
-        print layer.parse_network([cost7, cost8, cost9, cost10, cost11, cost12])
-
-        crf = layer.crf(input=inference, label=label)
-        crf_decoding = layer.crf_decoding(input=inference, size=3)
-        ctc = layer.ctc(input=inference, label=label)
-        warp_ctc = layer.warp_ctc(input=pixel, label=label)
-        nce = layer.nce(input=inference, label=label, num_classes=3)
-        hsigmoid = layer.hsigmoid(input=inference, label=label, num_classes=3)
-
-        print layer.parse_network(
-            [crf, crf_decoding, ctc, warp_ctc, nce, hsigmoid])
-
-
-class OtherLayerTest(unittest.TestCase):
-    def test_sampling_layer(self):
-        maxid = layer.max_id(input=inference)
-        sampling_id = layer.sampling_id(input=inference)
-        eos = layer.eos(input=maxid, eos_id=5)
-        layer.printer(maxid)
-        print layer.parse_network([maxid, sampling_id, eos])
-
-    def test_slicing_joining_layer(self):
-        pad = layer.pad(input=conv, pad_c=[2, 3], pad_h=[1, 2], pad_w=[3, 1])
-        print layer.parse_network(pad)
-
-
-class ProjOpTest(unittest.TestCase):
-    def test_projection(self):
-        input = layer.data(name='data2', type=data_type.dense_vector(784))
-        word = layer.data(
-            name='word2', type=data_type.integer_value_sequence(10000))
-        fc0 = layer.fc(input=input, size=100, act=activation.Sigmoid())
-        fc1 = layer.fc(input=input, size=200, act=activation.Sigmoid())
-        mixed0 = layer.mixed(
-            size=256,
-            input=[
-                layer.full_matrix_projection(input=fc0),
-                layer.full_matrix_projection(input=fc1)
-            ])
-        with layer.mixed(size=200) as mixed1:
-            mixed1 += layer.full_matrix_projection(input=fc0)
-            mixed1 += layer.identity_projection(input=fc1)
-
-        table = layer.table_projection(input=word)
-        emb0 = layer.mixed(size=512, input=table)
-        with layer.mixed(size=512) as emb1:
-            emb1 += table
-
-        scale = layer.scaling_projection(input=fc0)
-        scale0 = layer.mixed(size=100, input=scale)
-        with layer.mixed(size=100) as scale1:
-            scale1 += scale
-
-        dotmul = layer.dotmul_projection(input=fc0)
-        dotmul0 = layer.mixed(size=100, input=dotmul)
-        with layer.mixed(size=100) as dotmul1:
-            dotmul1 += dotmul
-
-        context = layer.context_projection(input=fc0, context_len=5)
-        context0 = layer.mixed(size=500, input=context)
-        with layer.mixed(size=500) as context1:
-            context1 += context
-
-        conv = layer.conv_projection(
-            input=input,
-            filter_size=1,
-            num_channels=1,
-            num_filters=128,
-            stride=1,
-            padding=0)
-        conv0 = layer.mixed(input=conv, bias_attr=True)
-        with layer.mixed(bias_attr=True) as conv1:
-            conv1 += conv
-
-        print layer.parse_network(mixed0)
-        print layer.parse_network(mixed1)
-        print layer.parse_network(emb0)
-        print layer.parse_network(emb1)
-        print layer.parse_network(scale0)
-        print layer.parse_network(scale1)
-        print layer.parse_network(dotmul0)
-        print layer.parse_network(dotmul1)
-        print layer.parse_network(conv0)
-        print layer.parse_network(conv1)
-
-    def test_operator(self):
-        ipt0 = layer.data(name='data1', type=data_type.dense_vector(784))
-        ipt1 = layer.data(name='word1', type=data_type.dense_vector(128))
-        fc0 = layer.fc(input=ipt0, size=100, act=activation.Sigmoid())
-        fc1 = layer.fc(input=ipt0, size=100, act=activation.Sigmoid())
-
-        dotmul_op = layer.dotmul_operator(a=fc0, b=fc1)
-        dotmul0 = layer.mixed(input=dotmul_op)
-        with layer.mixed() as dotmul1:
-            dotmul1 += dotmul_op
-
-        conv = layer.conv_operator(
-            img=ipt0,
-            filter=ipt1,
-            filter_size=1,
-            num_channels=1,
-            num_filters=128,
-            stride=1,
-            padding=0)
-        conv0 = layer.mixed(input=conv)
-        with layer.mixed() as conv1:
-            conv1 += conv
-
-        print layer.parse_network(dotmul0)
-        print layer.parse_network(dotmul1)
-        print layer.parse_network(conv0)
-        print layer.parse_network(conv1)
-
-
-class NetworkTests(unittest.TestCase):
-    def test_vgg(self):
-        img = layer.data(name='pixel1', type=data_type.dense_vector(784))
-        vgg_out = networks.small_vgg(
-            input_image=img, num_channels=1, num_classes=2)
-        print layer.parse_network(vgg_out)
-
-
-class EvaluatorTest(unittest.TestCase):
-    def test_evaluator(self):
-        img = layer.data(name='pixel2', type=data_type.dense_vector(784))
-        output = layer.fc(input=img,
-                          size=10,
-                          act=activation.Softmax(),
-                          name='fc_here')
-        lbl = layer.data(name='label2', type=data_type.integer_value(10))
-        cost = layer.cross_entropy_cost(input=output, label=lbl)
-
-        evaluator.classification_error(input=output, label=lbl)
-        print layer.parse_network(cost)
-        print layer.parse_network(output)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/tests/test_op.py b/python/paddle/v2/tests/test_op.py
deleted file mode 100644
index 15d5aef5111..00000000000
--- a/python/paddle/v2/tests/test_op.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle.v2.data_type as data_type
-import paddle.v2.layer as layer
-import paddle.v2.op as op
-
-
-class OpTest(unittest.TestCase):
-    def test_op(self):
-        x = layer.data(name='data', type=data_type.dense_vector(128))
-        x = op.exp(x)
-        x = op.sqrt(x)
-        x = op.reciprocal(x)
-        x = op.log(x)
-        x = op.abs(x)
-        x = op.sigmoid(x)
-        x = op.tanh(x)
-        x = op.square(x)
-        x = op.relu(x)
-        y = 1 + x
-        y = y + 1
-        y = x + y
-        y = y - x
-        y = y - 2
-        y = 2 - y
-        y = 2 * y
-        y = y * 3
-        z = layer.data(name='data_2', type=data_type.dense_vector(1))
-        y = y * z
-        y = z * y
-        y = y + z
-        y = z + y
-        print layer.parse_network(y)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/tests/test_paramconf_order.py b/python/paddle/v2/tests/test_paramconf_order.py
deleted file mode 100644
index 264442be182..00000000000
--- a/python/paddle/v2/tests/test_paramconf_order.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright PaddlePaddle contributors. All Rights Reservedd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-import math
-import paddle.v2 as paddle
-
-
-def wordemb(inlayer):
-    wordemb = paddle.layer.table_projection(
-        input=inlayer,
-        size=5,
-        param_attr=paddle.attr.Param(
-            name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0))
-    return wordemb
-
-
-def train():
-    word_dict = paddle.dataset.imikolov.build_dict()
-    dict_size = len(word_dict)
-    # Every layer takes integer value of range [0, dict_size)
-    firstword = paddle.layer.data(
-        name="firstw", type=paddle.data_type.integer_value(dict_size))
-    secondword = paddle.layer.data(
-        name="secondw", type=paddle.data_type.integer_value(dict_size))
-    thirdword = paddle.layer.data(
-        name="thirdw", type=paddle.data_type.integer_value(dict_size))
-    fourthword = paddle.layer.data(
-        name="fourthw", type=paddle.data_type.integer_value(dict_size))
-    nextword = paddle.layer.data(
-        name="fifthw", type=paddle.data_type.integer_value(dict_size))
-
-    Efirst = wordemb(firstword)
-    Esecond = wordemb(secondword)
-    Ethird = wordemb(thirdword)
-    Efourth = wordemb(fourthword)
-
-    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
-    hidden1 = paddle.layer.fc(name="fc1",
-                              input=contextemb,
-                              size=128,
-                              act=paddle.activation.Sigmoid(),
-                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
-                              bias_attr=paddle.attr.Param(learning_rate=2),
-                              param_attr=paddle.attr.Param(
-                                  initial_std=1. / math.sqrt(5 * 8),
-                                  learning_rate=1,
-                                  l2_rate=6e-4))
-    predictword = paddle.layer.fc(input=hidden1,
-                                  size=dict_size,
-                                  bias_attr=paddle.attr.Param(learning_rate=2),
-                                  act=paddle.activation.Softmax())
-
-    return paddle.layer.classification_cost(input=predictword, label=nextword)
-
-
-class TestParamConfOrder(unittest.TestCase):
-    def test_param_conf_order(self):
-        paddle.init()
-        cost = train()
-        parameters = paddle.parameters.create(cost)
-        adagrad = paddle.optimizer.AdaGrad(
-            learning_rate=3e-3,
-            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
-
-        trainer = paddle.trainer.SGD(cost, parameters, adagrad)
-        for p in trainer.get_topology_proto().parameters:
-            if p.name == "_fc1.w0":
-                self.assertEqual(p.decay_rate, 6e-4)
-            else:
-                self.assertEqual(p.decay_rate, 8e-4)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/tests/test_parameters.py b/python/paddle/v2/tests/test_parameters.py
deleted file mode 100644
index 3bfd9348a61..00000000000
--- a/python/paddle/v2/tests/test_parameters.py
+++ /dev/null
@@ -1,143 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import sys
-
-try:
-    import py_paddle
-
-    del py_paddle
-except ImportError:
-    print >> sys.stderr, "It seems swig of Paddle is not installed, this " \
-                         "unittest will not be run."
-    sys.exit(0)
-
-import paddle.v2.parameters as parameters
-import paddle.v2.data_type as data_type
-import paddle.v2.layer as layer
-from paddle.v2.attr import ParamAttr
-from paddle.proto.ParameterConfig_pb2 import ParameterConfig
-import random
-import cStringIO
-import numpy
-
-
-def __rand_param_config__(name, psize=None):
-    conf = ParameterConfig()
-    conf.name = name
-    size = 1
-    if psize is None:
-        for i in xrange(2):
-            dim = random.randint(1, 1000)
-            conf.dims.append(dim)
-            size *= dim
-    else:
-        size = psize
-    conf.size = size
-    assert conf.IsInitialized()
-    return conf
-
-
-class TestParameters(unittest.TestCase):
-    def test_serialization(self):
-        params = parameters.Parameters()
-        params.__append_config__(__rand_param_config__("param_0"))
-        params.__append_config__(__rand_param_config__("param_1"))
-
-        for name in params.names():
-            param = params.get(name)
-            param[:] = numpy.random.uniform(
-                -1.0, 1.0, size=params.get_shape(name))
-            params.set(name, param)
-
-        tmp_file = cStringIO.StringIO()
-        params.to_tar(tmp_file)
-        tmp_file.seek(0)
-        params_dup = parameters.Parameters.from_tar(tmp_file)
-
-        self.assertEqual(params_dup.names(), params.names())
-
-        for name in params.names():
-            self.assertEqual(params.get_shape(name), params_dup.get_shape(name))
-            p0 = params.get(name)
-            p1 = params_dup.get(name)
-            self.assertTrue(numpy.isclose(p0, p1).all())
-
-    def test_initializer(self):
-        def initializer(name):
-            assert name == "fc.w"
-            mat = numpy.ones((3, 2), dtype=numpy.float32)
-            mat[1, 1] = 2
-            return mat
-
-        x = layer.data(name="x", type=data_type.dense_vector(3))
-        y = layer.fc(x,
-                     size=2,
-                     bias_attr=False,
-                     param_attr=ParamAttr(
-                         name="fc.w", initializer=initializer))
-        params = parameters.create(y)
-        val = params["fc.w"]
-        assert val.shape == (3, 2)
-        expected = numpy.array([[1, 1], [1, 2], [1, 1]], numpy.float32)
-        assert numpy.logical_and.reduce(numpy.reshape(val == expected, 6))
-
-    def test_init_from_tar(self):
-        def get_param(names, size):
-            p = parameters.Parameters()
-            for k, v in zip(names, size):
-                p.__append_config__(__rand_param_config__(k, v))
-            for name in p.names():
-                param = p.get(name)
-                param[:] = numpy.random.uniform(
-                    -1.0, 1.0, size=p.get_shape(name))
-                p.set(name, param)
-            return p
-
-        def get_parames():
-            name1 = ['param_0', 'param_1']
-            size1 = [128, 256]
-            p1 = get_param(name1, size1)
-            file1 = cStringIO.StringIO()
-            p1.to_tar(file1)
-            file1.seek(0)
-
-            name2 = ['param_0', 'param_1', 'param_2']
-            size2 = [128, 256, 288]
-            p2 = get_param(name2, size2)
-            file2 = cStringIO.StringIO()
-            p2.to_tar(file2)
-            file2.seek(0)
-            return p1, file1, p2, file2
-
-        p1, file1, p2, file2 = get_parames()
-        p2.init_from_tar(file1)
-        for name in p1.names():
-            self.assertEqual(p1.get_shape(name), p2.get_shape(name))
-            v1 = p1.get(name)
-            v2 = p2.get(name)
-            self.assertTrue(numpy.isclose(v1, v2).all())
-
-        p1, file1, p2, file2 = get_parames()
-        p1.init_from_tar(file2)
-        for name in p1.names():
-            self.assertEqual(p1.get_shape(name), p2.get_shape(name))
-            v1 = p1.get(name)
-            v2 = p2.get(name)
-            self.assertTrue(numpy.isclose(v1, v2).all())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/tests/test_rnn_layer.py b/python/paddle/v2/tests/test_rnn_layer.py
deleted file mode 100644
index 6ad07167dce..00000000000
--- a/python/paddle/v2/tests/test_rnn_layer.py
+++ /dev/null
@@ -1,166 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import difflib
-import unittest
-
-import paddle.trainer_config_helpers as conf_helps
-import paddle.v2.activation as activation
-import paddle.v2.data_type as data_type
-import paddle.v2.layer as layer
-from paddle.trainer_config_helpers.config_parser_utils import \
-    parse_network_config as parse_network
-from paddle.trainer_config_helpers.config_parser_utils import \
-    reset_parser
-
-
-class RNNTest(unittest.TestCase):
-    def test_simple_rnn(self):
-        dict_dim = 10
-        word_dim = 8
-        hidden_dim = 8
-
-        def parse_old_rnn():
-            reset_parser()
-
-            def step(y):
-                mem = conf_helps.memory(name="rnn_state", size=hidden_dim)
-                out = conf_helps.fc_layer(
-                    input=[y, mem],
-                    size=hidden_dim,
-                    act=activation.Tanh(),
-                    bias_attr=True,
-                    name="rnn_state")
-                return out
-
-            def test():
-                data = conf_helps.data_layer(name="word", size=dict_dim)
-                embd = conf_helps.embedding_layer(input=data, size=word_dim)
-                conf_helps.recurrent_group(
-                    name="rnn", step=step, input=embd, reverse=True)
-
-            return str(parse_network(test))
-
-        def parse_new_rnn():
-            reset_parser()
-
-            def new_step(y):
-                mem = layer.memory(name="rnn_state", size=hidden_dim)
-                out = layer.fc(input=[y, mem],
-                               size=hidden_dim,
-                               act=activation.Tanh(),
-                               bias_attr=True,
-                               name="rnn_state")
-                return out
-
-            data = layer.data(
-                name="word", type=data_type.integer_value(dict_dim))
-            embd = layer.embedding(input=data, size=word_dim)
-            rnn_layer = layer.recurrent_group(
-                name="rnn", step=new_step, input=embd, reverse=True)
-            return str(layer.parse_network(rnn_layer))
-
-        diff = difflib.unified_diff(parse_old_rnn().splitlines(1),
-                                    parse_new_rnn().splitlines(1))
-        print ''.join(diff)
-
-    def test_sequence_rnn_multi_input(self):
-        dict_dim = 10
-        word_dim = 8
-        hidden_dim = 8
-        label_dim = 3
-
-        def parse_old_rnn():
-            reset_parser()
-
-            def test():
-                data = conf_helps.data_layer(name="word", size=dict_dim)
-                label = conf_helps.data_layer(name="label", size=label_dim)
-                emb = conf_helps.embedding_layer(input=data, size=word_dim)
-                boot_layer = conf_helps.data_layer(name="boot", size=10)
-                boot_layer = conf_helps.fc_layer(
-                    name='boot_fc', input=boot_layer, size=10)
-
-                def step(y, wid):
-                    z = conf_helps.embedding_layer(input=wid, size=word_dim)
-                    mem = conf_helps.memory(
-                        name="rnn_state",
-                        size=hidden_dim,
-                        boot_layer=boot_layer)
-                    out = conf_helps.fc_layer(
-                        input=[y, z, mem],
-                        size=hidden_dim,
-                        act=conf_helps.TanhActivation(),
-                        bias_attr=True,
-                        name="rnn_state")
-                    return out
-
-                out = conf_helps.recurrent_group(
-                    name="rnn", step=step, input=[emb, data])
-
-                rep = conf_helps.last_seq(input=out)
-                prob = conf_helps.fc_layer(
-                    size=label_dim,
-                    input=rep,
-                    act=conf_helps.SoftmaxActivation(),
-                    bias_attr=True)
-
-                conf_helps.outputs(
-                    conf_helps.classification_cost(
-                        input=prob, label=label))
-
-            return str(parse_network(test))
-
-        def parse_new_rnn():
-            reset_parser()
-            data = layer.data(
-                name="word", type=data_type.dense_vector(dict_dim))
-            label = layer.data(
-                name="label", type=data_type.dense_vector(label_dim))
-            emb = layer.embedding(input=data, size=word_dim)
-            boot_layer = layer.data(
-                name="boot", type=data_type.dense_vector(10))
-            boot_layer = layer.fc(name='boot_fc', input=boot_layer, size=10)
-
-            def step(y, wid):
-                z = layer.embedding(input=wid, size=word_dim)
-                mem = layer.memory(
-                    name="rnn_state", size=hidden_dim, boot_layer=boot_layer)
-                out = layer.fc(input=[y, z, mem],
-                               size=hidden_dim,
-                               act=activation.Tanh(),
-                               bias_attr=True,
-                               name="rnn_state")
-                return out
-
-            out = layer.recurrent_group(
-                name="rnn", step=step, input=[emb, data])
-
-            rep = layer.last_seq(input=out)
-            prob = layer.fc(size=label_dim,
-                            input=rep,
-                            act=activation.Softmax(),
-                            bias_attr=True)
-
-            cost = layer.classification_cost(input=prob, label=label)
-
-            return str(layer.parse_network(cost))
-
-        diff = difflib.unified_diff(parse_old_rnn().splitlines(1),
-                                    parse_new_rnn().splitlines(1))
-        print ''.join(diff)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/tests/test_topology.py b/python/paddle/v2/tests/test_topology.py
deleted file mode 100644
index bacd28ddb7b..00000000000
--- a/python/paddle/v2/tests/test_topology.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import paddle.v2.layer as layer
-import paddle.v2.topology as topology
-import paddle.v2.data_type as data_type
-import paddle.trainer_config_helpers as conf_helps
-import paddle.trainer.PyDataProvider2 as pydp2
-
-
-class TestTopology(unittest.TestCase):
-    def test_data_type(self):
-        pixel = layer.data(name='pixel', type=data_type.dense_vector(784))
-        label = layer.data(name='label', type=data_type.integer_value(10))
-        hidden = layer.fc(input=pixel,
-                          size=100,
-                          act=conf_helps.SigmoidActivation())
-        inference = layer.fc(input=hidden,
-                             size=10,
-                             act=conf_helps.SoftmaxActivation())
-        cost = layer.classification_cost(input=inference, label=label)
-        topo = topology.Topology(cost)
-        data_types = topo.data_type()
-        self.assertEqual(len(data_types), 2)
-        pixel_data_type = filter(lambda type: type[0] == "pixel", data_types)
-        self.assertEqual(len(pixel_data_type), 1)
-        pixel_data_type = pixel_data_type[0]
-        self.assertEqual(pixel_data_type[1].type, pydp2.DataType.Dense)
-        self.assertEqual(pixel_data_type[1].dim, 784)
-
-        label_data_type = filter(lambda type: type[0] == "label", data_types)
-        self.assertEqual(len(label_data_type), 1)
-        label_data_type = label_data_type[0]
-        self.assertEqual(label_data_type[1].type, pydp2.DataType.Index)
-        self.assertEqual(label_data_type[1].dim, 10)
-
-    def test_get_layer(self):
-        pixel = layer.data(name='pixel2', type=data_type.dense_vector(784))
-        label = layer.data(name='label2', type=data_type.integer_value(10))
-        hidden = layer.fc(input=pixel,
-                          size=100,
-                          act=conf_helps.SigmoidActivation())
-        inference = layer.fc(input=hidden,
-                             size=10,
-                             act=conf_helps.SoftmaxActivation())
-        cost = layer.classification_cost(input=inference, label=label)
-        topo = topology.Topology(cost)
-        pixel_layer = topo.get_layer("pixel2")
-        label_layer = topo.get_layer("label2")
-        self.assertEqual(pixel_layer, pixel)
-        self.assertEqual(label_layer, label)
-
-    def test_parse(self):
-        pixel = layer.data(name='pixel3', type=data_type.dense_vector(784))
-        label = layer.data(name='label3', type=data_type.integer_value(10))
-        hidden = layer.fc(input=pixel,
-                          size=100,
-                          act=conf_helps.SigmoidActivation())
-        inference = layer.fc(input=hidden,
-                             size=10,
-                             act=conf_helps.SoftmaxActivation())
-        maxid = layer.max_id(input=inference)
-        cost1 = layer.classification_cost(input=inference, label=label)
-        cost2 = layer.cross_entropy_cost(input=inference, label=label)
-
-        topology.Topology(cost2).proto()
-        topology.Topology([cost1]).proto()
-        topology.Topology([cost1, cost2]).proto()
-        topology.Topology([inference, maxid]).proto()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py
deleted file mode 100644
index 923ccecb0bf..00000000000
--- a/python/paddle/v2/topology.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-
-from paddle.proto.ModelConfig_pb2 import ModelConfig
-import paddle.trainer_config_helpers as conf_helps
-import layer as v2_layer
-import config_base
-import cPickle
-from paddle.trainer import config_parser as cp
-
-__all__ = ['Topology']
-
-
-class Topology(object):
-    """
-    Topology is used to store the information about all layers
-    and network configs.
-    """
-
-    def __init__(self, layers, extra_layers=None):
-        def __check__(layers):
-            if not isinstance(layers, collections.Sequence):
-                layers = [layers]
-            for layer in layers:
-                __check_layer_type__(layer)
-            return layers
-
-        layers = __check__(layers)
-        self.layers = layers
-        if extra_layers is not None:
-            extra_layers = __check__(extra_layers)
-
-        self.__model_config__ = v2_layer.parse_network(
-            layers, extra_layers=extra_layers)
-
-        if extra_layers is not None:
-            self.layers.extend(extra_layers)
-
-        assert isinstance(self.__model_config__, ModelConfig)
-
-    def update_from_default(self):
-        # HACK(typhoonzero): update ParameterConfig(proto) in case of
-        # optimizers are defined after layers, or between layers.
-        # Must be called from trainer.__init__()
-        for parameter in self.__model_config__.parameters:
-            if parameter.momentum == 0.0 and cp.g_default_momentum:
-                parameter.momentum = cp.g_default_momentum
-            if parameter.decay_rate == 0.0 and cp.g_default_decay_rate:
-                parameter.decay_rate = cp.g_default_decay_rate
-            if parameter.initial_mean == 0.0:
-                parameter.initial_mean = cp.g_default_initial_mean
-            if parameter.initial_std == 0.01:
-                parameter.initial_std = cp.g_default_initial_std
-            if parameter.initial_strategy == 0:
-                parameter.initial_strategy = cp.g_default_initial_strategy
-            if parameter.initial_smart == False:
-                parameter.initial_smart = cp.g_default_initial_smart
-            if parameter.num_batches_regularization == 1 and \
-                cp.g_default_num_batches_regularization:
-                parameter.num_batches_regularization = \
-                    cp.g_default_num_batches_regularization
-            if parameter.gradient_clipping_threshold == 0.0 and \
-                cp.g_default_gradient_clipping_threshold:
-                parameter.gradient_clipping_threshold = \
-                    cp.g_default_gradient_clipping_threshold
-            if parameter.device == -1 and cp.g_default_device:
-                parameter.device = cp.g_default_device
-            # FIXME(typhoonzero): ignored: update_hooks, g_default_compact_func
-
-    def use_sparse_updater(self):
-        """
-        check if any parameter require to use sparse_update
-        :return:
-        """
-        use_sparse = False
-        for parameter in self.__model_config__.parameters:
-            if parameter.sparse_update or parameter.sparse_remote_update:
-                use_sparse = True
-                break
-        return use_sparse
-
-    def proto(self):
-        return self.__model_config__
-
-    def get_layer(self, name):
-        """
-        get v2.Layer Class instance by layer name
-        :param name:
-        :return:
-        """
-        return v2_layer.get_layer(name)
-
-    def data_layers(self):
-        """
-        get all data layer
-        :return:
-        """
-        data_layers = {}
-        for layer in self.proto().layers:
-            l = v2_layer.get_layer(layer.name)
-            if l and l.layer_type == conf_helps.LayerType.DATA:
-                data_layers[layer.name] = l
-        return data_layers
-
-    def data_type(self):
-        """
-        get data_type from proto, such as:
-        [('image', dense_vector(768)), ('label', integer_value(10))]
-        """
-        data_layers = self.data_layers()
-
-        return [(nm, data_layers[nm].data_type)
-                for nm in self.proto().input_layer_names]
-
-    def get_layer_proto(self, name):
-        for layer in self.__model_config__.layers:
-            if layer.name == name:
-                return layer
-        return None
-
-    def serialize_for_inference(self, stream):
-        protobin = self.proto().SerializeToString()
-        data_type = self.data_type()
-        cPickle.dump({
-            'protobin': protobin,
-            'data_type': data_type
-        }, stream, cPickle.HIGHEST_PROTOCOL)
-
-
-def __check_layer_type__(layer):
-    if not isinstance(layer, config_base.Layer):
-        raise ValueError('layer should have type paddle.v2.config_base.Layer')
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
deleted file mode 100644
index 5d98d5b6db5..00000000000
--- a/python/paddle/v2/trainer.py
+++ /dev/null
@@ -1,258 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Module Trainer
-"""
-import collections
-from topology import Topology
-from . import event as v2_event
-from . import optimizer as v2_optimizer
-from . import parameters as v2_parameters
-
-__all__ = ['SGD']
-
-
-def default_event_handler(event):
-    """
-    Default event handler. It will print some log and save mode.
-
-    TODO(yuyang18): Complete it!
-    :param event:
-    :return:
-    """
-    pass
-
-
-class SGD(object):
-    """
-    Simple SGD Trainer.
-    SGD Trainer combines data reader, network topolopy and update_equation together
-    to train/test a neural network.
-
-    :param cost: Target cost that neural network should be optimized.
-    :type cost: paddle.v2.config_base.Layer
-    :param parameters: The parameters dictionary.
-    :type parameters: paddle.v2.parameters.Parameters
-    :param update_equation: The optimizer object.
-    :type update_equation: paddle.v2.optimizer.Optimizer
-    :param extra_layers: Some layers in the neural network graph are not
-                         in the path of cost layer.
-    :type extra_layers: paddle.v2.config_base.Layer
-    :param is_local: Whether trainning locally
-    :type is_local: bool
-    :param pserver_spec: comma string for pserver location,
-                         eg:127.10.0.10:3000,127.10.0.11:3000,
-                         and this parameter is only used for fault
-                         tolerant mode cluster training.
-    :type pserver_spec: string
-    :param use_etcd: Whether using etcd pserver.
-    :param use_etcd: bool
-    """
-
-    def __init__(self,
-                 cost,
-                 parameters,
-                 update_equation,
-                 extra_layers=None,
-                 is_local=True,
-                 pserver_spec=None,
-                 use_etcd=True):
-
-        if not isinstance(parameters, v2_parameters.Parameters):
-            raise TypeError('parameters should be parameters')
-
-        if not isinstance(update_equation, v2_optimizer.Optimizer):
-            raise TypeError("update equation parameter must be "
-                            "paddle.v2.optimizer.Optimizer")
-        import py_paddle.swig_paddle as api
-        topology = Topology(cost, extra_layers=extra_layers)
-        # HACK(typhoonzero): update ParameterConfig(proto) in case of optimizers
-        # are defined after layers, or between layers.
-        topology.update_from_default()
-        parameters.update_param_conf(topology.proto())
-
-        self.__optimizer__ = update_equation
-        self.__topology__ = topology
-        self.__parameters__ = parameters
-        self.__topology_in_proto__ = topology.proto()
-        self.__is_local__ = is_local
-        self.__pserver_spec__ = pserver_spec
-        self.__use_etcd__ = use_etcd
-
-        self.__use_sparse_updater__ = self.__topology__.use_sparse_updater()
-        # # In local mode, disable sparse_remote_update.
-        if is_local:
-            for param in self.__topology_in_proto__.parameters:
-                if param.sparse_remote_update:
-                    param.sparse_remote_update = False
-
-        self.__gm_create_mode__ = api.CREATE_MODE_NORMAL if not \
-            self.__use_sparse_updater__ else api.CREATE_MODE_SGD_SPARSE_CPU_TRAINING
-        self.__data_types__ = topology.data_type()
-        gm = api.GradientMachine.createFromConfigProto(
-            self.__topology_in_proto__, self.__gm_create_mode__,
-            self.__optimizer__.enable_types())
-        assert isinstance(gm, api.GradientMachine)
-        self.__gradient_machine__ = gm
-        self.__gradient_machine__.randParameters()
-        self.__parameters__.append_gradient_machine(gm)
-        self.__parameter_updater__ = None
-
-    def get_topology_proto(self):
-        return self.__topology_in_proto__
-
-    def __use_remote_sparse_updater__(self):
-        return self.__use_sparse_updater__ and not self.__is_local__
-
-    def __prepare_parameter__(self, in_args):
-        """
-        prepare parameter before forward backward.
-        1. When use remote sparse updater, parameters should be got
-        from ps according to input arguments.
-        :param in_args: input arguments of this batch.
-        :return:
-        """
-        if self.__use_remote_sparse_updater__():
-            self.__gradient_machine__.prefetch(in_args)
-            self.__parameter_updater__.getParametersRemote()
-
-    def save_parameter_to_tar(self, f):
-        self.__parameter_updater__.catchUpWith()
-        self.__parameter_updater__.apply()
-        self.__parameter_updater__.getParametersRemote(True, True)
-        self.__parameters__.to_tar(f)
-        self.__parameter_updater__.restore()
-
-    def train(self, reader, num_passes=1, event_handler=None, feeding=None):
-        """
-        Training method. Will train num_passes of input data.
-
-        :param reader: A reader that reads and yeilds data items. Usually we use a
-                       batched reader to do mini-batch training.
-        :type reader: collections.Iterable
-        :param num_passes: The total train passes.
-        :param event_handler: Event handler. A method will be invoked when event
-                              occurred.
-        :type event_handler: (BaseEvent) => None
-        :param feeding: Feeding is a map of neural network input name and array
-                        index that reader returns.
-        :type feeding: dict|list
-        :return:
-        """
-        import py_paddle.swig_paddle as api
-        from data_feeder import DataFeeder
-        if event_handler is None:
-            event_handler = default_event_handler
-        __check_train_args__(**locals())
-
-        self.__parameter_updater__ = self.__optimizer__.create_updater(
-            self.__is_local__, num_passes, self.__use_sparse_updater__,
-            self.__pserver_spec__, self.__use_etcd__)
-        self.__parameter_updater__.init(self.__gradient_machine__)
-
-        self.__gradient_machine__.start()
-        batch_evaluator = self.__gradient_machine__.makeEvaluator()
-        assert isinstance(batch_evaluator, api.Evaluator)
-        pass_evaluator = self.__gradient_machine__.makeEvaluator()
-        assert isinstance(pass_evaluator, api.Evaluator)
-        out_args = api.Arguments.createArguments(0)
-        feeder = DataFeeder(self.__data_types__, feeding)
-        for pass_id in xrange(num_passes):
-            event_handler(v2_event.BeginPass(pass_id))
-            pass_evaluator.start()
-            self.__parameter_updater__.startPass()
-            for batch_id, data_batch in enumerate(reader()):
-                batch_evaluator.start()
-                event_handler(
-                    v2_event.BeginIteration(
-                        pass_id=pass_id, batch_id=batch_id))
-                pass_type = self.__parameter_updater__.startBatch(
-                    len(data_batch))
-                in_args = feeder(data_batch)
-                self.__prepare_parameter__(in_args)
-                self.__gradient_machine__.forwardBackward(in_args, out_args,
-                                                          pass_type)
-                self.__gradient_machine__.eval(pass_evaluator)
-                self.__gradient_machine__.eval(batch_evaluator)
-                event_handler(
-                    v2_event.EndForwardBackward(
-                        pass_id=pass_id,
-                        batch_id=batch_id,
-                        gm=self.__gradient_machine__))
-                for each_param in self.__gradient_machine__.getNonStaticParameters(
-                ):
-                    self.__parameter_updater__.update(each_param)
-                cost_sum = out_args.sum()
-                cost = cost_sum / len(data_batch)
-                self.__parameter_updater__.finishBatch(cost)
-                batch_evaluator.finish()
-                event_handler(
-                    v2_event.EndIteration(
-                        pass_id=pass_id,
-                        batch_id=batch_id,
-                        cost=cost,
-                        evaluator=batch_evaluator,
-                        gm=self.__gradient_machine__))
-
-            self.__parameter_updater__.finishPass()
-            pass_evaluator.finish()
-            event_handler(
-                v2_event.EndPass(
-                    pass_id,
-                    evaluator=pass_evaluator,
-                    gm=self.__gradient_machine__))
-        self.__gradient_machine__.finish()
-
-    def test(self, reader, feeding=None):
-        """
-        Testing method. Will test input data.
-
-        :param reader: A batch reader that reads and yeilds data items,
-                       it should be a paddle.v2.batch.
-        :type reader: collections.Iterable
-        :param feeding: Feeding is a map of neural network input name and array
-                        index that reader returns.
-        :type feeding: dict
-        :return:
-        """
-        import py_paddle.swig_paddle as api
-        from data_feeder import DataFeeder
-        feeder = DataFeeder(self.__data_types__, feeding)
-        evaluator = self.__gradient_machine__.makeEvaluator()
-        out_args = api.Arguments.createArguments(0)
-        evaluator.start()
-        total_cost = 0
-        num_samples = 0.0
-        for data_batch in reader():
-            num_samples += len(data_batch)
-            in_args = feeder(data_batch)
-            self.__prepare_parameter__(in_args)
-            self.__gradient_machine__.forward(in_args, out_args, api.PASS_TEST)
-            total_cost += out_args.sum()
-            self.__gradient_machine__.eval(evaluator)
-
-        evaluator.finish()
-        return v2_event.TestResult(
-            evaluator=evaluator, cost=total_cost / num_samples)
-
-
-def __check_train_args__(reader, event_handler, **kwargs):
-    """
-    Check train function's argument types
-    """
-    if not callable(reader) or not isinstance(reader(), collections.Iterator):
-        raise TypeError('train_data_reader should be a function, '
-                        'which can return a iterator')
-    if not callable(event_handler):
-        raise TypeError('event handler should be a function')
-- 
GitLab


From 1777017a05fc178a5861b1311b686da3e8ecd60a Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 14:41:09 +0800
Subject: [PATCH 16/73] remove legace go and proto code

---
 go/.gitignore                                 |   3 -
 go/CMakeLists.txt                             |  23 -
 go/cmd/master/CMakeLists.txt                  |  15 -
 go/cmd/master/master.go                       | 120 ---
 go/cmd/pserver/.gitignore                     |   1 -
 go/cmd/pserver/CMakeLists.txt                 |  15 -
 go/cmd/pserver/pserver.go                     | 108 ---
 go/connection/conn.go                         | 120 ---
 go/glide.lock                                 | 233 ------
 go/glide.yaml                                 |  33 -
 go/master/CMakeLists.txt                      |  17 -
 go/master/c/CMakeLists.txt                    |  15 -
 go/master/c/client.go                         | 196 -----
 go/master/client.go                           | 255 -------
 go/master/client_internal_test.go             | 152 ----
 go/master/client_test.go                      | 150 ----
 go/master/etcd_client.go                      | 201 -----
 go/master/inmem_store.go                      |  47 --
 go/master/service.go                          | 510 -------------
 go/master/service_internal_test.go            |  52 --
 go/master/service_test.go                     |  72 --
 go/proto/.gitignore                           |   4 -
 go/pserver/CMakeLists.txt                     |  17 -
 go/pserver/client/CMakeLists.txt              |  17 -
 go/pserver/client/c/.gitignore                |   1 -
 go/pserver/client/c/CMakeLists.txt            |  30 -
 go/pserver/client/c/cclient.go                | 300 --------
 go/pserver/client/c/test/CMakeLists.txt       |  15 -
 go/pserver/client/c/test/test_cclient.c       | 115 ---
 go/pserver/client/c/test/test_mnist.py        | 145 ----
 go/pserver/client/c/test/test_train.py        |  89 ---
 .../client/c/test/testdata/optimizer.pb       | Bin 50 -> 0 bytes
 go/pserver/client/client.go                   | 237 ------
 go/pserver/client/client_test.go              | 268 -------
 go/pserver/client/etcd_client.go              | 266 -------
 go/pserver/client/etcd_client_test.go         | 106 ---
 go/pserver/etcd_client.go                     | 253 -------
 go/pserver/optimizer.go                       | 132 ----
 go/pserver/optimizer_test.go                  |  78 --
 go/pserver/service.go                         | 450 -----------
 go/pserver/service_internal_test.go           |  86 ---
 go/pserver/service_test.go                    | 211 ------
 go/utils/networkhelper/CMakeLists.txt         |  17 -
 go/utils/networkhelper/helper.go              |  59 --
 go/utils/networkhelper/helper_test.go         |  24 -
 proto/.gitignore                              |   1 -
 proto/CMakeLists.txt                          |  57 --
 proto/DataConfig.proto                        |  86 ---
 proto/DataFormat.proto                        |  76 --
 proto/ModelConfig.proto                       | 698 ------------------
 proto/OptimizerConfig.proto                   | 164 ----
 proto/ParameterConfig.proto                   |  83 ---
 proto/ParameterServerConfig.proto             |  50 --
 proto/ParameterService.proto                  | 351 ---------
 proto/README.md                               |   3 -
 proto/TrainerConfig.proto                     | 160 ----
 56 files changed, 6987 deletions(-)
 delete mode 100644 go/.gitignore
 delete mode 100644 go/CMakeLists.txt
 delete mode 100644 go/cmd/master/CMakeLists.txt
 delete mode 100644 go/cmd/master/master.go
 delete mode 100644 go/cmd/pserver/.gitignore
 delete mode 100644 go/cmd/pserver/CMakeLists.txt
 delete mode 100644 go/cmd/pserver/pserver.go
 delete mode 100644 go/connection/conn.go
 delete mode 100644 go/glide.lock
 delete mode 100644 go/glide.yaml
 delete mode 100644 go/master/CMakeLists.txt
 delete mode 100644 go/master/c/CMakeLists.txt
 delete mode 100644 go/master/c/client.go
 delete mode 100644 go/master/client.go
 delete mode 100644 go/master/client_internal_test.go
 delete mode 100644 go/master/client_test.go
 delete mode 100644 go/master/etcd_client.go
 delete mode 100644 go/master/inmem_store.go
 delete mode 100644 go/master/service.go
 delete mode 100644 go/master/service_internal_test.go
 delete mode 100644 go/master/service_test.go
 delete mode 100644 go/proto/.gitignore
 delete mode 100644 go/pserver/CMakeLists.txt
 delete mode 100644 go/pserver/client/CMakeLists.txt
 delete mode 100644 go/pserver/client/c/.gitignore
 delete mode 100644 go/pserver/client/c/CMakeLists.txt
 delete mode 100644 go/pserver/client/c/cclient.go
 delete mode 100644 go/pserver/client/c/test/CMakeLists.txt
 delete mode 100644 go/pserver/client/c/test/test_cclient.c
 delete mode 100644 go/pserver/client/c/test/test_mnist.py
 delete mode 100644 go/pserver/client/c/test/test_train.py
 delete mode 100644 go/pserver/client/c/test/testdata/optimizer.pb
 delete mode 100644 go/pserver/client/client.go
 delete mode 100644 go/pserver/client/client_test.go
 delete mode 100644 go/pserver/client/etcd_client.go
 delete mode 100644 go/pserver/client/etcd_client_test.go
 delete mode 100644 go/pserver/etcd_client.go
 delete mode 100644 go/pserver/optimizer.go
 delete mode 100644 go/pserver/optimizer_test.go
 delete mode 100644 go/pserver/service.go
 delete mode 100644 go/pserver/service_internal_test.go
 delete mode 100644 go/pserver/service_test.go
 delete mode 100644 go/utils/networkhelper/CMakeLists.txt
 delete mode 100644 go/utils/networkhelper/helper.go
 delete mode 100644 go/utils/networkhelper/helper_test.go
 delete mode 100644 proto/.gitignore
 delete mode 100644 proto/CMakeLists.txt
 delete mode 100644 proto/DataConfig.proto
 delete mode 100644 proto/DataFormat.proto
 delete mode 100644 proto/ModelConfig.proto
 delete mode 100644 proto/OptimizerConfig.proto
 delete mode 100644 proto/ParameterConfig.proto
 delete mode 100644 proto/ParameterServerConfig.proto
 delete mode 100644 proto/ParameterService.proto
 delete mode 100644 proto/README.md
 delete mode 100644 proto/TrainerConfig.proto

diff --git a/go/.gitignore b/go/.gitignore
deleted file mode 100644
index 398d70ca375..00000000000
--- a/go/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-vendor/
-.glide/
-proto/*.go
diff --git a/go/CMakeLists.txt b/go/CMakeLists.txt
deleted file mode 100644
index f3a9296c2c6..00000000000
--- a/go/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-add_subdirectory(pserver/client/c)
-add_subdirectory(cmd/pserver)
-add_subdirectory(cmd/master)
-add_subdirectory(master/c)
-add_subdirectory(master)
-add_subdirectory(pserver)
-add_subdirectory(pserver/client)
-add_subdirectory(utils/networkhelper)
diff --git a/go/cmd/master/CMakeLists.txt b/go/cmd/master/CMakeLists.txt
deleted file mode 100644
index fc99d8d3bd1..00000000000
--- a/go/cmd/master/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-go_binary(master SRC master.go)
diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go
deleted file mode 100644
index 537df59c860..00000000000
--- a/go/cmd/master/master.go
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-import (
-	"fmt"
-	"net"
-	"net/http"
-	"net/rpc"
-	"os"
-	"os/signal"
-	"strconv"
-	"strings"
-	"time"
-
-	log "github.com/inconshreveable/log15"
-	"github.com/namsral/flag"
-
-	"github.com/PaddlePaddle/Paddle/go/master"
-	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
-)
-
-func main() {
-	port := flag.Int("port", 8080, "port of the master server.")
-	ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.")
-	endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.")
-	taskTimeoutDur := flag.Duration("task-timout-dur", 20*time.Minute, "task timout duration.")
-	taskTimeoutMax := flag.Int("task-timeout-max", 3, "max timtout count for each task before it being declared failed task.")
-	chunkPerTask := flag.Int("chunk-per-task", 10, "chunk per task.")
-	logLevel := flag.String("log-level", "info",
-		"log level, possible values: debug, info, warn, error, crit")
-	flag.Parse()
-
-	lvl, err := log.LvlFromString(*logLevel)
-	if err != nil {
-		panic(err)
-	}
-
-	log.Root().SetHandler(
-		log.LvlFilterHandler(lvl, log.CallerStackHandler("%+v", log.StderrHandler)),
-	)
-
-	if *endpoints == "" {
-		log.Warn("-endpoints not set, fault tolerance not be enabled.")
-	}
-
-	var store master.Store
-	if *endpoints != "" {
-		eps := strings.Split(*endpoints, ",")
-		ip, err := networkhelper.GetExternalIP()
-		if err != nil {
-			log.Crit("get external ip error", log.Ctx{"error": err})
-			panic(err)
-		}
-
-		addr := fmt.Sprintf("%s:%d", ip, *port)
-		store, err = master.NewEtcdClient(eps, addr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, *ttlSec)
-		if err != nil {
-			log.Crit("error creating etcd client.", log.Ctx{"error": err})
-			panic(err)
-		}
-	} else {
-		store = &master.InMemStore{}
-	}
-
-	shutdown := func() {
-		log.Info("shutting down gracefully")
-		err := store.Shutdown()
-		if err != nil {
-			log.Error("shutdown error", log.Ctx{"error": err})
-		}
-	}
-
-	// Guaranteed to run even panic happens.
-	defer shutdown()
-
-	c := make(chan os.Signal, 1)
-	signal.Notify(c, os.Interrupt)
-
-	s, err := master.NewService(store, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
-	if err != nil {
-		log.Crit("error creating new service.", log.Ctx{"error": err})
-		panic(err)
-	}
-
-	err = rpc.Register(s)
-	if err != nil {
-		log.Crit("error registering to etcd.", log.Ctx{"error": err})
-		panic(err)
-	}
-
-	rpc.HandleHTTP()
-	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
-	if err != nil {
-		log.Crit("error listing to port", log.Ctx{"error": err, "port": *port})
-		panic(err)
-	}
-
-	go func() {
-		err = http.Serve(l, nil)
-		if err != nil {
-			log.Crit("error serving HTTP", log.Ctx{"error": err})
-			panic(err)
-		}
-	}()
-
-	<-c
-}
diff --git a/go/cmd/pserver/.gitignore b/go/cmd/pserver/.gitignore
deleted file mode 100644
index fffd9adc4fd..00000000000
--- a/go/cmd/pserver/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-pserver
diff --git a/go/cmd/pserver/CMakeLists.txt b/go/cmd/pserver/CMakeLists.txt
deleted file mode 100644
index 20d033c9386..00000000000
--- a/go/cmd/pserver/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-go_binary(pserver SRCS pserver.go DEPS paddle_go_optimizer)
diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
deleted file mode 100644
index 271274cafc5..00000000000
--- a/go/cmd/pserver/pserver.go
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-import (
-	"net"
-	"net/http"
-	"net/rpc"
-	"os"
-	"os/signal"
-	"strconv"
-	"time"
-
-	"github.com/namsral/flag"
-	"github.com/topicai/candy"
-
-	"github.com/PaddlePaddle/Paddle/go/pserver"
-	log "github.com/inconshreveable/log15"
-)
-
-func main() {
-	port := flag.Int("port", 8001, "port of the pserver")
-	index := flag.Int("index", -1, "index of the pserver, set to -1 if use etcd for auto pserver index registry")
-	etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
-		"comma separated endpoint string for pserver to connect to etcd")
-	dialTimeout := flag.Duration("dial-timeout", 5*time.Second, "dial timeout")
-	etcdTTL := flag.Int("etcd-ttl", 5, "etcd time to live in seconds")
-	numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job")
-	checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path")
-	checkpointInterval := flag.Duration("checkpoint-interval", 600*time.Second, "save checkpoint per interval seconds")
-	logLevel := flag.String("log-level", "info",
-		"log level, possible values: debug, info, warn, error, crit")
-	flag.Parse()
-
-	lvl, err := log.LvlFromString(*logLevel)
-	if err != nil {
-		panic(err)
-	}
-
-	log.Root().SetHandler(
-		log.LvlFilterHandler(lvl, log.CallerStackHandler("%+v", log.StderrHandler)),
-	)
-
-	var idx int
-
-	var cp pserver.Checkpoint
-	var e *pserver.EtcdClient
-	if *index >= 0 {
-		idx = *index
-	} else {
-		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *dialTimeout, *etcdTTL)
-		idx, err = e.Register(*port)
-		candy.Must(err)
-
-		cp, err = pserver.LoadCheckpoint(e, idx)
-		if err != nil {
-			if err == pserver.ErrCheckpointNotFound {
-				log.Info("load checkpoint error", "error", err)
-			} else {
-				panic(err)
-			}
-		}
-	}
-
-	shutdown := func() {
-		log.Info("shutting down gracefully")
-		sErr := e.Shutdown()
-		if sErr != nil {
-			log.Error("error shutting down", log.Ctx{"error": sErr})
-		}
-	}
-
-	// Guaranteed to run even panic happens.
-	defer shutdown()
-
-	c := make(chan os.Signal, 1)
-	signal.Notify(c, os.Interrupt)
-
-	s, err := pserver.NewService(idx, *checkpointInterval, *checkpointPath, e, cp)
-	candy.Must(err)
-
-	err = rpc.Register(s)
-	candy.Must(err)
-
-	rpc.HandleHTTP()
-	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
-	candy.Must(err)
-
-	go func() {
-		log.Info("serving pserver", log.Ctx{"port": *port})
-		err = http.Serve(l, nil)
-		candy.Must(err)
-	}()
-
-	<-c
-}
diff --git a/go/connection/conn.go b/go/connection/conn.go
deleted file mode 100644
index b8353e8e18e..00000000000
--- a/go/connection/conn.go
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package connection
-
-import (
-	"errors"
-	"net/rpc"
-	"sync"
-
-	log "github.com/sirupsen/logrus"
-)
-
-// TODO(helin): add TCP re-connect logic
-
-// Conn is a connection to a parameter server
-type Conn struct {
-	mu       sync.Mutex
-	client   *rpc.Client
-	waitConn chan struct{}
-}
-
-// New creates a new connection.
-func New() *Conn {
-	c := &Conn{}
-	return c
-}
-
-// Close closes the connection.
-func (c *Conn) Close() error {
-	c.mu.Lock()
-	defer c.mu.Unlock()
-
-	if c.client == nil {
-		return nil
-	}
-
-	return c.client.Close()
-}
-
-// Connect connects the connection to a address.
-func (c *Conn) Connect(addr string) error {
-	c.mu.Lock()
-	if c.client != nil {
-		err := c.client.Close()
-		if err != nil {
-			c.mu.Unlock()
-			return err
-		}
-
-		c.client = nil
-	}
-	c.mu.Unlock()
-
-	client, err := rpc.DialHTTP("tcp", addr)
-	if err != nil {
-		return err
-	}
-
-	c.mu.Lock()
-	defer c.mu.Unlock()
-
-	if c.client == nil {
-		c.client = client
-		if c.waitConn != nil {
-			close(c.waitConn)
-			c.waitConn = nil
-		}
-	} else {
-		err := client.Close()
-		if err != nil {
-			log.Errorln(err)
-		}
-
-		return errors.New("client already set from a concurrent goroutine")
-	}
-
-	return nil
-}
-
-// TODO(helin): refactor Call to be able to perform given retry
-// policy.
-
-// Call make a RPC call.
-//
-// Call will be blocked until the connection to remote RPC service
-// being established.
-func (c *Conn) Call(serviceMethod string, args interface{}, reply interface{}) error {
-	c.mu.Lock()
-	client := c.client
-	var waitCh chan struct{}
-	if client == nil {
-		if c.waitConn != nil {
-			waitCh = c.waitConn
-		} else {
-			waitCh = make(chan struct{})
-			c.waitConn = waitCh
-		}
-	}
-	c.mu.Unlock()
-
-	if waitCh != nil {
-		// wait until new connection being established
-		<-waitCh
-		return c.Call(serviceMethod, args, reply)
-	}
-
-	return client.Call(serviceMethod, args, reply)
-}
diff --git a/go/glide.lock b/go/glide.lock
deleted file mode 100644
index d15fc934dbe..00000000000
--- a/go/glide.lock
+++ /dev/null
@@ -1,233 +0,0 @@
-hash: 107c058cf5c9163a75d40eef2273a793c36112683c25d72aa8288827fdde3a19
-updated: 2017-10-30T03:46:19.137696069Z
-imports:
-- name: github.com/alecthomas/gometalinter
-  version: bae2f1293d092fd8167939d5108d1b025eaef9de
-- name: github.com/beorn7/perks
-  version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9
-  subpackages:
-  - quantile
-- name: github.com/boltdb/bolt
-  version: 583e8937c61f1af6513608ccc75c97b6abdf4ff9
-- name: github.com/cockroachdb/cmux
-  version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92
-- name: github.com/coreos/etcd
-  version: f1d7dd87da3e8feab4aaf675b8e29c6a5ed5f58b
-  subpackages:
-  - alarm
-  - auth
-  - auth/authpb
-  - client
-  - clientv3
-  - clientv3/concurrency
-  - compactor
-  - discovery
-  - embed
-  - error
-  - etcdserver
-  - etcdserver/api
-  - etcdserver/api/etcdhttp
-  - etcdserver/api/v2http
-  - etcdserver/api/v2http/httptypes
-  - etcdserver/api/v3client
-  - etcdserver/api/v3election
-  - etcdserver/api/v3election/v3electionpb
-  - etcdserver/api/v3election/v3electionpb/gw
-  - etcdserver/api/v3lock
-  - etcdserver/api/v3lock/v3lockpb
-  - etcdserver/api/v3lock/v3lockpb/gw
-  - etcdserver/api/v3rpc
-  - etcdserver/api/v3rpc/rpctypes
-  - etcdserver/auth
-  - etcdserver/etcdserverpb
-  - etcdserver/etcdserverpb/gw
-  - etcdserver/membership
-  - etcdserver/stats
-  - lease
-  - lease/leasehttp
-  - lease/leasepb
-  - mvcc
-  - mvcc/backend
-  - mvcc/mvccpb
-  - pkg/adt
-  - pkg/contention
-  - pkg/cors
-  - pkg/cpuutil
-  - pkg/crc
-  - pkg/debugutil
-  - pkg/fileutil
-  - pkg/httputil
-  - pkg/idutil
-  - pkg/ioutil
-  - pkg/logutil
-  - pkg/monotime
-  - pkg/netutil
-  - pkg/pathutil
-  - pkg/pbutil
-  - pkg/runtime
-  - pkg/schedule
-  - pkg/srv
-  - pkg/tlsutil
-  - pkg/transport
-  - pkg/types
-  - pkg/wait
-  - proxy/grpcproxy/adapter
-  - raft
-  - raft/raftpb
-  - rafthttp
-  - snap
-  - snap/snappb
-  - store
-  - version
-  - wal
-  - wal/walpb
-- name: github.com/coreos/go-semver
-  version: 8ab6407b697782a06568d4b7f1db25550ec2e4c6
-  subpackages:
-  - semver
-- name: github.com/coreos/go-systemd
-  version: 48702e0da86bd25e76cfef347e2adeb434a0d0a6
-  subpackages:
-  - daemon
-  - journal
-  - util
-- name: github.com/coreos/pkg
-  version: 3ac0863d7acf3bc44daf49afef8919af12f704ef
-  subpackages:
-  - capnslog
-- name: github.com/dgrijalva/jwt-go
-  version: d2709f9f1f31ebcda9651b03077758c1f3a0018c
-- name: github.com/ghodss/yaml
-  version: 0ca9ea5df5451ffdf184b4428c902747c2c11cd7
-- name: github.com/go-stack/stack
-  version: 817915b46b97fd7bb80e8ab6b69f01a53ac3eebf
-- name: github.com/gogo/protobuf
-  version: 909568be09de550ed094403c2bf8a261b5bb730a
-  subpackages:
-  - proto
-- name: github.com/golang/protobuf
-  version: 4bd1920723d7b7c925de087aa32e2187708897f7
-  subpackages:
-  - jsonpb
-  - proto
-- name: github.com/golang/snappy
-  version: 553a641470496b2327abcac10b36396bd98e45c9
-- name: github.com/google/btree
-  version: 925471ac9e2131377a91e1595defec898166fe49
-- name: github.com/grpc-ecosystem/go-grpc-prometheus
-  version: 6b7015e65d366bf3f19b2b2a000a831940f0f7e0
-- name: github.com/grpc-ecosystem/grpc-gateway
-  version: 18d159699f2e83fc5bb9ef2f79465ca3f3122676
-  subpackages:
-  - runtime
-  - runtime/internal
-  - utilities
-- name: github.com/inconshreveable/log15
-  version: 0decfc6c20d9ca0ad143b0e89dcaa20f810b4fb3
-- name: github.com/jonboulle/clockwork
-  version: 2eee05ed794112d45db504eb05aa693efd2b8b09
-- name: github.com/mattn/go-colorable
-  version: 5411d3eea5978e6cdc258b30de592b60df6aba96
-- name: github.com/mattn/go-isatty
-  version: 57fdcb988a5c543893cc61bce354a6e24ab70022
-- name: github.com/matttproud/golang_protobuf_extensions
-  version: c12348ce28de40eed0136aa2b644d0ee0650e56c
-  subpackages:
-  - pbutil
-- name: github.com/namsral/flag
-  version: 71ceffbeb0ba60fccc853971bb3ed4d7d90bfd04
-- name: github.com/PaddlePaddle/recordio
-  version: 0432dee9fd4b24fb6840fb20a8c055b0c933fb81
-- name: github.com/prometheus/client_golang
-  version: c5b7fccd204277076155f10851dad72b76a49317
-  subpackages:
-  - prometheus
-- name: github.com/prometheus/client_model
-  version: 6f3806018612930941127f2a7c6c453ba2c527d2
-  subpackages:
-  - go
-- name: github.com/prometheus/common
-  version: 49fee292b27bfff7f354ee0f64e1bc4850462edf
-  subpackages:
-  - expfmt
-  - internal/bitbucket.org/ww/goautoneg
-  - model
-- name: github.com/prometheus/procfs
-  version: a1dba9ce8baed984a2495b658c82687f8157b98f
-  subpackages:
-  - xfs
-- name: github.com/satori/go.uuid
-  version: 879c5887cd475cd7864858769793b2ceb0d44feb
-- name: github.com/sirupsen/logrus
-  version: f006c2ac4710855cf0f916dd6b77acf6b048dc6e
-- name: github.com/topicai/candy
-  version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc
-- name: github.com/ugorji/go
-  version: ded73eae5db7e7a0ef6f55aace87a2873c5d2b74
-  subpackages:
-  - codec
-- name: github.com/xiang90/probing
-  version: 07dd2e8dfe18522e9c447ba95f2fe95262f63bb2
-- name: golang.org/x/crypto
-  version: 9419663f5a44be8b34ca85f08abc5fe1be11f8a3
-  repo: https://github.com/golang/crypto.git
-  vcs: git
-  subpackages:
-  - bcrypt
-  - blowfish
-  - ssh/terminal
-- name: golang.org/x/net
-  version: c8c74377599bd978aee1cf3b9b63a8634051cec2
-  subpackages:
-  - context
-  - http2
-  - http2/hpack
-  - idna
-  - internal/timeseries
-  - lex/httplex
-  - trace
-- name: golang.org/x/sys
-  version: e48874b42435b4347fc52bdee0424a52abc974d7
-  repo: https://github.com/golang/sys.git
-  vcs: git
-  subpackages:
-  - unix
-  - windows
-- name: golang.org/x/text
-  version: 836efe42bb4aa16aaa17b9c155d8813d336ed720
-  repo: https://github.com/golang/text.git
-  vcs: git
-  subpackages:
-  - secure/bidirule
-  - transform
-  - unicode/bidi
-  - unicode/norm
-- name: google.golang.org/grpc
-  version: 8050b9cbc271307e5a716a9d782803d09b0d6f2d
-  subpackages:
-  - codes
-  - credentials
-  - grpclog
-  - internal
-  - keepalive
-  - metadata
-  - naming
-  - peer
-  - stats
-  - tap
-  - transport
-- name: gopkg.in/yaml.v2
-  version: cd8b52f8269e0feb286dfeef29f8fe4d5b397e0b
-testImports:
-- name: github.com/davecgh/go-spew
-  version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9
-  subpackages:
-  - spew
-- name: github.com/pmezard/go-difflib
-  version: d8ed2627bdf02c080bf22230dbb337003b7aba2d
-  subpackages:
-  - difflib
-- name: github.com/stretchr/testify
-  version: 05e8a0eda380579888eb53c394909df027f06991
-  subpackages:
-  - assert
diff --git a/go/glide.yaml b/go/glide.yaml
deleted file mode 100644
index c5d66694acd..00000000000
--- a/go/glide.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-package: github.com/PaddlePaddle/Paddle/go
-import:
-- package: github.com/PaddlePaddle/recordio
-- package: github.com/coreos/etcd
-  version: ^3.2.1
-  subpackages:
-  - clientv3
-  - clientv3/concurrency
-  - embed
-  - etcdserver
-- package: github.com/namsral/flag
-  version: ^1.7.4-pre
-- package: github.com/sirupsen/logrus
-  version: ^1.0.0
-- package: github.com/topicai/candy
-- package: golang.org/x/crypto
-  repo: https://github.com/golang/crypto.git
-  vcs: git
-- package: golang.org/x/sys
-  repo: https://github.com/golang/sys.git
-  vcs: git
-- package: golang.org/x/text
-  repo: https://github.com/golang/text.git
-  vcs: git
-- package: github.com/satori/go.uuid
-  version: v1.1.0
-- package: github.com/alecthomas/gometalinter
-  version: v1.2.1
-- package: github.com/inconshreveable/log15
-  version: v2.13
-- package: github.com/go-stack/stack
-  version: v1.6.0
-- package: github.com/golang/protobuf
diff --git a/go/master/CMakeLists.txt b/go/master/CMakeLists.txt
deleted file mode 100644
index b5101c3479d..00000000000
--- a/go/master/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-if(WITH_TESTING)
-  go_test(master_test)
-endif()
diff --git a/go/master/c/CMakeLists.txt b/go/master/c/CMakeLists.txt
deleted file mode 100644
index 58b44e6445b..00000000000
--- a/go/master/c/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-go_library(paddle_master SHARED DEPS paddle_go_optimizer)
diff --git a/go/master/c/client.go b/go/master/c/client.go
deleted file mode 100644
index 42c176d00bd..00000000000
--- a/go/master/c/client.go
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-/*
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#define PADDLE_MASTER_OK    0
-#define PADDLE_MASTER_ERROR -1
-
-#define PADDLE_SAVE_MODEL_OK   1
-#define PADDLE_SAVE_MODEL_SKIP 0
-
-typedef int paddle_master_client;
-*/
-import "C"
-
-import (
-	"strings"
-	"sync"
-	"time"
-	"unsafe"
-
-	"github.com/PaddlePaddle/Paddle/go/master"
-	log "github.com/inconshreveable/log15"
-)
-
-var mu sync.Mutex
-var handleMap = make(map[C.paddle_master_client]*master.Client)
-var curHandle C.paddle_master_client
-
-func init() {
-	log.Root().SetHandler(
-		log.LvlFilterHandler(log.LvlWarn, log.CallerStackHandler("%+v", log.StderrHandler)),
-	)
-}
-
-func add(c *master.Client) C.paddle_master_client {
-	mu.Lock()
-	defer mu.Unlock()
-	client := curHandle
-	curHandle++
-	handleMap[client] = c
-	return client
-}
-
-func get(client C.paddle_master_client) *master.Client {
-	mu.Lock()
-	defer mu.Unlock()
-	return handleMap[client]
-}
-
-func remove(client C.paddle_master_client) *master.Client {
-	mu.Lock()
-	defer mu.Unlock()
-	h := handleMap[client]
-	delete(handleMap, client)
-	return h
-}
-
-//export paddle_new_etcd_master_client
-//
-// bufSize is the record buffer size.
-func paddle_new_etcd_master_client(etcdEndpoints *C.char, timeout int, bufSize int) C.paddle_master_client {
-	p := C.GoString(etcdEndpoints)
-	endpoints := strings.Split(p, ",")
-	c, err := master.NewClient(
-		master.WithEtcd(endpoints, time.Duration(timeout)*time.Second),
-		master.WithBuffer(bufSize),
-	)
-	if err != nil {
-		panic(err)
-	}
-
-	return add(c)
-}
-
-//export paddle_new_master_client
-//
-// bufSize is the record buffer size.
-func paddle_new_master_client(addr *C.char, bufSize int) C.paddle_master_client {
-	a := C.GoString(addr)
-	c, err := master.NewClient(master.WithAddr(a), master.WithBuffer(bufSize))
-	if err != nil {
-		panic(err)
-	}
-
-	return add(c)
-}
-
-//export paddle_release_master_client
-func paddle_release_master_client(client C.paddle_master_client) {
-	remove(client)
-}
-
-//export paddle_start_get_records
-func paddle_start_get_records(client C.paddle_master_client, pass C.int) {
-	c := get(client)
-	c.StartGetRecords(int(pass))
-}
-
-//export paddle_set_dataset
-func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int) C.int {
-	c := get(client)
-	var paths []string
-	for i := 0; i < int(size); i++ {
-		ptr := (**C.char)(unsafe.Pointer(uintptr(unsafe.Pointer(path)) + uintptr(i)*unsafe.Sizeof(*path)))
-		str := C.GoString(*ptr)
-		paths = append(paths, str)
-	}
-	err := c.SetDataset(paths)
-	if err != nil {
-		log.Error("error set dataset",
-			log.Ctx{"error": err, "paths": paths})
-		return C.PADDLE_MASTER_ERROR
-	}
-
-	return C.PADDLE_MASTER_OK
-}
-
-// paddle_next_record gets the nexts training record.
-//
-// returns number of bytes of the records if success, -1 if failed, -2 if pass end.
-//
-//export paddle_next_record
-func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
-	c := get(client)
-	r, err := c.NextRecord()
-	if err != nil {
-		// NOTE: use errors to indicate pass ends
-		if err.Error() == master.ErrAllTaskFailed.Error() ||
-			err.Error() == master.ErrNoMoreAvailable.Error() ||
-			err.Error() == master.ErrPassBefore.Error() {
-			return -2
-		}
-		*record = (*C.uchar)(nil)
-		return -1
-	}
-
-	if len(r) == 0 {
-		// Empty record
-		*record = (*C.uchar)(nil)
-		return 0
-	}
-
-	size := C.size_t(len(r))
-	*record = (*C.uchar)(C.malloc(size))
-	C.memcpy(unsafe.Pointer(*record), unsafe.Pointer(&r[0]), size)
-	return C.int(size)
-}
-
-// paddle_request_save_model requests the master server to approve the
-// caller to save the model.
-//
-// returns 1 if the save the model request is approved, 0 if the
-// request is rejected because other trainer is saving the model, -1
-// if error happened.
-//
-//export paddle_request_save_model
-func paddle_request_save_model(client C.paddle_master_client, trainerID string, blockMS int) C.int {
-	c := get(client)
-	need, err := c.RequestSaveModel(trainerID, time.Duration(blockMS)*time.Millisecond)
-	if err != nil {
-		log.Error("error request save model", log.Ctx{"error": err})
-		return C.PADDLE_MASTER_ERROR
-	}
-
-	if need {
-		return C.PADDLE_SAVE_MODEL_OK
-	}
-
-	return C.PADDLE_SAVE_MODEL_SKIP
-}
-
-//export mem_free
-func mem_free(p unsafe.Pointer) {
-	// "free" may be a better name for this function, but doing so
-	// will cause calling any function of this library from Python
-	// ctypes hanging.
-	C.free(p)
-}
-
-func main() {}
diff --git a/go/master/client.go b/go/master/client.go
deleted file mode 100644
index e43903dd14e..00000000000
--- a/go/master/client.go
+++ /dev/null
@@ -1,255 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package master
-
-import (
-	"os"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/connection"
-	"github.com/PaddlePaddle/recordio"
-	"github.com/coreos/etcd/clientv3"
-	log "github.com/inconshreveable/log15"
-)
-
-// Client is the client of the master server.
-type Client struct {
-	conn    *connection.Conn
-	ch      chan record
-	bufSize int
-}
-
-type record struct {
-	r   []byte
-	err error
-}
-
-// WithBuffer sets the client to buffer the training record.
-//
-// bufSize is the record buffer size. NextRecord will read from this
-// buffer.
-func WithBuffer(bufSize int) func(*Client) error {
-	return func(c *Client) error {
-		if bufSize <= 0 {
-			return nil
-		}
-		c.bufSize = bufSize
-		return nil
-	}
-}
-
-// WithAddr sets the client to use fixed master address.
-func WithAddr(addr string) func(c *Client) error {
-	return func(c *Client) error {
-		ch := make(chan string, 1)
-		ch <- addr
-		go c.monitorMaster(ch)
-		return nil
-	}
-}
-
-// WithEtcd sets the client to use etcd for master discovery.
-func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error {
-	return func(c *Client) error {
-		var cli *clientv3.Client
-		f := func() error {
-			var err error
-			cli, err = clientv3.New(clientv3.Config{
-				Endpoints:   endpoints,
-				DialTimeout: timeout,
-			})
-			return err
-		}
-		for {
-			err := f()
-			if err != nil {
-				log.Warn("create etcd client error", log.Ctx{"error": err})
-			} else {
-				break
-			}
-			time.Sleep(time.Second)
-		}
-
-		ch := make(chan string, 1)
-		a, err := GetKey(cli, DefaultAddrPath, timeout)
-		if err != nil {
-			return err
-		}
-
-		if a != "" {
-			// Master is registered, send to the master address
-			// channel.
-			ch <- a
-		}
-
-		go watchKey(cli, DefaultAddrPath, ch)
-		go c.monitorMaster(ch)
-		return nil
-	}
-}
-
-// NewClient creates a new Client.
-func NewClient(opts ...func(*Client) error) (*Client, error) {
-	c := &Client{}
-	c.conn = connection.New()
-
-	for _, opt := range opts {
-		err := opt(c)
-		if err != nil {
-			return nil, err
-		}
-	}
-	c.ch = make(chan record, c.bufSize)
-	return c, nil
-}
-
-// StartGetRecords must be called at beginning of each pass
-func (c *Client) StartGetRecords(passID int) {
-	go c.getRecords(passID)
-}
-
-func (c *Client) getRecords(passID int) {
-	i := 0
-	for {
-		t, err := c.getTask(passID)
-		if err != nil {
-			if err.Error() == ErrPassBefore.Error() ||
-				err.Error() == ErrNoMoreAvailable.Error() ||
-				err.Error() == ErrAllTaskFailed.Error() {
-				c.ch <- record{nil, err}
-				break
-			}
-
-			if i%60 == 0 {
-				log.Debug("getTask of passID error.",
-					log.Ctx{"error": err, "passID": passID})
-				i = 0
-			}
-
-			// if err.Error() == ErrPassAfter.Error()
-			//   wait util last pass finishes
-			// if other error such as network error
-			//   wait to reconnect or task time out
-			time.Sleep(time.Second * 3)
-			i += 3
-			continue
-		}
-
-		for _, chunk := range t.Chunks {
-			f, e := os.Open(chunk.Path)
-			if e != nil {
-				log.Error("error open chunk", log.Ctx{"error": e})
-				continue
-			}
-
-			s := recordio.NewRangeScanner(f, &chunk.Index, -1, -1)
-			for s.Scan() {
-				c.ch <- record{s.Record(), nil}
-			}
-
-			if s.Err() != nil {
-				c.ch <- record{nil, s.Err()}
-				log.Error(
-					"error scan chunk",
-					log.Ctx{"error": err, "path": chunk.Path},
-				)
-			}
-
-			err = f.Close()
-			if err != nil {
-				log.Error("error close record file", log.Ctx{"error": err})
-			}
-		}
-
-		// We treat a task as finished whenever the last data
-		// instance of the task is read. This is not exactly
-		// correct, but a reasonable approximation.
-		err = c.taskFinished(t.Meta.ID)
-		if err != nil {
-			log.Error("task finish callback error.", log.Ctx{"error": err})
-		}
-	}
-}
-
-func (c *Client) monitorMaster(addrCh <-chan string) {
-	lastMaster := ""
-	for curMaster := range addrCh {
-		// connect to the new address once address changed.
-		if curMaster != lastMaster {
-			if curMaster == "" {
-				err := c.conn.Close()
-				if err != nil {
-					log.Error("close old master addr error", log.Ctx{"error": err})
-				}
-			} else {
-				err := c.conn.Connect(curMaster)
-				if err != nil {
-					log.Error("connect to new master addr error", log.Ctx{"error": err})
-
-					// connect to addr failed, set
-					// to last known addr in order
-					// to retry next time.
-					curMaster = lastMaster
-				}
-			}
-		}
-		lastMaster = curMaster
-	}
-}
-
-// SetDataset sets dataset to dispatch for the master server.
-//
-// SetDataset can be call multiple times at one pass. But only the first call
-// will be honored.
-//
-// After all tasks are done, another call of SetDataset will start another pass.
-func (c *Client) SetDataset(globPaths []string) error {
-	err := c.conn.Call("Service.SetDataset", globPaths, nil)
-	return err
-}
-
-// getTask gets a new task from the master server.
-func (c *Client) getTask(passID int) (Task, error) {
-	var t Task
-	err := c.conn.Call("Service.GetTask", passID, &t)
-	return t, err
-}
-
-// TaskFinished tells the master server a task is finished.
-func (c *Client) taskFinished(taskID int) error {
-	return c.conn.Call("Service.TaskFinished", taskID, nil)
-}
-
-// TaskFailed tell the master server as task is failed.
-func (c *Client) taskFailed(meta TaskMeta) error {
-	return c.conn.Call("Service.TaskFailed", meta, nil)
-}
-
-// NextRecord returns next record in the dataset.
-//
-// NextRecord will block until the next record is available. It is
-// thread-safe.
-func (c *Client) NextRecord() ([]byte, error) {
-	r := <-c.ch
-	return r.r, r.err
-}
-
-// RequestSaveModel requests the master server to approve the caller
-// to save the model.
-func (c *Client) RequestSaveModel(trainerID string, blockDur time.Duration) (bool, error) {
-	var need bool
-	err := c.conn.Call("Service.RequestSaveModel", SaveModelRequest{TrainerID: trainerID, BlockDur: blockDur}, &need)
-	return need, err
-}
diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go
deleted file mode 100644
index 37028a9e1f8..00000000000
--- a/go/master/client_internal_test.go
+++ /dev/null
@@ -1,152 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package master
-
-import (
-	"fmt"
-	"net"
-	"net/http"
-	"net/rpc"
-	"os"
-	"strconv"
-	"strings"
-	"testing"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/connection"
-	"github.com/PaddlePaddle/recordio"
-)
-
-const (
-	totalTask    = 20
-	chunkPerTask = 10
-)
-
-func TestGetFinishTask(t *testing.T) {
-	const path = "/tmp/master_client_test_0"
-
-	l, err := net.Listen("tcp", ":0")
-	if err != nil {
-		panic(err)
-	}
-
-	ss := strings.Split(l.Addr().String(), ":")
-	p, err := strconv.Atoi(ss[len(ss)-1])
-	if err != nil {
-		panic(err)
-	}
-	go func(l net.Listener) {
-		s, sErr := NewService(&InMemStore{}, chunkPerTask, time.Second, 1)
-		if sErr != nil {
-			panic(sErr)
-		}
-
-		server := rpc.NewServer()
-		sErr = server.Register(s)
-		if sErr != nil {
-			panic(sErr)
-		}
-
-		mux := http.NewServeMux()
-		mux.Handle(rpc.DefaultRPCPath, server)
-		sErr = http.Serve(l, mux)
-		if sErr != nil {
-			panic(sErr)
-		}
-	}(l)
-
-	f, err := os.Create(path)
-	if err != nil {
-		panic(err)
-	}
-
-	for i := 0; i < totalTask*chunkPerTask; i++ {
-		w := recordio.NewWriter(f, -1, -1)
-		_, err = w.Write(nil)
-		if err != nil {
-			panic(err)
-		}
-
-		// call Close to force RecordIO writing a chunk.
-		err = w.Close()
-		if err != nil {
-			panic(err)
-		}
-	}
-	err = f.Close()
-	if err != nil {
-		panic(err)
-	}
-
-	// Manually intialize client to avoid calling c.getRecords()
-	c := &Client{}
-	c.conn = connection.New()
-	addr := fmt.Sprintf(":%d", p)
-	ch := make(chan string, 1)
-	ch <- addr
-	go c.monitorMaster(ch)
-
-	err = c.SetDataset([]string{path})
-	if err != nil {
-		panic(err)
-	}
-
-	checkOnePass := func(i int) {
-		var tasks []Task
-		for idx := 0; idx < totalTask; idx++ {
-			task, cErr := c.getTask(i)
-			if cErr != nil && cErr.Error() != ErrNoMoreAvailable.Error() && cErr.Error() != ErrPassAfter.Error() {
-				t.Fatalf("error: %v, pass: %d\n", cErr, i)
-			}
-			tasks = append(tasks, task)
-		}
-
-		// getting task before task finishes should return error
-		_, cErr := c.getTask(i)
-		if cErr == nil {
-			t.Fatalf("Should get error, pass: %d\n", i)
-		}
-
-		cErr = c.taskFinished(tasks[0].Meta.ID)
-		if cErr != nil {
-			t.Fatalf("Error: %v, pass: %d\n", cErr, i)
-		}
-		// call taskFailed once won't put the task to failed queue, just ensure
-		// the call
-		cErr = c.taskFailed(tasks[0].Meta)
-		if cErr != nil {
-			t.Fatalf("Error: %v, pass: %d\n", cErr, i)
-		}
-
-		tasks = tasks[1:]
-		_, cErr = c.getTask(i)
-		if cErr != nil && cErr.Error() != ErrNoMoreAvailable.Error() && cErr.Error() != ErrPassAfter.Error() {
-			t.Fatalf("Should be ErrNoMoreAvailable or ErrPassAfter: %s", cErr)
-		}
-
-		for _, task := range tasks {
-			cErr = c.taskFinished(task.Meta.ID)
-			if cErr != nil {
-				t.Fatal(cErr)
-			}
-		}
-	}
-
-	for i := 0; i < 10; i++ {
-		// init pass data
-		c.StartGetRecords(i)
-		checkOnePass(i)
-	}
-}
diff --git a/go/master/client_test.go b/go/master/client_test.go
deleted file mode 100644
index 01ecad2dead..00000000000
--- a/go/master/client_test.go
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package master_test
-
-import (
-	"fmt"
-	"net"
-	"net/http"
-	"net/rpc"
-	"os"
-	"runtime"
-	"strconv"
-	"strings"
-	"sync"
-	"testing"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/master"
-	"github.com/PaddlePaddle/recordio"
-)
-
-// tool function for testing output goroutine ids
-func goid() int {
-	var buf [64]byte
-	n := runtime.Stack(buf[:], false)
-	idField := strings.Fields(strings.TrimPrefix(string(buf[:n]), "goroutine "))[0]
-	id, err := strconv.Atoi(idField)
-	if err != nil {
-		panic(fmt.Sprintf("cannot get goroutine id: %v", err))
-	}
-	return id
-}
-
-func TestNextRecord(t *testing.T) {
-	const (
-		path  = "/tmp/master_client_TestFull"
-		total = 50
-	)
-	l, err := net.Listen("tcp", ":0")
-	if err != nil {
-		panic(err)
-	}
-
-	ss := strings.Split(l.Addr().String(), ":")
-	p, err := strconv.Atoi(ss[len(ss)-1])
-	if err != nil {
-		panic(err)
-	}
-	go func(l net.Listener) {
-		s, err := master.NewService(&master.InMemStore{}, 1, time.Second*60, 1)
-		if err != nil {
-			panic(err)
-		}
-
-		server := rpc.NewServer()
-		err = server.Register(s)
-		if err != nil {
-			panic(err)
-		}
-
-		mux := http.NewServeMux()
-		mux.Handle(rpc.DefaultRPCPath, server)
-		err = http.Serve(l, mux)
-		if err != nil {
-			panic(err)
-		}
-	}(l)
-
-	f, err := os.Create(path)
-	if err != nil {
-		panic(err)
-	}
-
-	w := recordio.NewWriter(f, 1, -1)
-	for i := 0; i < total; i++ {
-		_, err = w.Write([]byte{byte(i)})
-		if err != nil {
-			panic(err)
-		}
-	}
-
-	err = w.Close()
-	if err != nil {
-		panic(err)
-	}
-
-	err = f.Close()
-	if err != nil {
-		panic(err)
-	}
-
-	// start several client to test task fetching
-	var wg sync.WaitGroup
-	for i := 0; i < 4; i++ {
-		wg.Add(1)
-		// test for multiple concurrent clients
-		go func() {
-			defer wg.Done()
-			// each go-routine needs a single client connection instance
-			c, e := master.NewClient(master.WithAddr(fmt.Sprintf(":%d", p)), master.WithBuffer(1))
-			if e != nil {
-				t.Fatal(e)
-			}
-			e = c.SetDataset([]string{path})
-			if e != nil {
-				panic(e)
-			}
-
-			// test for n passes
-			for pass := 0; pass < 10; pass++ {
-				c.StartGetRecords(pass)
-
-				received := make(map[byte]bool)
-				taskid := 0
-				for {
-					r, e := c.NextRecord()
-					if e != nil {
-						// ErrorPassAfter will wait, else break for next pass
-						if e.Error() == master.ErrPassBefore.Error() ||
-							e.Error() == master.ErrNoMoreAvailable.Error() {
-							break
-						}
-						t.Fatal(pass, taskid, "Read error:", e)
-					}
-					if len(r) != 1 {
-						t.Fatal(pass, taskid, "Length should be 1.", r)
-					}
-					if received[r[0]] {
-						t.Fatal(pass, taskid, "Received duplicate.", received, r)
-					}
-					taskid++
-					received[r[0]] = true
-				}
-			}
-		}()
-	}
-	wg.Wait()
-}
diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go
deleted file mode 100644
index 36fe6112744..00000000000
--- a/go/master/etcd_client.go
+++ /dev/null
@@ -1,201 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package master
-
-import (
-	"context"
-	"time"
-
-	"github.com/coreos/etcd/clientv3"
-	"github.com/coreos/etcd/clientv3/concurrency"
-	log "github.com/inconshreveable/log15"
-)
-
-const (
-	// DefaultLockPath is the default etcd master lock path.
-	DefaultLockPath = "/master/lock"
-	// DefaultStatePath is the default etcd key for master state.
-	DefaultStatePath = "/master/state"
-	// DefaultAddrPath is the default etcd key for master address.
-	DefaultAddrPath = "/master/addr"
-)
-
-// EtcdClient is the etcd client that the master uses for fault
-// tolerance and service registry.
-type EtcdClient struct {
-	lockPath  string
-	statePath string
-	client    *clientv3.Client
-	lock      *concurrency.Mutex
-	sess      *concurrency.Session
-}
-
-// NewEtcdClient creates a new EtcdClient.
-func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
-	log.Debug("Connecting to etcd", log.Ctx{"endpoint": endpoints})
-	cli, err := clientv3.New(clientv3.Config{
-		Endpoints:   endpoints,
-		DialTimeout: dialTimeout,
-	})
-	if err != nil {
-		return nil, err
-	}
-
-	sess, err := concurrency.NewSession(cli, concurrency.WithTTL(ttlSec))
-	if err != nil {
-		return nil, err
-	}
-
-	lock := concurrency.NewMutex(sess, lockPath)
-	// It's fine for the lock to get stuck, in this case we have
-	// multiple master servers running (only configured to have
-	// one master running, but split-brain problem may cause
-	// multiple master servers running), and the cluster management
-	// software will kill one of them.
-	log.Info("Trying to acquire lock.", log.Ctx{"path": lockPath})
-	err = lock.Lock(context.TODO())
-	if err != nil {
-		return nil, err
-	}
-	log.Info("Successfully acquired lock at %s.", log.Ctx{"path": lockPath})
-
-	put := clientv3.OpPut(addrPath, addr)
-	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
-	if err != nil {
-		return nil, err
-	}
-
-	if !resp.Succeeded {
-		log.Crit("No longer owns the master lock. Exiting.")
-		panic("No longer owns the master lock. Exiting.")
-	}
-
-	e := &EtcdClient{
-		lockPath:  lockPath,
-		statePath: statePath,
-		client:    cli,
-		lock:      lock,
-		sess:      sess,
-	}
-
-	return e, nil
-}
-
-// Save saves the state into the etcd.
-func (e *EtcdClient) Save(state []byte) error {
-	ctx := context.TODO()
-	put := clientv3.OpPut(e.statePath, string(state))
-	resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
-	if err != nil {
-		return err
-	}
-
-	if !resp.Succeeded {
-		log.Error("No longer owns the lock, trying to lock again")
-		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
-		err := e.lock.Lock(ctx)
-		cancel()
-		if err != nil {
-			// We lost the master lock and can not acquire
-			// it back, it means some other master is
-			// already started. We don't want cluster
-			// management system to kill the master server
-			// who is holding the lock and running
-			// correctly. So the most feasible solution is
-			// to kill current master server. The current
-			// state is not saved, but the trainer's RPC
-			// call will fail, so the trainer will retry.
-			log.Crit("Could not acquire the lock at %s: %v. Exiting.", log.Ctx{"path": e.lockPath, "error": err})
-			panic("Could not acquire the lock at %s: %v. Exiting.")
-		}
-		log.Info("Successfully acquired lock at %s.", e.lockPath)
-		return e.Save(state)
-	}
-
-	return nil
-}
-
-// Load loads the state from etcd.
-func (e *EtcdClient) Load() ([]byte, error) {
-	ctx := context.TODO()
-	get := clientv3.OpGet(e.statePath)
-
-	resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(get).Commit()
-	if err != nil {
-		return nil, err
-	}
-
-	if !resp.Succeeded {
-		log.Error("No longer owns the lock, trying to lock and load again.")
-		err = e.lock.Lock(context.Background())
-		if err != nil {
-			return nil, err
-		}
-
-		return e.Load()
-	}
-
-	kvs := resp.Responses[0].GetResponseRange().Kvs
-	if len(kvs) == 0 {
-		// No state exists
-		return nil, nil
-	}
-
-	state := kvs[0].Value
-	return state, nil
-}
-
-// Shutdown shuts down the etcd client gracefully.
-func (e *EtcdClient) Shutdown() error {
-	err := e.sess.Close()
-	newErr := e.client.Close()
-	if newErr != nil {
-		if err == nil {
-			err = newErr
-		} else {
-			log.Error("shutdown error", log.Ctx{"error": newErr})
-		}
-	}
-
-	return err
-}
-
-// GetKey gets the value by the specify key.
-func GetKey(c *clientv3.Client, key string, timeout time.Duration) (string, error) {
-	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	resp, err := c.Get(ctx, key)
-	cancel()
-	if err != nil {
-		return "", err
-	}
-	kvs := resp.Kvs
-	if len(kvs) == 0 {
-		return "", nil
-	}
-	v := kvs[0].Value
-	return string(v), nil
-}
-
-// watchKey watches the specify key and send to valChan if there is some event.
-func watchKey(c *clientv3.Client, key string, valChan chan<- string) {
-	rch := c.Watch(context.Background(), key)
-	for wresp := range rch {
-		for _, ev := range wresp.Events {
-			// if received event is DELETE, the value will be an empty string
-			log.Info("received event.", log.Ctx{"type": ev.Type, "key": ev.Kv.Key, "value": ev.Kv.Value})
-			valChan <- string(ev.Kv.Value)
-		}
-	}
-}
diff --git a/go/master/inmem_store.go b/go/master/inmem_store.go
deleted file mode 100644
index 33b4714317f..00000000000
--- a/go/master/inmem_store.go
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package master
-
-import "sync"
-
-// InMemStore is an in memory implementation of Store interface.
-//
-// It does not tolerate the fault that causes the program to crash.
-type InMemStore struct {
-	mu  sync.Mutex
-	buf []byte
-}
-
-// Save saves the state into the in-memory store.
-func (m *InMemStore) Save(state []byte) error {
-	m.mu.Lock()
-	defer m.mu.Unlock()
-
-	m.buf = state
-	return nil
-}
-
-// Load loads the state from the in-memory store.
-func (m *InMemStore) Load() ([]byte, error) {
-	m.mu.Lock()
-	defer m.mu.Unlock()
-
-	return m.buf, nil
-}
-
-// Shutdown shuts down the in mem store.
-func (m *InMemStore) Shutdown() error {
-	return nil
-}
diff --git a/go/master/service.go b/go/master/service.go
deleted file mode 100644
index 39f746e528e..00000000000
--- a/go/master/service.go
+++ /dev/null
@@ -1,510 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package master
-
-import (
-	"bytes"
-	"compress/gzip"
-	"encoding/gob"
-	"errors"
-	"math/rand"
-	"os"
-	"path/filepath"
-	"sync"
-	"time"
-
-	log "github.com/inconshreveable/log15"
-
-	"github.com/PaddlePaddle/recordio"
-)
-
-const (
-	dialTimeout = 5 * time.Second
-)
-
-// ErrAllTaskFailed occur when tasks are in done or failed state.
-var ErrAllTaskFailed = errors.New("all task finished")
-
-// ErrNoMoreAvailable occur when no task in todo and yet not all done or fail.
-var ErrNoMoreAvailable = errors.New("no more available task")
-
-// ErrPassBefore client side pass number does not match with master counter.
-var ErrPassBefore = errors.New("pass number smaller than master")
-
-// ErrPassAfter client side pass number does not match with master counter.
-var ErrPassAfter = errors.New("pass number larger than master")
-
-// Store is the interface for save and load the master state.
-type Store interface {
-	Save([]byte) error
-	Load() ([]byte, error)
-	Shutdown() error
-}
-
-// Chunk is a chunk of data consisted of several data instances.
-type Chunk struct {
-	Path  string
-	Index recordio.Index // chunk index
-}
-
-// TaskMeta is a struct which stores task's meta info.
-type TaskMeta struct {
-	ID    int
-	Epoch int
-}
-
-// Task is the basic unit of data instances assigned to trainers.
-type Task struct {
-	Meta   TaskMeta
-	Chunks []Chunk
-}
-
-type taskEntry struct {
-	Task Task
-	// A task fails if it's timeout or trainer reports it exits unnormally.
-	NumFailure int
-}
-
-type masterState struct {
-	Todo    []taskEntry
-	Pending map[int]taskEntry // map from task ID to task entry
-	Done    []taskEntry
-	Failed  []taskEntry
-	CurPass int
-}
-
-// Service is the master server service.
-type Service struct {
-	chunksPerTask int
-	timeoutDur    time.Duration
-	failureMax    int
-	store         Store
-
-	ready    chan struct{}
-	initDone bool
-
-	mu sync.Mutex
-	// State to be persisted to snapshot.
-	state masterState
-	// The trainer that is currently saving model. This state is
-	// transient, does not need to be persisted to snapshot.
-	savingTrainer string
-}
-
-func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
-	// generate uniq id across job using nanosecond + randint + counter
-	// FIXME(typhoonzero): this is a workaround, use uuid
-	randStart := rand.Int()
-	counter := 0
-	timestamp := time.Now().Nanosecond()
-	id := timestamp + randStart + counter
-	if chunksPerTask <= 0 {
-		chunksPerTask = 1
-	}
-
-	var result []taskEntry
-	var cur taskEntry
-	for i, c := range chunks {
-		if i%chunksPerTask == 0 && len(cur.Task.Chunks) > 0 {
-			cur.Task.Meta.ID = id
-			counter++
-			id = timestamp + randStart + counter
-			result = append(result, cur)
-			cur.Task.Chunks = nil
-		}
-
-		cur.Task.Chunks = append(cur.Task.Chunks, c)
-	}
-
-	if len(cur.Task.Chunks) > 0 {
-		cur.Task.Meta.ID = id
-		result = append(result, cur)
-	}
-
-	return result
-}
-
-// NewService creates a new service.
-func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, failureMax int) (*Service, error) {
-	s := &Service{}
-	s.chunksPerTask = chunksPerTask
-	s.timeoutDur = timeoutDur
-	s.failureMax = failureMax
-	s.state = masterState{}
-	s.state.Pending = make(map[int]taskEntry)
-	s.ready = make(chan struct{})
-	s.store = store
-	recovered, err := s.recover()
-	if err != nil {
-		return nil, err
-	}
-
-	if recovered {
-		// Recovered. Now the state is already initialized,
-		// and the master is ready.
-		s.initDone = true
-		close(s.ready)
-		log.Info("Master recovered from saved state.")
-	}
-
-	return s, nil
-}
-
-// recover recovers service state from etcd.
-func (s *Service) recover() (bool, error) {
-	state, err := s.store.Load()
-	if err != nil {
-		return false, err
-	}
-
-	if state == nil {
-		log.Info("No state exists, not recovered.")
-		return false, nil
-	}
-
-	log.Info("Loaded snapshot.", log.Ctx{"size": len(state)})
-	gr, err := gzip.NewReader(bytes.NewReader(state))
-	if err != nil {
-		return false, err
-	}
-
-	dec := gob.NewDecoder(gr)
-	var tqs masterState
-	err = dec.Decode(&tqs)
-	if err != nil {
-		return false, err
-	}
-
-	err = gr.Close()
-	if err != nil {
-		// Only close failed, recover actually succeed, so
-		// just log error.
-		log.Error("error close recover file.", log.Ctx{"error": err})
-	}
-
-	s.state = tqs
-	log.Info("Master recovered from snapshot, scheduling pending task timeout check.", s.logCtx())
-	for _, t := range s.state.Pending {
-		time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
-	}
-
-	return true, nil
-}
-
-// snapshot *must* be called with s.mu being held.
-func (s *Service) snapshot() error {
-	// TODO(helin): etcd request has a size limit, so the snapshot
-	// size is limited by the max request size. We should either
-	// divide the snapshot into smaller chunks and save under
-	// different keys, or configure the request size to be big
-	// enough:
-	// https://github.com/coreos/etcd/blob/2f84f3d8d8ed8f9537ab6ffa44a3a1c7eddfa9b1/embed/config.go#L44
-	var buf bytes.Buffer
-	gw := gzip.NewWriter(&buf)
-	enc := gob.NewEncoder(gw)
-	err := enc.Encode(s.state)
-	if err != nil {
-		return err
-	}
-	err = gw.Close()
-	if err != nil {
-		return err
-	}
-
-	state := buf.Bytes()
-	log.Info("Saving snapshot.", log.Ctx{"size bytes": len(state)})
-	return s.store.Save(state)
-}
-
-func readChunks(globPaths []string) ([]Chunk, error) {
-	var chunks []Chunk
-	var paths []string
-
-	for _, s := range globPaths {
-		match, err := filepath.Glob(s)
-		if err != nil {
-			return nil, err
-		}
-		paths = append(paths, match...)
-	}
-
-	if len(paths) == 0 {
-		return nil, errors.New("no valid dataset specified")
-	}
-
-	for _, path := range paths {
-		f, err := os.Open(path)
-		if err != nil {
-			return nil, err
-		}
-
-		index, err := recordio.LoadIndex(f)
-		if err != nil {
-			return nil, err
-		}
-		err = f.Close()
-		if err != nil {
-			return nil, err
-		}
-
-		count := index.NumChunks()
-		log.Info("reading chunks.", log.Ctx{"path": path, "num chunks": count})
-		for i := 0; i < count; i++ {
-			chunk := Chunk{
-				Path:  path,
-				Index: *index.ChunkIndex(i),
-			}
-			chunks = append(chunks, chunk)
-		}
-	}
-
-	return chunks, nil
-}
-
-// SetDataset sets dataset to dispatch for the master server.
-//
-// SetDataset can be call multiple times. But only the first call will
-// be honored.
-func (s *Service) SetDataset(globPaths []string, _ *int) error {
-	if len(globPaths) == 0 {
-		return errors.New("no dataset specified")
-	}
-
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	if s.initDone {
-		// Already initialized. All trainer will call
-		// SetDataset, but we only handle the first one. Treat
-		// other calls as successful but do nothing.
-		return nil
-	}
-
-	chunks, err := readChunks(globPaths)
-	if err != nil {
-		return err
-	}
-
-	s.state.Todo = partition(chunks, s.chunksPerTask)
-
-	err = s.snapshot()
-	if err != nil {
-		log.Error("snapshot error", log.Ctx{"error": err})
-		return err
-	}
-	close(s.ready)
-	s.initDone = true
-	return nil
-}
-
-// processFailedTask retry s.failureMax times for failed task.
-// return true if all task are done or failed.
-func (s *Service) processFailedTask(t taskEntry, epoch int) {
-	if t.Task.Meta.Epoch != epoch {
-		// new epoch, task launched after the
-		// schedule of this timeout check or failed status report.
-		return
-	}
-
-	defer func() {
-		err := s.snapshot()
-		if err != nil {
-			log.Error("snapshot error", log.Ctx{"error": err})
-		}
-	}()
-
-	delete(s.state.Pending, t.Task.Meta.ID)
-
-	t.NumFailure++
-	if t.NumFailure > s.failureMax {
-		log.Warn("Task failed to many times, discard.", log.Ctx{"task": t.Task, "num failed": t.NumFailure})
-		s.state.Failed = append(s.state.Failed, t)
-		return
-	}
-
-	log.Warn("Task failed, re-dispatch.", log.Ctx{"task": t.Task, "num failed": t.NumFailure})
-	s.state.Todo = append(s.state.Todo, t)
-	return
-}
-
-func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
-	return func() {
-		s.mu.Lock()
-		defer s.mu.Unlock()
-
-		t, ok := s.state.Pending[taskID]
-		if !ok {
-			return
-		}
-
-		s.processFailedTask(t, epoch)
-	}
-}
-
-// must be called with lock held.
-func (s *Service) logCtx() log.Ctx {
-	return log.Ctx{
-		"todoLen":    len(s.state.Todo),
-		"pendingLen": len(s.state.Pending),
-		"doneLen":    len(s.state.Done),
-		"failedLen":  len(s.state.Failed),
-		"curPass":    s.state.CurPass,
-	}
-}
-
-// GetTask gets a new task from the service.
-// passID is the client side pass count
-func (s *Service) GetTask(passID int, task *Task) error {
-	select {
-	case <-s.ready:
-	}
-
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	if passID < s.state.CurPass {
-		return ErrPassBefore
-	}
-	if passID > s.state.CurPass {
-		// Client may get run to pass after master when one client faster than the
-		// other
-		return ErrPassAfter
-	}
-
-	if len(s.state.Todo) == 0 {
-		if len(s.state.Done) == 0 && len(s.state.Pending) == 0 {
-			log.Warn("All tasks failed, may start next pass", s.logCtx())
-			return ErrAllTaskFailed
-		}
-		log.Warn("No more available task.", s.logCtx())
-		return ErrNoMoreAvailable
-	}
-
-	t := s.state.Todo[0]
-	t.Task.Meta.Epoch++
-	s.state.Todo = s.state.Todo[1:]
-	s.state.Pending[t.Task.Meta.ID] = t
-	err := s.snapshot()
-	if err != nil {
-		return err
-	}
-
-	*task = t.Task
-	ctx := s.logCtx()
-	ctx["task meta"] = t.Task.Meta
-	log.Info("Task dispatched.", ctx)
-	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
-	return nil
-}
-
-// TaskFinished tell the service that a task is finished.
-func (s *Service) TaskFinished(taskID int, dummy *int) error {
-	select {
-	case <-s.ready:
-	}
-
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	t, ok := s.state.Pending[taskID]
-	if !ok {
-		ctx := s.logCtx()
-		ctx["task id"] = taskID
-		log.Warn("Pending task not found.", ctx)
-		return nil
-	}
-
-	// task finished, reset timeout
-	t.NumFailure = 0
-	s.state.Done = append(s.state.Done, t)
-	delete(s.state.Pending, taskID)
-
-	ctx := s.logCtx()
-	ctx["task id"] = taskID
-	log.Info("Task finished.", ctx)
-	if len(s.state.Todo) == 0 && len(s.state.Pending) == 0 {
-		// increase master side pass count if all tasks finished
-		s.state.CurPass++
-		s.state.Todo = append(s.state.Done, s.state.Failed...)
-		s.state.Done = []taskEntry{}
-		// TODO(typhoonzero): deal with failed tasks
-		s.state.Failed = []taskEntry{}
-		ctx := s.logCtx()
-		ctx["new pass"] = s.state.CurPass
-		log.Warn("all task finished, add new pass data.", ctx)
-	}
-
-	err := s.snapshot()
-	if err != nil {
-		log.Error("snapshot error", log.Ctx{"error": err})
-	}
-	return err
-}
-
-// TaskFailed tells the service that a task is failed.
-func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
-	select {
-	case <-s.ready:
-	}
-
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	t, ok := s.state.Pending[meta.ID]
-	if !ok {
-		log.Warn("TaskFailed:Pending task not found.", log.Ctx{"task": t.Task.Meta})
-		return nil
-	}
-
-	s.processFailedTask(t, meta.Epoch)
-	return nil
-}
-
-// SaveModelRequest is the request for saving model
-type SaveModelRequest struct {
-	TrainerID string
-	BlockDur  time.Duration
-}
-
-// RequestSaveModel requests the master server to approve the caller
-// to save the model.
-func (s *Service) RequestSaveModel(req SaveModelRequest, need *bool) error {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	if req.TrainerID == "" {
-		return errors.New("trainer id is empty")
-	}
-
-	if s.savingTrainer == "" {
-		*need = true
-	} else {
-		if req.TrainerID == s.savingTrainer {
-			// save trainer asked to save model again
-			*need = true
-		} else {
-			*need = false
-		}
-	}
-
-	if *need {
-		s.savingTrainer = req.TrainerID
-		time.AfterFunc(req.BlockDur, func() {
-			s.mu.Lock()
-			s.savingTrainer = ""
-			s.mu.Unlock()
-		})
-	}
-
-	return nil
-}
diff --git a/go/master/service_internal_test.go b/go/master/service_internal_test.go
deleted file mode 100644
index dd22f3d548b..00000000000
--- a/go/master/service_internal_test.go
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package master
-
-import "testing"
-
-func TestPartitionCount(t *testing.T) {
-	cs := make([]Chunk, 100)
-	ts := partition(cs, 5)
-	if len(ts) != 20 {
-		t.Error(len(ts))
-	}
-
-	cs = make([]Chunk, 101)
-	ts = partition(cs, 5)
-	if len(ts) != 21 {
-		t.Error(len(ts))
-	}
-
-	ts = partition(cs, 1)
-	if len(ts) != 101 {
-		t.Error(len(ts))
-	}
-
-	ts = partition(cs, 0)
-	if len(ts) != 101 {
-		t.Error(len(ts))
-	}
-}
-
-func TestPartionIndex(t *testing.T) {
-	cs := make([]Chunk, 100)
-	ts := partition(cs, 20)
-	for i := range ts {
-		// test auto increament ids
-		if i > 0 && ts[i].Task.Meta.ID != ts[i-1].Task.Meta.ID+1 {
-			t.Error(ts[i], i)
-		}
-	}
-}
diff --git a/go/master/service_test.go b/go/master/service_test.go
deleted file mode 100644
index 2d00c22d6fe..00000000000
--- a/go/master/service_test.go
+++ /dev/null
@@ -1,72 +0,0 @@
-package master_test
-
-import (
-	"io/ioutil"
-	"net/url"
-	"os"
-	"strings"
-	"testing"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/master"
-	"github.com/coreos/etcd/clientv3"
-	"github.com/coreos/etcd/embed"
-	"github.com/stretchr/testify/assert"
-)
-
-func TestNewServiceWithEtcd(t *testing.T) {
-	// setup an embed etcd server
-	etcdDir, err := ioutil.TempDir("", "")
-	if err != nil {
-		t.Fatal(err)
-	}
-	cfg := embed.NewConfig()
-	lpurl, _ := url.Parse("http://localhost:0")
-	lcurl, _ := url.Parse("http://localhost:0")
-	cfg.LPUrls = []url.URL{*lpurl}
-	cfg.LCUrls = []url.URL{*lcurl}
-	cfg.Dir = etcdDir
-	e, err := embed.StartEtcd(cfg)
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer func() {
-		e.Close()
-		if err := os.RemoveAll(etcdDir); err != nil {
-			t.Fatal(err)
-		}
-	}()
-
-	<-e.Server.ReadyNotify()
-
-	port := strings.Split(e.Clients[0].Addr().String(), ":")[1]
-	endpoint := "127.0.0.1:" + port
-
-	ep := []string{endpoint}
-	masterAddr := "127.0.0.1:3306"
-	store, err := master.NewEtcdClient(ep, masterAddr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, 30)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	_, err = master.NewService(store, 10, 10, 3)
-	if err != nil {
-		t.Fatal(err)
-	}
-	cli, err := clientv3.New(clientv3.Config{
-		Endpoints:   ep,
-		DialTimeout: 3 * time.Second,
-	})
-	if err != nil {
-		t.Fatal(err)
-	}
-	v, err := master.GetKey(cli, master.DefaultAddrPath, 3*time.Second)
-	if err != nil {
-		t.Fatal(err)
-	}
-	if err := cli.Close(); err != nil {
-		t.Fatal(err)
-	}
-	// test master process registry itself into etcd server.
-	assert.Equal(t, masterAddr, v, "master process should registry itself into etcd server.")
-}
diff --git a/go/proto/.gitignore b/go/proto/.gitignore
deleted file mode 100644
index 5e7d2734cfc..00000000000
--- a/go/proto/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-# Ignore everything in this directory
-*
-# Except this file
-!.gitignore
diff --git a/go/pserver/CMakeLists.txt b/go/pserver/CMakeLists.txt
deleted file mode 100644
index 32f3b2baba3..00000000000
--- a/go/pserver/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-if(WITH_TESTING)
-  go_test(pserver_test DEPS paddle_go_optimizer gen_proto_go)
-endif()
diff --git a/go/pserver/client/CMakeLists.txt b/go/pserver/client/CMakeLists.txt
deleted file mode 100644
index 1d6f45a6642..00000000000
--- a/go/pserver/client/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-if(WITH_TESTING)
-  go_test(pserver_client_test DEPS paddle_go_optimizer)
-endif()
diff --git a/go/pserver/client/c/.gitignore b/go/pserver/client/c/.gitignore
deleted file mode 100644
index 4bf05c85386..00000000000
--- a/go/pserver/client/c/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-libpaddle_go_optimizer.a
diff --git a/go/pserver/client/c/CMakeLists.txt b/go/pserver/client/c/CMakeLists.txt
deleted file mode 100644
index 78776219dee..00000000000
--- a/go/pserver/client/c/CMakeLists.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
-target_link_libraries(paddle_go_optimizer stdc++ m)
-
-# Copy library to the required place.
-# See: go/pserver/optimizer.go:
-# // #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
-add_custom_command(TARGET paddle_go_optimizer POST_BUILD
-  COMMAND cp "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_go_optimizer.a" "${CMAKE_CURRENT_SOURCE_DIR}"
-  )
-
-go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
-if(WITH_TESTING)
-  # FIXME: this test requires pserver which is not managed by the test
-  # we need some kind of e2e testing machanism.
-  # add_subdirectory(test)
-endif()
diff --git a/go/pserver/client/c/cclient.go b/go/pserver/client/c/cclient.go
deleted file mode 100644
index cddc28e46f4..00000000000
--- a/go/pserver/client/c/cclient.go
+++ /dev/null
@@ -1,300 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-/*
-#include <string.h>
-typedef enum {
-  PADDLE_ELEMENT_TYPE_INT32   = 0,
-  PADDLE_ELEMENT_TYPE_UINT32  = 1,
-  PADDLE_ELEMENT_TYPE_INT64   = 2,
-  PADDLE_ELEMENT_TYPE_UINT64  = 3,
-  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
-  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
-} paddle_element_type;
-
-typedef struct {
-  char*               name;
-  paddle_element_type element_type;
-  unsigned char*      content;
-  int                 content_len;
-} paddle_parameter, paddle_gradient;
-
-typedef int paddle_pserver_client;
-#define PSERVER_ERROR -1
-#define PSERVER_OK 0
-*/
-import "C"
-
-import (
-	"strings"
-	"sync"
-	"unsafe"
-
-	"github.com/PaddlePaddle/Paddle/go/pserver"
-	"github.com/PaddlePaddle/Paddle/go/pserver/client"
-	log "github.com/inconshreveable/log15"
-)
-
-func init() {
-	log.Root().SetHandler(
-		log.LvlFilterHandler(log.LvlWarn, log.CallerStackHandler("%+v", log.StderrHandler)),
-	)
-}
-
-var mu sync.Mutex
-var handleMap = make(map[C.paddle_pserver_client]*client.Client)
-var curHandle C.paddle_pserver_client
-
-func add(c *client.Client) C.paddle_pserver_client {
-	mu.Lock()
-	defer mu.Unlock()
-	cli := curHandle
-	curHandle++
-	handleMap[cli] = c
-	return cli
-}
-
-func get(client C.paddle_pserver_client) *client.Client {
-	mu.Lock()
-	defer mu.Unlock()
-	return handleMap[client]
-}
-
-func remove(client C.paddle_pserver_client) *client.Client {
-	mu.Lock()
-	defer mu.Unlock()
-	h := handleMap[client]
-	delete(handleMap, client)
-	return h
-}
-
-func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nil {
-		return nil
-	}
-
-	// create a Go clice backed by a C array, reference:
-	// https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
-	//
-	// Go garbage collector will not interact with this data, need
-	// to be freed properly.
-	return (*[1 << 30]byte)(p)[:len:len]
-}
-
-type selector bool
-
-func (s selector) Select() (bool, error) {
-	return bool(s), nil
-}
-
-func (s selector) Done() error {
-	return nil
-}
-
-type lister []client.Server
-
-func (l lister) List() []client.Server {
-	return l
-}
-
-//export paddle_new_pserver_client
-func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_client {
-	a := C.GoString(addrs)
-	as := strings.Split(a, ",")
-	servers := make([]client.Server, len(as))
-	for i := range as {
-		servers[i].Index = i
-		servers[i].Addr = as[i]
-	}
-	c := client.NewClient(lister(servers), len(as), selector(selected != 0))
-	return add(c)
-}
-
-//export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcdEndpoints *C.char) C.paddle_pserver_client {
-	addr := C.GoString(etcdEndpoints)
-	etcdClient := client.NewEtcd(addr)
-	c := client.NewClient(etcdClient, etcdClient.Desired(), etcdClient)
-	return add(c)
-}
-
-//export paddle_pserver_client_release
-func paddle_pserver_client_release(client C.paddle_pserver_client) {
-	remove(client)
-}
-
-// paddle_begin_init_params tells trainer if it needs to init the
-// parameters.
-//
-// returns 1 if the trainer needs to init the parameters. 0 if the
-// trainer does not need to init the parameters.
-//
-//export paddle_begin_init_params
-func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
-	c := get(client)
-	selected, err := c.BeginInitParams()
-	if err != nil {
-		panic(err)
-	}
-
-	if selected {
-		return 1
-	}
-	return 0
-}
-
-//export paddle_init_param
-func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, paramConfig unsafe.Pointer, configLen C.int) C.int {
-	et := pserver.ElementType(param.element_type)
-	name := C.GoString(param.name)
-	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
-	pc := pserver.ParameterWithConfig{
-		Param:  pserver.Parameter{Name: name, ElementType: et, Content: content},
-		Config: cArrayToSlice(paramConfig, int(configLen)),
-	}
-	c := get(client)
-	err := c.InitParam(pc)
-
-	if err != nil {
-		if err.Error() == pserver.AlreadyInitialized {
-			log.Warn(
-				"parameter already initialized, treat paddle_init_param as successful.",
-				log.Ctx{"parameter": name},
-			)
-			return C.PSERVER_OK
-		}
-		log.Error("error init param", log.Ctx{"error": err})
-		return C.PSERVER_ERROR
-	}
-
-	return C.PSERVER_OK
-}
-
-//export paddle_finish_init_params
-func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
-	c := get(client)
-	err := c.FinishInitParams()
-	if err != nil {
-		if err.Error() == pserver.AlreadyInitialized {
-			log.Warn("parameters already initialized, treat paddle_finish_init_params as successful.")
-			return C.PSERVER_OK
-		}
-
-		log.Error("error finish init params", log.Ctx{"error": err})
-		return C.PSERVER_ERROR
-	}
-
-	return C.PSERVER_OK
-}
-
-//export paddle_send_grads
-func paddle_send_grads(client C.paddle_pserver_client, grads **C.paddle_gradient, total C.int) C.int {
-	var gs []pserver.Gradient
-	for i := 0; i < int(total); i++ {
-		grad := *(**C.paddle_gradient)(unsafe.Pointer((uintptr(unsafe.Pointer(grads)) + uintptr(i)*unsafe.Sizeof(*grads))))
-		et := pserver.ElementType(grad.element_type)
-		name := C.GoString(grad.name)
-		content := cArrayToSlice(unsafe.Pointer(grad.content), int(grad.content_len))
-		gs = append(gs, pserver.Gradient{Name: name, ElementType: et, Content: content})
-	}
-
-	c := get(client)
-	err := c.SendGrads(gs)
-	if err != nil {
-		log.Error("error send grads", log.Ctx{"error": err})
-		return C.PSERVER_ERROR
-	}
-
-	return C.PSERVER_OK
-}
-
-//export paddle_get_params
-func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter, total C.int) C.int {
-	var ns []string
-	for i := 0; i < int(total); i++ {
-		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
-		ns = append(ns, C.GoString(param.name))
-	}
-	c := get(client)
-	ps, err := c.GetParams(ns)
-	if err != nil {
-		log.Error("error get params", log.Ctx{"error": err})
-		return C.PSERVER_ERROR
-	}
-
-	if len(ps) != len(ns) {
-		pn := make([]string, len(ps))
-		for i, p := range ps {
-			pn[i] = p.Name
-		}
-		log.Error(
-			"pserver returned wrong number of parameters.",
-			log.Ctx{
-				"Requested": strings.Join(pn, ", "),
-				"Returned":  strings.Join(ns, ", "),
-			},
-		)
-		return C.PSERVER_ERROR
-	}
-
-	for i := range ps {
-		if ns[i] != ps[i].Name {
-			pn := make([]string, len(ps))
-			for i, p := range ps {
-				pn[i] = p.Name
-			}
-			log.Error(
-				"pserver returned wrong parameters, or not in requested order.",
-				log.Ctx{
-					"Requested": strings.Join(pn, ", "),
-					"Returned":  strings.Join(ns, ", "),
-				},
-			)
-			return C.PSERVER_ERROR
-		}
-	}
-
-	for i := 0; i < int(total); i++ {
-		p := ps[i]
-		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
-
-		if unsafe.Pointer(param) == nil {
-			log.Error("must pre-allocate parameter.")
-			return C.PSERVER_ERROR
-		}
-
-		if unsafe.Pointer(param.content) != nil {
-			if int(param.content_len) != len(p.Content) {
-				log.Error(
-					"the pre-allocated content len does not match parameter content len.",
-					log.Ctx{
-						"Pre-allocated len": param.content_len,
-						"Returned len":      len(p.Content),
-					},
-				)
-				return C.PSERVER_ERROR
-			}
-		}
-
-		C.memcpy(unsafe.Pointer(param.content), unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)))
-		param.content_len = C.int(len(p.Content))
-		param.element_type = C.paddle_element_type(p.ElementType)
-	}
-
-	return C.PSERVER_OK
-}
-
-func main() {} // Required but ignored
diff --git a/go/pserver/client/c/test/CMakeLists.txt b/go/pserver/client/c/test/CMakeLists.txt
deleted file mode 100644
index 4500b1f2883..00000000000
--- a/go/pserver/client/c/test/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
diff --git a/go/pserver/client/c/test/test_cclient.c b/go/pserver/client/c/test/test_cclient.c
deleted file mode 100644
index 0116e42a0a6..00000000000
--- a/go/pserver/client/c/test/test_cclient.c
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "libpaddle_pserver_cclient.h"
-
-// TODO(helin): Fix: gtest using cmake is not working, using this
-// hacky way for now.
-#define fail()                                          \
-  fprintf(stderr, "info: %s:%d: ", __FILE__, __LINE__); \
-  exit(-1);
-
-void sendGrads(paddle_pserver_client c) {
-  unsigned char grad_a[2000] = {2};
-  unsigned char grad_b[3000] = {3};
-  paddle_gradient grad1 = {
-      "param_a", PADDLE_ELEMENT_TYPE_FLOAT32, grad_a, 2000};
-  paddle_gradient grad2 = {
-      "param_b", PADDLE_ELEMENT_TYPE_FLOAT32, grad_b, 3000};
-  paddle_gradient *grads[2] = {&grad1, &grad2};
-  if (paddle_send_grads(c, grads, 2)) {
-    fail();
-  }
-}
-
-void getParams(paddle_pserver_client c) {
-  paddle_parameter param_a;
-  paddle_parameter param_b;
-  char name_a[] = "param_a";
-  char name_b[] = "param_b";
-  // Must pre-allocate the prameter content before calling paddle_get_params.
-  unsigned char content_a[2000] = {};
-  unsigned char content_b[3000] = {};
-  param_a.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
-  param_a.name = name_a;
-  param_a.content = content_a;
-  param_a.content_len = 2000;
-  param_b.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
-  param_b.name = name_b;
-  param_b.content = content_b;
-  param_b.content_len = 3000;
-
-  paddle_parameter *params[2] = {&param_a, &param_b};
-  if (paddle_get_params(c, params, 2)) {
-    fail();
-  }
-}
-
-int main() {
-  char addr[] = "localhost:3000";
-  paddle_pserver_client c = paddle_new_pserver_client(addr, 1);
-  char *config_proto;
-  size_t config_proto_len = 0;
-  ssize_t nread;
-  FILE *fp = fopen("testdata/optimizer.pb", "r");
-  if (!fp) {
-    fail();
-  }
-  while ((nread = getline(&config_proto, &config_proto_len, fp)) != -1) {
-    printf("%s", config_proto);
-  }
-  fclose(fp);
-retry:
-  if (paddle_begin_init_params(c)) {
-    paddle_parameter param;
-    char name_a[] = "param_a";
-    char name_b[] = "param_b";
-    unsigned char content_a[2000] = {1};
-    unsigned char content_b[3000] = {0};
-    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
-    param.name = name_a;
-    param.content = content_a;
-    param.content_len = 2000;
-    int error =
-        paddle_init_param(c, param, (void *)config_proto, config_proto_len);
-    if (error != 0) {
-      goto retry;
-    }
-
-    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
-    param.name = name_b;
-    param.content = content_b;
-    param.content_len = 3000;
-    error = paddle_init_param(c, param, (void *)config_proto, config_proto_len);
-    if (error != 0) {
-      goto retry;
-    }
-
-    error = paddle_finish_init_params(c);
-    if (error != 0) {
-      goto retry;
-    }
-  }
-
-  int i;
-  for (i = 0; i < 100; i++) {
-    sendGrads(c);
-    getParams(c);
-  }
-
-  return 0;
-}
diff --git a/go/pserver/client/c/test/test_mnist.py b/go/pserver/client/c/test/test_mnist.py
deleted file mode 100644
index 97f63aeb6d4..00000000000
--- a/go/pserver/client/c/test/test_mnist.py
+++ /dev/null
@@ -1,145 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2 as paddle
-import gzip
-
-
-def softmax_regression(img):
-    predict = paddle.layer.fc(input=img,
-                              size=10,
-                              act=paddle.activation.Softmax())
-    return predict
-
-
-def multilayer_perceptron(img):
-    # The first fully-connected layer
-    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
-    # The second fully-connected layer and the according activation function
-    hidden2 = paddle.layer.fc(input=hidden1,
-                              size=64,
-                              act=paddle.activation.Relu())
-    # The thrid fully-connected layer, note that the hidden size should be 10,
-    # which is the number of unique digits
-    predict = paddle.layer.fc(input=hidden2,
-                              size=10,
-                              act=paddle.activation.Softmax())
-    return predict
-
-
-def convolutional_neural_network(img):
-    # first conv layer
-    conv_pool_1 = paddle.networks.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        num_channel=1,
-        pool_size=2,
-        pool_stride=2,
-        act=paddle.activation.Tanh())
-    # second conv layer
-    conv_pool_2 = paddle.networks.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        num_channel=20,
-        pool_size=2,
-        pool_stride=2,
-        act=paddle.activation.Tanh())
-    # The first fully-connected layer
-    fc1 = paddle.layer.fc(input=conv_pool_2,
-                          size=128,
-                          act=paddle.activation.Tanh())
-    # The softmax layer, note that the hidden size should be 10,
-    # which is the number of unique digits
-    predict = paddle.layer.fc(input=fc1,
-                              size=10,
-                              act=paddle.activation.Softmax())
-    return predict
-
-
-def main():
-    paddle.init(use_gpu=False, trainer_count=1)
-
-    # define network topology
-    images = paddle.layer.data(
-        name='pixel', type=paddle.data_type.dense_vector(784))
-    label = paddle.layer.data(
-        name='label', type=paddle.data_type.integer_value(10))
-
-    # Here we can build the prediction network in different ways. Please
-    # choose one by uncomment corresponding line.
-    predict = softmax_regression(images)
-    #predict = multilayer_perceptron(images)
-    #predict = convolutional_neural_network(images)
-
-    cost = paddle.layer.classification_cost(input=predict, label=label)
-    parameters = paddle.parameters.create(cost)
-
-    optimizer = paddle.optimizer.Momentum(
-        learning_rate=0.1 / 128.0,
-        momentum=0.9,
-        regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
-
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=optimizer,
-                                 is_local=False,
-                                 pserver_spec="localhost:3000")
-
-    lists = []
-
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 1000 == 0:
-                print "Pass %d, Batch %d, Cost %f, %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics)
-
-        elif isinstance(event, paddle.event.EndPass):
-            result = trainer.test(reader=paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=128))
-            print "Test with Pass %d, Cost %f, %s\n" % (
-                event.pass_id, result.cost, result.metrics)
-            lists.append((event.pass_id, result.cost,
-                          result.metrics['classification_error_evaluator']))
-
-    trainer.train(
-        reader=paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=8192),
-            batch_size=128),
-        event_handler=event_handler,
-        num_passes=100)
-
-    # find the best pass
-    best = sorted(lists, key=lambda list: float(list[1]))[0]
-    print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
-    print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
-
-    test_creator = paddle.dataset.mnist.test()
-    test_data = []
-    for item in test_creator():
-        test_data.append((item[0], ))
-        if len(test_data) == 100:
-            break
-
-    # output is a softmax layer. It returns probabilities.
-    # Shape should be (100, 10)
-    probs = paddle.infer(
-        output_layer=predict, parameters=parameters, input=test_data)
-    print probs.shape
-
-
-if __name__ == '__main__':
-    main()
diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py
deleted file mode 100644
index 2db5a0bf6a5..00000000000
--- a/go/pserver/client/c/test/test_train.py
+++ /dev/null
@@ -1,89 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2 as paddle
-import paddle.v2.dataset.uci_housing as uci_housing
-import paddle.v2.master as master
-import os
-import cPickle as pickle
-from paddle.v2.reader.creator import cloud_reader
-
-etcd_ip = os.getenv("MASTER_IP", "127.0.0.1")
-etcd_endpoints = "http://" + etcd_ip + ":2379"
-print "etcd endpoints: ", etcd_endpoints
-
-
-def main():
-    # init
-    paddle.init(use_gpu=False, trainer_count=1)
-
-    # network config
-    x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
-    y_predict = paddle.layer.fc(input=x,
-                                param_attr=paddle.attr.Param(name='w'),
-                                size=1,
-                                act=paddle.activation.Linear(),
-                                bias_attr=paddle.attr.Param(name='b'))
-    y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
-    cost = paddle.layer.mse_cost(input=y_predict, label=y)
-
-    # create parameters
-    parameters = paddle.parameters.create(cost)
-
-    # create optimizer of new remote updater to pserver
-    optimizer = paddle.optimizer.Momentum(momentum=0, learning_rate=1e-3)
-
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=optimizer,
-                                 is_local=False,
-                                 pserver_spec=etcd_endpoints,
-                                 use_etcd=True)
-
-    # event_handler to print training and testing info
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            # FIXME: for cloud data reader, pass number is managed by master
-            # should print the server side pass number
-            if event.batch_id % 100 == 0:
-                print "Pass %d, Batch %d, Cost %f" % (
-                    event.pass_id, event.batch_id, event.cost)
-
-        if isinstance(event, paddle.event.EndPass):
-            if (event.pass_id + 1) % 10 == 0:
-                result = trainer.test(
-                    reader=paddle.batch(
-                        uci_housing.test(), batch_size=2),
-                    feeding={'x': 0,
-                             'y': 1})
-                print "Test %d, %.2f" % (event.pass_id, result.cost)
-
-    # training
-    # NOTE: use uci_housing.train() as reader for non-paddlecloud training
-    trainer.train(
-        reader=paddle.batch(
-            paddle.reader.shuffle(
-                cloud_reader(
-                    ["/pfs/dlnel/public/dataset/uci_housing/uci_housing*"],
-                    etcd_endpoints),
-                buf_size=500),
-            batch_size=2),
-        feeding={'x': 0,
-                 'y': 1},
-        event_handler=event_handler,
-        num_passes=30)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/go/pserver/client/c/test/testdata/optimizer.pb b/go/pserver/client/c/test/testdata/optimizer.pb
deleted file mode 100644
index 27dd3bc5f19e2964b4b674cff8860233cbdb445a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 50
kcmd;JloDUb$N&X9;j9CU3=s@ToSd^}g1}Dum25B;0LStS`2YX_

diff --git a/go/pserver/client/client.go b/go/pserver/client/client.go
deleted file mode 100644
index 2a8f66a07c7..00000000000
--- a/go/pserver/client/client.go
+++ /dev/null
@@ -1,237 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package client
-
-import (
-	"errors"
-	"hash/fnv"
-	"sort"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/connection"
-	"github.com/PaddlePaddle/Paddle/go/pserver"
-	log "github.com/inconshreveable/log15"
-)
-
-// TODO(helin): add RPC call retry logic
-
-// Selector selects if the client should initialize parameters and
-// reports the initialization process done.
-type Selector interface {
-	// Select selects if the client should initialize parameter servers.
-	Select() (bool, error)
-	// Done indicates the initialization process is done.
-	Done() error
-}
-
-// Server is the identification of a parameter Server.
-type Server struct {
-	Index int
-	Addr  string
-}
-
-// Lister lists currently available parameter servers.
-type Lister interface {
-	List() []Server
-}
-
-// Client is the client to parameter servers.
-type Client struct {
-	sel      Selector
-	pservers []*connection.Conn
-}
-
-// NewClient creates a new client.
-func NewClient(l Lister, pserverNum int, sel Selector) *Client {
-	c := &Client{sel: sel}
-	c.pservers = make([]*connection.Conn, pserverNum)
-	for i := 0; i < pserverNum; i++ {
-		c.pservers[i] = connection.New()
-	}
-	go c.monitorPservers(l, pserverNum)
-	return c
-}
-
-// monitorPservers monitors pserver addresses, and updates connection
-// when the address changes.
-func (c *Client) monitorPservers(l Lister, pserverNum int) {
-	lastServers := make([]Server, pserverNum)
-	ticker := time.NewTicker(10 * time.Second)
-	monitor := func() {
-		curServers := make([]Server, pserverNum)
-		list := l.List()
-		for _, l := range list {
-			curServers[l.Index] = l
-		}
-
-		for i := range lastServers {
-			if lastServers[i].Addr == curServers[i].Addr {
-				continue
-			}
-
-			if curServers[i].Addr == "" {
-				err := c.pservers[i].Close()
-				if err != nil {
-					log.Error("error closing connection to pserver", log.Ctx{"error": err})
-				}
-
-				continue
-			}
-
-			err := c.pservers[i].Connect(curServers[i].Addr)
-			if err != nil {
-				log.Error("error connecting to pserver", log.Ctx{"error": err})
-
-				// connect to addr failed, set
-				// to last known addr in order
-				// to retry next time.
-				curServers[i].Addr = lastServers[i].Addr
-			}
-
-		}
-
-		lastServers = curServers
-	}
-
-	monitor()
-	for range ticker.C {
-		monitor()
-	}
-}
-
-// BeginInitParams begins to initialize parameters on parameter
-// servers.
-//
-// BeginInitParams will be called from multiple trainers, only one
-// trainer will be selected to initialize the parameters on parameter
-// servers. Other trainers will be blocked until the initialization is
-// done, and they need to get the initialized parameters from
-// parameter servers using GetParams.
-func (c *Client) BeginInitParams() (bool, error) {
-	return c.sel.Select()
-}
-
-// InitParam initializes the parameter on parameter servers.
-func (c *Client) InitParam(paramWithConfigs pserver.ParameterWithConfig) error {
-	return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, nil)
-}
-
-// FinishInitParams tells parameter servers client has sent all
-// parameters to parameter servers as initialization.
-func (c *Client) FinishInitParams() error {
-	for _, p := range c.pservers {
-		err := p.Call("Service.FinishInitParams", 0, nil)
-		if err != nil {
-			return err
-		}
-	}
-	return c.sel.Done()
-}
-
-// SendGrads sends gradients to parameter servers for updating
-// parameters.
-func (c *Client) SendGrads(grads []pserver.Gradient) error {
-	if len(grads) == 0 {
-		return errors.New("no gradient received")
-	}
-	errCh := make(chan error, len(grads))
-	for _, g := range grads {
-		go func(g pserver.Gradient) {
-			err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, nil)
-			errCh <- err
-		}(g)
-	}
-
-	recv := 0
-	for err := range errCh {
-		if err != nil {
-			return err
-		}
-
-		recv++
-		if recv == len(grads) {
-			break
-		}
-	}
-	return nil
-}
-
-type result struct {
-	idx   int
-	param pserver.Parameter
-	err   error
-}
-
-type results []result
-
-func (r results) Len() int {
-	return len(r)
-}
-
-func (r results) Less(i int, j int) bool {
-	return r[i].idx < r[j].idx
-}
-
-func (r results) Swap(i int, j int) {
-	r[i], r[j] = r[j], r[i]
-}
-
-// GetParams gets parameters from parameter servers.
-func (c *Client) GetParams(names []string) ([]pserver.Parameter, error) {
-	rCh := make(chan result, len(names))
-
-	for idx, name := range names {
-		go func(name string, idx int) {
-			var parameter pserver.Parameter
-			err := c.pservers[c.partition(name)].Call("Service.GetParam", name, &parameter)
-			rCh <- result{idx: idx, param: parameter, err: err}
-		}(name, idx)
-	}
-
-	var rs results
-	recv := 0
-	for r := range rCh {
-		if r.err != nil {
-			return nil, r.err
-		}
-		rs = append(rs, r)
-
-		recv++
-		if recv == len(names) {
-			break
-		}
-	}
-	sort.Sort(rs)
-
-	ps := make([]pserver.Parameter, len(rs))
-	for i := range rs {
-		ps[i] = rs[i].param
-	}
-
-	return ps, nil
-}
-
-func strHash(s string) uint32 {
-	h := fnv.New32a()
-	_, _ = h.Write([]byte(s))
-	return h.Sum32()
-}
-
-// TODO(helin): now partition only select which parameter server to
-// send the entire parameter. We need to partition a parameter into
-// small blocks and send to different parameter servers.
-func (c *Client) partition(key string) int {
-	return int(strHash(key) % uint32(len(c.pservers)))
-}
diff --git a/go/pserver/client/client_test.go b/go/pserver/client/client_test.go
deleted file mode 100644
index 3a067ff5188..00000000000
--- a/go/pserver/client/client_test.go
+++ /dev/null
@@ -1,268 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package client_test
-
-import (
-	"context"
-	"io/ioutil"
-	"math/rand"
-	"net"
-	"net/http"
-	"net/rpc"
-	"strconv"
-	"strings"
-	"sync"
-	"testing"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/pserver"
-	"github.com/PaddlePaddle/Paddle/go/pserver/client"
-	"github.com/coreos/etcd/clientv3"
-	log "github.com/inconshreveable/log15"
-)
-
-const (
-	numPserver    = 10
-	etcdEndpoints = "127.0.0.1:2379"
-	timeout       = 2 * time.Second
-)
-
-var pserverClientPorts [numPserver]int
-
-// this function init pserver client and return their ports in an array.
-func initClient() [numPserver]int {
-	var ports [numPserver]int
-	for i := 0; i < numPserver; i++ {
-		l, err := net.Listen("tcp", ":0")
-		if err != nil {
-			panic(err)
-		}
-
-		ss := strings.Split(l.Addr().String(), ":")
-		p, err := strconv.Atoi(ss[len(ss)-1])
-		if err != nil {
-			panic(err)
-		}
-		ports[i] = p
-
-		go func(l net.Listener) {
-			var cp pserver.Checkpoint
-			s, err := pserver.NewService(0, time.Hour, "", nil, cp)
-			if err != nil {
-				panic(err)
-			}
-			server := rpc.NewServer()
-			err = server.Register(s)
-			if err != nil {
-				panic(err)
-			}
-
-			mux := http.NewServeMux()
-			mux.Handle(rpc.DefaultRPCPath, server)
-			err = http.Serve(l, mux)
-			if err != nil {
-				panic(err)
-			}
-		}(l)
-	}
-	return ports
-}
-
-func initNativeClient() {
-	pserverClientPorts = initClient()
-}
-
-func initEtcdClient() {
-	client, err := clientv3.New(clientv3.Config{
-		Endpoints:   []string{etcdEndpoints},
-		DialTimeout: time.Second * time.Duration(1),
-	})
-	if err != nil {
-		log.Error("error init etcd client", log.Ctx{"error": err})
-	}
-	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	_, err = client.Delete(ctx, pserver.PsDesired)
-	if err != nil {
-		panic(err)
-	}
-
-	_, err = client.Delete(ctx, pserver.PsPath)
-	if err != nil {
-		panic(err)
-	}
-
-	_, err = client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
-	if err != nil {
-		panic(err)
-	}
-
-	ports := initClient()
-	for i := 0; i < numPserver; i++ {
-		_, err = client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
-		if err != nil {
-			panic(err)
-		}
-	}
-	cancel()
-	err = client.Close()
-	if err != nil {
-		panic(err)
-	}
-}
-
-type selector bool
-
-func (s selector) Select() (bool, error) {
-	return bool(s), nil
-}
-
-func (s selector) Done() error {
-	return nil
-}
-
-type lister []client.Server
-
-func (l lister) List() []client.Server {
-	return l
-}
-
-func testClient(t *testing.T, c *client.Client) {
-	selected, err := c.BeginInitParams()
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if !selected {
-		t.Fatal("should be selected.")
-	}
-
-	const numParameter = 1000
-	config, err := ioutil.ReadFile("./c/test/testdata/optimizer.pb")
-	if err != nil {
-		t.Fatalf("read optimizer proto failed")
-	}
-
-	var wg sync.WaitGroup
-	for i := 0; i < numParameter; i++ {
-		wg.Add(1)
-		go func(i int) {
-			var p pserver.Parameter
-			p.Name = "p_" + strconv.Itoa(i)
-			p.ElementType = pserver.Float32
-			p.Content = make([]byte, (i+1)*100)
-			err := c.InitParam(pserver.ParameterWithConfig{Param: p, Config: config})
-			if err != nil {
-				t.Fatal(err)
-			}
-			wg.Done()
-		}(i)
-	}
-	wg.Wait()
-
-	err = c.FinishInitParams()
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	var grads []pserver.Gradient
-	for i := 0; i < numParameter; i++ {
-		var g pserver.Gradient
-		g.Name = "p_" + strconv.Itoa(i)
-		g.ElementType = pserver.Float32
-		g.Content = make([]byte, (i+1)*100)
-		grads = append(grads, g)
-	}
-
-	const paramPerGroup = 10
-	const numGroups = numParameter / paramPerGroup
-
-	// shuffle send grads order
-	for i := range grads {
-		j := rand.Intn(i + 1)
-		grads[i], grads[j] = grads[j], grads[i]
-	}
-
-	for i := 0; i < numGroups; i++ {
-		var gs []pserver.Gradient
-		if i == numGroups-1 {
-			gs = grads[i*paramPerGroup:]
-		} else {
-			gs = grads[i*paramPerGroup : (i+1)*paramPerGroup]
-		}
-
-		wg.Add(1)
-		go func(gs []pserver.Gradient) {
-			err := c.SendGrads(gs)
-			if err != nil {
-				t.Fatal(err)
-			}
-			wg.Done()
-		}(gs)
-	}
-
-	names := make([]string, numParameter)
-	for i := 0; i < numParameter; i++ {
-		names[i] = "p_" + strconv.Itoa(i)
-	}
-
-	for i := 0; i < numGroups; i++ {
-		var ns []string
-		if i == numGroups-1 {
-			ns = names[i*paramPerGroup:]
-		} else {
-			ns = names[i*paramPerGroup : (i+1)*paramPerGroup]
-		}
-
-		wg.Add(1)
-		go func(ns []string) {
-			params, err := c.GetParams(ns)
-			if err != nil {
-				t.Fatal(err)
-			}
-
-			if len(ns) != len(params) {
-				t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
-			}
-
-			for i := range params {
-				if ns[i] != params[i].Name {
-					t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", ns[i], params[i].Name)
-				}
-			}
-			wg.Done()
-		}(ns)
-	}
-
-	wg.Wait()
-}
-
-func TestNativeClient(t *testing.T) {
-	initNativeClient()
-	servers := make([]client.Server, numPserver)
-	for i := 0; i < numPserver; i++ {
-		servers[i] = client.Server{Index: i, Addr: ":" + strconv.Itoa(pserverClientPorts[i])}
-	}
-	c1 := client.NewClient(lister(servers), len(servers), selector(true))
-	testClient(t, c1)
-}
-
-// EtcdClient is a disabled test, since we have not embedded etcd into
-// our test.
-func EtcdClient(t *testing.T) {
-	initEtcdClient()
-	etcdClient := client.NewEtcd(etcdEndpoints)
-	c2 := client.NewClient(etcdClient, etcdClient.Desired(), selector(true))
-	testClient(t, c2)
-}
diff --git a/go/pserver/client/etcd_client.go b/go/pserver/client/etcd_client.go
deleted file mode 100644
index 3fb835a6e16..00000000000
--- a/go/pserver/client/etcd_client.go
+++ /dev/null
@@ -1,266 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package client
-
-import (
-	"context"
-	"errors"
-	"fmt"
-	"strconv"
-	"strings"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/pserver"
-	"github.com/coreos/etcd/clientv3"
-	"github.com/coreos/etcd/clientv3/concurrency"
-	log "github.com/inconshreveable/log15"
-)
-
-const (
-	defaultEtcdTimeout time.Duration = 5 * time.Second
-
-	initLockPath = "/init_ps/lock"
-	initDonePath = "/init_ps/done"
-	initDoneVal  = "1"
-)
-
-// Etcd is used by pserver client that is a part of trainer process.
-// TODO:
-// 1. add watcher to watch the change state of pservers.
-type Etcd struct {
-	client    *clientv3.Client
-	timeout   time.Duration
-	endpoints []string
-	lock      *concurrency.Mutex
-}
-
-// Desired read ps desired number from etcd.
-func (e *Etcd) Desired() int {
-	var psDesired int
-	for {
-		ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
-		resp, err := e.client.Get(ctx, pserver.PsDesired)
-		cancel()
-		if err != nil {
-			log.Error(
-				"Get ps dresire number failed! reconnecting...",
-				log.Ctx{"error": err},
-			)
-			time.Sleep(e.timeout)
-			continue
-		}
-
-		kvs := resp.Kvs
-		if len(kvs) == 0 {
-			log.Info("Waiting for ps desired registered ...")
-			time.Sleep(e.timeout)
-			continue
-		}
-
-		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
-		if err != nil {
-			log.Error("atoi failed", log.Ctx{"error": err})
-			time.Sleep(e.timeout)
-			continue
-		}
-
-		log.Debug("Got psDesired", log.Ctx{"psDesired": psDesired})
-		break
-	}
-	return psDesired
-}
-
-// List return the pserver list read from etcd.
-func (e *Etcd) List() []Server {
-	psDesired := e.Desired()
-
-	servers := make([]Server, psDesired)
-	for {
-		for i := 0; i < psDesired; i++ {
-			ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
-			psKey := pserver.PsPath + strconv.Itoa(i)
-			log.Debug("looking for pserver", log.Ctx{"ps key": psKey})
-			resp, err := e.client.Get(ctx, psKey)
-			cancel()
-			if err != nil {
-				log.Info(
-					"Get psKey error",
-					log.Ctx{"ps key": psKey, "error": err},
-				)
-				time.Sleep(e.timeout)
-				continue
-			}
-			kvs := resp.Kvs
-			if len(kvs) == 0 {
-				log.Info("Waiting for ps addr registered ...")
-				time.Sleep(e.timeout)
-				continue
-			}
-
-			psAddr := string(resp.Kvs[0].Value)
-			// TODO(Longfei) check the ps address
-			if psAddr == "" {
-				log.Info(
-					"Value under psKey is empty",
-					log.Ctx{"psKey": psKey},
-				)
-				time.Sleep(e.timeout)
-				continue
-			}
-			log.Debug(
-				"got psAddr given psKey",
-				log.Ctx{"psAddr": psAddr, "psKey": psKey},
-			)
-			servers[i].Index = i
-			servers[i].Addr = psAddr
-		}
-		break
-	}
-	return servers
-}
-
-// NewEtcd create a etcd client to return the state of pserver on etcd.
-func NewEtcd(endpoints string) *Etcd {
-	ep := strings.Split(endpoints, ",")
-	var cli *clientv3.Client
-	var err error
-	for {
-		cli, err = clientv3.New(clientv3.Config{
-			Endpoints:   ep,
-			DialTimeout: defaultEtcdTimeout,
-		})
-		if err != nil {
-			log.Error("Init etcd connection failed", log.Ctx{"error": err})
-			time.Sleep(defaultEtcdTimeout)
-			continue
-		}
-		break
-	}
-	log.Info("Connected to etcd endpoint", log.Ctx{"endpoint": endpoints})
-	client := &Etcd{
-		client:    cli,
-		timeout:   defaultEtcdTimeout,
-		endpoints: ep,
-	}
-	return client
-}
-
-// Select indicates if the current trainer is selected to initialize
-// the pserver parameters.
-func (e *Etcd) Select() (bool, error) {
-	sess, err := concurrency.NewSession(e.client, concurrency.WithTTL(5))
-	if err != nil {
-		return false, err
-	}
-
-	lock := concurrency.NewMutex(sess, initLockPath)
-	log.Info("Trying to acquire lock", log.Ctx{"lock path": initLockPath})
-	// Do not use timeout context here, since we don't know how
-	// long does it take for other trainers to initialize the
-	// parameters.
-	err = lock.Lock(context.Background())
-	if err != nil {
-		return false, err
-	}
-	log.Info("Successfully acquired lock", log.Ctx{"lock path": initLockPath})
-
-	get := clientv3.OpGet(initDonePath)
-	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
-	tresp, err := e.client.Txn(ctx).If(lock.IsOwner()).Then(get).Commit()
-	cancel()
-	if err != nil {
-		return false, err
-	}
-
-	if !tresp.Succeeded {
-		return false, errors.New("no longer the owner of the lock")
-	}
-
-	resp := tresp.Responses[0].GetResponseRange()
-
-	if len(resp.Kvs) == 0 {
-		// Key value not set, select current trainer.
-		e.lock = lock
-		log.Info("Trainer selected.")
-		return true, nil
-	}
-
-	if string(resp.Kvs[0].Value) == initDoneVal {
-		log.Info("Initialization is already done.")
-		ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
-		err = lock.Unlock(ctx)
-		cancel()
-		if err != nil {
-			log.Error("error unlocking", log.Ctx{"error": err})
-		}
-		return false, nil
-	}
-
-	return false, fmt.Errorf("key %s have unexpected value: %v", initDonePath, resp.Kvs[0].Value)
-}
-
-// Done indicates the parameter initialization process is done.
-func (e *Etcd) Done() error {
-	if e.lock == nil {
-		return errors.New("lock is nil, Done called unexpectedly")
-	}
-
-	put := clientv3.OpPut(initDonePath, initDoneVal)
-	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
-	tresp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
-	cancel()
-	if err != nil {
-		return err
-	}
-
-	if !tresp.Succeeded {
-		return errors.New("no longer the owner of the lock")
-	}
-
-	ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
-	err = e.lock.Unlock(ctx)
-	cancel()
-	if err != nil {
-		log.Error("error unlocking", log.Ctx{"error": err})
-	} else {
-		e.lock = nil
-	}
-
-	return nil
-}
-
-// Close closes the etcd client.
-func (e *Etcd) Close() error {
-	var err error
-	if e.lock != nil {
-		ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
-		err = e.lock.Unlock(ctx)
-		cancel()
-		if err == nil {
-			e.lock = nil
-		}
-	}
-
-	cErr := e.client.Close()
-	if cErr != nil {
-		if err != nil {
-			log.Error("error closing etcd client", log.Ctx{"error": cErr})
-			return err
-		}
-		return cErr
-	}
-
-	return err
-}
diff --git a/go/pserver/client/etcd_client_test.go b/go/pserver/client/etcd_client_test.go
deleted file mode 100644
index 08742433e7a..00000000000
--- a/go/pserver/client/etcd_client_test.go
+++ /dev/null
@@ -1,106 +0,0 @@
-package client_test
-
-import (
-	"io/ioutil"
-	"net/url"
-	"os"
-	"strings"
-	"sync"
-	"testing"
-
-	"github.com/PaddlePaddle/Paddle/go/pserver/client"
-	"github.com/coreos/etcd/embed"
-)
-
-func TestSelector(t *testing.T) {
-	etcdDir, err := ioutil.TempDir("", "")
-	if err != nil {
-		t.Fatal(err)
-	}
-	cfg := embed.NewConfig()
-	lpurl, _ := url.Parse("http://localhost:0")
-	lcurl, _ := url.Parse("http://localhost:0")
-	cfg.LPUrls = []url.URL{*lpurl}
-	cfg.LCUrls = []url.URL{*lcurl}
-	cfg.Dir = etcdDir
-	e, err := embed.StartEtcd(cfg)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	defer func() {
-		e.Close()
-		if err := os.RemoveAll(etcdDir); err != nil {
-			t.Fatal(err)
-		}
-	}()
-
-	<-e.Server.ReadyNotify()
-
-	port := strings.Split(e.Clients[0].Addr().String(), ":")[1]
-	endpoint := "127.0.0.1:" + port
-
-	var mu sync.Mutex
-	selectedCount := 0
-	var wg sync.WaitGroup
-	selectAndDone := func(c *client.Etcd) {
-		defer wg.Done()
-
-		selected, err := c.Select()
-		if err != nil {
-			panic(err)
-		}
-
-		if selected {
-			mu.Lock()
-			selectedCount++
-			mu.Unlock()
-			err = c.Done()
-			if err != nil {
-				t.Fatal(err)
-			}
-		}
-	}
-
-	c0 := client.NewEtcd(endpoint)
-	c1 := client.NewEtcd(endpoint)
-	c2 := client.NewEtcd(endpoint)
-	c3 := client.NewEtcd(endpoint)
-	wg.Add(3)
-	go selectAndDone(c0)
-	go selectAndDone(c1)
-	go selectAndDone(c2)
-	wg.Wait()
-
-	// simulate trainer crashed and restarted after the
-	// initialization process.
-	wg.Add(1)
-	go selectAndDone(c3)
-	wg.Wait()
-
-	mu.Lock()
-	if selectedCount != 1 {
-		t.Fatal("selected count wrong:", selectedCount)
-	}
-	mu.Unlock()
-
-	err = c0.Close()
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	err = c1.Close()
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	err = c2.Close()
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	err = c3.Close()
-	if err != nil {
-		t.Fatal(err)
-	}
-}
diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go
deleted file mode 100644
index 719013b1bb4..00000000000
--- a/go/pserver/etcd_client.go
+++ /dev/null
@@ -1,253 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pserver
-
-import (
-	"context"
-	"errors"
-	"strconv"
-	"strings"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
-	"github.com/coreos/etcd/clientv3"
-	"github.com/coreos/etcd/clientv3/concurrency"
-	log "github.com/inconshreveable/log15"
-)
-
-const (
-	// PsDesired is etcd path for store desired pserver count
-	PsDesired = "/ps_desired"
-	// PsPath is the base dir for pserver to store their addr
-	PsPath = "/ps/"
-	// PsCheckpoint is the etcd path for store checkpoints information
-	PsCheckpoint = "/checkpoints/"
-
-	retryTimeout = 5 * time.Second
-)
-
-// EtcdClient is the etcd client that the pserver uses for fault
-// tolerance, service registry and coordination.
-type EtcdClient struct {
-	numPservers int
-	endpoints   string
-	client      *clientv3.Client
-	sess        *concurrency.Session
-	dialTimeout time.Duration
-	ttlSec      int
-	// FIXME: ensure GetExternalIP gets the correct ip for trainers to connect.
-	externalIP string
-	// desired number of pservers in the job.
-	// assume desired will not change during one training job.
-	desired int
-}
-
-// NewEtcdClient creates an EtcdClient
-func NewEtcdClient(endpoints string, numPservers int, dialtimeout time.Duration, ttlSec int) *EtcdClient {
-	return &EtcdClient{
-		dialTimeout: dialtimeout,
-		ttlSec:      ttlSec,
-		numPservers: numPservers,
-		endpoints:   endpoints,
-	}
-}
-
-// Register registers the pserver on etcd
-//
-// Register returns the index of the current pserver.
-func (e *EtcdClient) Register(port int) (int, error) {
-	var err error
-	e.externalIP, err = networkhelper.GetExternalIP()
-	if err != nil {
-		return 0, err
-	}
-
-	// initialize connection to etcd.
-	ep := strings.Split(e.endpoints, ",")
-	for {
-		cli, err := clientv3.New(clientv3.Config{
-			Endpoints:   ep,
-			DialTimeout: e.dialTimeout,
-		})
-		if err != nil {
-			log.Error("connect to etcd error", log.Ctx{"error": err})
-			time.Sleep(retryTimeout)
-			continue
-		}
-		e.client = cli
-		sess, err := concurrency.NewSession(cli, concurrency.WithTTL(e.ttlSec))
-		if err != nil {
-			log.Error("create etcd session error", log.Ctx{"error": err})
-			time.Sleep(retryTimeout)
-			continue
-		}
-		e.sess = sess
-		log.Debug("connected to etcd", log.Ctx{"endpoint": e.endpoints})
-		break
-	}
-	// init /ps_desired using transaction, for multiple pservers may want to write
-	// it at the same time.
-	for {
-		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
-		_, err := e.initDesiredPservers(ctx, e.numPservers)
-		cancel()
-		if err != nil {
-			log.Warn("pserver init error", log.Ctx{"error": err, "num pservers": e.numPservers})
-			time.Sleep(retryTimeout)
-			continue
-		}
-		break
-	}
-	// TODO: when implementing extending or reducing pservers, /ps_desired is
-	// changed, then we need to watch /ps_desired node for events. For now, just
-	// write once when init and read from it.
-	// wait and set s.desired init value
-	for {
-		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
-		resp, err := e.client.Get(ctx, PsDesired)
-		cancel()
-		if err != nil {
-			log.Error("get etcd key error", log.Ctx{"key": PsDesired, "error": err})
-			time.Sleep(retryTimeout)
-			continue
-		}
-		if len(resp.Kvs) != 0 {
-			e.desired, err = strconv.Atoi(string(resp.Kvs[0].Value))
-			if err != nil {
-				log.Error(
-					"psDesired atoi error",
-					log.Ctx{"error": err, "value": string(resp.Kvs[0].Value)},
-				)
-				time.Sleep(retryTimeout)
-				// NOTE: wait util ps_desired value change
-				continue
-			}
-			break
-		}
-	}
-
-	var pserverIdx int
-	// try register pserver node on etcd
-	for {
-		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
-		var err error
-		pserverIdx, err = e.registerPserverEtcd(ctx, port)
-		cancel()
-		if err != nil {
-			log.Warn("register pserver on etcd error", log.Ctx{"error": err})
-			time.Sleep(retryTimeout)
-			continue
-		}
-		break
-	}
-
-	return pserverIdx, nil
-}
-
-func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
-	return concurrency.NewSTM(e.client, func(c concurrency.STM) error {
-		dsStr := c.Get(PsDesired)
-		if dsStr == "" {
-			c.Put(PsDesired, strconv.Itoa(numPservers), clientv3.WithLease(e.sess.Lease()))
-		}
-		return nil
-	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
-}
-
-// registerPserverEtcd registers pserver node on etcd using transaction.
-func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, error) {
-	var idx int
-	_, err := concurrency.NewSTM(e.client, func(c concurrency.STM) error {
-		registered := false
-		for i := 0; i < e.desired; i++ {
-			psKey := PsPath + strconv.Itoa(i)
-			ps := c.Get(psKey)
-			log.Debug(
-				"register pserver got value",
-				log.Ctx{"value": ps, "key": psKey},
-			)
-
-			if ps == "" {
-				// find the first id and write info
-				pserverAddr := e.externalIP + ":" + strconv.Itoa(port)
-				c.Put(psKey, pserverAddr, clientv3.WithLease(e.sess.Lease()))
-				log.Debug("register finished", log.Ctx{"key": psKey, "value": pserverAddr})
-				idx = i
-				registered = true
-				break
-			}
-		}
-		if registered {
-			return nil
-		}
-		return errors.New("not registered, may due to already have enough pservers")
-	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
-
-	if err != nil {
-		return 0, err
-	}
-
-	return idx, nil
-}
-
-// GetKey gets the value by the specified key
-func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
-	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	resp, err := e.client.Get(ctx, key)
-	cancel()
-	if err != nil {
-		return []byte{}, err
-	}
-
-	kvs := resp.Kvs
-	if len(kvs) == 0 {
-		return []byte{}, nil
-	}
-	v := kvs[0].Value
-	return v, nil
-}
-
-// PutKey put into etcd with value by key specified
-func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration, withLease bool) error {
-	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	var err error
-	if withLease {
-		_, err = e.client.Put(ctx, key, string(value), clientv3.WithLease(e.sess.Lease()))
-	} else {
-		_, err = e.client.Put(ctx, key, string(value))
-	}
-	cancel()
-	return err
-}
-
-// Shutdown shuts down the etcd client gracefully.
-func (e *EtcdClient) Shutdown() error {
-	var err error
-	if e.sess != nil {
-		err = e.sess.Close()
-	}
-
-	if e.client != nil {
-		newErr := e.client.Close()
-		if newErr != nil {
-			if err != nil {
-				log.Error("shutdown error", log.Ctx{"error": newErr})
-			} else {
-				err = newErr
-			}
-		}
-	}
-	return err
-}
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
deleted file mode 100644
index eba0c47e195..00000000000
--- a/go/pserver/optimizer.go
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pserver
-
-// #cgo CFLAGS: -I ../../
-// #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
-// #include "paddle/legacy/optimizer/optimizer.h"
-// #include <stdlib.h>
-// #include <string.h>
-import "C"
-
-import (
-	"fmt"
-	"unsafe"
-
-	log "github.com/inconshreveable/log15"
-)
-
-type optimizer struct {
-	opt         *C.struct_paddle_optimizer
-	elementType ElementType
-	contentLen  int
-	config      []byte
-}
-
-func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nil {
-		return nil
-	}
-
-	// create a Go clice backed by a C array, reference:
-	// https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
-	//
-	// Go garbage collector will not interact with this data, need
-	// to be freed properly.
-	return (*[1 << 30]byte)(p)[:len:len]
-}
-
-func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer {
-	o := &optimizer{}
-	o.elementType = paramWithConfigs.Param.ElementType
-	o.contentLen = len(paramWithConfigs.Param.Content)
-	p := paramWithConfigs.Param
-	c := paramWithConfigs.Config
-	s := State
-	paramBufferSize := C.size_t(len(p.Content))
-	log.Info("New Optimizer Created with config", log.Ctx{
-		"ElementType": p.ElementType,
-		"ParamSize":   paramBufferSize,
-		"ConfigSize":  len(c),
-		"StateSize":   len(s),
-	})
-	var cbuffer unsafe.Pointer
-	cbuffer = C.malloc(paramBufferSize)
-
-	C.memcpy(cbuffer, unsafe.Pointer(&p.Content[0]), paramBufferSize)
-	var cstate unsafe.Pointer
-	if len(s) != 0 {
-		cstate = unsafe.Pointer(&s[0])
-	}
-
-	var cptr (*C.uchar)
-	if len(c) > 0 {
-		cptr = (*C.uchar)(&c[0])
-	} else {
-		log.Error("empty config", "param name", paramWithConfigs.Param.Name)
-	}
-	o.config = c
-	o.opt = C.paddle_create_optimizer(
-		cptr,
-		C.int(len(c)),
-		C.paddle_element_type(p.ElementType),
-		cbuffer,
-		C.int(paramBufferSize),
-		(*C.char)(cstate),
-		C.int(len(s)),
-	)
-	return o
-}
-
-func (o *optimizer) GetWeights() []byte {
-	var buffer unsafe.Pointer
-	// we do not own the buffer, no need to free later.
-	bufferLen := C.paddle_optimizer_get_weights(o.opt, &buffer)
-	return cArrayToSlice(buffer, int(bufferLen)*C.sizeof_float)
-}
-
-func (o *optimizer) GetStates() []byte {
-	var cbuffer *C.char
-	// we owns the state buffer, need to free later.
-	cbufferLen := C.paddle_optimizer_get_state(o.opt, &cbuffer)
-	buf := cArrayToSlice(unsafe.Pointer(cbuffer), int(cbufferLen))
-	cpy := make([]byte, len(buf))
-	copy(cpy, buf)
-	C.free(unsafe.Pointer(cbuffer))
-	return cpy
-}
-
-func (o *optimizer) UpdateParameter(g Gradient) error {
-	if o.elementType != g.ElementType {
-		return fmt.Errorf("Name: %s, parameter and gradient element type not match, parameter: %v, gradient: %v", g.Name, o.elementType, g.ElementType)
-	}
-
-	if o.contentLen != len(g.Content) {
-		return fmt.Errorf("Name: %s, parameter and gradient does not have same content len, parameter: %d, gradient: %d", g.Name, o.contentLen, len(g.Content))
-	}
-
-	r := C.paddle_update_parameter(o.opt, C.paddle_element_type(g.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content)))
-	if r != 0 {
-		return fmt.Errorf("optimizer update returned error code: %d", r)
-	}
-	return nil
-}
-
-func (o *optimizer) Cleanup() {
-	if unsafe.Pointer(o.opt) != nil {
-		C.paddle_release_optimizer(o.opt)
-		o.opt = (*C.struct_paddle_optimizer)(nil)
-	}
-}
diff --git a/go/pserver/optimizer_test.go b/go/pserver/optimizer_test.go
deleted file mode 100644
index 3b923879d5e..00000000000
--- a/go/pserver/optimizer_test.go
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pserver
-
-import (
-	"encoding/binary"
-	"io/ioutil"
-	"math"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-)
-
-func TestOptimizerCreateRelease(t *testing.T) {
-	p := Parameter{
-		Name:        "a",
-		ElementType: Int32,
-	}
-	p.Content = []byte{1, 3}
-	config, err := ioutil.ReadFile("./client/c/test/testdata/optimizer.pb")
-	if err != nil {
-		t.Fatalf("read optimizer proto failed")
-	}
-	param := ParameterWithConfig{
-		Param:  p,
-		Config: config,
-	}
-	o := newOptimizer(param, nil)
-	o.Cleanup()
-}
-
-func float32Bytes(float float32) []byte {
-	bits := math.Float32bits(float)
-	bytes := make([]byte, 4)
-	binary.LittleEndian.PutUint32(bytes, bits)
-	return bytes
-}
-
-func TestOptimizerState(t *testing.T) {
-	p := Parameter{
-		Name:        "a",
-		ElementType: Int32,
-	}
-	weights := float32Bytes(100)
-	p.Content = weights
-	config, err := ioutil.ReadFile("./client/c/test/testdata/optimizer.pb")
-	if err != nil {
-		t.Fatalf("read optimizer proto failed")
-	}
-	param := ParameterWithConfig{
-		Param:  p,
-		Config: config,
-	}
-	o := newOptimizer(param, nil)
-	s := o.GetStates()
-
-	// clear param content and check if the state is restored.
-	param.Param.Content = float32Bytes(300)
-	o1 := newOptimizer(param, s)
-	s1 := o1.GetStates()
-	assert.Equal(t, s, s1)
-	assert.Equal(t, weights, o.GetWeights())
-	assert.Equal(t, weights, o1.GetWeights())
-	o.Cleanup()
-	o1.Cleanup()
-}
diff --git a/go/pserver/service.go b/go/pserver/service.go
deleted file mode 100644
index d6ead774af5..00000000000
--- a/go/pserver/service.go
+++ /dev/null
@@ -1,450 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pserver
-
-import (
-	"bufio"
-	"bytes"
-	"encoding/binary"
-	"encoding/gob"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"hash/crc32"
-	"io/ioutil"
-	"os"
-	"path"
-	"strconv"
-	"strings"
-	"sync"
-	"time"
-
-	"github.com/golang/protobuf/proto"
-	uuid "github.com/satori/go.uuid"
-
-	pb "github.com/PaddlePaddle/Paddle/go/proto"
-
-	log "github.com/inconshreveable/log15"
-)
-
-// ElementType is the type of elements of a Parameter.
-type ElementType int
-
-// ErrCheckpointNotFound indicates that the pserver checkpoint could
-// not be found.
-var ErrCheckpointNotFound = errors.New("checkpoint not found in etcd")
-
-// RPC error message.
-const (
-	AlreadyInitialized = "pserver already initialized"
-	Uninitialized      = "pserver not fully initialized"
-	WrongChecksum      = "checkpoint file checksum validation failed"
-)
-
-// Supported element types.
-const (
-	Int32 ElementType = iota
-	UInt32
-	Int64
-	UInt64
-	Float32
-	Float64
-)
-
-// Parameter is a piece of data to sync with the parameter server.
-type Parameter struct {
-	Name        string
-	ElementType ElementType
-	Content     []byte
-}
-
-func float32ToString(b []byte) string {
-	f := make([]float32, len(b)/4)
-	buf := bytes.NewReader(b)
-	err := binary.Read(buf, binary.LittleEndian, &f)
-	if err != nil {
-		return ""
-	}
-	return fmt.Sprintf("%v", f)
-}
-
-func float32ByteToString(c []byte) string {
-	var a []byte
-	var b []byte
-	if len(c) <= 80 {
-		a = c
-	} else {
-		a = c[0:40]
-		b = c[len(c)-40:]
-	}
-
-	var s string
-	s = float32ToString(a)
-
-	if b == nil {
-		return s
-	}
-
-	s = strings.Replace(s, "]", "", -1) + "..." + strings.Replace(float32ToString(b), "[", "", -1)
-	return s
-}
-
-func (p Parameter) String() string {
-	if p.ElementType != Float32 {
-		return fmt.Sprintf("name:%v ElementType:%v",
-			p.Name, p.ElementType)
-	}
-
-	return float32ByteToString(p.Content)
-}
-
-// ParameterWithConfig contains the parameter and the configuration.
-type ParameterWithConfig struct {
-	Param  Parameter
-	Config []byte // parameter configuration in Proto Buffer format
-}
-
-// checkpointMeta saves checkpoint metadata
-type checkpointMeta struct {
-	UUID      string `json:"uuid"`
-	Path      string `json:"path"`
-	CRC32     uint32 `json:"crc32"`
-	Timestamp int64  `json:"timestamp"`
-}
-
-// Checkpoint is the pserver shard persist in file.
-type Checkpoint []parameterCheckpoint
-
-// Gradient is the gradient of the parameter.
-type Gradient Parameter
-
-// Service is the RPC service for pserver.
-type Service struct {
-	initialized        chan struct{}
-	idx                int
-	checkpointInterval time.Duration
-	checkpointPath     string
-	client             KVStore
-
-	mu     sync.Mutex
-	optMap map[string]*optimizer
-}
-
-// parameterCheckpoint saves parameter checkpoint.
-type parameterCheckpoint struct {
-	ParameterWithConfig
-	State []byte
-}
-
-type KVStore interface {
-	GetKey(key string, timeout time.Duration) ([]byte, error)
-	PutKey(key string, value []byte, timeout time.Duration, withLease bool) error
-}
-
-func loadMeta(e KVStore, idx int) (meta checkpointMeta, err error) {
-	v, err := e.GetKey(PsCheckpoint+strconv.Itoa(idx), 3*time.Second)
-	if err != nil {
-		return
-	}
-
-	if len(v) == 0 {
-		err = ErrCheckpointNotFound
-		return
-	}
-
-	if err = json.Unmarshal(v, &meta); err != nil {
-		return
-	}
-
-	return
-}
-
-// LoadCheckpoint loads checkpoint from file.
-func LoadCheckpoint(e KVStore, idx int) (Checkpoint, error) {
-	log.Info("Loading checkpoint", "pserver index", idx)
-	defer traceTime(time.Now(), "load checkpoint")
-
-	cpMeta, err := loadMeta(e, idx)
-	if err != nil {
-		return nil, err
-	}
-
-	content, err := ioutil.ReadFile(cpMeta.Path)
-	if err != nil {
-		return nil, err
-	}
-
-	crc32 := crc32.ChecksumIEEE(content)
-	if crc32 != cpMeta.CRC32 {
-		return nil, errors.New(WrongChecksum)
-	}
-
-	dec := gob.NewDecoder(bytes.NewReader(content))
-	var cp Checkpoint
-	if err = dec.Decode(&cp); err != nil {
-		return nil, err
-	}
-
-	return cp, nil
-}
-
-// NewService creates a new service, will bypass etcd registration if no
-// endpoints specified. It will recovery from checkpoint file if a exists a specified checkpoint.
-func NewService(idx int, interval time.Duration, path string, client KVStore, cp Checkpoint) (*Service, error) {
-	s := &Service{
-		idx:                idx,
-		checkpointInterval: interval,
-		checkpointPath:     path,
-		client:             client,
-	}
-	s.optMap = make(map[string]*optimizer)
-	s.initialized = make(chan struct{})
-
-	if cp != nil {
-		for _, item := range cp {
-			p := ParameterWithConfig{
-				Param:  item.Param,
-				Config: item.Config,
-			}
-			s.optMap[p.Param.Name] = newOptimizer(p, item.State)
-		}
-		close(s.initialized)
-	}
-	return s, nil
-}
-
-// InitParam initializes a parameter.
-func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, _ *int) error {
-	select {
-	case <-s.initialized:
-		log.Warn("init param called but parameters already initialized.")
-		return errors.New(AlreadyInitialized)
-	default:
-	}
-
-	c := &pb.OptimizerConfig{}
-	proto.Unmarshal(paramWithConfigs.Config, c)
-	log.Debug(fmt.Sprintf("OptimizerConfig:%v", c))
-
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	// TODO(helin): check if paramWithConfigs.Param.Content is
-	// properly memory aligned, if not, make copy to a memory
-	// aligned region.
-	s.optMap[paramWithConfigs.Param.Name] = newOptimizer(paramWithConfigs, nil)
-	log.Info(
-		"init parameter",
-		"name", paramWithConfigs.Param.Name,
-		"config len", len(paramWithConfigs.Config),
-		"param len", len(paramWithConfigs.Param.Content),
-		"type", paramWithConfigs.Param.ElementType,
-	)
-	return nil
-}
-
-// FinishInitParams tells the parameter server that the parameter
-// initialization has finished.
-func (s *Service) FinishInitParams(_ int, _ *int) error {
-	select {
-	case <-s.initialized:
-		log.Warn("finished init param called but parameters already initialized.")
-		return errors.New(AlreadyInitialized)
-	default:
-	}
-
-	close(s.initialized)
-	go func() {
-		t := time.Tick(s.checkpointInterval)
-		for range t {
-			err := s.checkpoint()
-			if err != nil {
-				log.Error("checkpoint error", log.Ctx{"error": err})
-			}
-		}
-	}()
-
-	log.Info("init parameter finished.")
-	return nil
-}
-
-// SendGrad sends gradient to parameter servers for parameter
-// optimization.
-func (s *Service) SendGrad(g Gradient, _ *int) error {
-	select {
-	case <-s.initialized:
-	default:
-		log.Warn("received gradient before initialization.",
-			"name", g.Name, "size", len(g.Content), "type", g.ElementType)
-		return errors.New(Uninitialized)
-	}
-
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	o, ok := s.optMap[g.Name]
-	if !ok {
-		log.Warn("received gradient but can't find name.",
-			"name", g.Name, "size", len(g.Content), "type", g.ElementType)
-		return fmt.Errorf("parameter: %s does not exist", g.Name)
-	}
-
-	log.Debug(Parameter(g).String())
-	log.Info("received gradient from trainer, updating gradient.",
-		"name", g.Name, "size", len(g.Content), "type", g.ElementType)
-	return o.UpdateParameter(g)
-}
-
-// GetParam gets parameters from the parameter server.
-func (s *Service) GetParam(name string, parameter *Parameter) error {
-	<-s.initialized
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	opt, ok := s.optMap[name]
-	if !ok {
-		log.Warn("trainer wants to get a parameter that does not exist.", "name", name)
-		return fmt.Errorf("parameter: %s does not exist", name)
-	}
-
-	// The parameter content (a byte slice) may change
-	// during RPC serialization due to write from other
-	// goroutine, we allow it since mini-batch based deep
-	// learning optimization methods are stochastic in
-	// nature. This race condition is allowed deliberately
-	// to save the program from making a copy of the
-	// parameter content.
-	parameter.Name = name
-	parameter.ElementType = opt.elementType
-	parameter.Content = opt.GetWeights()
-	log.Debug(parameter.String())
-	log.Info("sending parameter to the trainer", "name", parameter.Name, "size", len(parameter.Content), "type", parameter.ElementType)
-	return nil
-}
-
-func traceTime(start time.Time, name string) {
-	elapsed := time.Since(start)
-	log.Info("time elapsed", log.Ctx{"name": name, "elapsed": elapsed})
-}
-
-// checkpoint saves checkpoint to disk.
-//
-// checkpoint should be only called after the parameters are
-// initialized.
-func (s *Service) checkpoint() (err error) {
-	log.Info("Begin save checkpoint.")
-	defer traceTime(time.Now(), "save checkpoint")
-
-	s.mu.Lock()
-	cp := make([]parameterCheckpoint, len(s.optMap))
-	index := 0
-	// TODO(helin): write checkpoint incrementally to reduce memory
-	// footprint during checkpoint.
-	for name, opt := range s.optMap {
-		var pc parameterCheckpoint
-		pc.Param.Name = name
-		pc.Param.ElementType = opt.elementType
-		pc.Param.Content = opt.GetWeights()
-		pc.Config = opt.config
-		pc.State = opt.GetStates()
-		cp[index] = pc
-		index++
-	}
-	s.mu.Unlock()
-
-	var buf bytes.Buffer
-	encoder := gob.NewEncoder(&buf)
-	err = encoder.Encode(cp)
-	if err != nil {
-		return
-	}
-
-	if _, err = os.Stat(s.checkpointPath); os.IsNotExist(err) {
-		err = os.MkdirAll(s.checkpointPath, os.ModePerm)
-		if err != nil {
-			return
-		}
-	}
-
-	id := uuid.NewV4().String()
-	p := path.Join(s.checkpointPath, id)
-	f, err := os.Create(p)
-	if err != nil {
-		return
-	}
-
-	defer func() {
-		closeErr := f.Close()
-		if closeErr != nil {
-			if err != nil {
-				log.Error("error close checkpoint file", log.Ctx{"error": closeErr})
-			} else {
-				// Set closeErr as return value.
-				err = closeErr
-			}
-		}
-	}()
-
-	writer := bufio.NewWriter(f)
-	_, err = writer.Write(buf.Bytes())
-	if err != nil {
-		return
-	}
-
-	err = writer.Flush()
-	if err != nil {
-		return
-	}
-
-	oldMeta, err := loadMeta(s.client, s.idx)
-	if err == ErrCheckpointNotFound {
-		log.Info("old meta not found, skip removing old meta")
-		err = nil
-	} else if err == nil {
-		log.Info("removing old meta")
-		if oldMeta.Path != "" {
-			rmErr := os.Remove(oldMeta.Path)
-			if rmErr != nil {
-				// log error, but still treat checkpoint as
-				// successful.
-				log.Error("remove old meta file error", log.Ctx{"error": rmErr})
-			}
-		}
-	}
-
-	if err != nil {
-		return
-	}
-
-	crc32 := crc32.ChecksumIEEE(buf.Bytes())
-	cpMeta := checkpointMeta{
-		UUID:      id,
-		Timestamp: time.Now().UnixNano(),
-		CRC32:     crc32,
-		Path:      p,
-	}
-
-	json, err := json.Marshal(cpMeta)
-	if err != nil {
-		return
-	}
-
-	err = s.client.PutKey(PsCheckpoint+strconv.Itoa(s.idx), json, 3*time.Second, false)
-	if err != nil {
-		return
-	}
-
-	return
-}
diff --git a/go/pserver/service_internal_test.go b/go/pserver/service_internal_test.go
deleted file mode 100644
index 36eca5112b3..00000000000
--- a/go/pserver/service_internal_test.go
+++ /dev/null
@@ -1,86 +0,0 @@
-package pserver
-
-import (
-	"bytes"
-	"encoding/binary"
-	"fmt"
-	"testing"
-	"time"
-
-	"github.com/stretchr/testify/assert"
-)
-
-const testDir = "./test_data"
-
-type myKV struct {
-	m map[string][]byte
-}
-
-func (m *myKV) GetKey(key string, timeout time.Duration) ([]byte, error) {
-	if m.m == nil {
-		m.m = make(map[string][]byte)
-	}
-	return m.m[key], nil
-}
-
-func (m *myKV) PutKey(key string, value []byte, timeout time.Duration, withLease bool) error {
-	if m.m == nil {
-		m.m = make(map[string][]byte)
-	}
-	m.m[key] = value
-	return nil
-}
-
-func TestCheckpoint(t *testing.T) {
-	kv := &myKV{}
-	s, err := NewService(0, time.Hour, testDir, kv, nil)
-	assert.Nil(t, err)
-	err = s.checkpoint()
-	assert.Nil(t, err)
-	_, err = LoadCheckpoint(kv, 0)
-	assert.Nil(t, err)
-}
-
-func float32ToByte(f float32) []byte {
-	var buf bytes.Buffer
-	err := binary.Write(&buf, binary.LittleEndian, f)
-	if err != nil {
-		fmt.Println("binary.Write failed:", err)
-	}
-	return buf.Bytes()
-}
-
-func TestCheckpointWithData(t *testing.T) {
-	kv := &myKV{}
-	s, err := NewService(0, time.Hour, testDir, kv, nil)
-	assert.Nil(t, err)
-
-	var content []byte
-	for i := 0; i < 50000; i++ {
-		content = append(content, float32ToByte(float32(i))...)
-	}
-
-	p1 := Parameter{Name: "p1", ElementType: 1, Content: content}
-	err = s.InitParam(ParameterWithConfig{Param: p1}, nil)
-	assert.Nil(t, err)
-
-	err = s.FinishInitParams(0, nil)
-	assert.Nil(t, err)
-
-	var p2 Parameter
-	err = s.GetParam(p1.Name, &p2)
-	assert.Nil(t, err)
-	assert.Equal(t, p1, p2)
-
-	err = s.checkpoint()
-	assert.Nil(t, err)
-	cp, err := LoadCheckpoint(kv, 0)
-	assert.Nil(t, err)
-	s1, err := NewService(0, time.Hour, testDir, kv, cp)
-	assert.Nil(t, err)
-
-	var p3 Parameter
-	err = s1.GetParam(p1.Name, &p3)
-	assert.Nil(t, err)
-	assert.Equal(t, p1, p3)
-}
diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go
deleted file mode 100644
index 6949348e933..00000000000
--- a/go/pserver/service_test.go
+++ /dev/null
@@ -1,211 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pserver_test
-
-import (
-	"fmt"
-	"io/ioutil"
-	"reflect"
-	"sync"
-	"testing"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/pserver"
-)
-
-const (
-	OptimizerConfig = "./client/c/test/testdata/optimizer.pb"
-)
-
-func TestServiceFull(t *testing.T) {
-	var cp pserver.Checkpoint
-	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
-	if err != nil {
-		t.Error(err)
-	}
-	var p pserver.Parameter
-	p.Name = "param_a"
-	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
-	p.ElementType = pserver.Int32
-	config, err := ioutil.ReadFile(OptimizerConfig)
-	if err != nil {
-		t.Fatalf("read optimizer proto failed")
-	}
-
-	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	var p1 pserver.Parameter
-	p1.Name = "param_b"
-	p1.Content = []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
-	p1.ElementType = pserver.Float32
-	err = s.InitParam(pserver.ParameterWithConfig{Param: p1, Config: config}, nil)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	err = s.FinishInitParams(0, nil)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	var param pserver.Parameter
-	err = s.GetParam("param_b", &param)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if !reflect.DeepEqual(param, p1) {
-		t.Fatal("not equal:", param, p1)
-	}
-
-	g1, g2 := pserver.Gradient(p1), pserver.Gradient(p)
-
-	err = s.SendGrad(g1, nil)
-	if err != nil {
-		t.Fatal(err)
-	}
-	err = s.SendGrad(g2, nil)
-
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	var param1 pserver.Parameter
-	err = s.GetParam("param_a", &param1)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	// don't compare content, since it's already changed by
-	// gradient update.
-	param1.Content = nil
-	p.Content = nil
-
-	if !reflect.DeepEqual(param1, p) {
-		t.Fatal("not equal:", param1, p)
-	}
-}
-
-func TestMultipleInit(t *testing.T) {
-	var cp pserver.Checkpoint
-	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
-	if err != nil {
-		t.Fatal(err)
-	}
-	err = s.FinishInitParams(0, nil)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	err = s.FinishInitParams(0, nil)
-	if err.Error() != pserver.AlreadyInitialized {
-		t.Fatal(err)
-	}
-}
-
-func TestUninitialized(t *testing.T) {
-	var cp pserver.Checkpoint
-	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
-	err = s.SendGrad(pserver.Gradient{}, nil)
-	if err.Error() != pserver.Uninitialized {
-		t.Fatal(err)
-	}
-}
-
-func TestBlockUntilInitialized(t *testing.T) {
-	var cp pserver.Checkpoint
-	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
-	if err != nil {
-		t.Error(err)
-	}
-	ch := make(chan struct{}, 2)
-	errCh := make(chan error, 2)
-	var wg sync.WaitGroup
-	wg.Add(1)
-	go func() {
-		var param pserver.Parameter
-		err := s.GetParam("param_a", &param)
-		if err != nil {
-			errCh <- err
-		}
-		wg.Done()
-		ch <- struct{}{}
-	}()
-
-	time.Sleep(50 * time.Millisecond)
-
-	select {
-	case <-ch:
-		// some function returned before initialization is completed.
-		t.FailNow()
-	case <-errCh:
-		t.FailNow()
-	default:
-	}
-
-	var p pserver.Parameter
-	p.Name = "param_a"
-	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
-	p.ElementType = pserver.Int32
-	config, err := ioutil.ReadFile(OptimizerConfig)
-	if err != nil {
-		t.Fatalf("read optimizer proto failed")
-	}
-	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
-
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	err = s.FinishInitParams(0, nil)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	wg.Wait()
-}
-
-func TestGradientString(t *testing.T) {
-	g := pserver.Parameter{}
-	g.ElementType = pserver.Float32
-	g.Content = []byte{0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40, 0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40}
-	if g.String() != "[3.3702806e+12 2.142699 3.3702806e+12 2.142699]" {
-		t.Fatal("get float data error!")
-	}
-
-	g.Content = []byte{0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40}
-	if g.String() != "[3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699...3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699]" {
-		t.Fatal("get float data error!", g.String())
-	}
-	fmt.Println(g)
-}
diff --git a/go/utils/networkhelper/CMakeLists.txt b/go/utils/networkhelper/CMakeLists.txt
deleted file mode 100644
index 3100f2b5a52..00000000000
--- a/go/utils/networkhelper/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-if(WITH_TESTING)
-  go_test(network_helper_test)
-endif()
diff --git a/go/utils/networkhelper/helper.go b/go/utils/networkhelper/helper.go
deleted file mode 100644
index d205b6c5020..00000000000
--- a/go/utils/networkhelper/helper.go
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package networkhelper
-
-import (
-	"errors"
-	"net"
-)
-
-// GetExternalIP returns the ip address of local network interface, not the
-// loopback device.
-func GetExternalIP() (string, error) {
-	ifaces, err := net.Interfaces()
-	if err != nil {
-		return "", err
-	}
-	for _, iface := range ifaces {
-		if iface.Flags&net.FlagUp == 0 {
-			continue // interface down
-		}
-		if iface.Flags&net.FlagLoopback != 0 {
-			continue // loopback interface
-		}
-		addrs, err := iface.Addrs()
-		if err != nil {
-			return "", err
-		}
-		for _, addr := range addrs {
-			var ip net.IP
-			switch v := addr.(type) {
-			case *net.IPNet:
-				ip = v.IP
-			case *net.IPAddr:
-				ip = v.IP
-			}
-			if ip == nil || ip.IsLoopback() {
-				continue
-			}
-			ip = ip.To4()
-			if ip == nil {
-				continue // not an ipv4 address
-			}
-			return ip.String(), nil
-		}
-	}
-	return "", errors.New("are you connected to the network?")
-}
diff --git a/go/utils/networkhelper/helper_test.go b/go/utils/networkhelper/helper_test.go
deleted file mode 100644
index 60b520fae15..00000000000
--- a/go/utils/networkhelper/helper_test.go
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package networkhelper
-
-import "testing"
-
-func TestGetIP(t *testing.T) {
-	_, err := GetExternalIP()
-	if err != nil {
-		t.Errorf("GetExternalIP returns error : %v\n", err)
-	}
-}
diff --git a/proto/.gitignore b/proto/.gitignore
deleted file mode 100644
index a0f00082c8e..00000000000
--- a/proto/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-CMakeLists.txt
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
deleted file mode 100644
index a075eeb83bd..00000000000
--- a/proto/CMakeLists.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-if (MOBILE_INFERENCE)
-    file(GLOB proto_filenames . ModelConfig.proto ParameterConfig.proto
-         TrainerConfig.proto DataConfig.proto)
-else()
-    file(GLOB proto_filenames . *.proto)
-endif()
-
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
-proto_library(paddle_proto SRCS ${proto_filenames})
-
-set(PROTO_GEN)
-set(PROTO_GEN_PY)
-
-foreach(filename ${proto_filenames})
-    get_filename_component(ABS_FIL ${filename} ABSOLUTE)
-    get_filename_component(FIL_WE ${filename} NAME_WE)
-    set(CUR_PROTO_GEN_PY
-            ${PADDLE_BINARY_DIR}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
-    set(PROTO_GEN_PY
-            ${CUR_PROTO_GEN_PY}
-            ${PROTO_GEN_PY})
-    add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
-            COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/proto
-            COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-            ARGS "--python_out=${PADDLE_BINARY_DIR}/python/paddle/proto"
-            "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
-            DEPENDS ${ABS_FIL} protoc)
-endforeach()
-
-add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})
-
-
-if (WITH_GOLANG)
-    add_custom_target(protoc-gen-go)
-    add_custom_command(TARGET protoc-gen-go
-            COMMAND go 
-            ARGS "get" "-u" "github.com/golang/protobuf/protoc-gen-go")
-
-    set(PROTO_GEN_GO)
-    file(GLOB proto_filenames . OptimizerConfig.proto)
-    foreach(filename ${proto_filenames})
-        message(STATUS ${filename})
-        get_filename_component(ABS_FIL ${filename} ABSOLUTE)
-        get_filename_component(FIL_WE ${filename} NAME_WE)
-        set(CUR_PROTO_GEN_GO
-                ${PADDLE_SOURCE_DIR}/paddle/go/proto/${FIL_WE}.pb.go)
-        set(PROTO_GEN_GO
-                ${CUR_PROTO_GEN_GO}
-                ${PROTO_GEN_GO})
-        add_custom_command(OUTPUT ${CUR_PROTO_GEN_GO}
-                COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-                ARGS "--go_out=${PADDLE_SOURCE_DIR}/go/proto"
-                "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
-                DEPENDS ${ABS_FIL} protoc protoc-gen-go)
-    endforeach()
-    add_custom_target(gen_proto_go ALL DEPENDS ${PROTO_GEN_GO})
-endif()
diff --git a/proto/DataConfig.proto b/proto/DataConfig.proto
deleted file mode 100644
index 1b2aa8e726d..00000000000
--- a/proto/DataConfig.proto
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-syntax = "proto2";
-
-package paddle;
-
-message FileGroupConf {
-  optional uint32 queue_capacity = 1 [ default = 1 ];
-  // how many files to load for a load file thread
-  optional int32 load_file_count = 2 [ default = 1 ];
-  // how many threads to load files
-  // Setting to be 5~10 is appropriate when loading files by hadoop vfs
-  optional int32 load_thread_num = 3 [ default = 1 ];
-};
-
-message DataConfig {
-
-  required string type = 1;
-
-  // name of a text file which contains a list of file names at each line
-  optional string files = 3;
-
-  optional int32 feat_dim = 4;         // feature dimension of one frame
-  repeated int32 slot_dims = 5;        // feature slot dims
-  optional int32 context_len = 6;      // max neibour frame numbers
-  optional uint64 buffer_capacity = 7; // the number of samples
-
-  // part of data used in training
-  // if not -1, part of train data is used in training
-  optional int64 train_sample_num = 8 [ default = -1 ];
-
-  // The number of documents processed once
-  optional int32 file_load_num = 9 [ default = -1 ];
-  optional bool async_load_data = 12 [ default = false ];
-  /// Note the field number 10, 11 and 13 have been deprecated.
-  optional bool for_test = 14
-      [ default = false ]; // whether this data is for test
-  optional FileGroupConf file_group_conf = 15;
-  repeated int32 float_slot_dims = 16;
-
-  /// Note the field number 17, 18 and 19 have been deprecated.
-
-  // a list of values which will be used to create additional one dimensional
-  // float
-  // values slots. These one dimensional slots can be used as the weight input
-  // for cost layers.
-  // Currently this is only supported by ProtoDataProvider.
-  repeated double constant_slots = 20;
-
-  // for PyDataProvider.
-  // Specify the load data script module name, object name and user args
-  optional string load_data_module = 21;
-  optional string load_data_object = 22;
-  optional string load_data_args = 23;
-
-  // for MultiDataProvider
-  repeated DataConfig sub_data_configs = 24; // sub dataproviders
-                                             /*
-                                              * the ratio of each sub dataproviders:
-                                              * e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100,
-                                              * then each mini-batch is combined by 10 instance from A and 90 instances
-                                              * from B.
-                                              */
-  optional int32 data_ratio = 25;
-  /*
-   * if one of the sub dataproviders is running out of data, then
-   * (1) it is "main data", then finish current pass.
-   * (2) it is not "main data", then reset it, and try getNextBatch again.
-   */
-  optional bool is_main_data = 26 [ default = true ];
-
-  // the usage ratio of instances. Setting to 1.0 means the use of all
-  // instances.
-  optional double usage_ratio = 27 [ default = 1.0 ];
-};
diff --git a/proto/DataFormat.proto b/proto/DataFormat.proto
deleted file mode 100644
index 46b1f58bdb8..00000000000
--- a/proto/DataFormat.proto
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-syntax = "proto2";
-
-package paddle;
-
-/*
- If values is not empty and ids is empty, this is a dense vector.
- If values is not empty and ids is not empty, this is a sparse vector. The
- position of each value
- is specified by ids.
- If values is empty and ids is not empty, this is a sparse vector whose non-zero
- values are 1.
- The position of each 1 is specified by ids.
-*/
-message VectorSlot {
-  repeated float values = 1 [ packed = true ];
-  repeated uint32 ids = 2 [ packed = true ];
-  /* For multidimensional data, for example "image width height depth" */
-  repeated uint32 dims = 3 [ packed = true ];
-  repeated string strs = 4;
-};
-
-/*
- SubseqSlot use to record whether VectorSlot or any other slot in future has
- subseq.
- If not all VectorSlot have subseq, we only store the one who has subseq, and
- use *slot_id* to record it.
- One vector_slots has one sequence, and it may have N subseq, thus the number of
- *lens* will be N too.
-*/
-message SubseqSlot {
-  required uint32 slot_id = 1; // the id of slot who has subseq
-  repeated uint32 lens = 2;    // lengths of sub-sequence in the slot
-};
-
-message SlotDef {
-  enum SlotType {
-    VECTOR_DENSE = 0;
-    VECTOR_SPARSE_NON_VALUE = 1;
-    VECTOR_SPARSE_VALUE = 2;
-    INDEX = 3; // This can be used as label, or word id, etc.
-    VAR_MDIM_DENSE = 4;
-    VAR_MDIM_INDEX = 5;
-    STRING = 6;
-  }
-  required SlotType type = 1;
-  required uint32 dim =
-      2; // For INDEX slots, this means the maximal index plus 1.
-};
-
-message DataHeader {
-  // INDEX slot should be always after VECTOR slots.
-  repeated SlotDef slot_defs = 1;
-};
-
-message DataSample {
-  optional bool is_beginning = 1
-      [ default = true ]; // is the beginning of a sequence
-  repeated VectorSlot vector_slots = 2;
-  repeated uint32 id_slots = 3 [ packed = true ];
-  /* use ids of VectorSlot */
-  repeated VectorSlot var_id_slots = 4;
-  repeated SubseqSlot subseq_slots = 5;
-};
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
deleted file mode 100644
index d78ee9c9f39..00000000000
--- a/proto/ModelConfig.proto
+++ /dev/null
@@ -1,698 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-syntax = "proto2";
-
-import "ParameterConfig.proto";
-
-package paddle;
-
-/**
- * Various structs for the configuration of a neural network
- */
-
-message ExternalConfig {
-  repeated string layer_names = 1;
-  repeated string input_layer_names = 2;
-  repeated string output_layer_names = 3;
-}
-
-message ActivationConfig {
-  // identity: f(x) = x
-  // sigmoid: f(x) = 1 / (1 + exp(-x))
-  // logistic: f(x) = (1 - exp(-x)) / (1+ exp(-x))
-  // softmax: y_i = f(x_i) = exp(x_i) / (\sum_i exp(x_i))
-  // relu: y = max(0, x)
-  required string type = 1;
-};
-
-message ConvConfig {
-  // filter_size = 5, says that this layer will use
-  // filters of size 5x5 pixels.
-  required uint32 filter_size = 1;
-
-  // The image data dimensionality.
-  // This value must be either 1, 2, 3, or a multiple of 4.
-  required uint32 channels = 2;
-
-  // stride = 1, indicates that the distance between
-  // successive filter applications should be 1 pixel.
-  required uint32 stride = 3;
-
-  // padding = 4, instructs the net to implicitly
-  // pad the images with a 4-pixel border of zeros.
-  required uint32 padding = 4;
-
-  // If groups = 4 together with the filters = 32 parameter,
-  // they state that this convolutional layer is to have 4
-  // groups of 32 filters. Each filter will connect to 8
-  // input channels.
-  required uint32 groups = 5;
-  required uint32 filter_channels = 6;
-
-  // The size of output feature map.
-  required uint32 output_x = 7;
-
-  // The size of input feature map.
-  required uint32 img_size = 8;
-
-  // caffe mode for output size coherence
-  required bool caffe_mode = 9 [ default = true ];
-
-  // if filter_size_y is set , this convolutional layer will use
-  // filters of size filter_size * filter_size_y pixels.
-  // if filter_size_y is not set, this convolutional layer will use
-  // filters of size filter_size * filter_size
-  required uint32 filter_size_y = 10;
-  required uint32 padding_y = 11;
-  required uint32 stride_y = 12;
-
-  // if not set, use output_x
-  optional uint32 output_y = 13;
-
-  // if not set, use img_size
-  optional uint32 img_size_y = 14;
-
-  optional uint32 dilation = 15 [ default = 1 ];
-  optional uint32 dilation_y = 16 [ default = 1 ];
-
-  optional uint32 filter_size_z = 17 [ default = 1 ];
-  optional uint32 padding_z = 18 [ default = 1 ];
-  optional uint32 stride_z = 19 [ default = 1 ];
-  optional uint32 output_z = 20 [ default = 1 ];
-  optional uint32 img_size_z = 21 [ default = 1 ];
-}
-
-message PoolConfig {
-  // max or avg pooling
-  required string pool_type = 1;
-  required uint32 channels = 2;
-
-  // Defines the size of the pooling region in
-  // the x (equivalently, y) dimension.
-  required uint32 size_x = 3;
-
-  // Tell the net where in the input image to start the pooling.
-  // start is deprecated now.
-  optional uint32 start = 4;
-
-  // Defines the stride size between successive pooling squares.
-  required uint32 stride = 5 [ default = 1 ];
-
-  // The size of output feature map.
-  required uint32 output_x = 6;
-
-  // The size of input feature map.
-  required uint32 img_size = 7;
-
-  // padding = 4, instructs the net to implicitly
-  // pad the images with a 4-pixel border of zeros.
-  optional uint32 padding = 8 [ default = 0 ];
-
-  // if not set, use size_x
-  optional uint32 size_y = 9;
-
-  // if not set, use stride
-  optional uint32 stride_y = 10;
-
-  // if not set, use output_x
-  optional uint32 output_y = 11;
-
-  // if not set, use img_size
-  optional uint32 img_size_y = 12;
-
-  // if not set, use padding
-  optional uint32 padding_y = 13;
-
-  optional uint32 size_z = 14 [ default = 1 ];
-  optional uint32 stride_z = 15 [ default = 1 ];
-  optional uint32 output_z = 16 [ default = 1 ];
-  optional uint32 img_size_z = 17 [ default = 1 ];
-  optional uint32 padding_z = 18 [ default = 1 ];
-
-  optional bool exclude_mode = 19;
-}
-
-message SppConfig {
-  required ImageConfig image_conf = 1;
-  required string pool_type = 2;
-  required uint32 pyramid_height = 3;
-}
-
-message NormConfig {
-  // rnorm or cmrnorm
-  required string norm_type = 1;
-  required uint32 channels = 2;
-
-  // rnorm: this defines the size of the local regions
-  // used for response normalization.
-  // cmrnorm: The size parameter indicates how many
-  // nearby maps to use for normalization.
-  required uint32 size = 3;
-
-  // the parameters for normalization
-  // u = u / (1+scale*sum(u^2 in window))^pow
-  required double scale = 4;
-  required double pow = 5;
-
-  // The size of output feature map.
-  required uint32 output_x = 6;
-
-  // The size of input feature map.
-  required uint32 img_size = 7;
-
-  // normalize with fixed window or sliding window
-  // u = u / (1+scale*sum(u^2 in window))^pow
-  // fixed window: shared a fixed window for each value
-  // sliding window: have a different window for each value
-  optional bool blocked = 8;
-
-  // if not set, use output_x
-  optional uint32 output_y = 9;
-
-  // if not set, use img_size
-  optional uint32 img_size_y = 10;
-}
-
-message BlockExpandConfig {
-  required uint32 channels = 1;
-
-  required uint32 stride_x = 2;
-  required uint32 stride_y = 3;
-
-  required uint32 padding_x = 4;
-  required uint32 padding_y = 5;
-
-  required uint32 block_x = 6;
-  required uint32 block_y = 7;
-
-  // The size of output feature map.
-  required uint32 output_x = 8;
-  required uint32 output_y = 9;
-
-  // The size of input feature map.
-  required uint32 img_size_x = 10;
-  required uint32 img_size_y = 11;
-}
-
-message MaxOutConfig {
-  required ImageConfig image_conf = 1;
-  required uint32 groups = 2;
-}
-
-message RowConvConfig { required uint32 context_length = 1; }
-
-message SliceConfig {
-  required uint32 start = 1;
-  required uint32 end = 2;
-}
-
-message ProjectionConfig {
-  required string type = 1;
-  required string name = 2;
-  required uint64 input_size = 3;
-  required uint64 output_size = 4;
-
-  // For ShiftProjection
-  optional int32 context_start = 5;
-  optional int32 context_length = 6;
-  optional bool trainable_padding = 7 [ default = false ];
-
-  // For convolution
-  optional ConvConfig conv_conf = 8;
-  optional int32 num_filters = 9;
-
-  // For IdentityOffsetProjection
-  optional uint64 offset = 11 [ default = 0 ];
-
-  // For pool
-  optional PoolConfig pool_conf = 12;
-
-  // For slice
-  // Each slice output is the input[start, end)
-  repeated SliceConfig slices = 13;
-}
-
-message OperatorConfig {
-  required string type = 1;
-  repeated int32 input_indices = 2;
-  repeated uint64 input_sizes = 3;
-  required uint64 output_size = 4;
-
-  // For DotMulOperator
-  optional double dotmul_scale = 5 [ default = 1.0 ];
-
-  // For ConvOperator
-  optional ConvConfig conv_conf = 6;
-  optional int32 num_filters = 7;
-}
-
-message BilinearInterpConfig {
-  // The size of input feature map.
-  required ImageConfig image_conf = 1;
-  // The size of output feature map.
-  required uint32 out_size_x = 2;
-  required uint32 out_size_y = 3;
-}
-
-message ImageConfig {
-  // The image data dimensionality.
-  // This value must be either 1, 2, 3, or a multiple of 4.
-  required uint32 channels = 2;
-
-  // The size of input feature map.
-  required uint32 img_size = 8;
-  optional uint32 img_size_y = 9;
-  optional uint32 img_size_z = 10 [ default = 1 ];
-}
-
-message PriorBoxConfig {
-  repeated uint32 min_size = 1;
-  repeated uint32 max_size = 2;
-  repeated float aspect_ratio = 3;
-  repeated float variance = 4;
-}
-
-message PadConfig {
-  required ImageConfig image_conf = 1;
-  repeated uint32 pad_c = 2;
-  repeated uint32 pad_h = 3;
-  repeated uint32 pad_w = 4;
-}
-
-message ReshapeConfig {
-  repeated uint32 height_axis = 1;
-  repeated uint32 width_axis = 2;
-}
-
-message MultiBoxLossConfig {
-  required uint32 num_classes = 1;
-  required float overlap_threshold = 2;
-  required float neg_pos_ratio = 3;
-  required float neg_overlap = 4;
-  required uint32 background_id = 5;
-  required uint32 input_num = 6;
-  optional uint32 height = 7 [ default = 1 ];
-  optional uint32 width = 8 [ default = 1 ];
-}
-
-message DetectionOutputConfig {
-  required uint32 num_classes = 1;
-  required float nms_threshold = 2;
-  required uint32 nms_top_k = 3;
-  required uint32 background_id = 4;
-  required uint32 input_num = 5;
-  required uint32 keep_top_k = 6;
-  required float confidence_threshold = 7;
-  optional uint32 height = 8 [ default = 1 ];
-  optional uint32 width = 9 [ default = 1 ];
-}
-
-message ClipConfig {
-  required double min = 1;
-  required double max = 2;
-}
-
-message UpsampleConfig {
-  required ImageConfig image_conf = 1;
-  optional uint32 scale = 2 [ default = 2 ];
-  optional uint32 scale_y = 3 [ default = 2 ];
-  optional bool pad_out_x = 4 [ default = false ];
-  optional bool pad_out_y = 5 [ default = false ];
-  optional uint32 upsample_size = 6;
-  optional uint32 upsample_size_y = 7;
-}
-
-message ROIPoolConfig {
-  required uint32 pooled_width = 1;
-  required uint32 pooled_height = 2;
-  required float spatial_scale = 3;
-  optional uint32 height = 4 [ default = 1 ];
-  optional uint32 width = 5 [ default = 1 ];
-}
-
-message ScaleSubRegionConfig {
-  required ImageConfig image_conf = 1;
-  required float value = 2;
-}
-
-message LayerInputConfig {
-  required string input_layer_name = 1;
-  optional string input_parameter_name = 2;
-  optional ConvConfig conv_conf = 3;
-  optional PoolConfig pool_conf = 4;
-  optional NormConfig norm_conf = 5;
-  optional ProjectionConfig proj_conf = 6;
-  optional BlockExpandConfig block_expand_conf = 7;
-  optional ImageConfig image_conf = 8;
-  // If the input layer has multi-output.
-  // Set the argument name.
-  optional string input_layer_argument = 9;
-  optional BilinearInterpConfig bilinear_interp_conf = 10;
-  optional MaxOutConfig maxout_conf = 11;
-  optional SppConfig spp_conf = 12;
-  optional PriorBoxConfig priorbox_conf = 13;
-  optional PadConfig pad_conf = 14;
-  optional RowConvConfig row_conv_conf = 15;
-  optional MultiBoxLossConfig multibox_loss_conf = 16;
-  optional DetectionOutputConfig detection_output_conf = 17;
-  optional ClipConfig clip_conf = 18;
-  optional ScaleSubRegionConfig scale_sub_region_conf = 19;
-  optional ROIPoolConfig roi_pool_conf = 20;
-  optional UpsampleConfig upsample_conf = 21;
-}
-
-message LayerConfig {
-  required string name = 1;
-  required string type = 2;
-  optional uint64 size = 3;
-  // optional ActivationConfig activation = 4;
-  optional string active_type = 4;
-  repeated LayerInputConfig inputs = 5;
-  optional string bias_parameter_name = 6;
-
-  // This number must be a multiple of 16.
-  optional uint32 num_filters = 7;
-
-  // indicates that the biases of every filter in this layer
-  // should be shared amongst all applications of that filter
-  // (which is how convnets are usually trained). Setting this to
-  // false will untie the biases, yielding a separate bias for
-  // every location at which the filter is applied.
-  optional bool shared_biases = 8 [ default = false ];
-
-  // Valid values are ones that divide the area of the output
-  // grid in this convolutional layer. For example if this layer
-  // produces 32-channel 20x20 output grid, valid values of
-  // partialSum are ones which divide 20*20 = 400.
-  // I'll update this comments when confirmed
-  optional uint32 partial_sum = 9;
-
-  // for dropout
-  optional double drop_rate = 10;
-
-  // for HierarchicalSoftmaxLayer and NCELayer
-  // the number of classes
-  optional uint32 num_classes = 11;
-
-  // the gpu device which the Layer's data in.
-  // Only used by ParallelNeuralNetork. Ignored otherwise.
-  optional int32 device = 12 [ default = -1 ];
-
-  // for recurrent layer. If true, the recurrence runs from the end to the
-  // beginning.
-  optional bool reversed = 13 [ default = false ];
-
-  // for lstmemory layer. Different types of nodes have different activation
-  // type.
-  optional string active_gate_type = 14;
-  optional string active_state_type = 15;
-
-  // For NCELayer
-  // The number of random negative labels for each sample
-  optional int32 num_neg_samples = 16 [ default = 10 ];
-
-  // For NCELayer
-  // The distribution for generating the random negative labels.
-  // A uniform distribution will be used if not provided
-  repeated double neg_sampling_dist = 17 [ packed = true ];
-
-  // For MaxLayer
-  // default: output VALUE of MaxLayer. set this flag to true for output INDEX
-  // INDEX will be put in Argument::value as double values.
-  optional bool output_max_index = 19 [ default = false ];
-
-  /// The filed number 20 have been deprecated.
-
-  // For self-normalized estimation
-  optional double softmax_selfnorm_alpha = 21 [ default = 0.1 ];
-
-  /// The filed numbers 22 and 23 have been deprecated.
-
-  // for MDLstmLayer
-  repeated bool directions = 24;
-
-  // for CTCLayer
-  optional bool norm_by_times = 25;
-
-  // for CostLayers
-  optional double coeff = 26 [ default = 1.0 ];
-
-  // for AverageLayer
-  // can be set to: 'average', 'sum' or 'squarerootn'
-  optional string average_strategy = 27;
-
-  // for error clipping
-  optional double error_clipping_threshold = 28 [ default = 0.0 ];
-
-  // for operators used by mixed layer
-  repeated OperatorConfig operator_confs = 29;
-
-  // for lambdaCost
-  optional int32 NDCG_num = 30;
-  optional int32 max_sort_size = 31;
-
-  // for SlopeInterceptLayer
-  optional double slope = 32;
-  optional double intercept = 33;
-
-  // for CosSimVecMatLayer and CosSimLayer
-  optional double cos_scale = 34;
-
-  // for DataNormLayer
-  // can be set to: 'z-score', 'min-max' or 'decimal-scaling'
-  optional string data_norm_strategy = 36;
-
-  // for bos/eos id
-  optional uint32 bos_id = 37;
-  optional uint32 eos_id = 38;
-
-  // for max id layer
-  optional uint32 beam_size = 39;
-
-  // for seqlastins layer, whether select first instead last
-  optional bool select_first = 40 [ default = false ];
-
-  // for seqlastins layer, AverageLayer, MaxLayer and ExpandLayer
-  // can be set to: 'non-seq','seq'
-  optional string trans_type = 41 [ default = 'non-seq' ];
-
-  // to indicate whether selective_fc layer
-  // is used in sequence generation or not
-  optional bool selective_fc_pass_generation = 42 [ default = false ];
-
-  // to indicate whether selective_fc layer take its last input to
-  // selected several columns and only compute the multiplications
-  // between the input matrices and the selected columns of
-  // the parameter matrices of this layer.
-  // if set false, selective_fc degrades into fc.
-  optional bool has_selected_colums = 43 [ default = true ];
-
-  // this parameter is for speed consideration.
-  // if number of the selected columns is less than
-  // sample number * selective_fc output size * selective_fc_mull_mull_ratio
-  // sparse multiplication is used, otherwise, using full multiplication.
-  optional double selective_fc_full_mul_ratio = 44 [ default = 0.02 ];
-
-  // to indicate how many threads selective_fc use to to accelate
-  // the plain_mul period
-  // leave empty or set to 0 to disable multi-thread accleleration
-  optional uint32 selective_fc_parallel_plain_mul_thread_num = 45
-      [ default = 0 ];
-
-  // for batch normalization layer
-  // if set use_global_stats true, will use the loaded mean and variance.
-  optional bool use_global_stats = 46;
-
-  // use to compute moving mean and variance.
-  optional double moving_average_fraction = 47 [ default = 0.9 ];
-
-  // bias size
-  optional uint32 bias_size = 48 [ default = 0 ];
-
-  // this parameter can be used as a user-defined parameter when necessary,
-  // without changing the proto file.
-  // e.g., when a new layer with a user-defined parameter is implemented,
-  // it can be used to pass that parameter, without modifying the proto file.
-  // string type is used for flexibility: different types can be converted
-  // to string and reinterpreted in the user's own layer implementation.
-  optional string user_arg = 49;
-
-  // to indicate rectangle image data
-  optional uint64 height = 50;
-  optional uint64 width = 51;
-
-  // blank label used in ctc loss
-  optional uint32 blank = 52 [ default = 0 ];
-
-  // stride parameter for seqlastins layer, AverageLayer, MaxLayer, which
-  // controls the scope of pooling operation. can be set > 0.
-  // leave empty or set to -1 to disable this stride pooling.
-  optional int32 seq_pool_stride = 53 [ default = -1 ];
-
-  // for crop layer
-  optional int32 axis = 54 [ default = 2 ];
-  repeated uint32 offset = 55;
-  repeated uint32 shape = 56;
-
-  // for HuberRegressionLoss
-  optional double delta = 57 [ default = 1.0 ];
-
-  // for 3D data
-  optional uint64 depth = 58 [ default = 1 ];
-
-  // for switch order layer
-  optional ReshapeConfig reshape_conf = 59;
-
-  // for batch normalization layer
-  // The small constant added to the variance to improve numeric stability.
-  optional double epsilon = 60 [ default = 0.00001 ];
-
-  // for factorization machine layer
-  optional uint32 factor_size = 61;
-}
-
-message EvaluatorConfig {
-  required string name = 1;
-  required string type = 2;
-  repeated string input_layers = 3;
-
-  // Used by ChunkEvaluator
-  // one of "IOB", "IOE", "IOBES"
-  optional string chunk_scheme = 4;
-  // number of chunk types other than "other"
-  optional int32 num_chunk_types = 5;
-
-  // Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator
-  // For multi binary labels: true if output > classification_threshold
-  optional double classification_threshold = 6 [ default = 0.5 ];
-  // The positive label. -1 means average precision and recall
-  optional int32 positive_label = 7 [ default = -1 ];
-
-  // load dict from this file
-  optional string dict_file = 8;
-
-  // dump result in this file
-  optional string result_file = 9;
-
-  // top # results for max id printer
-  optional int32 num_results = 10 [ default = 1 ];
-
-  // whether to delimit the sequence in the seq_text_printer
-  optional bool delimited = 11 [ default = true ];
-
-  // Used by ChunkEvaluator
-  // chunk of these types are not counted
-  repeated int32 excluded_chunk_types = 12;
-
-  // Used by ClassificationErrorEvaluator
-  // top # classification error
-  optional int32 top_k = 13 [ default = 1 ];
-
-  // Used by DetectionMAPEvaluator
-  optional double overlap_threshold = 14 [ default = 0.5 ];
-
-  optional int32 background_id = 15 [ default = 0 ];
-
-  optional bool evaluate_difficult = 16 [ default = false ];
-
-  optional string ap_type = 17 [ default = "11point" ];
-}
-
-message LinkConfig {
-  required string layer_name = 1;
-  required string link_name = 2;
-  // If true, this link has sub-sequence
-  optional bool has_subseq = 3 [ default = false ];
-}
-
-message MemoryConfig {
-  required string layer_name = 1;
-  required string link_name = 2;
-
-  optional string boot_layer_name = 3;
-  optional string boot_bias_parameter_name = 4;
-  optional string boot_bias_active_type = 5;
-  optional uint32 boot_with_const_id = 7;
-
-  // memory is a sequence, initailized by a sequence boot layer
-  optional bool is_sequence = 6 [ default = false ];
-}
-
-message GeneratorConfig {
-  required uint32 max_num_frames = 1;
-  required string eos_layer_name = 2;
-  optional int32 num_results_per_sample = 3 [ default = 1 ];
-
-  // for beam search
-  optional int32 beam_size = 4 [ default = 1 ];
-
-  optional bool log_prob = 5 [ default = true ];
-}
-
-message SubModelConfig {
-  required string name = 1;
-  repeated string layer_names = 2; // selected layers in sub model
-  repeated string input_layer_names = 3;
-  repeated string output_layer_names = 4;
-  repeated string evaluator_names = 5;
-
-  optional bool is_recurrent_layer_group = 6 [ default = false ];
-
-  // If true, the recurrence runs from the end to the beginning.
-  optional bool reversed = 7 [ default = false ];
-
-  // name and link name of memory
-  repeated MemoryConfig memories = 8;
-
-  // if use recurrent layer group, all layers in submodel will postfix by
-  // "_in_"+submodel.name, so we add a name pair to link between
-  // root model and layer group,
-  // note that these in/out layers are not input/output of the network.
-  repeated LinkConfig in_links = 9;
-  repeated LinkConfig out_links = 10;
-
-  optional GeneratorConfig generator = 11;
-
-  // the id of inlink which share info with outlinks, used in recurrent layer
-  // group
-  optional int32 target_inlinkid = 12;
-}
-
-message ModelConfig {
-  // type of the model.
-  // Currently, "nn", "recurrent_nn" and "recursive_nn" are supported
-  required string type = 1 [ default = "nn" ];
-
-  // layers should be ordered in such a way that the forward propagation
-  // can be correctly executed by going from the first layer to the last layer
-  repeated LayerConfig layers = 2;
-
-  repeated ParameterConfig parameters = 3;
-
-  // Input layers should have the same order as the data streams provided
-  // by the data provider. The type of input layers should be "data"
-  repeated string input_layer_names = 4;
-
-  // For training, the type of a output layer is usually cost layer.
-  // For prediction, they should be the actual output layers.
-  repeated string output_layer_names = 5;
-
-  repeated EvaluatorConfig evaluators = 6;
-
-  repeated SubModelConfig sub_models = 8;
-
-  // For External Machine, defining how to split a neural network
-  // into multiple parts.
-  optional ExternalConfig external_config = 9;
-};
diff --git a/proto/OptimizerConfig.proto b/proto/OptimizerConfig.proto
deleted file mode 100644
index e9ea1bfbcc6..00000000000
--- a/proto/OptimizerConfig.proto
+++ /dev/null
@@ -1,164 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-syntax = "proto2";
-
-option optimize_for = LITE_RUNTIME;
-
-package paddle;
-
-message SGDConfig {
-  // SGD
-  // momentum: float >= 0. Parameter updates momentum.
-  // decay: float >= 0. Learning rate decay over each update.
-  // nesterov: boolean. Whether to apply Nesterov momentum.
-  optional double momentum = 21 [ default = 0.0 ];
-  optional double decay = 23 [ default = 0.0 ];
-  optional bool nesterov = 24 [ default = false ];
-}
-
-message AdadeltaConfig {
-  // Adadelta
-  // It is recommended to leave it at the default value.
-  // rho: float >= 0.
-  // epsilon: float >= 0. Fuzz factor.
-  // decay: float >= 0. Learning rate decay over each update.
-
-  // reference : [Adadelta - an adaptive learning rate
-  // method](http://arxiv.org/abs/1212.5701)
-  optional double rho = 33 [ default = 0.90 ];
-  optional double epsilon = 31 [ default = 1e-5 ];
-  optional double decay = 32 [ default = 0.0 ];
-}
-
-message AdagradConfig {
-  // Adagrad
-  // epsilon: float >= 0.
-  // decay: float >= 0. Learning rate decay over each update.
-
-  // reference : [Adaptive Subgradient Methods for Online Learning and
-  // Stochastic
-  // Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-  optional double epsilon = 41 [ default = 1e-5 ];
-  optional double decay = 42 [ default = 0.0 ];
-}
-
-message AdamConfig {
-  // Adaj
-  // beta_1: float, 0 < beta < 1. Generally close to 1.
-  // beta_2: float, 0 < beta < 1. Generally close to 1.
-  // epsilon: float >= 0. Fuzz factor.
-  // decay: float >= 0. Learning rate decay over each update.
-  // reference : [Adam - A Method for Stochastic
-  // Optimization](http://arxiv.org/abs/1412.6980v8)
-  optional double beta_1 = 41;
-  optional double beta_2 = 42;
-  optional double epsilon = 43;
-  optional double decay = 44;
-}
-
-message ConstLrConfig {
-  // learninRate Policy
-  optional double learning_rate = 1 [ default = 1.0 ];
-}
-
-message LinearLrConfig {
-  // learninRate Policy
-  optional double learning_rate = 1 [ default = 1.0 ];
-  optional double lr_decay_a = 2;
-  optional double lr_decay_b = 3;
-}
-
-message TensorProto {
-  enum DataType {
-    PADDLE_ELEMENT_TYPE_INT32 = 0;
-    PADDLE_ELEMENT_TYPE_UINT32 = 1;
-    PADDLE_ELEMENT_TYPE_INT64 = 2;
-    PADDLE_ELEMENT_TYPE_UINT64 = 3;
-    PADDLE_ELEMENT_TYPE_FLOAT32 = 4;
-    PADDLE_ELEMENT_TYPE_FLOAT64 = 5;
-  }
-  optional DataType data_type = 1;
-  repeated bytes content = 2;
-}
-
-message LrPolicyState {
-  // learninRate Policy
-  optional double learning_rate = 1 [ default = 1.0 ];
-  optional double lr_decay_a = 2;
-  optional double lr_decay_b = 3;
-}
-
-message SGDOptimizerState {
-  optional LrPolicyState lr_state = 101;
-  optional double num_sample_passed = 104;
-  // state
-  optional TensorProto parameter = 1;
-  optional TensorProto momentums = 2;
-}
-
-message AdadeltaOptimizerState {
-  // learning rate policy
-  optional LrPolicyState lr_state = 101;
-  optional double num_sample_passed = 104;
-  // state
-  optional TensorProto parameter = 1;
-  optional TensorProto accum_gradient = 2;
-  optional TensorProto accum_delta = 3;
-  optional TensorProto update_delta = 4;
-}
-
-message AdagradOptimizerState {
-  optional LrPolicyState lr_state = 101;
-  optional double num_sample_passed = 104;
-  // state
-  optional TensorProto parameter = 1;
-  optional TensorProto accum_gradient = 2;
-}
-
-message AdamOptimizerState {
-  optional LrPolicyState lr_state = 101;
-  optional double num_sample_passed = 104;
-  // state
-  optional TensorProto parameter = 1;
-  optional TensorProto momentums = 2;
-  optional TensorProto velocitys = 3;
-}
-
-message OptimizerConfig {
-  enum Optimizer {
-    SGD = 1;
-    Adadelta = 2;
-    Adagrad = 3;
-    Adam = 4;
-  }
-  optional Optimizer optimizer = 1;
-  optional SGDConfig sgd = 3;
-  optional AdadeltaConfig adadelta = 4;
-  optional AdagradConfig adagrad = 5;
-  optional AdamConfig adam = 6;
-
-  enum LrPolicy {
-    Const = 0;
-    Linear = 1;
-  }
-  optional LrPolicy lr_policy = 11;
-  optional ConstLrConfig const_lr = 12;
-  optional LinearLrConfig linear_lr = 13;
-
-  // common config of optimizer
-  // gradient clip when L2 exceeding value
-  optional double clip_norm = 101;
-  // gradient clip when L1 exceeding value
-  optional double clip_value = 102;
-}
diff --git a/proto/ParameterConfig.proto b/proto/ParameterConfig.proto
deleted file mode 100644
index 6f8ba9d7605..00000000000
--- a/proto/ParameterConfig.proto
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-syntax = "proto2";
-
-package paddle;
-
-/**
- * Configuration structure for parameter
- */
-
-enum ParameterInitStrategy {
-  PARAMETER_INIT_NORMAL = 0;
-  PARAMETER_INIT_UNIFORM = 1;
-}
-
-message ParameterUpdaterHookConfig {
-  // hook type such as  'pruning'
-  required string type = 1;
-  // this represents the ratio of zero element to be set by the Parameter
-  optional double sparsity_ratio = 2 [ default = 0.6 ];
-}
-
-message ParameterConfig {
-  required string name = 1;
-  required uint64 size = 2;
-  optional double learning_rate = 3 [ default = 1.0 ];
-  optional double momentum = 4 [ default = 0.0 ];
-  optional double initial_mean = 5 [ default = 0.0 ];
-  optional double initial_std = 6 [ default = 0.01 ];
-  // use L2-regularization if decay_rate set and decay_rate_l1 not set
-  optional double decay_rate = 7 [ default = 0.0 ];
-  // use L1-regularization if decay_rate_l1 set
-  optional double decay_rate_l1 = 8 [ default = 0.0 ];
-  // dims of Parameter, e.g. dims[0] as height, dims[1] as width..
-  repeated uint64 dims = 9;
-  // the gpu device which the parameter in.
-  // Only used by ParallelNeuralNetork. Ignored otherwise.
-  optional int32 device = 10 [ default = -1 ];
-  // how to init the parameter: 0 -> normal, 1 -> uniform
-  // 0: treat initial_mean as mean, intial_std as standard deviation
-  // 1: range is (initial_mean - initial_std) to (initial_mean + initial_std)
-  optional int32 initial_strategy = 11 [ default = 0 ];
-  // define the variance when init the parameter, by height of the Matrix
-  optional bool initial_smart = 12 [ default = false ];
-  // apply regularization every # batches
-  optional int32 num_batches_regularization = 13 [ default = 1 ];
-  // if is_sparse is true, para is sparse, else para is dense
-  optional bool is_sparse = 14 [ default = false ];
-  // if para is sparse, format should be "csc" or "csr", empty means is not
-  // sparse
-  optional string format = 15 [ default = "" ];
-  // sparse remote update or not
-  optional bool sparse_remote_update = 16 [ default = false ];
-  // gradient clipping threshold, no clipping by default
-  optional double gradient_clipping_threshold = 17 [ default = 0.0 ];
-  // static parameters are fixed when training
-  optional bool is_static = 18 [ default = false ];
-  // para_id should NOT be set by config_parser. It is for
-  // internal use.
-  optional uint64 para_id = 19;
-
-  repeated ParameterUpdaterHookConfig update_hooks = 20;
-  // setup load mat -> csr
-  optional bool need_compact = 21 [ default = false ];
-  // whether to do sparse update for this parameter
-  optional bool sparse_update = 22 [ default = false ];
-
-  // whether this parameter is shared or not.
-  optional bool is_shared = 23 [ default = false ];
-  // parameter block size
-  optional uint64 parameter_block_size = 24 [ default = 0 ];
-}
diff --git a/proto/ParameterServerConfig.proto b/proto/ParameterServerConfig.proto
deleted file mode 100644
index 1404c8aa143..00000000000
--- a/proto/ParameterServerConfig.proto
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-syntax = "proto2";
-
-package paddle;
-
-/**
- * Configuration structure for ParameterClient2.
- */
-message ParameterClientConfig { required int32 trainer_id = 1; }
-
-/**
- * Configuration structure for ParameterServer2.
- */
-message ParameterServerConfig {
-  // Number of ports for sending dense parameter,
-  // following ports on parameter server will be visited
-  // for sending dense parameter: [port, port+ports_num-1]
-  required int32 ports_num = 1 [ default = 1 ];
-  // Number of ports for sending sparse parameter,
-  // following ports on parameter server will be visited
-  // for sending sparse parameter:
-  // [port+ports_num, port+ports_num+ports_num_for_sparse-1]
-  required int32 ports_num_for_sparse = 2 [ default = 0 ];
-  // network device name for pservers
-  required string nics = 3 [ default = "xgbe0,xgbe1" ];
-  required string rdma_tcp = 4 [ default = "tcp" ];
-  // Listening port for pserver
-  required int32 port = 5 [ default = 20134 ];
-  // number of gradient servers
-  required int32 num_gradient_servers = 6 [ default = 1 ];
-  // number of threads for sync op exec
-  required int32 pserver_num_threads = 7 [ default = 1 ];
-  // control config_.async_lagged_grad_discard_ratio() min value
-  required double async_lagged_ratio_min = 8 [ default = 1.0 ];
-  // if async_lagged_grad_discard_ratio is not set in trainer_config.conf
-  // use it as defalut value
-  required double async_lagged_ratio_default = 9 [ default = 1.5 ];
-}
diff --git a/proto/ParameterService.proto b/proto/ParameterService.proto
deleted file mode 100644
index b56c1bfe7ca..00000000000
--- a/proto/ParameterService.proto
+++ /dev/null
@@ -1,351 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-syntax = "proto2";
-
-import "ParameterConfig.proto";
-import "TrainerConfig.proto";
-
-package paddle;
-
-/**
- * Various structs for communicating with parameter server
- */
-enum ParameterUpdateMode {
-  // Set parameter
-  PSERVER_UPDATE_MODE_SET_PARAM = 0;      // use local param
-  PSERVER_UPDATE_MODE_SET_PARAM_ZERO = 1; // set zero param
-
-  // Update parameter once a gradient is received
-  PSERVER_UPDATE_MODE_ASYNC_SGD = 2;
-
-  // Accumulate gradient
-  PSERVER_UPDATE_MODE_ADD_GRADIENT = 3;
-
-  // Average parameters
-  PSERVER_UPDATE_MODE_AVERAGE_PARAMETER = 4;
-
-  // No update. Only get parameters back.
-  PSERVER_UPDATE_MODE_GET_PARAM = 5;
-  PSERVER_UPDATE_MODE_GET_PARAM_SPARSE = 6; // only get sparse rows
-};
-
-message ParameterBlock {
-  // it accurately means parameter id.
-  required uint64 para_id = 1;
-  // global sparse row or dense block for each block in parameter
-  required uint64 block_id = 2;
-  // offset in (local) storage
-  required uint64 begin_pos = 3;
-  // actual size of block, size for last block is [endDim -beginDim],
-  // others is parameter_block_size in ParameterConfig
-  required uint64 block_size = 4;
-}
-
-enum PServerStatus {
-  PSERVER_STATUS_NOT_SET = 0;
-  PSERVER_STATUS_PARAMETER_READY = 1;
-};
-
-enum BatchStatus {
-  BATCH_START = 0;
-  BATCH_ON = 1;
-  BATCH_FINISH = 2;
-  BATCH_START_AND_FINISH = 3;
-};
-
-message SendParameterRequest {
-  required ParameterUpdateMode update_mode = 1;
-  repeated ParameterBlock blocks = 2;
-  required bool send_back_parameter = 3;
-
-  // number of samples used for calculating this update
-  optional int64 num_samples = 4;
-
-  // cost will be used to calculate global objective value
-  optional double cost = 5;
-
-  required BatchStatus batch_status = 6;
-
-  optional int32 trainer_id = 7;
-
-  // send back parameter type on pserver, PARAMETER_VALUE by default
-  optional int32 send_back_parameter_type = 8 [ default = 0 ];
-
-  // forwardbackward time in usec
-  optional uint64 forwardbackward_time = 9;
-}
-
-message WaitPassStartRequest {}
-
-message WaitPassStartResponse {}
-
-message WaitPassFinishRequest {}
-
-message WaitPassFinishResponse {}
-
-enum SyncObject {
-  SYNC_DEFAULT = 0; // wait for the synchronizeBarrier_
-  SYNC_DATA = 1;    // wait for the synchronizeDataBarrier_
-}
-
-message SynchronizeRequest {
-  required SyncObject sync_object_id = 1 [ default = SYNC_DEFAULT ];
-
-  optional int32 trainer_id = 2;
-}
-
-message SynchronizeResponse {}
-
-message SendParameterResponse { repeated ParameterBlock blocks = 1; }
-
-message SetConfigRequest {
-  repeated ParameterConfig param_configs = 1;
-  required OptimizationConfig opt_config = 2;
-  required string save_dir = 4;
-  required int32 server_id = 5;
-  required bool is_sparse_server = 6;
-}
-
-message SetConfigResponse {}
-
-message GetStatusRequest {}
-
-message GetStatusResponse { required PServerStatus status = 1; }
-
-message SetStatusRequest { required PServerStatus status = 1; }
-
-message SetStatusResponse {}
-
-// create a column vector. The size is the dimension of parameter
-message CreateVectorRequest {}
-
-message CreateVectorResponse {
-  // error message. Empty if success
-  optional string return_message = 1;
-
-  required int64 handle = 2;
-}
-
-message ReleaseVectorRequest { required int64 handle = 1; }
-
-message ReleaseVectorResponse {
-  // error message. Empty if success
-  optional string return_message = 1;
-}
-
-// Create a column major matrix. The number of rows is the dimension
-// of parameter. The number of columns is specifed by num_cols
-message CreateMatrixRequest { required int32 num_cols = 1; }
-
-message CreateMatrixResponse {
-  // error message. Empty if success
-  optional string return_message = 1;
-
-  required int64 handle = 2;
-}
-
-message ReleaseMatrixRequest { required int64 handle = 1; }
-
-message ReleaseMatrixResponse {
-  // error message. Empty if success
-  optional string return_message = 1;
-}
-
-/**
- * The operations are defined using the variables commented at Operation
- * and OperationResult
- */
-enum MatrixVectorOperation {
-  // r = u^T u
-  PSERVER_OP_utu = 0;
-
-  // r = u^T v
-  PSERVER_OP_utv = 1;
-
-  // u = a u
-  PSERVER_OP_au = 2;
-
-  // v = a u + b v
-  PSERVER_OP_au_bv = 3;
-
-  // u = a A x + b u
-  PSERVER_OP_aAx_bu = 4;
-
-  // Stochastic gradient update
-  PSERVER_OP_SGD = 5;
-
-  // u = a
-  PSERVER_OP_RESET = 6;
-
-  // v = u
-  PSERVER_OP_COPY = 7;
-
-  // w = a u + b v + c w
-  PSERVER_OP_au_bv_cw = 8;
-
-  // owlqn: MakeSteepestDescDir
-  PSERVER_OP_MAKE_STEEPEST_DESC_DIR = 9;
-
-  // owlqn: FixDirSigns
-  PSERVER_OP_FIX_DIR_SIGNS = 10;
-
-  // owlqn: DirDeriv
-  PSERVER_OP_DIR_DERIV = 11;
-
-  // owlqn: FixOmegaSigns
-  PSERVER_OP_FIX_OMEGA_SIGNS = 12;
-
-  // Get overall cost
-  PSERVER_OP_COST = 13;
-
-  // Pass control
-  PSERVER_OP_START_PASS = 14;
-  PSERVER_OP_FINISH_PASS = 15;
-
-  // randomize value
-  PSERVER_OP_RANDOMIZE = 16;
-
-  // call optimizer apply
-  PSERVER_OP_APPLY = 17;
-}
-
-message ProtoVector {
-  required int64 dim = 1;
-  repeated double values = 2 [ packed = true ];
-}
-
-message ProtoMatrix {
-  required int64 num_rows = 1;
-  required int64 num_cols = 2;
-  repeated double values = 3 [ packed = true ];
-}
-
-message Operation {
-  required MatrixVectorOperation operation = 1;
-
-  // vector handles created on the pserver
-  repeated int64 pvectors = 2; // u, v, w
-
-  // matrix handles created on the pserver
-  repeated int64 pmatrices = 3; // A, B, C
-
-  repeated double scalars = 4;       // a, b, c
-  repeated ProtoVector vectors = 5;  // x, y, z
-  repeated ProtoMatrix matrices = 6; // X, Y, Z
-}
-
-message OperationResult {
-  // error message. Empty if success
-  optional string return_message = 1;
-  //
-  repeated double scalars = 2;       // d, e, f
-  repeated ProtoVector vectors = 3;  // p, q, r
-  repeated ProtoMatrix matrices = 4; // P, Q, R
-}
-
-message DoOperationRequest {
-  repeated Operation operations = 1;
-
-  // If true, wait for gradient to be ready before starting the operations
-  required bool wait_for_gradient = 2;
-
-  // If true, send back the parameter to clients after the operations are
-  // finished
-  required bool send_back_parameter = 3;
-
-  // If true, and if all clients call waitPassFinish,
-  // signal all clients finish the pass
-  required bool release_pass = 4;
-}
-
-message DoOperationResponse {
-  // error message. Empty if success
-  optional string return_message = 1;
-
-  repeated OperationResult results = 2;
-
-  required bool pass_finish = 3;
-}
-
-message LoadValueRequest { required string dir_name = 1; }
-
-message LoadValueResponse {
-  // error message. Empty if success
-  optional string return_message = 1;
-}
-
-message SaveValueRequest { required string dir_name = 1; }
-
-message SaveValueResponse {
-  // error message. Empty if success
-  optional string return_message = 1;
-}
-
-enum DataUpdateMode {
-  // Client send it's own data to pserver
-  DATA_UPDATE_MODE_SET_OWN = 0;
-  // Client get all user data from all pservers
-  DATA_UPDATE_MODE_GET_ALL = 1;
-  // Client send it's own ref feature to pserver
-  DATA_UPDATE_MODE_SET_REF = 2;
-  // Client get all ref featuers from all pservers
-  DATA_UPDATE_MODE_GET_REF = 3;
-  // Client send it's own ref label to pserver
-  DATA_UPDATE_MODE_SET_REF_LABEL = 4;
-  // Client get all ref labels from all pservers
-  DATA_UPDATE_MODE_GET_REF_LABEL = 5;
-  // Client send it's own ref grad to pserver
-  DATA_UPDATE_MODE_SET_REF_GRAD = 6;
-  // Client get all ref grad from all pservers
-  DATA_UPDATE_MODE_GET_REF_GRAD = 7;
-}
-
-enum SendDataType {
-  DATA_REF = 0;
-  DATA_REFLABEL = 1;
-  DATA_REFGRAD = 2;
-  DATA_REDUCE_SUM = 3;
-}
-
-enum TransDataType {
-  TRANS_INT32 = 0;
-  TRANS_UINT32_T = 1;
-  TRANS_INT64_T = 2;
-  TRANS_UINT64_T = 3;
-  TRANS_FLOAT = 5;
-  TRANS_DOUBLE = 6;
-}
-
-message DataBlock {
-  // total byte size of this data blcok
-  required uint64 total_size = 1;
-  // byte size of one data type
-  required int32 data_size = 2;
-  // data_type
-  optional TransDataType data_type = 3 [ default = TRANS_DOUBLE ];
-}
-
-message SendDataRequest {
-  required SendDataType type = 1;
-  required DataUpdateMode update_mode = 2;
-  repeated DataBlock blocks = 3;
-  required uint64 client_id = 4;
-  required uint64 server_id = 5;
-}
-
-message SendDataResponse {
-  required SendDataType type = 1;
-  repeated DataBlock blocks = 2;
-  required uint64 server_id = 3;
-}
diff --git a/proto/README.md b/proto/README.md
deleted file mode 100644
index dda7ed7b3c8..00000000000
--- a/proto/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-## protos in this folder are legacy v2 protos.
-
-## Please refer to paddle/fluid for latest version.
diff --git a/proto/TrainerConfig.proto b/proto/TrainerConfig.proto
deleted file mode 100644
index 9cc20b4a3ef..00000000000
--- a/proto/TrainerConfig.proto
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-syntax = "proto2";
-
-import "DataConfig.proto";
-import "ModelConfig.proto";
-
-package paddle;
-
-message OptimizationConfig {
-  optional int32 batch_size = 3 [ default = 1 ];
-  required string algorithm = 4 [ default = "async_sgd" ];
-  optional int32 num_batches_per_send_parameter = 5 [ default = 1 ];
-  optional int32 num_batches_per_get_parameter = 6 [ default = 1 ];
-
-  required double learning_rate = 7;
-  optional double learning_rate_decay_a = 8 [ default = 0 ];
-  optional double learning_rate_decay_b = 9 [ default = 0 ];
-  optional string learning_rate_schedule = 27 [ default = "constant" ];
-  // learning rate will be scaled according to learning_rate_schedule
-  // 1), constant:
-  // lr = learning_rate
-  // 2), poly:
-  // lr = learning_rate *
-  //      pow(1 + learning_rate_decay_a * num_samples_processed,
-  //          -learning_rate_decay_b)
-  // 3), exp:
-  // lr = learning_rate *
-  //      pow(learning_rate_decay_a,
-  //          num_samples_processed / learning_rate_decay_b)
-  // 4), discexp:
-  // lr = learning_rate *
-  //      pow(learning_rate_decay_a,
-  //          floor(num_samples_processed / learning_rate_decay_b))
-  // 5), linear:
-  // lr = max(learning_rate - learning_rate_decay_a * num_samples_processed,
-  //          learning_rate_decay_b)
-
-  // owlqn related
-  // L1-regularization
-  optional double l1weight = 10 [ default = 0.1 ];
-  // L2-regularization
-  optional double l2weight = 11 [ default = 0 ];
-  // "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step)
-  // then accept the step
-  optional double c1 = 12 [ default = 0.0001 ];
-  // multiply the step with "backoff", when wolfe condition doesn't satisfy
-  optional double backoff = 13 [ default = 0.5 ];
-  // how many "s"s and "y"s are kept in owlqn
-  optional int32 owlqn_steps = 14 [ default = 10 ];
-  // accept the step if encountered "max_backoff" times of "reduce the step"
-  optional int32 max_backoff = 15 [ default = 5 ];
-  // L2-regularization coefficient is reduced linearly from iteration 0 to
-  // "l2weight_zero_iter", and set to 0 after "l2weight_zero_iter"
-  // iterations. set "l2weight_zero_iter" to 0 to disable this strategy.
-  optional int32 l2weight_zero_iter = 17 [ default = 0 ];
-
-  // averaged sgd
-  // About average_window * numBatchProcessed parameter are used
-  // for average. To be accurate, between average_window * numBatchProcessed
-  // and 2 * average_window * numBatchProcessed parameters are used for
-  // average.
-  optional double average_window = 18 [ default = 0 ];
-  optional int64 max_average_window = 19 [ default = 0x7fffffffffffffff ];
-
-  //////////////////////////
-  // Options Adaptive SGD //
-  //////////////////////////
-
-  // learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta",
-  // "rmsprop"
-  // default learning method("momentum") use global decayed learning rate with
-  // momentum.
-  // "adagrad", "adadelta" and "rmsprop" can set momentum too.
-  optional string learning_method = 23 [ default = "momentum" ];
-  optional double ada_epsilon = 24 [ default = 1e-6 ];
-  optional double ada_rou = 26 [ default = 0.95 ];
-
-  // Force to do average in cpu in order to save gpu memory usage
-  optional bool do_average_in_cpu = 25 [ default = false ];
-
-  // delta add rate in pserver, used while num_batches_per_send_parameter>1
-  // will be divided by #machines automatically.
-  optional double delta_add_rate = 28 [ default = 1.0 ];
-
-  // We split a large size into smaller mini-batches, whose sizes are
-  // determined by mini_batch_size. It only takes effect when there is
-  // an ExternalMachine.
-  optional int32 mini_batch_size = 29 [ default = 128 ];
-
-  // automatically set if any one of parameters set sparse remote update flag
-  optional bool use_sparse_remote_updater = 30 [ default = false ];
-
-  // how to update center parameter and feedback to local parameter,
-  // when use local sgd update in cluster training.
-  // A option is elastic_average, proposed by the paper: Deep learning with
-  // elastic averaging SGD.
-  // If use elastic_average method, every trainer node should sample from whole
-  // data sets.
-  optional string center_parameter_update_method = 31 [ default = "average" ];
-
-  // shrink sparse parameter value
-  // only works if parameter is remote sparse update and has L1 decay rate
-  optional double shrink_parameter_value = 32 [ default = 0 ];
-
-  ////////////////////////////
-  // Options Adam Optimizer //
-  ////////////////////////////
-  optional double adam_beta1 = 33 [ default = 0.9 ];
-  optional double adam_beta2 = 34 [ default = 0.999 ];
-  optional double adam_epsilon = 35 [ default = 1e-8 ];
-
-  // arguments for learning rate scheduler
-  // Format: num1:rate1,num2:rate2,...,numK:rateK
-  // For learning_rate_schedule="manual", num is the number of samples,
-  // For learning_rate_schedule="pass_manual",
-  //  num is the number of passes (starting from 0)
-  optional string learning_rate_args = 36 [ default = "" ];
-
-  // for async sgd gradient commit control.
-  // when async_lagged_grad_discard_ratio * num_gradient_servers commit passed,
-  // current async gradient will be discard silently.
-  optional double async_lagged_grad_discard_ratio = 37 [ default = 1.5 ];
-
-  // global threshold for gradient clipping
-  optional double gradient_clipping_threshold = 38 [ default = 0.0 ];
-};
-
-message TrainerConfig {
-  optional ModelConfig model_config = 1;
-  optional DataConfig data_config = 2;
-  required OptimizationConfig opt_config = 3;
-  optional DataConfig test_data_config = 4;
-  repeated string config_files = 5;
-
-  // the directory to save/load model files for each training path
-  optional string save_dir = 6 [ default = "./output/model" ];
-
-  // Path of the initial model parameters.
-  // If it was set, start_pass will be ignored.
-  optional string init_model_path = 7;
-
-  // Start training from this pass.
-  // Will load parameter from the previous pass.
-  optional int32 start_pass = 8 [ default = 0 ];
-
-  // file path to the trainer config file
-  optional string config_file = 9;
-}
-- 
GitLab


From eec133ca6a9545e5a05bfa7b8eced8a6a69582c4 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 14:48:41 +0800
Subject: [PATCH 17/73] remove legacy testing code

---
 paddle/testing/TestMain.cpp |  22 ----
 paddle/testing/TestUtil.cpp | 222 ------------------------------------
 paddle/testing/TestUtil.h   |  78 -------------
 3 files changed, 322 deletions(-)
 delete mode 100644 paddle/testing/TestMain.cpp
 delete mode 100644 paddle/testing/TestUtil.cpp
 delete mode 100644 paddle/testing/TestUtil.h

diff --git a/paddle/testing/TestMain.cpp b/paddle/testing/TestMain.cpp
deleted file mode 100644
index 1811dbbd1a5..00000000000
--- a/paddle/testing/TestMain.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/legacy/utils/Util.h"
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/testing/TestUtil.cpp b/paddle/testing/TestUtil.cpp
deleted file mode 100644
index fa8efc20f59..00000000000
--- a/paddle/testing/TestUtil.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TestUtil.h"
-#include <gflags/gflags.h>
-#include "paddle/legacy/math/SparseMatrix.h"
-
-DEFINE_int32(fixed_seq_length, 0, "Produce some sequence of fixed length");
-
-namespace paddle {
-
-std::string randStr(const int len) {
-  std::string str =
-      "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
-  std::string s = "";
-  for (int i = 0; i < len; ++i) s += str[(rand() % 62)];  // NOLINT
-  return s;
-}
-
-MatrixPtr makeRandomSparseMatrix(size_t height,
-                                 size_t width,
-                                 bool withValue,
-                                 bool useGpu,
-                                 bool equalNnzPerSample) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  std::vector<int64_t> ids(height);
-  std::vector<int64_t> indices(height + 1);
-  indices[0] = 0;
-
-  std::function<size_t()> randomer = [] { return uniformRandom(10); };
-  if (equalNnzPerSample) {
-    size_t n = 0;
-    do {
-      n = uniformRandom(10);
-    } while (!n);
-    randomer = [=] { return n; };
-  }
-  for (size_t i = 0; i < height; ++i) {
-    indices[i + 1] = indices[i] + std::min(randomer(), width);
-    ids[i] = i;
-  }
-
-  if (!withValue) {
-    std::vector<sparse_non_value_t> data;
-    data.resize(indices[height] - indices[0]);
-    for (size_t i = 0; i < data.size(); ++i) {
-      data[i].col = uniformRandom(width);
-    }
-    auto mat = Matrix::createSparseMatrix(
-        height, width, data.size(), NO_VALUE, SPARSE_CSR, false, useGpu);
-    if (useGpu) {
-      std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-          ids.data(), indices.data(), data.data(), HPPL_STREAM_DEFAULT);
-    } else {
-      std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-          ids.data(), indices.data(), data.data());
-    }
-    return mat;
-  } else {
-    std::vector<sparse_float_value_t> data;
-    data.resize(indices[height] - indices[0]);
-    for (size_t i = 0; i < data.size(); ++i) {
-      data[i].col = uniformRandom(width);
-      data[i].value = rand() / static_cast<float>(RAND_MAX);  // NOLINT
-    }
-    auto mat = Matrix::createSparseMatrix(
-        height, width, data.size(), FLOAT_VALUE, SPARSE_CSR, false, useGpu);
-    if (useGpu) {
-      std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-          ids.data(), indices.data(), data.data(), HPPL_STREAM_DEFAULT);
-    } else {
-      std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-          ids.data(), indices.data(), data.data());
-    }
-    return mat;
-  }
-#endif
-  return nullptr;
-}
-
-void generateSequenceStartPositions(size_t batchSize,
-                                    IVectorPtr& sequenceStartPositions) {
-  ICpuGpuVectorPtr gpuCpuVec;
-  generateSequenceStartPositions(batchSize, gpuCpuVec);
-  sequenceStartPositions = gpuCpuVec->getMutableVector(false);
-}
-
-void generateSequenceStartPositions(size_t batchSize,
-                                    ICpuGpuVectorPtr& sequenceStartPositions) {
-  int numSeqs;
-  if (FLAGS_fixed_seq_length != 0) {
-    numSeqs = std::ceil((float)batchSize / (float)FLAGS_fixed_seq_length);
-  } else {
-    numSeqs = batchSize / 10 + 1;
-  }
-  sequenceStartPositions =
-      ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
-  int* buf = sequenceStartPositions->getMutableData(false);
-  int64_t pos = 0;
-  int len = FLAGS_fixed_seq_length;
-  int maxLen = 2 * batchSize / numSeqs;
-  for (int i = 0; i < numSeqs; ++i) {
-    if (FLAGS_fixed_seq_length == 0) {
-      len = uniformRandom(
-                std::min<int64_t>(maxLen, batchSize - pos - numSeqs + i)) +
-            1;
-    }
-    buf[i] = pos;
-    pos += len;
-    VLOG(1) << " len=" << len;
-  }
-  buf[numSeqs] = batchSize;
-}
-
-void generateSubSequenceStartPositions(
-    const ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions) {
-  int numSeqs = sequenceStartPositions->getSize() - 1;
-  const int* buf = sequenceStartPositions->getData(false);
-  int numOnes = 0;
-  for (int i = 0; i < numSeqs; ++i) {
-    if (buf[i + 1] - buf[i] == 1) {
-      ++numOnes;
-    }
-  }
-  // each seq has two sub-seq except length 1
-  int numSubSeqs = numSeqs * 2 - numOnes;
-  subSequenceStartPositions =
-      ICpuGpuVector::create(numSubSeqs + 1, /* useGpu= */ false);
-  int* subBuf = subSequenceStartPositions->getMutableData(false);
-  int j = 0;
-  for (int i = 0; i < numSeqs; ++i) {
-    if (buf[i + 1] - buf[i] == 1) {
-      subBuf[j++] = buf[i];
-    } else {
-      int len = uniformRandom(buf[i + 1] - buf[i] - 1) + 1;
-      subBuf[j++] = buf[i];
-      subBuf[j++] = buf[i] + len;
-    }
-  }
-  subBuf[j] = buf[numSeqs];
-}
-
-void generateMDimSequenceData(const IVectorPtr& sequenceStartPositions,
-                              IVectorPtr& cpuSequenceDims) {
-  /* generate sequences with 2 dims */
-  int numSeqs = sequenceStartPositions->getSize() - 1;
-  int numDims = 2;
-
-  cpuSequenceDims = IVector::create(numSeqs * numDims, /* useGpu= */ false);
-  int* bufStarts = sequenceStartPositions->getData();
-  int* bufDims = cpuSequenceDims->getData();
-
-  for (int i = 0; i < numSeqs; i++) {
-    int len = bufStarts[i + 1] - bufStarts[i];
-    /* get width and height randomly */
-    std::vector<int> dimVec;
-    for (int j = 0; j < len; j++) {
-      if (len % (j + 1) == 0) {
-        dimVec.push_back(1);
-      }
-    }
-    int idx = rand() % dimVec.size();  // NOLINT use rand_r
-    bufDims[i * numDims] = dimVec[idx];
-    bufDims[i * numDims + 1] = len / dimVec[idx];
-  }
-}
-
-void generateMDimSequenceData(const ICpuGpuVectorPtr& sequenceStartPositions,
-                              IVectorPtr& cpuSequenceDims) {
-  /* generate sequences with 2 dims */
-  int numSeqs = sequenceStartPositions->getSize() - 1;
-  int numDims = 2;
-
-  cpuSequenceDims = IVector::create(numSeqs * numDims, /* useGpu= */ false);
-  const int* bufStarts = sequenceStartPositions->getData(false);
-  int* bufDims = cpuSequenceDims->getData();
-
-  for (int i = 0; i < numSeqs; i++) {
-    int len = bufStarts[i + 1] - bufStarts[i];
-    /* get width and height randomly */
-    std::vector<int> dimVec;
-    for (int j = 0; j < len; j++) {
-      if (len % (j + 1) == 0) {
-        dimVec.push_back(1);
-      }
-    }
-    int idx = rand() % dimVec.size();  // NOLINT use rand_r
-    bufDims[i * numDims] = dimVec[idx];
-    bufDims[i * numDims + 1] = len / dimVec[idx];
-  }
-}
-
-void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) {
-  EXPECT_EQ(a->getWidth(), b->getWidth());
-  EXPECT_EQ(a->getHeight(), b->getHeight());
-  EXPECT_EQ(a->isTransposed(), b->isTransposed());
-  for (size_t r = 0; r < a->getHeight(); ++r) {
-    for (size_t c = 0; c < a->getWidth(); ++c) {
-      EXPECT_FLOAT_EQ(a->getElement(r, c), b->getElement(r, c));
-    }
-  }
-}
-
-void checkVectorEqual(const IVectorPtr& a, const IVectorPtr& b) {
-  EXPECT_EQ(a->getSize(), b->getSize());
-  for (size_t r = 0; r < a->getSize(); ++r) {
-    EXPECT_FLOAT_EQ(a->get(r), b->get(r));
-  }
-}
-}  // namespace paddle
diff --git a/paddle/testing/TestUtil.h b/paddle/testing/TestUtil.h
deleted file mode 100644
index 98b864e3c56..00000000000
--- a/paddle/testing/TestUtil.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <gtest/gtest.h>
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-std::string randStr(const int len);
-
-inline int uniformRandom(int n) { return n == 0 ? 0 : rand() % n; }
-
-inline bool approximatelyEqual(float a, float b, float epsilon) {
-  return fabs(a - b) <= ((fabs(a) < fabs(b) ? fabs(b) : fabs(a)) * epsilon);
-}
-
-MatrixPtr makeRandomSparseMatrix(size_t height,
-                                 size_t width,
-                                 bool withValue,
-                                 bool useGpu,
-                                 bool equalNnzPerSample = false);
-
-/**
- * @brief generate sequenceStartPositions for INPUT_SEQUENCE_DATA,
- *        INPUT_HASSUB_SEQUENCE_DATA and INPUT_SEQUENCE_LABEL
- *
- * @param batchSize                      batchSize
- *        sequenceStartPositions[out] generation output
- */
-void generateSequenceStartPositions(size_t batchSize,
-                                    IVectorPtr& sequenceStartPositions);
-
-void generateSequenceStartPositions(size_t batchSize,
-                                    ICpuGpuVectorPtr& sequenceStartPositions);
-
-/**
- * @brief generate subSequenceStartPositions for INPUT_HASSUB_SEQUENCE_DATA
- *        according to sequenceStartPositions
- *
- * @param sequenceStartPositions[in]     input
- *        subSequenceStartPositions[out] generation output
- */
-void generateSubSequenceStartPositions(const IVectorPtr& sequenceStartPositions,
-                                       IVectorPtr& subSequenceStartPositions);
-
-void generateSubSequenceStartPositions(
-    const ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions);
-
-/**
- * @brief generate cpuSequenceDims for INPUT_SEQUENCE_MDIM_DATA according to
- *        sequenceStartPositions
- *
- * @param sequenceStartPositions[in]     input
- *        cpuSequenceDims[out]              generation output
- */
-void generateMDimSequenceData(const IVectorPtr& sequenceStartPositions,
-                              IVectorPtr& cpuSequenceDims);
-void generateMDimSequenceData(const ICpuGpuVectorPtr& sequenceStartPositions,
-                              IVectorPtr& cpuSequenceDims);
-
-void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b);
-
-void checkVectorEqual(const IVectorPtr& a, const IVectorPtr& b);
-}  // namespace paddle
-- 
GitLab


From 5316c647766b19605a87e2eb98dba8ff6df2aadb Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 14:59:15 +0800
Subject: [PATCH 18/73] remove legacy cluster_train code

---
 paddle/scripts/cluster_train/conf.py          | 37 ---------
 paddle/scripts/cluster_train/paddle.py        | 82 -------------------
 paddle/scripts/cluster_train/run.sh           | 27 ------
 .../scripts/cluster_train_v2/fabric/conf.py   | 39 ---------
 .../fabric/docker_cluster/Dockerfile          | 11 ---
 .../fabric/docker_cluster/ssh_servers.yaml    | 23 ------
 paddle/scripts/cluster_train_v2/fabric/run.sh | 14 ----
 .../openmpi/docker_cluster/Dockerfile         | 43 ----------
 .../openmpi/docker_cluster/head.yaml          | 25 ------
 .../openmpi/docker_cluster/mpi-nodes.yaml     | 26 ------
 .../openmpi/docker_cluster/ssh/config         |  1 -
 .../openmpi/docker_cluster/ssh/id_rsa.mpi     | 27 ------
 .../openmpi/docker_cluster/ssh/id_rsa.mpi.pub |  1 -
 .../openmpi/start_mpi_train.sh                | 32 --------
 14 files changed, 388 deletions(-)
 delete mode 100644 paddle/scripts/cluster_train/conf.py
 delete mode 100644 paddle/scripts/cluster_train/paddle.py
 delete mode 100644 paddle/scripts/cluster_train/run.sh
 delete mode 100644 paddle/scripts/cluster_train_v2/fabric/conf.py
 delete mode 100644 paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
 delete mode 100644 paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
 delete mode 100644 paddle/scripts/cluster_train_v2/fabric/run.sh
 delete mode 100644 paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
 delete mode 100644 paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
 delete mode 100644 paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
 delete mode 100644 paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
 delete mode 100644 paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
 delete mode 100644 paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
 delete mode 100644 paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh

diff --git a/paddle/scripts/cluster_train/conf.py b/paddle/scripts/cluster_train/conf.py
deleted file mode 100644
index c77d7584d3c..00000000000
--- a/paddle/scripts/cluster_train/conf.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-HOSTS = [
-    "root@192.168.100.17",
-    "root@192.168.100.18",
-]
-'''
-workspace configuration
-'''
-#root dir for workspace, can be set as any director with real user account
-ROOT_DIR = "/home/paddle"
-'''
-network configuration
-'''
-#pserver nics
-PADDLE_NIC = "eth0"
-#pserver port
-PADDLE_PORT = 7164
-#pserver ports num
-PADDLE_PORTS_NUM = 2
-#pserver sparse ports num
-PADDLE_PORTS_NUM_FOR_SPARSE = 2
-
-#environments setting for all processes in cluster job
-LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64"
diff --git a/paddle/scripts/cluster_train/paddle.py b/paddle/scripts/cluster_train/paddle.py
deleted file mode 100644
index ba313ac6a18..00000000000
--- a/paddle/scripts/cluster_train/paddle.py
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/usr/bin/python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" module for launching cluster job """
-
-import os
-import argparse
-import socket
-import copy
-import time
-import signal
-
-from fabric.api import run, put, settings, env, prefix
-from fabric.tasks import execute
-
-#configuration for cluster
-import conf
-
-
-def refine_unknown_args(cmd_args):
-    '''
-    refine unknown parameters to handle some special parameters
-    '''
-    new_args = []
-    for arg in cmd_args:
-        if arg.startswith("--") and arg.find("=") != -1:
-            equal_pos = arg.find("=")  #find first = pos
-            arglist = list(arg)
-            arglist[equal_pos] = " "
-            arg = "".join(arglist)
-            arg = arg.lstrip("-")
-            new_args += arg.split(" ")
-        elif arg.startswith("--") and arg.find("=") == -1:
-            arg = arg.lstrip("-")
-            new_args.append(arg)
-        else:
-            new_args.append(arg)
-    return new_args
-
-
-def kill_process():
-    '''
-    kill comments threads
-    '''
-    run("ps aux \
-         | grep paddle_process_by_paddle \
-         | grep -v grep  \
-         | awk '{print $2}' \
-         | xargs kill > /dev/null 2>&1")
-
-
-def job_prepare(jobdir, data=None):
-    '''
-    prepare job related workspace data
-
-    Assuming you already installed PaddlePaddle in all nodes which means
-    PaddlePaddle related bins and dependencies libraries.
-    Assuming the train/test data have already been installed.
-    This function just prepare all related model and other resources
-    needed at runtime.
-    '''
-
-    def job_create_workspace(jobdir, data=None):
-        '''
-        prepare job workspace, common file, etc.
-        '''
-        log = os.path.join(jobdir, "log")
-        if data is not None:
-            #create job dir
-            run('rm ' + jobdir + ' -fr && ' + 'mkdir -p ' + jobdir)
-            #push data and paddle bin
diff --git a/paddle/scripts/cluster_train/run.sh b/paddle/scripts/cluster_train/run.sh
deleted file mode 100644
index 331c6498813..00000000000
--- a/paddle/scripts/cluster_train/run.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/sh
-
-#python paddle.py \
-#  --job_workspace="${PATH_TO_REMOTE_EXISTED_WORKSPACE}" \
-#  --dot_period=10 \
-#  --ports_num_for_sparse=2 \
-#  --log_period=50 \
-#  --num_passes=10 \
-#  --trainer_count=4 \
-#  --saving_period=1 \
-#  --local=0 \
-#  --config=./trainer_config.py \
-#  --save_dir=./output \
-#  --use_gpu=0
-
-python paddle.py \
-  --job_dispatch_package="${PATH_TO_LOCAL_WORKSPACE}" \
-  --dot_period=10 \
-  --ports_num_for_sparse=2 \
-  --log_period=50 \
-  --num_passes=10 \
-  --trainer_count=4 \
-  --saving_period=1 \
-  --local=0 \
-  --config=./trainer_config.py \
-  --save_dir=./output \
-  --use_gpu=0
diff --git a/paddle/scripts/cluster_train_v2/fabric/conf.py b/paddle/scripts/cluster_train_v2/fabric/conf.py
deleted file mode 100644
index e96503d093a..00000000000
--- a/paddle/scripts/cluster_train_v2/fabric/conf.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-HOSTS = [
-    "root@10.1.9.7",
-    "root@10.1.18.7",
-    "root@10.1.32.9",
-]
-'''
-workspace configuration
-'''
-#root dir for workspace, can be set as any director with real user account
-ROOT_DIR = "/root"
-'''
-network configuration
-'''
-#pserver nics
-PADDLE_NIC = "eth0"
-#pserver port
-PADDLE_PORT = 7164
-#pserver ports num
-PADDLE_PORTS_NUM = 1
-#pserver sparse ports num
-PADDLE_PORTS_NUM_FOR_SPARSE = 1
-#trainer whether use gpu
-PADDLE_USE_GPU = "False"
-#environments setting for all processes in cluster job
-LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64"
diff --git a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
deleted file mode 100644
index 6606c01265a..00000000000
--- a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
+++ /dev/null
@@ -1,11 +0,0 @@
-FROM docker.paddlepaddlehub.com/paddle:0.10.0rc2
-RUN apt-get update && apt-get install -y openssh-server
-RUN mkdir /var/run/sshd
-
-RUN echo 'root:root' |chpasswd
-
-RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
-
-EXPOSE 22
-CMD ["/usr/sbin/sshd", "-D"]
diff --git a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
deleted file mode 100644
index 0784b2d1b87..00000000000
--- a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: Deployment
-metadata:
-  name: ssh-servers
-spec:
-  replicas: 3
-  template:
-    metadata:
-      labels:
-        app: ssh-servers
-    spec:
-      containers:
-      - name: ssh-servers
-        image: docker.paddlepaddlehub.com/paddlessh
-        resources:
-          limits:
-            cpu: 500m
-            memory: 1Gi
-          requests:
-            cpu: 500m
-            memory: 1Gi
-        ports:
-        - containerPort: 22
diff --git a/paddle/scripts/cluster_train_v2/fabric/run.sh b/paddle/scripts/cluster_train_v2/fabric/run.sh
deleted file mode 100644
index f6324bcb136..00000000000
--- a/paddle/scripts/cluster_train_v2/fabric/run.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-python paddle.py \
-  --job_dispatch_package="/root/wuyi/fabric_submit/workspace" \
-  --dot_period=10 \
-  --ports_num_for_sparse=1 \
-  --log_period=50 \
-  --num_passes=5 \
-  --trainer_count=2 \
-  --saving_period=1 \
-  --local=0 \
-  --config=./trainer_config.py \
-  --save_dir=./output \
-  --use_gpu=0
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
deleted file mode 100644
index c2f631bdf4e..00000000000
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
+++ /dev/null
@@ -1,43 +0,0 @@
-# Build this image:  docker build -t mpi .
-#
-
-FROM paddlepaddle/paddle:0.10.0rc3
-
-ENV DEBIAN_FRONTEND noninteractive
-
-RUN apt-get update -y && \
-    apt-get upgrade -y && \
-    apt-get install -y openssh-server zip unzip vim sudo \
-gcc gfortran openmpi-checkpoint binutils wget curl git openmpi-bin openmpi-common libopenmpi-dev && \
-pip install mpi4py numpy virtualenv scipy matplotlib lxml sqlalchemy suds ipython obspy && \
-mkdir /var/run/sshd && \
-echo 'root:tutorial' | chpasswd && \
-sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
-# SSH login fix. Otherwise user is kicked off after login
-sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
-echo "export VISIBLE=now" >> /etc/profile && \
-adduser --disabled-password --gecos "" tutorial && \
-echo "tutorial ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers && \
-mkdir /home/tutorial/.ssh/
-
-ENV HOME /home/tutorial
-ENV NOTVISIBLE "in users profile"
-
-# ------------------------------------------------------------
-# Set-Up SSH with our Github deploy key
-# ------------------------------------------------------------
-
-ADD ssh/config /home/tutorial/.ssh/config
-ADD ssh/id_rsa.mpi /home/tutorial/.ssh/id_rsa
-ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/id_rsa.pub
-ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/authorized_keys
-
-#---------------------------------------------------------------
-#LD_LIBRARY_PATH
-#---------------------------------------------------------------
-
-RUN export LD_LIBRARY_PATH=/usr/lib/openmpi/lib/
-
-WORKDIR /home/tutorial
-EXPOSE 22
-CMD ["/usr/sbin/sshd", "-D"]
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
deleted file mode 100644
index 34835e5eb8d..00000000000
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: Deployment
-metadata:
-  name: mpi-header
-  labels:
-    app: mpi-header
-spec:
-  replicas: 1
-  template:
-    metadata:
-      labels:
-        app: mpi-header
-    spec:
-      containers:
-      - image: typhoon1986/paddle-openmpi
-        name : mpi-header
-        resources:
-          limits:
-            cpu: 500m
-            memory: 2Gi
-          requests:
-            cpu: 500m
-            memory: 2Gi
-        ports:
-        - containerPort: 22
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
deleted file mode 100644
index 2fd5cb4d44a..00000000000
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: Deployment
-metadata:
-  name: mpi-nodes
-  labels:
-    app: mpi-nodes
-spec:
-  replicas: 3
-  template:
-    metadata:
-      labels:
-        app: mpi-nodes
-    spec:
-      containers:
-      - image: typhoon1986/paddle-openmpi
-        name : mpi-nodes
-        resources:
-          limits:
-            cpu: 500m
-            memory: 2Gi
-          requests:
-            cpu: 500m
-            memory: 2Gi
-        ports:
-        - containerPort: 22
-        imagePullPolicy: Always
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
deleted file mode 100644
index a9ecad07c39..00000000000
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
+++ /dev/null
@@ -1 +0,0 @@
-StrictHostKeyChecking no
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
deleted file mode 100644
index 23768343edf..00000000000
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
+++ /dev/null
@@ -1,27 +0,0 @@
------BEGIN RSA PRIVATE KEY-----
-MIIEogIBAAKCAQEA7PWLZmgdJ508dD15T6+xqGDvL9Ehzo9SgsnN6xJ+qpUvvOi4
-1axW0AqR4MnPTg/uuvk+x4tUpuufOW4w22UTGjsdvmIVWa9ujLtcRiN3YPY+SU+Y
-O5FfqKg7r/hBn+/GMcSoffwSs7vVgmhBBnp/mJh2O1cOAFZEe98/47mbg3/kHBAk
-36NOQktaU3l48B38EhBTnjWfcEGm1HcTRPFxXV5Wiko6ZhKFEuHcTVKng4ROtUqE
-mgHyI0aB7TAxg4na0ejItsYWEPWGeDOw6ms/4MwylxNosWzHFPW9p4zgLCLNr+b6
-bDDfYKjXZflAuTQtQhLmJUwD9uuYLAijpSE2fQIDAQABAoIBADgcgRET8Gt0CV/B
-OtvKz/f+VEVvcWD3gWNlJDTZIVOFllNWjIZUlA4ZoqenQkbK8Q4nfV1FOht4yjCQ
-TlN1oMtiWk297i5Zo4UBzPzy4w774I39oh/g8dT/WXr2/5s+7SDV38xNh6Q2A34o
-79T35wUcfUrZ93/O7dKjb/6d8hx2FMha0wVKqY4lmG1lQE3bbx3kakec0PdvU5kO
-YHKlpqj3pMR7CpMa+4yL/iXFwWYmnK+uu+zw7JR7PwvH1CzrnvW438wjQ1QmYbSx
-mHHOE89X67Lsl5hn81qYWBhpwAlBwi1qscsE0cV9GcFyKqWFqZsj5coM9u3CRfvy
-lrWe1OUCgYEA+LBUFEd3Hxs4sFiYElJ8R9SAs1udaqPvAl01hTEijJLfYlMMVs/y
-rgNN7j22zjDak2f8QdyMJZX7EZdRmdYcHO0csYOwbYvalzcnwk+U3mxmdD3r4xSo
-DSvkJ70fogAqUlcVIg2re6fCmZVJQTvMQYTVEM8zQomJRt/Lb2esSfsCgYEA8+zv
-44aToe8uqiDs4w8guRW7LCDkTw4z4IVo9JUibIaPjaAs5bZEBXSB43EEywXCR75H
-fML0rU1PVvKh1rqcvZdVzm+XMWVr3asPk0sapaiHaTcmyZvJRDxxqbLFp0zRP1T6
-cCtXNFdHWU4KiuKrUi6cDyOKchpfkSZa4seiT+cCgYB+n4FgBfdQPlMB70oW4irn
-g/q32CjxuGCk6oKqu5bkzo+xB6obtavSEFqouIGQwO056tNVUY+GP7Rjg5GH663K
-yKw4cl3tmS0Gm43B8TVSfw03mKO3rrfWZQe5eCFYIg9qd26KNT2gK435FzsCXQkm
-PxUhhu6JrW/ZR2/U3Iur6wKBgADrWLAb1ryagSuE+j+U1AO+kDkHWrTtkcZ72jxp
-v3p3O11GSEUJXdJDcSXhTCpTuDq6/dv7hB6PFwh126RKicKxKlKf2wsFndV1Cpb8
-hnovW2tLGOtTmfuW2rrQAKyzvmolsNfxYd/BoHQ2thV16z1hDZeFA8WQUeHjKh6G
-sBbrAoGATdtQlaUxx4izua6k02ihkxx/cRYwDl2N8UDvDBHokS7vJFMX8b8NpsGg
-zMElnqSpu/pe/0UG7N2MtPF6uyMcX8AZzzcsRkiMkDvWJzYt8Jpf+Eyd/uryF+Yv
-yrXaOEY83tm6x/fny5ZaZmk8lNth7bfWywuTMkZLX3fYpWtIeE4=
------END RSA PRIVATE KEY-----
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
deleted file mode 100644
index 015f2b42e71..00000000000
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
+++ /dev/null
@@ -1 +0,0 @@
-ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDs9YtmaB0nnTx0PXlPr7GoYO8v0SHOj1KCyc3rEn6qlS+86LjVrFbQCpHgyc9OD+66+T7Hi1Sm6585bjDbZRMaOx2+YhVZr26Mu1xGI3dg9j5JT5g7kV+oqDuv+EGf78YxxKh9/BKzu9WCaEEGen+YmHY7Vw4AVkR73z/juZuDf+QcECTfo05CS1pTeXjwHfwSEFOeNZ9wQabUdxNE8XFdXlaKSjpmEoUS4dxNUqeDhE61SoSaAfIjRoHtMDGDidrR6Mi2xhYQ9YZ4M7Dqaz/gzDKXE2ixbMcU9b2njOAsIs2v5vpsMN9gqNdl+UC5NC1CEuYlTAP265gsCKOlITZ9 oweidner@peahi
diff --git a/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh b/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
deleted file mode 100644
index 2a7f4636274..00000000000
--- a/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-# General trainning configurations
-
-NICS=eth0
-PADDLE_INIT_PORT=7164
-PADDLE_INIT_PORTS_NUM=1
-PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
-PADDLE_INIT_PSERVERS=$(cat machines | sed -e ':a' -e 'N' -e '$!ba' -e 's/\n/,/g')
-PADDLE_INIT_USE_GPU=False
-
-PADDLE_INIT_NUM_GRADIENT_SERVERS=${OMPI_COMM_WORLD_SIZE}
-PADDLE_INIT_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
-PADDLE_CLUSTER_TRAIN=True
-
-env
-
-# start pserver
-stdbuf -oL nohup paddle pserver \
-  --port=$PADDLE_INIT_PORT \
-  --ports_num=$PADDLE_INIT_PORTS_NUM \
-  --ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \
-  --nics=$NICS \
-  --comment=paddle_cluster_pserver \
-  --num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS \
-  &> logs/pserver.log &
-
-# start trainer
-# NOTE: train.py will use the above environment variables as configuration
-python train.py &> logs/train.log
-
-# kill background pservers when train finishes
-ps -ef | grep pserver | awk '{print $2}' | xargs kill
-- 
GitLab


From 3ede8b67e6913e19c3db523f25ed5c95c061f321 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 15:20:00 +0800
Subject: [PATCH 19/73] update CMakeLists.txt

---
 CMakeLists.txt                                | 18 ---------
 paddle/CMakeLists.txt                         | 30 +-------------
 .../operators/positive_negative_pair_op.h     |  1 -
 .../sigmoid_cross_entropy_with_logits_op.h    |  1 -
 paddle/testing/CMakeLists.txt                 | 10 +----
 python/CMakeLists.txt                         | 40 +------------------
 python/setup.py.in                            | 32 ---------------
 7 files changed, 5 insertions(+), 127 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d6aa8f1b85c..a51552d96a4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -279,9 +279,6 @@ include(inference_lib)      # add paddle fluid inference libraries
 
 
 include_directories("${PADDLE_SOURCE_DIR}")
-include_directories("${PADDLE_SOURCE_DIR}/paddle/legacy/cuda/include")
-include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
-include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
 
 set(EXTERNAL_LIBS
     gflags
@@ -320,21 +317,6 @@ if(USE_NNPACK)
     list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
 endif(USE_NNPACK)
 
-add_subdirectory(proto)
-
-if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
-    # "add_subdirectory(go)" should be placed after the following loine,
-    # because it depends on paddle/optimizer.
-    add_subdirectory(paddle/legacy/optimizer)
-endif()
-
-# "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
-# placed after this block, because they depends on it.
-if(WITH_GOLANG)
-    enable_language(Go)
-    add_subdirectory(go)
-endif(WITH_GOLANG)
-
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 6b665a9effb..c0c04d47595 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,30 +1,4 @@
-if(NOT WITH_FLUID_ONLY)
-  add_subdirectory(legacy/cuda)
-  add_subdirectory(legacy/function)
-  add_subdirectory(legacy/utils)
-  add_subdirectory(legacy/math)
-  add_subdirectory(legacy/gserver)
-  add_subdirectory(legacy/parameter)
-
-  if(MOBILE_INFERENCE)
-    add_subdirectory(legacy/capi)
-  else()
-    add_subdirectory(legacy/pserver)
-    add_subdirectory(legacy/trainer)
-    add_subdirectory(scripts)
-
-    if(WITH_C_API)
-      add_subdirectory(legacy/capi)
-    endif()
-
-    if(WITH_SWIG_PY)
-      add_subdirectory(legacy/api)
-    endif()
-  endif()
-endif()
-
+add_subdirectory(scripts)
 add_subdirectory(testing)
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
-if(NOT MOBILE_INFERENCE AND NOT RPI AND NOT WITH_C_API)
-  add_subdirectory(fluid)
-endif()
+add_subdirectory(fluid)
diff --git a/paddle/fluid/operators/positive_negative_pair_op.h b/paddle/fluid/operators/positive_negative_pair_op.h
index db0a1002f47..a47deb18b6f 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.h
+++ b/paddle/fluid/operators/positive_negative_pair_op.h
@@ -14,7 +14,6 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
index b8731c23275..6e75f9e0b8d 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -16,7 +16,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/hostdevice.h"
-#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 614596958e3..dc6245ce6b0 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -1,13 +1,5 @@
 # for paddle test case
 
 if(WITH_TESTING)
-  add_library(paddle_test_main STATIC TestMain.cpp)
-  add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies})
-  if(NOT WIN32)
-    add_library(paddle_test_util STATIC TestUtil.cpp)
-    add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
-  endif(NOT WIN32)
-  if(NOT MOBILE_INFERENCE)
-    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags)
-  endif()
+  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags)
 endif()
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 72c0d03e522..37ad77549c8 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -4,27 +4,6 @@ set(PY_FILES paddle/__init__.py
   ${UTILS_PY_FILES}
   ${FLUID_PY_FILES})
 
-if(NOT WITH_FLUID_ONLY)
-  file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
-  file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
-  file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/*.py)
-  set(PY_FILES ${PY_FILES}
-    ${TRAINER_PY_FILES}
-    ${HELPERS_PY_FILES}
-    ${V2_PY_FILES})
-
-  add_custom_target(copy_paddle_master)
-
-  SET(COPY_PADDLE_MASTER "")
-  if(WITH_GOLANG)
-    SET(COPY_PADDLE_MASTER "copy_paddle_master")
-    add_custom_command(TARGET ${COPY_PADDLE_MASTER}
-      COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/
-      )
-    add_dependencies(copy_paddle_master paddle_master)
-  endif(WITH_GOLANG)
-endif()
-
 set(MKL_SHARED_LIBS "")
 set(MKL_DEPENDS "")
 if(WITH_MKLML)
@@ -64,7 +43,7 @@ IF(WIN32)
             COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
             COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
             COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-            DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+            DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 ELSE(WIN32)
 	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 		COMMAND touch stub.cc
@@ -74,16 +53,10 @@ ELSE(WIN32)
 		COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
 		COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-		DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+		DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 ENDIF()
 
 set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})
-if(NOT WITH_FLUID_ONLY)
-    set(paddle_python_deps ${paddle_python_deps} paddle_pserver_main paddle_trainer paddle_merge_model)
-    if(WITH_SWIG_PY)
-        list(APPEND paddle_python_deps python_api_wheel)
-    endif()
-endif()
 add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
 
 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
@@ -91,15 +64,6 @@ set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 if (WITH_TESTING)
   add_subdirectory(paddle/reader/tests)
   add_subdirectory(paddle/dataset/tests)
-  if(NOT WITH_FLUID_ONLY)
-    add_subdirectory(paddle/trainer_config_helpers/tests)
-    if (WITH_SWIG_PY)
-      # enable v2 API unittest only when paddle swig api is compiled
-      add_subdirectory(paddle/v2/tests)
-      add_subdirectory(paddle/v2/plot/tests)
-      add_subdirectory(paddle/v2/reader/tests)
-    endif()
-  endif()
   add_subdirectory(paddle/fluid/tests)
   add_subdirectory(paddle/fluid/contrib/tests)
 endif()
diff --git a/python/setup.py.in b/python/setup.py.in
index c9afe6c8856..730b2e1f71c 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -117,17 +117,6 @@ packages=['paddle',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details']
 
-if '${WITH_FLUID_ONLY}'== 'OFF':
-    packages+=['paddle.proto',
-               'paddle.trainer',
-               'paddle.trainer_config_helpers',
-               'paddle.v2',
-               'paddle.v2.master',
-               'paddle.v2.plot',
-               'paddle.v2.reader',
-               'paddle.v2.dataset',
-               'py_paddle']
-
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
     setup_requires = f.read().splitlines()
 
@@ -136,19 +125,8 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
 
 # the prefix is sys.prefix which should always be usr
 paddle_bins = ''
-if '${WITH_FLUID_ONLY}'== 'OFF':
-    paddle_bin_dir = 'opt/paddle/bin'
-    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/legacy/trainer/paddle_trainer',
-                   '${PADDLE_BINARY_DIR}/paddle/legacy/trainer/paddle_merge_model',
-                   '${PADDLE_BINARY_DIR}/paddle/legacy/pserver/paddle_pserver_main',
-                   '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
-
 package_data={'paddle.fluid': ['core' + (ext_name if os.name != 'nt' else '.pyd')]}
 
-if '${WITH_FLUID_ONLY}'== 'OFF':
-    package_data['paddle.v2.master']=['libpaddle_master' + ext_name]
-    package_data['py_paddle']=['*.py','_swig_paddle' + ext_name]
-
 package_dir={
     '': '${PADDLE_BINARY_DIR}/python',
     # The paddle.fluid.proto will be generated while compiling.
@@ -157,8 +135,6 @@ package_dir={
     'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
     'paddle.fluid': '${PADDLE_BINARY_DIR}/python/paddle/fluid',
 }
-if '${WITH_FLUID_ONLY}'== 'OFF':
-    package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle'
 
 # put all thirdparty libraries in paddle.libs
 libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
@@ -226,14 +202,6 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
             command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core" + ext_name
         if os.system(command) != 0:
             raise Exception("patch core.%s failed, command: %s" % (ext_name, command))
-        if '${WITH_FLUID_ONLY}'== 'OFF':
-            # change rpath of _swig_paddle.xx.
-            if "@APPLE@" == "1":
-                command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle" + ext_name
-            else:
-                command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle" + ext_name
-            if os.system(command) != 0:
-                raise Exception("patch _swig_paddle.%s failed, command: %s" % (ext_name, command))
 
 ext_modules = [Extension('_foo', ['stub.cc'])]
 if os.name == 'nt':
-- 
GitLab


From e2ba9668b4a0b9b8c820f8fe152b1f6fc65310e9 Mon Sep 17 00:00:00 2001
From: zhaozhehao <zhaozhehao@outlook.com>
Date: Fri, 18 Jan 2019 15:24:26 +0800
Subject: [PATCH 20/73] Tree conv op (#15217)

* refactor tree2col operator with new memory mechanism test=develop

* test=develop

* test=develop

* Modified API according to panyx0718 test=develop

* fix API change according to heavengate test=develop

* Modify API comment test=develop
---
 paddle/fluid/API.spec                         |   1 +
 paddle/fluid/operators/CMakeLists.txt         |   2 +-
 paddle/fluid/operators/math/CMakeLists.txt    |   1 +
 paddle/fluid/operators/math/tree2col.cc       | 197 +++++++++++++++++
 paddle/fluid/operators/math/tree2col.cu       | 208 ++++++++++++++++++
 paddle/fluid/operators/math/tree2col.h        |  90 ++++++++
 paddle/fluid/operators/tree_conv_op.cc        | 129 +++++++++++
 paddle/fluid/operators/tree_conv_op.cu        |  24 ++
 paddle/fluid/operators/tree_conv_op.h         | 146 ++++++++++++
 python/paddle/fluid/layers/nn.py              |  71 ++++++
 .../tests/unittests/test_tree_conv_op.py      | 120 ++++++++++
 11 files changed, 988 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/operators/math/tree2col.cc
 create mode 100644 paddle/fluid/operators/math/tree2col.cu
 create mode 100644 paddle/fluid/operators/math/tree2col.h
 create mode 100644 paddle/fluid/operators/tree_conv_op.cc
 create mode 100644 paddle/fluid/operators/tree_conv_op.cu
 create mode 100644 paddle/fluid/operators/tree_conv_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_tree_conv_op.py

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index f6bf54d3394..0a4edea2c3c 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -215,6 +215,7 @@ paddle.fluid.layers.py_func ArgSpec(args=['func', 'x', 'out', 'backward_func', '
 paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.teacher_student_sigmoid_loss ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0))
 paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.tree_conv ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index e53a6a562ad..992a2bdd5ad 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -65,7 +65,7 @@ set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 600ab14d37a..dc27e543f0d 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -60,6 +60,7 @@ math_library(matrix_bit_code)
 math_library(unpooling)
 math_library(vol2col)
 math_library(prelu)
+math_library(tree2col DEPS math_function)
 
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc
new file mode 100644
index 00000000000..05ce5bc7a20
--- /dev/null
+++ b/paddle/fluid/operators/math/tree2col.cc
@@ -0,0 +1,197 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/math/tree2col.h"
+#include <deque>
+#include <stack>
+
+namespace paddle {
+namespace operators {
+namespace math {
+using Tensor = framework::Tensor;
+std::vector<TreeNode> Tree2ColUtil::construct_patch(
+    size_t root, int max_depth, const std::vector<std::vector<int>> &tr) {
+  std::stack<TreeNode, std::deque<TreeNode>> stack;
+  std::unordered_map<int, bool> visited;
+  std::vector<TreeNode> patch;
+
+  stack.push(TreeNode(root, 1, 1, 0));
+  patch.emplace_back(TreeNode(root, 1, 1, 0));
+  visited[root] = true;
+
+  while (!stack.empty()) {
+    TreeNode &u = stack.top();
+    bool end = true;
+    size_t node = u.get_node(), sz = tr[node].size();
+    visited[node] = true;
+    for (size_t i = 0; i < sz; i++) {
+      size_t v = tr[node][i];
+      if (!visited[v] && static_cast<int>(u.get_depth()) + 1 < max_depth) {
+        visited[v] = true;
+        stack.push(TreeNode(v, i, sz, u.get_depth() + 1));
+        patch.push_back(TreeNode(v, i + 1, sz, u.get_depth() + 1));
+        end = false;
+      }
+    }
+    if (end) {
+      stack.pop();
+    }
+  }
+  return patch;
+}
+
+void Tree2ColUtil::construct_tree(const paddle::Tensor &EdgeSet,
+                                  std::vector<std::vector<int>> *tr,
+                                  size_t *node_count) {
+  auto edge_set_dims = EdgeSet.dims();
+  PADDLE_ENFORCE_EQ(edge_set_dims[1], 2);
+  int64_t edge_count = EdgeSet.numel();
+
+  const int *edge_data = EdgeSet.data<int>();
+
+  for (int64_t i = 0; i < edge_count; i += 2) {
+    int u = edge_data[i], v = edge_data[i + 1];
+    if (u != 0 && v != 0) (*node_count)++;
+  }
+  (*node_count)++;
+
+  tr->resize(static_cast<size_t>(*node_count + 1));
+
+  for (int64_t i = 0; i < edge_count; i += 2) {
+    int u = edge_data[i], v = edge_data[i + 1];
+    if (u != 0 && v != 0) {
+      tr->at(u).push_back(v);
+    } else {
+      break;
+    }
+  }
+}
+
+template <typename T>
+class Tree2ColFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext &context,
+                  const framework::Tensor &EdgeSet,
+                  const framework::Tensor &node_features,
+                  framework::Tensor *patch, int max_depth) {
+    std::vector<std::vector<int>> tr;
+    auto feature_dims = node_features.dims();
+    auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+    math::SetConstant<platform::CPUDeviceContext, T> constant;
+    int64_t feature_size = feature_dims[1];
+    size_t patch_elem_size = 3 * static_cast<size_t>(feature_size);
+    size_t node_count = 0, patch_count = 0, patch_size;
+    Tree2ColUtil::construct_tree(EdgeSet, &tr, &node_count);
+    std::vector<std::vector<TreeNode>> processing_list;
+    for (size_t u = 1; u <= node_count; u++) {
+      std::vector<TreeNode> temp_patch =
+          Tree2ColUtil::construct_patch(u, max_depth, tr);
+      if (!temp_patch.empty()) {
+        processing_list.emplace_back(temp_patch);
+      }
+    }
+    patch_size = processing_list.size();
+
+    T *patch_data =
+        patch->mutable_data<T>({static_cast<int64_t>(patch_size),
+                                static_cast<int64_t>(patch_elem_size)},
+                               cpu_place);
+    constant(context, patch, 0);
+    const T *features = node_features.data<T>();
+
+    for (auto &patch_item : processing_list) {
+      size_t pointer_base = patch_count * patch_elem_size;
+      for (auto &v : patch_item) {
+        T eta_l = v.eta_l<T>(max_depth), eta_r = v.eta_r<T>(max_depth),
+          eta_t = v.eta_t<T>(max_depth);
+        size_t id = v.get_node() - 1;
+        for (int i = 0; i < feature_size; i++) {
+          patch_data[pointer_base + i * 3] +=
+              eta_l * features[id * feature_size + i];
+          patch_data[pointer_base + i * 3 + 1] +=
+              eta_r * features[id * feature_size + i];
+          patch_data[pointer_base + i * 3 + 2] +=
+              eta_t * features[id * feature_size + i];
+        }
+      }
+      patch_count++;
+    }
+    patch->Resize({static_cast<int64_t>(patch_count),
+                   static_cast<int64_t>(patch_elem_size)});
+  }
+};
+template <typename T>
+class Col2TreeFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext &context,
+                  const framework::Tensor &EdgeSet,
+                  const framework::Tensor &out_grad, framework::Tensor *in_grad,
+                  int max_depth) {
+    std::vector<std::vector<int>> tr;
+    auto output_dims = out_grad.dims();
+    auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+    math::SetConstant<platform::CPUDeviceContext, T> constant;
+    int64_t output_size = output_dims[1];
+    size_t grad_elem_size = 3 * static_cast<size_t>(output_size);
+    size_t node_count = 0, grad_count = 0;
+    Tree2ColUtil::construct_tree(EdgeSet, &tr, &node_count);
+    std::vector<std::vector<TreeNode>> processing_list;
+    std::vector<std::vector<TreeNode>> grad_list;
+    grad_list.resize(node_count);
+    for (size_t u = 1; u <= node_count; u++) {
+      std::vector<TreeNode> tmp =
+          Tree2ColUtil::construct_patch(u, max_depth, tr);
+      if (!tmp.empty()) {
+        processing_list.push_back(tmp);
+      }
+    }
+    for (size_t patch_id = 0; patch_id < processing_list.size(); patch_id++) {
+      for (auto v : processing_list[patch_id]) {
+        grad_list[v.get_node() - 1].push_back(v.change_node(patch_id + 1));
+      }
+    }
+    T *grad_data =
+        in_grad->mutable_data<T>({static_cast<int64_t>(node_count),
+                                  static_cast<int64_t>(grad_elem_size)},
+                                 cpu_place);
+
+    constant(context, in_grad, 0);
+    const T *out_g = out_grad.data<T>();
+    for (auto &patch_item : grad_list) {
+      size_t pointer_base = grad_count * grad_elem_size;
+      for (auto &v : patch_item) {
+        T eta_l = v.eta_l<T>(max_depth), eta_r = v.eta_r<T>(max_depth),
+          eta_t = v.eta_t<T>(max_depth);
+        size_t id = v.get_node() - 1;
+        for (int i = 0; i < output_size; i++) {
+          grad_data[pointer_base + i * 3] +=
+              eta_l * out_g[id * output_size + i];
+          grad_data[pointer_base + i * 3 + 1] +=
+              eta_r * out_g[id * output_size + i];
+          grad_data[pointer_base + i * 3 + 2] +=
+              eta_t * out_g[id * output_size + i];
+        }
+      }
+      grad_count++;
+    }
+  }
+};
+
+template class Tree2ColFunctor<platform::CPUDeviceContext, float>;
+template class Tree2ColFunctor<platform::CPUDeviceContext, double>;
+template class Col2TreeFunctor<platform::CPUDeviceContext, float>;
+template class Col2TreeFunctor<platform::CPUDeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/tree2col.cu b/paddle/fluid/operators/math/tree2col.cu
new file mode 100644
index 00000000000..3c50a525c2e
--- /dev/null
+++ b/paddle/fluid/operators/math/tree2col.cu
@@ -0,0 +1,208 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stack>
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/tree2col.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+using Tensor = framework::Tensor;
+using Node = paddle::operators::math::TreeNode;
+template <typename T>
+__global__ void tree2col(const T* eta, const int* node, const int* index,
+                         const T* vectors, T* result, int feature_size, int n) {
+  const int thread_id =
+      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  const int patch_id = thread_id / feature_size;
+  const int j = thread_id % feature_size;
+  if (patch_id < n) {
+    const int begin_o = patch_id * 3 * feature_size;
+    const int begin = index[patch_id * 2], end = index[patch_id * 2 + 1];
+    T res_l = 0, res_r = 0, res_t = 0;
+    for (int i = begin; i < end; i++) {
+      const int id = node[i];
+      const T vec = vectors[id * feature_size + j];
+      res_l += eta[i * 3] * vec;
+      res_r += eta[i * 3 + 1] * vec;
+      res_t += eta[i * 3 + 2] * vec;
+    }
+    result[begin_o + j * 3] = res_l;
+    result[begin_o + j * 3 + 1] = res_r;
+    result[begin_o + j * 3 + 2] = res_t;
+  }
+}
+template <typename T>
+class Tree2ColFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const paddle::platform::CUDADeviceContext& context,
+                  const framework::Tensor& EdgeSet,
+                  const framework::Tensor& node_features,
+                  framework::Tensor* patch, int max_depth) {
+    std::vector<std::vector<int>> tr;
+    auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
+    auto cpu_place = platform::CPUPlace();
+    auto stream = context.stream();
+    auto feature_dims = node_features.dims();
+    math::SetConstant<platform::CUDADeviceContext, T> constant;
+
+    Tensor EdgeSet_cpu;
+    framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu);
+    int64_t feature_size = feature_dims[1];
+    size_t patch_elem_size = 3 * static_cast<size_t>(feature_size);
+    size_t node_count = 0, patch_count = 0, total_size = 0;
+    size_t max_size = feature_dims[0];
+    Tree2ColUtil::construct_tree(EdgeSet_cpu, &tr, &node_count);
+
+    std::vector<std::vector<Node>> processing_list;
+    for (size_t u = 1; u <= node_count; u++) {
+      std::vector<Node> tmp = Tree2ColUtil::construct_patch(u, max_depth, tr);
+      if (!tmp.empty()) {
+        processing_list.push_back(tmp);
+        total_size += tmp.size();
+      }
+    }
+
+    size_t patch_size = processing_list.size();
+    Tensor node_cpu, node_gpu, eta_cpu, eta_gpu, index_cpu, index_gpu;
+    int* node = node_cpu.mutable_data<int>({static_cast<int64_t>(total_size)},
+                                           cpu_place);
+    T* eta = eta_cpu.mutable_data<T>({static_cast<int64_t>(total_size * 3)},
+                                     cpu_place);
+    int* index = index_cpu.mutable_data<int>(
+        {static_cast<int64_t>(patch_size * 2)}, cpu_place);
+
+    int idx = 0, index_idx = 0;
+    for (auto& tmp : processing_list) {
+      index[index_idx++] = idx;
+      for (auto& v : tmp) {
+        node[idx] = static_cast<int>(v.node - 1);
+        eta[idx * 3] = v.eta_l<T>(max_depth);
+        eta[idx * 3 + 1] = v.eta_r<T>(max_depth);
+        eta[idx * 3 + 2] = v.eta_t<T>(max_depth);
+        idx++;
+      }
+      index[index_idx++] = idx;
+    }
+    framework::TensorCopy(node_cpu, gpu_place, context, &node_gpu);
+    framework::TensorCopy(eta_cpu, gpu_place, context, &eta_gpu);
+    framework::TensorCopy(index_cpu, gpu_place, context, &index_gpu);
+
+    int elem_size = patch_size * feature_size;
+    int blocks = (elem_size + 1024 - 1) / 1024;
+    int block_x = 512;
+    int block_y = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(block_x, block_y);
+
+    patch->mutable_data<T>(
+        {static_cast<int64_t>(max_size), static_cast<int64_t>(patch_elem_size)},
+        gpu_place);
+    constant(context, patch, 0);
+    tree2col<T><<<grid, threads, 0, stream>>>(
+        eta_gpu.data<T>(), node_gpu.data<int>(), index_gpu.data<int>(),
+        node_features.data<T>(), patch->data<T>(), feature_size, patch_size);
+  }
+};
+template <typename T>
+class Col2TreeFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& EdgeSet,
+                  const framework::Tensor& patch_grad,
+                  framework::Tensor* embedding_grad, int max_depth) {
+    std::vector<std::vector<int>> tr;
+    auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
+    auto cpu_place = platform::CPUPlace();
+    auto stream = context.stream();
+    auto output_dims = patch_grad.dims();
+    math::SetConstant<platform::CUDADeviceContext, T> constant;
+
+    Tensor EdgeSet_cpu;
+    framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu);
+    int64_t output_size = output_dims[1];
+    size_t patch_elem_size = 3 * static_cast<size_t>(output_size);
+    size_t node_count = 0, patch_count = 0;
+    size_t max_size = output_dims[0];
+    Tree2ColUtil::construct_tree(EdgeSet_cpu, &tr, &node_count);
+    std::vector<std::vector<Node>> processing_list;
+    std::vector<std::vector<Node>> grad_list;
+    grad_list.resize(node_count);
+    size_t total_size = 0, grad_size = node_count;
+    for (size_t u = 1; u <= node_count; u++) {
+      std::vector<Node> tmp = Tree2ColUtil::construct_patch(u, max_depth, tr);
+      if (!tmp.empty()) {
+        processing_list.push_back(tmp);
+      }
+    }
+    for (size_t patch_id = 0; patch_id < processing_list.size(); patch_id++) {
+      for (auto v : processing_list[patch_id]) {
+        grad_list[v.get_node() - 1].push_back(v.change_node(patch_id + 1));
+      }
+    }
+    for (auto& tmp : grad_list) {
+      total_size += tmp.size();
+    }
+
+    Tensor node_cpu, node_gpu, eta_cpu, eta_gpu, index_cpu, index_gpu;
+    int* node = node_cpu.mutable_data<int>({static_cast<int64_t>(total_size)},
+                                           cpu_place);
+    T* eta = eta_cpu.mutable_data<T>({static_cast<int64_t>(total_size * 3)},
+                                     cpu_place);
+    int* index = index_cpu.mutable_data<int>(
+        {static_cast<int64_t>(grad_size * 2)}, cpu_place);
+
+    size_t idx = 0, index_idx = 0;
+    for (auto& tmp : grad_list) {
+      index[index_idx++] = idx;
+      for (auto& v : tmp) {
+        node[idx] = static_cast<int>(v.node - 1);
+        eta[idx * 3] = v.eta_l<T>(max_depth);
+        eta[idx * 3 + 1] = v.eta_r<T>(max_depth);
+        eta[idx * 3 + 2] = v.eta_t<T>(max_depth);
+        idx++;
+      }
+      index[index_idx++] = idx;
+    }
+    framework::TensorCopy(node_cpu, gpu_place, &node_gpu);
+    framework::TensorCopy(eta_cpu, gpu_place, &eta_gpu);
+    framework::TensorCopy(index_cpu, gpu_place, &index_gpu);
+
+    int elem_size = output_size * grad_size;
+    int blocks = (elem_size + 1024 - 1) / 1024;
+    int block_x = 512;
+    int block_y = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(block_x, block_y);
+
+    embedding_grad->mutable_data<T>(
+        {static_cast<int64_t>(max_size), static_cast<int64_t>(patch_elem_size)},
+        gpu_place);
+
+    constant(context, embedding_grad, 0);
+    tree2col<T><<<grid, threads, 0, stream>>>(
+        eta_gpu.data<T>(), node_gpu.data<int>(), index_gpu.data<int>(),
+        patch_grad.data<T>(), embedding_grad->data<T>(), output_size,
+        grad_size);
+  }
+};
+
+template class Tree2ColFunctor<platform::CUDADeviceContext, float>;
+template class Tree2ColFunctor<platform::CUDADeviceContext, double>;
+template class Col2TreeFunctor<platform::CUDADeviceContext, float>;
+template class Col2TreeFunctor<platform::CUDADeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/tree2col.h b/paddle/fluid/operators/math/tree2col.h
new file mode 100644
index 00000000000..478ba78e259
--- /dev/null
+++ b/paddle/fluid/operators/math/tree2col.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <array>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+namespace operators {
+namespace math {
+class TreeNode {
+ public:
+  size_t node;
+  explicit TreeNode(size_t node = 0, size_t index = 0, size_t pclen = 0,
+                    size_t depth = 0)
+      : node(node), index(index), pclen(pclen), depth(depth) {}
+  template <typename T>
+  T eta_t(T filter_depth) {
+    return ((filter_depth - this->depth) / filter_depth);
+  }
+  template <typename T>
+  T eta_l(T filter_depth) {
+    T temp;
+    if (this->pclen == 1) {
+      temp = 0.5;
+    } else {
+      temp = (this->index - 1.0) / (this->pclen - 1.0);
+    }
+    return (1.0 - this->eta_t<T>(filter_depth)) * temp;
+  }
+  template <typename T>
+  T eta_r(T filter_depth) {
+    return (1.0 - this->eta_t<T>(filter_depth)) *
+           (1.0 - this->eta_l<T>(filter_depth));
+  }
+  TreeNode change_node(size_t v) {
+    return TreeNode(v, this->index, this->pclen, this->depth);
+  }
+  size_t get_node() { return this->node; }
+  size_t get_depth() { return this->depth; }
+
+ private:
+  size_t index, pclen, depth;
+};
+class Tree2ColUtil {
+ public:
+  static std::vector<TreeNode> construct_patch(
+      size_t root, int max_depth, const std::vector<std::vector<int>> &tr);
+
+  static void construct_tree(const Tensor &EdgeSet,
+                             std::vector<std::vector<int>> *tr,
+                             size_t *node_count);
+};
+
+template <typename DeviceContext, typename T>
+class Tree2ColFunctor {
+ public:
+  void operator()(const DeviceContext &context,
+                  const framework::Tensor &EdgeSet,
+                  const framework::Tensor &node_features,
+                  framework::Tensor *patch, int max_depth);
+};
+template <typename DeviceContext, typename T>
+class Col2TreeFunctor {
+ public:
+  void operator()(const DeviceContext &context,
+                  const framework::Tensor &EdgeSet,
+                  const framework::Tensor &out_grad, framework::Tensor *in_grad,
+                  int max_depth);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/tree_conv_op.cc b/paddle/fluid/operators/tree_conv_op.cc
new file mode 100644
index 00000000000..615ea285e54
--- /dev/null
+++ b/paddle/fluid/operators/tree_conv_op.cc
@@ -0,0 +1,129 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/tree_conv_op.h"
+#include <string>
+
+namespace paddle {
+namespace operators {
+class TreeConvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("NodesVector",
+             "(Tensor) The feature vector of every node on the tree. "
+             "The shape of the feature vector must be "
+             "[max_tree_node_size, feature_size].");
+    AddInput("EdgeSet",
+             "(Tensor) The Edges of Tree. The edge must be directional. "
+             "The shape of the edge set must be [max_tree_node_size, 2].");
+    AddInput("Filter",
+             "(Tensor) The feature detector. "
+             "The shape of the filter is "
+             "[feature_size, 3, output_size, num_filters].");
+    AddOutput("Out",
+              "(Tensor) The feature vector of subtrees. "
+              "The shape of the output tensor is [max_tree_node_size, "
+              "output_size, num_filters]. "
+              "The output tensor could be a new feature "
+              "vector for next tree convolution layers.");
+    AddAttr<int>("max_depth",
+                 "(int, default: 2) The depth of feature detector.")
+        .SetDefault(2)
+        .GreaterThan(1);
+    AddComment(R"DOC(
+**Tree-Based Convolution Operator**
+
+Tree-Based Convolution is a kind of convolution based on tree structure.
+Tree-Based Convolution is a part of Tree-Based Convolution Neural Network(TBCNN),
+which is used to classify tree structures, such as Abstract Syntax Tree.
+Tree-Based Convolution proposed a kind of data structure called continuous binary tree,
+which regards multiway tree as binary tree.
+The paper of Tree-Based Convolution Operator is here:
+https://arxiv.org/abs/1409.5718v1
+)DOC");
+  }
+};
+class TreeConvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"));
+    auto edge_dims = ctx->GetInputDim("EdgeSet");
+    auto vector_dims = ctx->GetInputDim("NodesVector");
+    auto filter_dims = ctx->GetInputDim("Filter");
+    PADDLE_ENFORCE_EQ(edge_dims[2], 2, "Input(EdgeSet) dim[2] should be 2");
+    PADDLE_ENFORCE_EQ(edge_dims.size(), 3,
+                      "The dimension of EdgeSet Tensor should be 3");
+    PADDLE_ENFORCE_EQ(vector_dims.size(), 3,
+                      "The dimension of NodesVector Tensor should be 3");
+    PADDLE_ENFORCE_EQ(filter_dims.size(), 4,
+                      "The dimension of Filter Tensor should be 4");
+    PADDLE_ENFORCE_EQ(filter_dims[1], 3, "Input(Filter) dim[1] should be 3");
+    PADDLE_ENFORCE_EQ(
+        filter_dims[0], vector_dims[2],
+        "Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]");
+    auto output_dims = framework::make_ddim(
+        {vector_dims[0], vector_dims[1], filter_dims[2], filter_dims[3]});
+    ctx->SetOutputDim("Out", output_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("NodesVector")->type(),
+                                   ctx.device_context());
+  }
+};
+
+class TreeConvGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    auto vectors_dims = ctx->GetInputDim("NodesVector");
+    auto filter_dims = ctx->GetInputDim("Filter");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "the gradient of output(Out) must not be null");
+    if (ctx->HasOutput(framework::GradVarName("Filter"))) {
+      ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
+    }
+    if (ctx->HasOutput(framework::GradVarName("NodesVector"))) {
+      ctx->SetOutputDim(framework::GradVarName("NodesVector"), vectors_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("NodesVector")->type(),
+                                   ctx.device_context());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(tree_conv, ops::TreeConvOp, ops::TreeConvOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+
+REGISTER_OPERATOR(tree_conv_grad, ops::TreeConvGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    tree_conv, ops::TreeConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TreeConvKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    tree_conv_grad,
+    ops::TreeConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TreeConvGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/tree_conv_op.cu b/paddle/fluid/operators/tree_conv_op.cu
new file mode 100644
index 00000000000..eebfe412bdd
--- /dev/null
+++ b/paddle/fluid/operators/tree_conv_op.cu
@@ -0,0 +1,24 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/tree_conv_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    tree_conv, ops::TreeConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TreeConvKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    tree_conv_grad,
+    ops::TreeConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TreeConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/tree_conv_op.h b/paddle/fluid/operators/tree_conv_op.h
new file mode 100644
index 00000000000..a84589b32fd
--- /dev/null
+++ b/paddle/fluid/operators/tree_conv_op.h
@@ -0,0 +1,146 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iostream>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/tree2col.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+template <typename DeviceContext, typename T>
+class TreeConvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    math::Tree2ColFunctor<DeviceContext, T> tree2col;
+    math::SetConstant<DeviceContext, T> constant;
+
+    auto *Edges = ctx.Input<Tensor>("EdgeSet");
+    auto *Embeddings = ctx.Input<Tensor>("NodesVector");
+    auto *Filter = ctx.Input<Tensor>("Filter");
+    auto *output_emb = ctx.Output<Tensor>("Out");
+    int max_depth = ctx.Attr<int>("max_depth");
+
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+
+    Tensor W;
+    W.ShareDataWith(*Filter);
+    W.Resize(framework::flatten_to_2d(Filter->dims(), 2));
+
+    int batch_size = static_cast<int>(Edges->dims()[0]);
+    int n = static_cast<int>(Embeddings->dims()[1]);
+    int out_size = static_cast<int>(Filter->dims()[2]);
+    int num_filters = static_cast<int>(Filter->dims()[3]);
+    output_emb->mutable_data<T>({batch_size, n, out_size, num_filters},
+                                ctx.GetPlace());
+
+    auto edge_set_slicedim = framework::slice_ddim(
+        Edges->dims(), 1, static_cast<int>(Edges->dims().size()));
+
+    auto embedding_slicedim = framework::slice_ddim(
+        Embeddings->dims(), 1, static_cast<int>(Embeddings->dims().size()));
+
+    auto output_slicedim = framework::slice_ddim(
+        output_emb->dims(), 1, static_cast<int>(output_emb->dims().size()));
+
+    output_slicedim = framework::flatten_to_2d(output_slicedim, 1);
+
+    for (int idx = 0; idx < batch_size; idx++) {
+      auto edge_set = Edges->Slice(idx, idx + 1).Resize(edge_set_slicedim);
+      auto embeddings =
+          Embeddings->Slice(idx, idx + 1).Resize(embedding_slicedim);
+      auto out_vec = output_emb->Slice(idx, idx + 1).Resize(output_slicedim);
+      Tensor patch;
+      tree2col(dev_ctx, edge_set, embeddings, &patch, max_depth);
+      constant(dev_ctx, &out_vec, 0);
+      blas.MatMul(patch, W, &out_vec);
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class TreeConvGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *out_g = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *in_g = ctx.Output<Tensor>(framework::GradVarName("NodesVector"));
+    auto *filter_g = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+    int max_depth = ctx.Attr<int>("max_depth");
+    auto *Embeddings = ctx.Input<Tensor>("NodesVector");
+    auto *edges = ctx.Input<Tensor>("EdgeSet");
+    auto *Filter = ctx.Input<Tensor>("Filter");
+    math::Tree2ColFunctor<DeviceContext, T> tree2col;
+    math::Col2TreeFunctor<DeviceContext, T> col2tree;
+    math::SetConstant<DeviceContext, T> constant;
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+
+    Tensor W;
+    W.ShareDataWith(*Filter);
+    W.Resize(framework::flatten_to_2d(Filter->dims(), 1));
+
+    int batch_size = static_cast<int>(Embeddings->dims()[0]);
+
+    auto edge_set_slicedim = framework::slice_ddim(
+        edges->dims(), 1, static_cast<int>(edges->dims().size()));
+
+    auto embedding_slicedim = framework::slice_ddim(
+        Embeddings->dims(), 1, static_cast<int>(Embeddings->dims().size()));
+
+    auto out_grad_dims = framework::slice_ddim(
+        out_g->dims(), 1, static_cast<int>(out_g->dims().size()));
+    out_grad_dims = framework::flatten_to_2d(out_grad_dims, 1);
+    if (filter_g) {
+      filter_g->mutable_data<T>(Filter->dims(), ctx.GetPlace());
+      Tensor f_g;
+      f_g.ShareDataWith(*filter_g);
+      f_g.Resize(framework::flatten_to_2d(Filter->dims(), 2));
+      constant(dev_ctx, filter_g, 0);
+      for (int batch_id = 0; batch_id < batch_size; batch_id++) {
+        auto edge_set =
+            edges->Slice(batch_id, batch_id + 1).Resize(edge_set_slicedim);
+        auto embeddings = Embeddings->Slice(batch_id, batch_id + 1)
+                              .Resize(embedding_slicedim);
+        auto out_grad =
+            out_g->Slice(batch_id, batch_id + 1).Resize(out_grad_dims);
+        Tensor patch;
+        tree2col(dev_ctx, edge_set, embeddings, &patch, max_depth);
+        blas.MatMul(patch, true, out_grad, false, T(1.0), &f_g, T(1.0));
+      }
+    }
+    if (in_g) {
+      auto input_grad_dims = framework::slice_ddim(
+          in_g->dims(), 1, static_cast<int>(in_g->dims().size()));
+      in_g->mutable_data<T>(Embeddings->dims(), ctx.GetPlace());
+      constant(dev_ctx, in_g, 0);
+      for (int batch_id = 0; batch_id < batch_size; batch_id++) {
+        auto edge_set =
+            edges->Slice(batch_id, batch_id + 1).Resize(edge_set_slicedim);
+        auto out_grad =
+            out_g->Slice(batch_id, batch_id + 1).Resize(out_grad_dims);
+        auto in_grad =
+            in_g->Slice(batch_id, batch_id + 1).Resize(input_grad_dims);
+        Tensor in_grad_temp;
+        col2tree(dev_ctx, edge_set, out_grad, &in_grad_temp, max_depth);
+        blas.MatMul(in_grad_temp, false, W, true, &in_grad);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 093571a93b4..ea88d8b4d09 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -183,6 +183,7 @@ __all__ = [
     'psroi_pool',
     'teacher_student_sigmoid_loss',
     'huber_loss',
+    'tree_conv',
 ]
 
 kIgnoreIndex = -100
@@ -9930,3 +9931,73 @@ def huber_loss(input, label, delta):
                  'Residual': residual},
         attrs={'delta': delta})
     return out
+
+
+@templatedoc()
+def tree_conv(nodes_vector,
+              edge_set,
+              output_size,
+              num_filters=1,
+              max_depth=2,
+              act='tanh',
+              param_attr=None,
+              bias_attr=None,
+              name=None):
+    """ 
+    ${comment}
+    		
+    Args:
+        nodes_vector(${nodes_vector_type}): ${nodes_vector_comment}
+        edge_set(${edge_set_type}): ${edge_set_comment}
+        output_size(int): output feature width
+        num_filters(int): number of filters, Default 1
+        max_depth(int): max depth of filters, Default 2
+        act(str): activation function, Default tanh
+        param_attr(ParamAttr): the parameter attribute for the filters, Default None
+        bias_attr(ParamAttr): the parameter attribute for the bias of this layer, Default None
+        name(str): a name of this layer(optional). If set None, the layer will be named automatically, Default None
+
+    Returns:
+        out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+          nodes_vector = layers.data(name='vectors', shape=[None, 10, 5], dtype='float32)
+          # None for batch size, 10 for max_node_size of dataset, 5 for vector width
+          edge_set = layers.data(name='edge_set', shape=[None, 10, 2], dtype='float32')
+          # None for batch size, 10 for max_node_size of dataset, 2 for every edge has two nodes
+          # edges must be directional
+          out_vector = layers.tree_conv(nodes_vector, edge_set, 6, 1, 2, 'tanh',
+              ParamAttr(initializer=Constant(1.0), ParamAttr(initializer=Constant(1.0))
+          # the shape of output will be [None, 10, 6, 1],
+          # None for batch size, 10 for max_node_size of dataset, 6 for output size, 1 for 1 filter
+          out_vector = layers.reshape(out_vector, shape=[None, 10, 6])
+          # After reshape, output tensor could be nodes_vector for next tree convolution
+          out_vector_2 = layers.tree_conv(out_vector, edge_set, 3, 4, 2, 'tanh',
+              ParamAttr(initializer=Constant(1.0), ParamAttr(initializer=Constant(1.0))
+          # also output tensor could be pooling(the pooling in paper called global pooling)
+          pooled = layers.reduce_max(out_vector, dims=2) # global pooling
+    """
+    helper = LayerHelper("tree_conv", **locals())
+    dtype = helper.input_dtype('nodes_vector')
+    feature_size = nodes_vector.shape[2]
+    W_shape = [feature_size, 3, output_size, num_filters]
+    W = helper.create_parameter(
+        attr=param_attr, shape=W_shape, dtype=dtype, is_bias=False)
+    if name == None:
+        out = helper.create_variable_for_type_inference(dtype=dtype)
+    else:
+        out = helper.create_variable(name=name, dtype=dtype, persistable=False)
+    helper.append_op(
+        type='tree_conv',
+        inputs={'NodesVector': nodes_vector,
+                'EdgeSet': edge_set,
+                'Filter': W},
+        outputs={'Out': out, },
+        attrs={'max_depth': max_depth})
+    if helper.bias_attr:
+        pre_activation = helper.append_bias_op(out)
+    else:
+        pre_activation = out
+    return helper.append_activation(pre_activation)
diff --git a/python/paddle/fluid/tests/unittests/test_tree_conv_op.py b/python/paddle/fluid/tests/unittests/test_tree_conv_op.py
new file mode 100644
index 00000000000..712453d2910
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tree_conv_op.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from op_test import OpTest
+
+
+def collect_node_patch(og, max_depth):
+    """
+    The naive method to construct patches
+    :param og: original graph
+    :param max_depth: the depth of convolution filters
+    :return: convolution patches
+    """
+
+    def gen(node, max_depth):
+        collected = [(node, 1, 1, 0, max_depth)]
+
+        def recurse_helper(node, depth):
+            if depth > max_depth:
+                return
+            l = len(og[node])
+            for idx, c in enumerate(og[node], 1):
+                if depth + 1 < max_depth:
+                    collected.append((c, idx, l, depth + 1, max_depth))
+                    recurse_helper(c, depth + 1)
+
+        recurse_helper(node, 0)
+        return collected
+
+    res = []
+    for u in range(1, len(og)):
+        lis = gen(u, max_depth)
+        if len(lis) > 0:
+            res.append(lis)
+    return res
+
+
+class TestTreeConvOp(OpTest):
+    def setUp(self):
+        self.n = 17
+        self.fea_size = 3
+        self.output_size = 1
+        self.max_depth = 2
+        self.batch_size = 1
+        self.num_filters = 1
+        adj_array = [
+            1, 2, 1, 3, 1, 4, 1, 5, 2, 6, 2, 7, 2, 8, 4, 9, 4, 10, 5, 11, 6, 12,
+            6, 13, 9, 14, 9, 15, 9, 16, 9, 17
+        ]
+        adj = np.array(adj_array).reshape((1, self.n - 1, 2)).astype('int32')
+        adj = np.tile(adj, (self.batch_size, 1, 1))
+        self.op_type = 'tree_conv'
+        vectors = np.random.random(
+            (self.batch_size, self.n, self.fea_size)).astype('float32')
+        self.inputs = {
+            'EdgeSet': adj,
+            'NodesVector': vectors,
+            'Filter': np.random.random((self.fea_size, 3, self.output_size,
+                                        self.num_filters)).astype('float32')
+        }
+        self.attrs = {'max_depth': self.max_depth}
+        vectors = []
+        for i in range(self.batch_size):
+            vector = self.get_output_naive(i)
+            vectors.append(vector)
+        self.outputs = {'Out': np.array(vectors).astype('float32'), }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['NodesVector', 'Filter'], 'Out', max_relative_error=0.5)
+
+    def get_output_naive(self, batch_id):
+        og = [[] for i in range(1, self.n + 2)]
+        st = np.array(self.inputs['EdgeSet'][batch_id]).tolist()
+        for e in st:
+            og[e[0]].append(e[1])
+        patches = collect_node_patch(og, self.max_depth)
+        W = np.array(self.inputs['Filter']).astype('float32')
+        W = np.transpose(W, axes=[1, 0, 2, 3])
+        vec = []
+        for i, patch in enumerate(patches, 1):
+            result = np.zeros((1, W.shape[2], W.shape[3]))
+            for v in patch:
+                eta_t = float(v[4] - v[3]) / float(v[4])
+                eta_l = (1.0 - eta_t) * (0.5 if v[2] == 1 else
+                                         float(v[1] - 1.0) / float(v[2] - 1.0))
+                eta_r = (1.0 - eta_t) * (1.0 - eta_l)
+                x = self.inputs['NodesVector'][batch_id][v[0] - 1]
+                eta = np.array([eta_l, eta_r, eta_t]).reshape(
+                    (3, 1)).astype('float32')
+                Wconvi = np.tensordot(eta, W, axes=([0], [0]))
+                x = np.array(x).reshape((1, 1, self.fea_size))
+                res = np.tensordot(x, Wconvi, axes=2)
+                result = result + res
+            vec.append(result)
+        vec = np.concatenate(vec, axis=0)
+        vec = np.concatenate(
+            [
+                vec, np.zeros(
+                    (self.n - vec.shape[0], W.shape[2], W.shape[3]),
+                    dtype='float32')
+            ],
+            axis=0)
+        return vec
-- 
GitLab


From 193edfa746bab6d4fdf47a2c1944648cdde7d378 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 15:34:43 +0800
Subject: [PATCH 21/73] remove legacy build_android and build_ios

test=develop
---
 .travis.yml                         |   2 -
 paddle/scripts/paddle_build.sh      | 114 +---------------------------
 tools/manylinux1/Dockerfile.android |  55 --------------
 3 files changed, 1 insertion(+), 170 deletions(-)
 delete mode 100644 tools/manylinux1/Dockerfile.android

diff --git a/.travis.yml b/.travis.yml
index 8c2d9f143b3..87de895ddad 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,7 +4,6 @@ cache:
     - $HOME/.ccache
     - $HOME/.cache/pip
     - $TRAVIS_BUILD_DIR/build/third_party
-    - $TRAVIS_BUILD_DIR/build_android/third_party
 sudo: required
 dist: trusty
 services:
@@ -13,7 +12,6 @@ os:
   - linux
 env:
   - JOB=check_style
-  - JOB=build_android
 addons:
   ssh_known_hosts: 13.229.163.131
 before_install:
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 0fb29d4b3d4..f58e392684d 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -26,8 +26,6 @@ function print_usage() {
 
     echo -e "\n${RED}Options${NONE}:
     ${BLUE}build${NONE}: run build for x86 platform
-    ${BLUE}build_android${NONE}: run build for android platform
-    ${BLUE}build_ios${NONE}: run build for ios platform
     ${BLUE}test${NONE}: run all unit tests
     ${BLUE}single_test${NONE}: run a single unit test
     ${BLUE}bind_test${NONE}: parallel tests bind to different GPU
@@ -301,110 +299,6 @@ EOF
     make install -j 8
 }
 
-function build_android() {
-    if [ $ANDROID_ABI == "arm64-v8a" ]; then
-      ANDROID_ARCH=arm64
-      if [ $ANDROID_API -lt 21 ]; then
-        echo "Warning: arm64-v8a requires ANDROID_API >= 21."
-        ANDROID_API=21
-      fi
-    else # armeabi, armeabi-v7a
-      ANDROID_ARCH=arm
-    fi
-
-    ANDROID_STANDALONE_TOOLCHAIN=$ANDROID_TOOLCHAINS_DIR/$ANDROID_ARCH-android-$ANDROID_API
-
-    cat <<EOF
-    ============================================
-    Generating the standalone toolchain ...
-    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh
-          --arch=$ANDROID_ARCH
-          --platform=android-$ANDROID_API
-          --install-dir=${ANDROID_STANDALONE_TOOLCHAIN}
-    ============================================
-EOF
-    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh \
-          --arch=$ANDROID_ARCH \
-          --platform=android-$ANDROID_API \
-          --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
-
-    BUILD_ROOT=${PADDLE_ROOT}/build_android
-    DEST_ROOT=${PADDLE_ROOT}/install_android
-
-    mkdir -p $BUILD_ROOT
-    cd $BUILD_ROOT
-
-    if [ $ANDROID_ABI == "armeabi-v7a" ]; then
-      cmake -DCMAKE_SYSTEM_NAME=Android \
-            -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-            -DANDROID_ABI=$ANDROID_ABI \
-            -DANDROID_ARM_NEON=ON \
-            -DANDROID_ARM_MODE=ON \
-            -DHOST_C_COMPILER=/usr/bin/gcc \
-            -DHOST_CXX_COMPILER=/usr/bin/g++ \
-            -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-            -DCMAKE_BUILD_TYPE=MinSizeRel \
-            -DUSE_EIGEN_FOR_BLAS=ON \
-            -DWITH_C_API=ON \
-            -DWITH_SWIG_PY=OFF \
-            ..
-    elif [ $ANDROID_ABI == "arm64-v8a" ]; then
-      cmake -DCMAKE_SYSTEM_NAME=Android \
-            -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-            -DANDROID_ABI=$ANDROID_ABI \
-            -DANDROID_ARM_MODE=ON \
-            -DHOST_C_COMPILER=/usr/bin/gcc \
-            -DHOST_CXX_COMPILER=/usr/bin/g++ \
-            -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-            -DCMAKE_BUILD_TYPE=MinSizeRel \
-            -DUSE_EIGEN_FOR_BLAS=OFF \
-            -DWITH_C_API=ON \
-            -DWITH_SWIG_PY=OFF \
-            ..
-    elif [ $ANDROID_ABI == "armeabi" ]; then
-      cmake -DCMAKE_SYSTEM_NAME=Android \
-            -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-            -DANDROID_ABI=$ANDROID_ABI \
-            -DANDROID_ARM_MODE=ON \
-            -DHOST_C_COMPILER=/usr/bin/gcc \
-            -DHOST_CXX_COMPILER=/usr/bin/g++ \
-            -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-            -DCMAKE_BUILD_TYPE=MinSizeRel \
-            -DWITH_C_API=ON \
-            -DWITH_SWIG_PY=OFF \
-            ..
-    else
-      echo "Invalid ANDROID_ABI: $ANDROID_ABI"
-    fi
-
-    cat <<EOF
-    ============================================
-    Building in $BUILD_ROOT ...
-    ============================================
-EOF
-    make -j `nproc`
-    make install -j `nproc`
-}
-
-function build_ios() {
-    # Create the build directory for CMake.
-    mkdir -p ${PADDLE_ROOT}/build
-    cd ${PADDLE_ROOT}/build
-
-    # Compile paddle binaries
-    cmake .. \
-          -DCMAKE_SYSTEM_NAME=iOS \
-          -DIOS_PLATFORM=OS \
-          -DCMAKE_OSX_ARCHITECTURES="arm64" \
-          -DWITH_C_API=ON \
-          -DUSE_EIGEN_FOR_BLAS=ON \
-          -DWITH_TESTING=OFF \
-          -DWITH_SWIG_PY=OFF \
-          -DCMAKE_BUILD_TYPE=Release
-
-    make -j 2
-}
-
 function run_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -639,7 +533,7 @@ EOF
     case $LIB_TYPE in
       full)
         # Build full Paddle Python module. Will timeout without caching 'copy_paddle_pybind' first
-        make -j `nproc` gen_proto_py framework_py_proto copy_paddle_pybind paddle_python
+        make -j `nproc` framework_py_proto copy_paddle_pybind paddle_python
         ;;
       pybind)
         # Build paddle pybind library. Takes 49 minutes to build. Might timeout
@@ -876,12 +770,6 @@ function main() {
         build
         gen_dockerfile ${PYTHON_ABI:-""}
         ;;
-      build_android)
-        build_android
-        ;;
-      build_ios)
-        build_ios
-        ;;
       test)
         run_test
         ;;
diff --git a/tools/manylinux1/Dockerfile.android b/tools/manylinux1/Dockerfile.android
deleted file mode 100644
index 7eb040902b0..00000000000
--- a/tools/manylinux1/Dockerfile.android
+++ /dev/null
@@ -1,55 +0,0 @@
-FROM ubuntu:16.04
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-# ENV variables
-ARG ANDROID_ABI
-ARG ANDROID_API
-
-ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
-ENV ANDROID_API=${ANDROID_API:-21}
-
-ENV HOME=/root \
-    ANDROID_NDK_HOME=/opt/android-ndk-linux \
-    ANDROID_TOOLCHAINS_DIR=/opt/toolchains
-
-RUN apt-get update && \
-    apt-get install -y \
-    git python-dev python-pip python-numpy \
-    wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
-    apt-get clean -y
-
-# Install Go and glide
-RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
-    tar -xz -C /usr/local && \
-    mkdir /root/gopath && \
-    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src
-ENV GOROOT=/usr/local/go GOPATH=/root/gopath
-# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
-ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
-
-# git credential to skip password typing
-RUN git config --global credential.helper store
-
-# Fix locales to en_US.UTF-8
-RUN localedef -i en_US -f UTF-8 en_US.UTF-8
-
-RUN pip install --upgrade pip==9.0.3 && \
-    pip install -U 'protobuf==3.1.0' && \
-    pip install -U wheel sphinx && \
-    pip install pre-commit
-
-# Android NDK
-RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
-    mkdir -p /opt/android-ndk-tmp && \
-    cd /opt/android-ndk-tmp && \
-    wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
-    unzip -q android-ndk-r14b-linux-x86_64.zip && \
-    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
-    rm -rf /opt/android-ndk-tmp
-
-CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
-
-- 
GitLab


From bbd921c32210ba94904066bd4eec8669a0ca0f97 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 16:12:37 +0800
Subject: [PATCH 22/73] recover glide for check_style

test=develop
---
 go/glide.lock | 233 ++++++++++++++++++++++++++++++++++++++++++++++++++
 go/glide.yaml |  33 +++++++
 2 files changed, 266 insertions(+)
 create mode 100644 go/glide.lock
 create mode 100644 go/glide.yaml

diff --git a/go/glide.lock b/go/glide.lock
new file mode 100644
index 00000000000..d15fc934dbe
--- /dev/null
+++ b/go/glide.lock
@@ -0,0 +1,233 @@
+hash: 107c058cf5c9163a75d40eef2273a793c36112683c25d72aa8288827fdde3a19
+updated: 2017-10-30T03:46:19.137696069Z
+imports:
+- name: github.com/alecthomas/gometalinter
+  version: bae2f1293d092fd8167939d5108d1b025eaef9de
+- name: github.com/beorn7/perks
+  version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9
+  subpackages:
+  - quantile
+- name: github.com/boltdb/bolt
+  version: 583e8937c61f1af6513608ccc75c97b6abdf4ff9
+- name: github.com/cockroachdb/cmux
+  version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92
+- name: github.com/coreos/etcd
+  version: f1d7dd87da3e8feab4aaf675b8e29c6a5ed5f58b
+  subpackages:
+  - alarm
+  - auth
+  - auth/authpb
+  - client
+  - clientv3
+  - clientv3/concurrency
+  - compactor
+  - discovery
+  - embed
+  - error
+  - etcdserver
+  - etcdserver/api
+  - etcdserver/api/etcdhttp
+  - etcdserver/api/v2http
+  - etcdserver/api/v2http/httptypes
+  - etcdserver/api/v3client
+  - etcdserver/api/v3election
+  - etcdserver/api/v3election/v3electionpb
+  - etcdserver/api/v3election/v3electionpb/gw
+  - etcdserver/api/v3lock
+  - etcdserver/api/v3lock/v3lockpb
+  - etcdserver/api/v3lock/v3lockpb/gw
+  - etcdserver/api/v3rpc
+  - etcdserver/api/v3rpc/rpctypes
+  - etcdserver/auth
+  - etcdserver/etcdserverpb
+  - etcdserver/etcdserverpb/gw
+  - etcdserver/membership
+  - etcdserver/stats
+  - lease
+  - lease/leasehttp
+  - lease/leasepb
+  - mvcc
+  - mvcc/backend
+  - mvcc/mvccpb
+  - pkg/adt
+  - pkg/contention
+  - pkg/cors
+  - pkg/cpuutil
+  - pkg/crc
+  - pkg/debugutil
+  - pkg/fileutil
+  - pkg/httputil
+  - pkg/idutil
+  - pkg/ioutil
+  - pkg/logutil
+  - pkg/monotime
+  - pkg/netutil
+  - pkg/pathutil
+  - pkg/pbutil
+  - pkg/runtime
+  - pkg/schedule
+  - pkg/srv
+  - pkg/tlsutil
+  - pkg/transport
+  - pkg/types
+  - pkg/wait
+  - proxy/grpcproxy/adapter
+  - raft
+  - raft/raftpb
+  - rafthttp
+  - snap
+  - snap/snappb
+  - store
+  - version
+  - wal
+  - wal/walpb
+- name: github.com/coreos/go-semver
+  version: 8ab6407b697782a06568d4b7f1db25550ec2e4c6
+  subpackages:
+  - semver
+- name: github.com/coreos/go-systemd
+  version: 48702e0da86bd25e76cfef347e2adeb434a0d0a6
+  subpackages:
+  - daemon
+  - journal
+  - util
+- name: github.com/coreos/pkg
+  version: 3ac0863d7acf3bc44daf49afef8919af12f704ef
+  subpackages:
+  - capnslog
+- name: github.com/dgrijalva/jwt-go
+  version: d2709f9f1f31ebcda9651b03077758c1f3a0018c
+- name: github.com/ghodss/yaml
+  version: 0ca9ea5df5451ffdf184b4428c902747c2c11cd7
+- name: github.com/go-stack/stack
+  version: 817915b46b97fd7bb80e8ab6b69f01a53ac3eebf
+- name: github.com/gogo/protobuf
+  version: 909568be09de550ed094403c2bf8a261b5bb730a
+  subpackages:
+  - proto
+- name: github.com/golang/protobuf
+  version: 4bd1920723d7b7c925de087aa32e2187708897f7
+  subpackages:
+  - jsonpb
+  - proto
+- name: github.com/golang/snappy
+  version: 553a641470496b2327abcac10b36396bd98e45c9
+- name: github.com/google/btree
+  version: 925471ac9e2131377a91e1595defec898166fe49
+- name: github.com/grpc-ecosystem/go-grpc-prometheus
+  version: 6b7015e65d366bf3f19b2b2a000a831940f0f7e0
+- name: github.com/grpc-ecosystem/grpc-gateway
+  version: 18d159699f2e83fc5bb9ef2f79465ca3f3122676
+  subpackages:
+  - runtime
+  - runtime/internal
+  - utilities
+- name: github.com/inconshreveable/log15
+  version: 0decfc6c20d9ca0ad143b0e89dcaa20f810b4fb3
+- name: github.com/jonboulle/clockwork
+  version: 2eee05ed794112d45db504eb05aa693efd2b8b09
+- name: github.com/mattn/go-colorable
+  version: 5411d3eea5978e6cdc258b30de592b60df6aba96
+- name: github.com/mattn/go-isatty
+  version: 57fdcb988a5c543893cc61bce354a6e24ab70022
+- name: github.com/matttproud/golang_protobuf_extensions
+  version: c12348ce28de40eed0136aa2b644d0ee0650e56c
+  subpackages:
+  - pbutil
+- name: github.com/namsral/flag
+  version: 71ceffbeb0ba60fccc853971bb3ed4d7d90bfd04
+- name: github.com/PaddlePaddle/recordio
+  version: 0432dee9fd4b24fb6840fb20a8c055b0c933fb81
+- name: github.com/prometheus/client_golang
+  version: c5b7fccd204277076155f10851dad72b76a49317
+  subpackages:
+  - prometheus
+- name: github.com/prometheus/client_model
+  version: 6f3806018612930941127f2a7c6c453ba2c527d2
+  subpackages:
+  - go
+- name: github.com/prometheus/common
+  version: 49fee292b27bfff7f354ee0f64e1bc4850462edf
+  subpackages:
+  - expfmt
+  - internal/bitbucket.org/ww/goautoneg
+  - model
+- name: github.com/prometheus/procfs
+  version: a1dba9ce8baed984a2495b658c82687f8157b98f
+  subpackages:
+  - xfs
+- name: github.com/satori/go.uuid
+  version: 879c5887cd475cd7864858769793b2ceb0d44feb
+- name: github.com/sirupsen/logrus
+  version: f006c2ac4710855cf0f916dd6b77acf6b048dc6e
+- name: github.com/topicai/candy
+  version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc
+- name: github.com/ugorji/go
+  version: ded73eae5db7e7a0ef6f55aace87a2873c5d2b74
+  subpackages:
+  - codec
+- name: github.com/xiang90/probing
+  version: 07dd2e8dfe18522e9c447ba95f2fe95262f63bb2
+- name: golang.org/x/crypto
+  version: 9419663f5a44be8b34ca85f08abc5fe1be11f8a3
+  repo: https://github.com/golang/crypto.git
+  vcs: git
+  subpackages:
+  - bcrypt
+  - blowfish
+  - ssh/terminal
+- name: golang.org/x/net
+  version: c8c74377599bd978aee1cf3b9b63a8634051cec2
+  subpackages:
+  - context
+  - http2
+  - http2/hpack
+  - idna
+  - internal/timeseries
+  - lex/httplex
+  - trace
+- name: golang.org/x/sys
+  version: e48874b42435b4347fc52bdee0424a52abc974d7
+  repo: https://github.com/golang/sys.git
+  vcs: git
+  subpackages:
+  - unix
+  - windows
+- name: golang.org/x/text
+  version: 836efe42bb4aa16aaa17b9c155d8813d336ed720
+  repo: https://github.com/golang/text.git
+  vcs: git
+  subpackages:
+  - secure/bidirule
+  - transform
+  - unicode/bidi
+  - unicode/norm
+- name: google.golang.org/grpc
+  version: 8050b9cbc271307e5a716a9d782803d09b0d6f2d
+  subpackages:
+  - codes
+  - credentials
+  - grpclog
+  - internal
+  - keepalive
+  - metadata
+  - naming
+  - peer
+  - stats
+  - tap
+  - transport
+- name: gopkg.in/yaml.v2
+  version: cd8b52f8269e0feb286dfeef29f8fe4d5b397e0b
+testImports:
+- name: github.com/davecgh/go-spew
+  version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9
+  subpackages:
+  - spew
+- name: github.com/pmezard/go-difflib
+  version: d8ed2627bdf02c080bf22230dbb337003b7aba2d
+  subpackages:
+  - difflib
+- name: github.com/stretchr/testify
+  version: 05e8a0eda380579888eb53c394909df027f06991
+  subpackages:
+  - assert
diff --git a/go/glide.yaml b/go/glide.yaml
new file mode 100644
index 00000000000..c5d66694acd
--- /dev/null
+++ b/go/glide.yaml
@@ -0,0 +1,33 @@
+package: github.com/PaddlePaddle/Paddle/go
+import:
+- package: github.com/PaddlePaddle/recordio
+- package: github.com/coreos/etcd
+  version: ^3.2.1
+  subpackages:
+  - clientv3
+  - clientv3/concurrency
+  - embed
+  - etcdserver
+- package: github.com/namsral/flag
+  version: ^1.7.4-pre
+- package: github.com/sirupsen/logrus
+  version: ^1.0.0
+- package: github.com/topicai/candy
+- package: golang.org/x/crypto
+  repo: https://github.com/golang/crypto.git
+  vcs: git
+- package: golang.org/x/sys
+  repo: https://github.com/golang/sys.git
+  vcs: git
+- package: golang.org/x/text
+  repo: https://github.com/golang/text.git
+  vcs: git
+- package: github.com/satori/go.uuid
+  version: v1.1.0
+- package: github.com/alecthomas/gometalinter
+  version: v1.2.1
+- package: github.com/inconshreveable/log15
+  version: v2.13
+- package: github.com/go-stack/stack
+  version: v1.6.0
+- package: github.com/golang/protobuf
-- 
GitLab


From 579d7582549f4c886dcf26ad3ba0b4de3bb6b7f9 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Thu, 17 Jan 2019 12:41:52 +0000
Subject: [PATCH 23/73] fix jitkernel tests and refine benchmark

test=develop
---
 paddle/fluid/operators/jit/benchmark.cc | 130 ++++++++++++++++++------
 paddle/fluid/operators/jit/test.cc      |  30 ++++--
 2 files changed, 117 insertions(+), 43 deletions(-)

diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index b39ce280939..e7041b12288 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -26,6 +26,49 @@
 DEFINE_int32(burning, 10, "Burning times.");
 DEFINE_int32(repeat, 3000, "Repeat times.");
 DEFINE_int32(max_size, 1000, "The Max size would be tested.");
+DEFINE_string(filter, "", "The Benchmark name would be run.");
+
+class BenchJITKernel {
+ public:
+  BenchJITKernel() = default;
+  virtual ~BenchJITKernel() = default;
+  virtual void Run() = 0;
+  virtual const char* Name() = 0;
+  virtual const char* Dtype() = 0;
+  virtual const char* Place() = 0;
+};
+
+static std::vector<BenchJITKernel*> g_all_benchmarks;
+
+BenchJITKernel* InsertBenchmark(BenchJITKernel* b) {
+  g_all_benchmarks.push_back(b);
+  return b;
+}
+
+#define BENCH_JITKERNEL(name, dtype, place)                                    \
+  class BenchJITKernel_##name##_##dtype##_##place##_ : public BenchJITKernel { \
+   public:                                                                     \
+    const char* Name() override { return #name; }                              \
+    const char* Dtype() override { return #dtype; }                            \
+    const char* Place() override { return #place; }                            \
+    void Run() override;                                                       \
+  };                                                                           \
+  static auto inserted_##name##_##dtype##_##place##_ =                         \
+      InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_());     \
+  void BenchJITKernel_##name##_##dtype##_##place##_::Run()
+
+#define BENCH_FP32_CPU(name) BENCH_JITKERNEL(name, FP32, CPU)
+
+void RUN_ALL_BENCHMARK() {
+  for (auto p : g_all_benchmarks) {
+    if (!FLAGS_filter.empty() && FLAGS_filter != p->Name()) {
+      continue;
+    }
+    LOG(INFO) << "Benchmark " << p->Name() << "." << p->Dtype() << "."
+              << p->Place();
+    p->Run();
+  }
+}
 
 template <typename T>
 void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
@@ -228,49 +271,70 @@ void BenchMatMulKernel() {
   }
 }
 
+using T = float;
+using PlaceType = paddle::platform::CPUPlace;
+
+// xyzn
+BENCH_FP32_CPU(kVMul) { BenchXYZNKernel<jit::kVMul, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel<jit::kVAdd, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel<jit::kVAddRelu, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kVSub) { BenchXYZNKernel<jit::kVSub, T, PlaceType>(); }
+
+// axyn
+BENCH_FP32_CPU(kVScal) { BenchAXYNKernel<jit::kVScal, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel<jit::kVAddBias, T, PlaceType>(); }
+
+// xyn
+BENCH_FP32_CPU(kVRelu) { BenchXYNKernel<jit::kVRelu, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel<jit::kVIdentity, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kVExp) { BenchXYNKernel<jit::kVExp, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel<jit::kVSigmoid, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kVTanh) { BenchXYNKernel<jit::kVTanh, T, PlaceType>(); }
+
+// lstm and peephole
+BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel<jit::kLSTMCtHt, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel<jit::kLSTMC1H1, T, PlaceType>(); }
+
+// gru functions
+BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel<jit::kGRUH1, T, PlaceType>(); }
+
+BENCH_FP32_CPU(kGRUHtPart1) {
+  BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
+}
+
+BENCH_FP32_CPU(kGRUHtPart2) {
+  BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
+}
+
+// seq pool function
+BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel<jit::kSeqPool, T, PlaceType>(); }
+
+// matmul
+BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, PlaceType>(); }
+
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]
 // Options:
 //     --burning: the burning time before count
 //     --repeat: the repeat times
 //     --max_size: the max size would be tested
+//     --filter: the bench name would be run
 int main(int argc, char* argv[]) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
   google::InitGoogleLogging(argv[0]);
   LOG(INFO) << "Burning " << FLAGS_burning << " times, Repeat " << FLAGS_repeat
             << " times.";
-  using T = float;
-  using PlaceType = paddle::platform::CPUPlace;
-  // xyzn
-  BenchXYZNKernel<jit::kVMul, T, PlaceType>();
-  BenchXYZNKernel<jit::kVAdd, T, PlaceType>();
-  BenchXYZNKernel<jit::kVAddRelu, T, PlaceType>();
-  BenchXYZNKernel<jit::kVSub, T, PlaceType>();
-
-  // axyn
-  BenchAXYNKernel<jit::kVScal, T, PlaceType>();
-  BenchAXYNKernel<jit::kVAddBias, T, PlaceType>();
-
-  // xyn
-  BenchXYNKernel<jit::kVRelu, T, PlaceType>();
-  BenchXYNKernel<jit::kVIdentity, T, PlaceType>();
-  BenchXYNKernel<jit::kVSquare, T, PlaceType>();
-  BenchXYNKernel<jit::kVExp, T, PlaceType>();
-  BenchXYNKernel<jit::kVSigmoid, T, PlaceType>();
-  BenchXYNKernel<jit::kVTanh, T, PlaceType>();
-
-  // lstm and peephole
-  BenchLSTMKernel<jit::kLSTMCtHt, T, PlaceType>();
-  BenchLSTMKernel<jit::kLSTMC1H1, T, PlaceType>();
-
-  // gru functions
-  BenchGRUKernel<jit::kGRUH1, T, PlaceType>();
-  BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
-  BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
-
-  // seq pool function
-  BenchSeqPoolKernel<jit::kSeqPool, T, PlaceType>();
 
-  // matmul
-  BenchMatMulKernel<jit::kMatMul, T, PlaceType>();
+  RUN_ALL_BENCHMARK();
 }
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index f4415a54ca9..68a79b6314e 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -22,6 +22,8 @@
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/place.h"
 
+static double acc = 1e-5;
+
 template <typename T>
 void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
                const T upper = static_cast<T>(20.f)) {
@@ -37,7 +39,7 @@ template <typename T>
 void ExpectEQ(const T* target, const T* refer, int n) {
   if (std::is_floating_point<T>::value) {
     for (int i = 0; i < n; ++i) {
-      EXPECT_NEAR(target[i], refer[i], 1e-5);
+      EXPECT_NEAR(target[i], refer[i], acc);
     }
   } else {
     for (int i = 0; i < n; ++i) {
@@ -62,7 +64,9 @@ namespace jit = paddle::operators::jit;
 
 template <typename KernelTuples, typename... Args>
 struct TestFuncWithRefer {
-  void operator()(const typename KernelTuples::func_type tgt, Args... args) {}
+  void operator()(const typename KernelTuples::func_type tgt, Args... args) {
+    LOG(FATAL) << "Should specify this function.";
+  }
 };
 
 template <typename T>
@@ -140,7 +144,8 @@ struct TestFuncWithRefer<jit::XYNTuples<T>, std::vector<T>, std::vector<T>> {
 
 template <typename T>
 struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>,
-                         std::vector<T>, std::vector<T>, std::vector<T>> {
+                         std::vector<T>, std::vector<T>, std::vector<T>,
+                         typename jit::LSTMTuples<T>::attr_type> {
   void operator()(const typename jit::LSTMTuples<T>::func_type tgt,
                   const std::vector<T>& xsrc, const std::vector<T>& wp,
                   const std::vector<T>& ct_1, const std::vector<T>& ct_ref,
@@ -185,7 +190,8 @@ struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>,
 
 template <typename T>
 struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>,
-                         std::vector<T>> {
+                         std::vector<T>,
+                         typename jit::GRUTuples<T>::attr_type> {
   void operator()(const typename jit::GRUTuples<T>::func_type tgt,
                   const std::vector<T>& xsrc, const std::vector<T>& ht_1,
                   const std::vector<T>& ht_ref,
@@ -212,8 +218,8 @@ struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>,
 };
 
 template <typename T>
-struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>,
-                         std::vector<T>> {
+struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
+                         typename jit::SeqPoolTuples<T>::attr_type> {
   void operator()(const typename jit::SeqPoolTuples<T>::func_type tgt,
                   const std::vector<T>& x, const std::vector<T>& yref,
                   const typename jit::SeqPoolTuples<T>::attr_type& attr) {
@@ -385,8 +391,8 @@ void TestLSTMKernel() {
             std::vector<T> xsrc(4 * d), wp(3 * d), ct_1(d);
             std::vector<T> ct_ref(d), ht_ref(d), checked(2 * d);
             RandomVec<T>(4 * d, xsrc.data(), -2.f, 2.f);
-            RandomVec<T>(3 * d, wp.data(), -2.f, 2.f);
-            RandomVec<T>(d, ct_1.data(), -2.f, 2.f);
+            RandomVec<T>(3 * d, wp.data(), -1.f, 1.f);
+            RandomVec<T>(d, ct_1.data(), -1.f, 1.f);
             // x could be changed after compute, so copy to save src
             std::vector<T> x(xsrc.size());
             std::copy(xsrc.begin(), xsrc.end(), x.begin());
@@ -481,14 +487,17 @@ void TestSeqPoolKernel() {
 template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
 void TestMatMulKernel() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  auto last_acc = acc;
+  // TODO(intel): this should be acc issue of MKL
+  acc = 1e-3;
   for (int m : {1, 2, 3, 4}) {
     for (int n : {1, 2, 3, 4}) {
       for (int k : TestSizes()) {
         auto ref = jit::GetRefer<KT, jit::MatMulTuples<T>>();
         EXPECT_TRUE(ref != nullptr);
         std::vector<T> a(m * k), b(k * n), c(m * n);
-        RandomVec<T>(m * k, a.data(), -0.2f, 0.2f);
-        RandomVec<T>(k * n, b.data(), -0.2f, 0.2f);
+        RandomVec<T>(m * k, a.data(), -2.f, 2.f);
+        RandomVec<T>(k * n, b.data(), -2.f, 2.f);
         const T* a_data = a.data();
         const T* b_data = b.data();
         T* c_data = c.data();
@@ -498,6 +507,7 @@ void TestMatMulKernel() {
       }
     }
   }
+  acc = last_acc;
 }
 
 template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
-- 
GitLab


From c8c3efad768578fb56bea91092e8ca73b007c290 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 18 Jan 2019 17:26:30 +0800
Subject: [PATCH 24/73] fix bias

test=develop
---
 python/paddle/fluid/imperative/nn.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index 79986070c23..5384533591c 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -221,8 +221,11 @@ class FC(layers.Layer):
         self._dtype = dtype
         from ..layer_helper import LayerHelper
         self._helper = LayerHelper(
-            'FC', param_attr=param_attr, act=act, name=name)
-        self._bias_attr = bias_attr if bias_attr else ParamAttr()
+            'FC',
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            act=act,
+            name=name)
 
     def parameters(self):
         return [self._w, self._b]
@@ -256,14 +259,16 @@ class FC(layers.Layer):
             inputs={"X": [tmp]},
             outputs={"Out": out},
             attrs={"use_mkldnn": False})
-        if not self._bias_attr:
-            return out
+
+        bias_attr = self._helper.bias_attr
+        if not bias_attr:
+            return
 
         # add bias
         size = list(out.shape[1:])
         if not self._built:
             self._b = self._helper.create_parameter(
-                attr=self._bias_attr, shape=size, dtype=out.dtype, is_bias=True)
+                attr=bias_attr, shape=size, dtype=out.dtype, is_bias=True)
         bias_out = self._helper.create_variable_for_type_inference(
             dtype=out.dtype)
         self._helper.append_op(
-- 
GitLab


From 3c09a57e472c9e92dd7fb36c6d00a558e87e87c6 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 18 Jan 2019 17:31:48 +0800
Subject: [PATCH 25/73] fix bias

test=develop
---
 python/paddle/fluid/imperative/nn.py | 33 ++++++++++++++--------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index 5384533591c..03fbfe76d12 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -261,21 +261,20 @@ class FC(layers.Layer):
             attrs={"use_mkldnn": False})
 
         bias_attr = self._helper.bias_attr
-        if not bias_attr:
-            return
-
-        # add bias
-        size = list(out.shape[1:])
-        if not self._built:
-            self._b = self._helper.create_parameter(
-                attr=bias_attr, shape=size, dtype=out.dtype, is_bias=True)
-        bias_out = self._helper.create_variable_for_type_inference(
-            dtype=out.dtype)
-        self._helper.append_op(
-            type='elementwise_add',
-            inputs={'X': [out],
-                    'Y': [self._b]},
-            outputs={'Out': [bias_out]},
-            attrs={'axis': 1})
+        if bias_attr:
+            # add bias
+            size = list(out.shape[1:])
+            if not self._built:
+                self._b = self._helper.create_parameter(
+                    attr=bias_attr, shape=size, dtype=out.dtype, is_bias=True)
+            bias_out = self._helper.create_variable_for_type_inference(
+                dtype=out.dtype)
+            self._helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [out],
+                        'Y': [self._b]},
+                outputs={'Out': [bias_out]},
+                attrs={'axis': 1})
+            out = bias_out
         # add activation
-        return self._helper.append_activation(bias_out)
+        return self._helper.append_activation(out)
-- 
GitLab


From 7e651a38dd9c3e8f68b75ba8e68086ef0c9151d1 Mon Sep 17 00:00:00 2001
From: Wu Yi <typhoonzero1986@gmail.com>
Date: Fri, 18 Jan 2019 18:04:20 +0800
Subject: [PATCH 26/73] fix mac cmake version 3.13 build (#15386)

* fix mac cmake version 3.13 test=develop

* fix again test=develop
---
 cmake/generic.cmake                               | 4 ++--
 paddle/fluid/operators/distributed/CMakeLists.txt | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index d5eaa987718..3f1be11d855 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -748,7 +748,7 @@ function(grpc_library TARGET_NAME)
   #FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but
   # somehow it didn't. line 602 to 604 is to patching this. Leaving this here
   # for now to enable dist CI.
-  protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
+  paddle_protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
   set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
   set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
   cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}")
@@ -791,7 +791,7 @@ function(brpc_library TARGET_NAME)
   get_filename_component(PROTO_WE ${brpc_library_PROTO} NAME_WE)
   get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
 
-  protobuf_generate_cpp(brpc_proto_srcs brpc_proto_hdrs "${ABS_PROTO}")
+  paddle_protobuf_generate_cpp(brpc_proto_srcs brpc_proto_hdrs "${ABS_PROTO}")
   cc_library("${TARGET_NAME}_proto" SRCS "${brpc_proto_srcs}")
   cc_library("${TARGET_NAME}" SRCS "${brpc_library_SRCS}" DEPS "${TARGET_NAME}_proto" "${brpc_library_DEPS}")
 endfunction()
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 1249ef9a9b5..7fcbf85f187 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -7,7 +7,7 @@ if(WITH_GRPC)
 else()
     set(cc_generic_services "true")
 endif()
-configure_file(send_recv.proto.in ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto @ONLY)
+configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
 
 # FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
@@ -19,7 +19,7 @@ if(WITH_GRPC)
         variable_response.cc
         collective_client.cc collective_server.cc
         ${GRPC_SRCS}
-      PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto 
+      PROTO send_recv.proto 
       DEPS lod_tensor selected_rows_functor memory)
 
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-- 
GitLab


From 316e44b1b7d674b9b6ac0a8bbfa1725a75d2fbda Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 18 Jan 2019 13:55:02 +0000
Subject: [PATCH 27/73] fix unused warnings

test=develop
---
 paddle/fluid/operators/jit/benchmark.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index e7041b12288..74d6a872478 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/platform/device_tracer.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/port.h"
+#include "paddle/fluid/platform/variant.h"  // for UNUSED
 
 DEFINE_int32(burning, 10, "Burning times.");
 DEFINE_int32(repeat, 3000, "Repeat times.");
@@ -53,7 +54,7 @@ BenchJITKernel* InsertBenchmark(BenchJITKernel* b) {
     const char* Place() override { return #place; }                            \
     void Run() override;                                                       \
   };                                                                           \
-  static auto inserted_##name##_##dtype##_##place##_ =                         \
+  static auto inserted_##name##_##dtype##_##place##_ UNUSED =                  \
       InsertBenchmark(new BenchJITKernel_##name##_##dtype##_##place##_());     \
   void BenchJITKernel_##name##_##dtype##_##place##_::Run()
 
-- 
GitLab


From 451896fce4d48306737883c73a837ecda3f691d7 Mon Sep 17 00:00:00 2001
From: WangZhen <wangzhen31@baidu.com>
Date: Sat, 19 Jan 2019 02:58:25 +0800
Subject: [PATCH 28/73] init quantization.

---
 .../fluid/framework/details/build_strategy.cc |   1 +
 paddle/fluid/framework/ir/pass.cc             |   4 +
 paddle/fluid/pybind/ir.cc                     |   6 +
 paddle/fluid/pybind/protobuf.cc               |   6 +
 paddle/fluid/pybind/pybind.cc                 |  14 +-
 .../paddle/fluid/contrib/slim/graph/graph.py  |  79 ++++-
 .../quantization/quantization_performer.py    | 287 ++++++++++++++++++
 7 files changed, 391 insertions(+), 6 deletions(-)
 create mode 100644 python/paddle/fluid/contrib/slim/quantization/quantization_performer.py

diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index df0ff772c9d..ad73085f52e 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -233,3 +233,4 @@ USE_PASS(sequential_execution_pass);
 USE_PASS(all_reduce_deps_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
 USE_PASS(lock_free_optimize_pass);
+USE_PASS(graph_to_program_pass);
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index 6cf405efe63..33ccee6aa0a 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -28,10 +28,14 @@ std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
     PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.",
                    attr);
   }
+  auto* native_graph = graph.get();
   auto applied_graph = ApplyImpl(std::move(graph));
   // TODO(panyx0718): Add more verifications.
   PADDLE_ENFORCE(!HasCircle(*applied_graph),
                  "Illegal Pass. Generated graph shouldn't has cycle.");
+  PADDLE_ENFORCE(applied_graph.get() == native_graph,
+                 "Pass::Apply() cannot delete the passed graph and shouldn't "
+                 "return a new graph.(For the need of pybind11)");
   applied_ = true;
   return applied_graph;
 }
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index d32fe58f869..1205ccf7f02 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -42,6 +42,7 @@ void BindGraph(py::module *m) {
       .def("get_float", &Graph::Get<float>)
       .def("get_double", &Graph::Get<double>)
       .def("get_string", &Graph::Get<std::string>)
+      .def("get_program", &Graph::Get<ProgramDesc>)
       .def("set", [](Graph &self, const std::string &attr_name,
                      int attr) { return self.Set(attr_name, new int(attr)); })
       .def("set",
@@ -57,6 +58,11 @@ void BindGraph(py::module *m) {
            [](Graph &self, const std::string &attr_name, double attr) {
              return self.Set(attr_name, new double(attr));
            })
+      .def("set",
+           [](Graph &self, const std::string &attr_name,
+              const ProgramDesc &attr) {
+             return self.Set(attr_name, new ProgramDesc(attr));
+           })
       .def("erase", &Graph::Erase)
       .def("nodes", &Graph::Nodes, return_value_policy::reference)
       .def("create_var_node",
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 4b218fb3a2a..09c08f1ffc8 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -229,6 +229,12 @@ void BindBlockDesc(pybind11::module *m) {
 void BindVarDsec(pybind11::module *m) {
   pybind11::class_<pd::VarDesc> var_desc(*m, "VarDesc", "");
   var_desc
+      .def("__init__",
+           [](pd::VarDesc &self, const pybind11::bytes &binary_str) {
+             std::string str(binary_str);
+             new (&self) pd::VarDesc(str);
+           },
+           pybind11::return_value_policy::reference)
       .def("name", &pd::VarDesc::Name, pybind11::return_value_policy::reference)
       .def("set_name", &pd::VarDesc::SetName)
       .def("set_shape", &pd::VarDesc::SetShape)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index f3f4854a9ef..ae50f3885f6 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -786,9 +786,20 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("disable_profiler", platform::DisableProfiler);
   m.def("is_profiler_enabled", platform::IsProfileEnabled);
   m.def("reset_profiler", platform::ResetProfiler);
+  m.def("get_pass", [](const py::bytes &binary_str) {
+    std::string pass_type(binary_str);
+    auto pass = framework::ir::PassRegistry::Instance().Get(pass_type);
+    return std::shared_ptr<framework::ir::Pass>(std::move(pass));
+  });
 
   py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
   pass.def(py::init())
+      .def("has", &ir::Pass::Has)
+      .def("set_program",
+           [](ir::Pass &self, const std::string &attr_name,
+              const ProgramDesc &attr) {
+             return self.Set(attr_name, new ProgramDesc(attr));
+           })
       .def(
           "set_str",
           [](ir::Pass &self, const std::string &name, const std::string &attr) {
@@ -796,11 +807,12 @@ All parameter, weight, gradient are variables in Paddle.
           })
       .def("set_int", [](ir::Pass &self, const std::string &name,
                          int val) { self.Set<const int>(name, new int(val)); })
+      .def("get_program", &ir::Pass::Get<ProgramDesc>)
       .def("type", &ir::Pass::Type)
       .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
         std::unique_ptr<ir::Graph> origin_graph(graph.get());
         auto optim_graph = self.Apply(std::move(origin_graph));
-        graph.reset(optim_graph.release());
+        optim_graph.release();
       });
 
   py::class_<ir::PassBuilder, std::shared_ptr<ir::PassBuilder>> pb(
diff --git a/python/paddle/fluid/contrib/slim/graph/graph.py b/python/paddle/fluid/contrib/slim/graph/graph.py
index 7d6b0702035..774da2d1ef1 100644
--- a/python/paddle/fluid/contrib/slim/graph/graph.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph.py
@@ -13,8 +13,81 @@
 # limitations under the License.
 
 from ....framework import Program
+from ....framework import Block
+from .... import core
 
-__all__ = ['Graph', 'ImitationGraph', 'IRGraph']
+__all__ = ['Graph', 'ImitationGraph', 'PyGraph']
+
+
+class PyGraph(object):
+    """
+    PyGraph uses core.Graph as the delegation to accomplish the manipulation.
+    """
+
+    def __init__(self, graph):
+        assert isinstance(
+            graph, core.Graph), 'graph must be the instance of core.Graph.'
+        self.graph = graph
+
+    def all_parameters(self):
+        params = []
+        for node in self.graph.nodes():
+            if node.is_var() and node.var().persistable():
+                params.append(node)
+        return params
+
+    def all_vars(self):
+        return [node for node in self.graph.nodes() if node.is_var()]
+
+    def all_ops(self):
+        return [node for node in self.graph.nodes() if node.is_op()]
+
+    def create_param_node(self, name, var_type, shape, var_dtype):
+        var_desc = core.VarDesc(name)
+        var_desc.set_type(var_type)
+        var_desc.set_shape(shape)
+        var_desc.set_dtype(var_dtype)
+        var_desc.set_persistable(True)
+        return self.graph.create_var_node(var_desc)
+
+    def create_var_node(self, name, var_type, shape, var_dtype):
+        var_desc = core.VarDesc(name)
+        var_desc.set_type(var_type)
+        var_desc.set_shape(shape)
+        var_desc.set_dtype(var_dtype)
+        return self.graph.create_var_node(var_desc)
+
+    def create_var_node_from_desc(self, var_desc):
+        return self.graph.create_var_node(var_desc)
+
+    def create_op_node(self, op_type, attrs, inputs, outputs):
+        op_desc = core.OpDesc()
+        op_desc.set_type(op_type)
+        for attr, value in attrs.iteritems():
+            self._update_desc_attr(op_desc, attr, value)
+        for input_name, var_node in inputs.iteritems():
+            op_desc.set_input(input_name, [var_node.name()])
+        for output_name, var_node in outputs.iteritems():
+            op_desc.set_output(output_name, [var_node.name()])
+        return self.graph.create_op_node(op_desc)
+
+    def create_op_node_from_desc(self, op_desc):
+        return self.graph.create_op_node(op_desc)
+
+    def _update_desc_attr(self, desc, name, val):
+        """
+        Update the value of desc's attribute by attribute's name.
+        """
+        if isinstance(val, Block):
+            desc.set_block_attr(name, val.desc)
+        elif isinstance(val, list) and val and all(
+                isinstance(v, Block) for v in val):
+            desc.set_blocks_attr(name, [v.desc for v in val])
+        elif isinstance(val, core.BlockDesc) or \
+                isinstance(val, core.ProgramDesc):
+            desc.set_serialized_attr(name, val.serialize_to_string())
+        else:
+            desc._set_attr(name, val)
 
 
 class Graph(object):
@@ -39,7 +112,3 @@ class ImitationGraph(Graph):
 
     def all_parameters(self):
         return self.program.global_block().all_parameters()
-
-
-class IRGraph(Graph):
-    pass
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_performer.py b/python/paddle/fluid/contrib/slim/quantization/quantization_performer.py
new file mode 100644
index 00000000000..7d9207dfbc9
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_performer.py
@@ -0,0 +1,287 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import numpy as np
+from .... import core
+from ....initializer import Constant
+from .... import unique_name
+from ..graph import PyGraph
+
+
+class QuantizationPerformer(object):
+    def __init__(self,
+                 weight_bits=8,
+                 activation_bits=8,
+                 activation_quantize_type='abs_max',
+                 weight_quantize_type='abs_max',
+                 window_size=10000):
+        """
+        Convert and rewrite the IRGraph according to weight and
+        activation quantization type.
+        Args:
+            weight_bits (int): quantization bit number for weights,
+                the bias is not quantized.
+            activation_bits (int): quantization bit number for activation.
+            activation_quantize_type (str): quantization type for activation,
+                now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode,
+                the quantization scale will be calculated dynamically each step
+                in both training and testing period. If use 'range_abs_max',
+                a static quantization scale will be calculated during training
+                and used in inference.
+            weight_quantize_type (str): quantization type for weights,
+                support 'abs_max'. The 'range_abs_max' usually is not used for
+                weight, since weights are fixed once the model is well trained.
+            window_size (int): the window size for 'range_abs_max' quantization.
+        Examples:
+        .. code-block:: python
+            # the original graph will be rewrite, if you don't want to
+            # change it, please clone at first.
+            # graph = graph.clone()
+            from paddle.fluid.contrib.slim import *
+            from paddle.fluid.contrib.quantize import *
+            graph = IRGraph(program)
+            performer = QuantizationPerformer()
+            performer.quantize_transform(graph)
+        """
+        self.weight_bits = weight_bits
+        self.activation_bits = activation_bits
+
+        quant_type = ['abs_max', 'range_abs_max']
+        if activation_quantize_type not in quant_type:
+            raise ValueError(
+                "Unknown activation_quantize_type : '%s'. It can only be ",
+                "'abs_max' or 'range_abs_max'.", str(activation_quantize_type))
+        if weight_quantize_type not in quant_type:
+            raise ValueError(
+                "Unknown weight_quantize_type: '%s'. It can only be ",
+                "'abs_max' or 'range_abs_max'.", str(weight_quantize_type))
+
+        self.activation_quantize_type = activation_quantize_type
+        self.weight_quantize_type = weight_quantize_type
+        self.window_size = window_size
+
+        self.need_inited_outer = collections.OrderedDict()
+        self.quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
+        self.quantizable_grad_ops = [
+            '%s_grad' % (op) for op in self.quantizable_ops
+        ]
+        self.fake_quant_op_types = [
+            'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
+        ]
+        self.fake_dequant_op_types = ['fake_dequantize_max_abs']
+        self.is_test = None
+        self.global_step = None
+
+    def quantize_transform(self, graph, is_test):
+        self.need_inited_outer.clear()
+        self.is_test = is_test
+        assert isinstance(graph,
+                          PyGraph), 'graph must be the instance of PyGraph.'
+        # marked the variable which has been dequantized.
+        dequantized_vars = collections.OrderedDict()
+        params = [p.name() for p in graph.all_parameters()]
+
+        def _transform_forward(graph, op):
+            for var_node in op.inputs:
+                if var_node.name() in dequantized_vars:
+                    dequant_var_node = dequantized_vars[var_node.name()]
+                else:
+                    quant_bits = self.weight_bits if var_node.name() in params \
+                    else self.activation_bits
+                    quant_type = self.weight_quantize_type if var_node.name() \
+                        in params else self.activation_quantize_type
+                    quant_var_node, scale_var_node = self._insert_quant_op(
+                        graph, var_node, quant_bits, quant_type)
+                    dequant_var_node = self._insert_dequant_op(
+                        graph, quant_var_node, scale_var_node, quant_bits)
+                    dequantized_vars[var_node.name()] = dequant_var_node
+                self._update_input(var_node, dequant_var_node, op)
+
+        if not self.is_test:
+            self._create_global_step(graph)
+        ops = graph.all_ops()
+        for op in ops:
+            # transform the forward graph
+            if op.name() in self.quantizable_ops:
+                _transform_forward(graph, op)
+            # rename the inputs of backward op
+            if op.name() in self.quantizable_grad_ops:
+                _transform_backward(graph, op)
+        return self.need_inited_outer
+
+    def _insert_quant_op(self, graph, var_node, quant_bits, quant_type):
+        """
+        Insert fake_quantize_op in the graph.
+        """
+        if quant_type == 'abs_max':
+            return self._insert_quant_abs_max_op(graph, var_node, quant_bits)
+        elif quant_type == 'range_abs_max':
+            return self._inser_quant_range_abs_max_op(graph, var_node,
+                                                      quant_bits)
+
+    def _insert_quant_abs_max_op(self, graph, var_node, quant_bits):
+        """
+        Insert fake_quantize_abs_max op in the graph.
+        """
+        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
+
+        quant_var_node = graph.create_var_node(
+            name=self._quantized_var_name(var_node.name()),
+            var_type=var_node.var().type(),
+            shape=var_node.var().shape(),
+            var_dtype=var_node.var().dtype())
+        scale_var_node = graph.create_var_node(
+            name=self._quantized_scale_name(var_node.name()),
+            var_type=var_node.var().type(),
+            shape=var_node.var().shape(),
+            var_dtype=var_node.var().dtype())
+        quant_op_node = graph.create_op_node(
+            op_type='fake_quantize_abs_max',
+            attrs={'bit_length': quant_bits},
+            inputs={'X': var_node},
+            outputs={'Out': quant_var_node,
+                     'OutScale': scale_var_node})
+        self._link_to(var_node, quant_op_node)
+        self._link_to(quant_op_node, quant_var_node)
+        self._link_to(quant_op_node, scale_var_node)
+        return quant_var_node, scale_var_node
+
+    def _insert_quant_range_abs_max_op(self, graph, var_node, quant_bits):
+        """
+        Insert fake_quantize_range_abs_max on the graph.
+        """
+        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
+
+        quant_var_node = graph.create_var_node(
+            name=self._quantized_var_name(var_node.name()),
+            var_type=var_node.var().type(),
+            shape=var_node.var().shape(),
+            var_dtype=var_node.var().dtype())
+
+        scale_in_node = graph.create_param_node(
+            name=self._quantized_scale_name(var_node.name()),
+            var_type=core.VarDesc.VarType.LOD_TENSOR,
+            shape=[1],
+            var_dtype=var_node.var().dtype())
+        self.need_inited_outer[scale_in_node.var()] = Constant(value=0.001)
+
+        scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
+        inputs = {'X': var_node, 'InScale': scale_in_node}
+        outputs = {'Out': quant_var_node, 'OutScale': scale_out_node}
+
+        if not self.is_test:
+            # The name of scales_var_node maybe 'scales_0', 'scales_1', etc.
+            scales_node = graph.create_param_node(
+                name=unique_name.generate('scales'),
+                var_type=core.VarDesc.VarType.LOD_TENSOR,
+                shape=[self.window_size],
+                var_dtype=var_node.var().dtype())
+            self.need_inited_outer[scales_node.var()] = Constant(value=0)
+            inputs['Iter'] = self.global_step
+            outputs['OutScales'] = scales_node
+        attrs = {
+            'window_size': self.window_size,
+            'bit_length': quant_bits,
+            'is_test': self.is_test
+        }
+        quant_op_node = graph.create_op_node(
+            op_type='fake_quantize_range_abs_max',
+            attrs=attrs,
+            inputs=inputs,
+            outputs=outputs)
+
+        self._link_to(var_node, quant_op_node)
+        self._link_to(scale_in_node, quant_op_node)
+        self._link_to(quant_op_node, quant_var_node)
+        self._link_to(quant_op_node, scale_out_node)
+
+        if not self.is_test:
+            self._link_to(self.global_step, quant_op_node)
+            self._link_to(quant_op_node, scales_node)
+
+        return quant_var_node, scale_out_node
+
+    def _insert_dequant_op(self, graph, var_node, scale_var_node, quant_bits):
+        """
+        Insert fake_dequantize_op in the graph.
+        """
+        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
+
+        dequant_var_node = graph.create_var_node(
+            name=self._dequantized_var_name(var_node.name()),
+            var_type=var_node.var().type(),
+            shape=var_node.var().shape(),
+            var_dtype=var_node.var().dtype())
+        max_range = (1 << (quant_bits - 1)) - 1
+        dequant_op_node = graph.create_op_node(
+            op_type='fake_dequantize_max_abs',
+            attrs={'max_range': float(max_range)},
+            inputs={'X': var_node,
+                    'Scale': scale_var_node},
+            outputs={'Out': dequant_var_node})
+        self._link_to(var_node, dequant_op_node)
+        self._link_to(scale_var_node, dequant_op_node)
+        self._link_to(dequant_op_node, dequant_var_node)
+        return dequant_var_node
+
+    def _update_input(self, old_input_node, new_input_node, op_node):
+        old_input_node.outputs.remove(op_node)
+        op_node.inputs.remove(old_input_node)
+        new_input_node.outputs.append(op_node)
+        op_node.inputs.append(new_input_node)
+
+    def _link_to(node_in, node_out):
+        node_in.outputs.append(node_out)
+        node_out.inputs.append(node_in)
+
+    def _quantized_var_name(self, var_name):
+        """
+        Return quantized variable name for the input `var_name`.
+        """
+        return "%s.quantized" % (var_name)
+
+    def _dequantized_var_name(self, var_name):
+        """
+        Return dequantized variable name for the input `var_name`.
+        """
+        return "%s.dequantized" % (var_name)
+
+    def _quantized_scale_name(self, var_name):
+        """
+        Return quantized variable name for the input `var_name`.
+        """
+        return "%s.scale" % (var_name)
+
+    def _original_var_name(self, var_name):
+        """
+        Return the original variable name.
+        """
+        if var_name.endswith('.quantized.dequantized'):
+            return var_name[:-len('.quantized.dequantized')]
+        if var_name.endswith('.quantized'):
+            return var_name[:-len('.quantized')]
+        if var_name.endswith('.dequantized'):
+            return var_name[:-len('.dequantized')]
+        if var_name.endswith('.scale'):
+            return var_name[:-len('.scale')]
+        else:
+            return var_name
+
+    def _is_float(self, v):
+        return isinstance(v, float) or isinstance(v, np.float32)
+
+    def _quant(self, x, scale, num_bits):
+        y = np.round(x / scale * ((1 << (num_bits - 1)) - 1))
+        return y
-- 
GitLab


From c102f427d2eea147e75c69213c2e4253feb9071c Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 22:02:04 +0800
Subject: [PATCH 29/73] make 'paddle version' valid

test=develop
---
 paddle/scripts/submit_local.sh.in | 36 +------------------------------
 python/setup.py.in                |  2 ++
 2 files changed, 3 insertions(+), 35 deletions(-)

diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 622a2d51049..1f421f248fa 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -1,19 +1,5 @@
 #!/bin/bash
 
-function usage(){
-        echo "usage: paddle [--help] [<args>]"
-        echo "These are common paddle commands used in various situations:"
-        echo "    train             Start a paddle_trainer"
-        echo "    merge_model       Start a paddle_merge_model"
-        echo "    pserver           Start a paddle_pserver_main"
-        echo "    version           Print paddle version"
-        echo "    dump_config       Dump the trainer config as proto string"
-        echo "    make_diagram      Make Diagram using Graphviz"
-        echo ""
-        echo "'paddle train --help' 'paddle merge_model --help', 'paddle pserver --help', list more detailed usage of each command"
-}
-
-
 function version(){
         echo "PaddlePaddle @PADDLE_VERSION@, compiled with"
         echo "    with_avx: @WITH_AVX@"
@@ -177,30 +163,10 @@ cpu_config
 # echo $KMP_AFFINITY $OMP_DYNAMIC
 
 case "$1" in
-    "train")
-        threads_config $@
-        # echo $OMP_NUM_THREADS $MKL_NUM_THREADS $OPENBLAS_NUM_THREADS
-        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2}
-        ;;
-    "merge_model")
-        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_merge_model ${@:2}
-        ;;
-    "pserver")
-        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_pserver_main ${@:2}
-        ;;
-    "dump_config")
-        python -m paddle.utils.dump_config ${@:2}
-        ;;
-    "make_diagram")
-        python -m paddle.utils.make_model_diagram ${@:2}
-        ;;
     "version")
         version
         ;;
-    "--help")
-        usage
-        ;;
     *)
-        usage
+        version
         ;;
  esac
diff --git a/python/setup.py.in b/python/setup.py.in
index 730b2e1f71c..e00c88b3a6e 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -125,6 +125,8 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
 
 # the prefix is sys.prefix which should always be usr
 paddle_bins = ''
+if not '${WIN32}':
+    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 package_data={'paddle.fluid': ['core' + (ext_name if os.name != 'nt' else '.pyd')]}
 
 package_dir={
-- 
GitLab


From cf7dd49d5211106efc3f93b587b5e1b1fd87e9d1 Mon Sep 17 00:00:00 2001
From: dongdaxiang <dongdaxiang@baidu.com>
Date: Sat, 19 Jan 2019 23:02:45 +0800
Subject: [PATCH 30/73] remove file cnt assignment in async_executor

---
 python/paddle/fluid/async_executor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py
index 4ca6a5170eb..25f95ffbb0a 100644
--- a/python/paddle/fluid/async_executor.py
+++ b/python/paddle/fluid/async_executor.py
@@ -200,7 +200,6 @@ class AsyncExecutor(object):
             local_path,
             self.instance.get_worker_index(),
             self.instance.get_node_cnt() / 2,
-            file_cnt,
             multi_processes=process_num)
         self.instance.barrier_worker()  #wait for download_data
 
-- 
GitLab


From db50b0110056e29cb7feb9b263eb90c6893323f3 Mon Sep 17 00:00:00 2001
From: dongdaxiang <dongdaxiang@baidu.com>
Date: Sat, 19 Jan 2019 23:02:45 +0800
Subject: [PATCH 31/73] remove file cnt assignment in async_executor
 test=develop

---
 python/paddle/fluid/async_executor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py
index 4ca6a5170eb..25f95ffbb0a 100644
--- a/python/paddle/fluid/async_executor.py
+++ b/python/paddle/fluid/async_executor.py
@@ -200,7 +200,6 @@ class AsyncExecutor(object):
             local_path,
             self.instance.get_worker_index(),
             self.instance.get_node_cnt() / 2,
-            file_cnt,
             multi_processes=process_num)
         self.instance.barrier_worker()  #wait for download_data
 
-- 
GitLab


From e2ff300b02feda77473c8703954b990828a3a10e Mon Sep 17 00:00:00 2001
From: WangZhen <wangzhen31@baidu.com>
Date: Sun, 20 Jan 2019 15:24:45 +0800
Subject: [PATCH 32/73] add UT for quantization.

---
 .../fluid/framework/details/build_strategy.cc |   1 +
 paddle/fluid/pybind/ir.cc                     |  55 ++++++-
 paddle/fluid/pybind/protobuf.cc               |   8 +-
 .../paddle/fluid/contrib/slim/graph/graph.py  |  80 +++++++++--
 .../contrib/slim/quantization/__init__.py     |  20 +++
 .../quantization/quantization_performer.py    |  69 +++++++--
 .../unitest/test_quantization_performer.py    | 135 ++++++++++++++++++
 python/setup.py.in                            |   1 +
 8 files changed, 336 insertions(+), 33 deletions(-)
 create mode 100644 python/paddle/fluid/contrib/slim/quantization/__init__.py
 create mode 100644 python/paddle/fluid/contrib/slim/unitest/test_quantization_performer.py

diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index ad73085f52e..e69d67dc55a 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/sequential_execution_pass.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index 1205ccf7f02..24059140ab2 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -15,7 +15,9 @@
 #include "paddle/fluid/pybind/ir.h"
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
@@ -24,6 +26,7 @@
 namespace py = pybind11;
 using paddle::framework::ir::Graph;
 using paddle::framework::ir::Node;
+using paddle::framework::ir::GraphSafeRemoveNodes;
 using paddle::framework::OpDesc;
 using paddle::framework::ProgramDesc;
 using paddle::framework::VarDesc;
@@ -32,6 +35,7 @@ using pybind11::return_value_policy;
 namespace paddle {
 namespace pybind {
 void BindGraph(py::module *m) {
+  m->def("graph_safe_remove_nodes", GraphSafeRemoveNodes);
   py::class_<Graph, std::shared_ptr<Graph>>(
       *m, "Graph",
       "The graph is a Directed Acyclic Single Static Assignment Graph, see "
@@ -43,6 +47,7 @@ void BindGraph(py::module *m) {
       .def("get_double", &Graph::Get<double>)
       .def("get_string", &Graph::Get<std::string>)
       .def("get_program", &Graph::Get<ProgramDesc>)
+      .def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>)
       .def("set", [](Graph &self, const std::string &attr_name,
                      int attr) { return self.Set(attr_name, new int(attr)); })
       .def("set",
@@ -63,6 +68,12 @@ void BindGraph(py::module *m) {
               const ProgramDesc &attr) {
              return self.Set(attr_name, new ProgramDesc(attr));
            })
+      .def("set",
+           [](Graph &self, const std::string &attr_name,
+              const std::unordered_set<const Node *> &attr) {
+             return self.Set(attr_name,
+                             new std::unordered_set<const Node *>(attr));
+           })
       .def("erase", &Graph::Erase)
       .def("nodes", &Graph::Nodes, return_value_policy::reference)
       .def("create_var_node",
@@ -91,12 +102,52 @@ void BindNode(py::module *m) {
   py::class_<Node> node(*m, "Node");
   node.def("name", &Node::Name)
       .def("node_type", &Node::NodeType)
-      .def("var", &Node::Var)
-      .def("op", &Node::Op)
+      .def("var", &Node::Var, return_value_policy::reference)
+      .def("op", &Node::Op, return_value_policy::reference)
       .def("id", &Node::id)
       .def("is_op", &Node::IsOp)
       .def("is_var", &Node::IsVar)
       .def("is_ctrl_var", &Node::IsCtrlVar)
+      .def("inputs_remove",
+           [](Node &self, int node_id) {
+             for (auto it = self.inputs.begin(); it != self.inputs.end();
+                  it++) {
+               if ((*it)->id() == node_id) {
+                 self.inputs.erase(it);
+               }
+             }
+           })
+      .def("inputs_remove",
+           [](Node &self, Node &node) {
+             for (auto it = self.inputs.begin(); it != self.inputs.end();
+                  it++) {
+               if (*it == &node) {
+                 self.inputs.erase(it);
+               }
+             }
+           })
+      .def("inputs_append",
+           [](Node &self, Node &node) { self.inputs.push_back(&node); })
+      .def("outputs_remove",
+           [](Node &self, int node_id) {
+             for (auto it = self.outputs.begin(); it != self.outputs.end();
+                  it++) {
+               if ((*it)->id() == node_id) {
+                 self.outputs.erase(it);
+               }
+             }
+           })
+      .def("outputs_remove",
+           [](Node &self, Node &node) {
+             for (auto it = self.outputs.begin(); it != self.outputs.end();
+                  it++) {
+               if (*it == &node) {
+                 self.outputs.erase(it);
+               }
+             }
+           })
+      .def("outputs_append",
+           [](Node &self, Node &node) { self.outputs.push_back(&node); })
       .def_readwrite("inputs", &Node::inputs)
       .def_readwrite("outputs", &Node::outputs);
 
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 09c08f1ffc8..e729be4a95a 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -228,13 +228,7 @@ void BindBlockDesc(pybind11::module *m) {
 
 void BindVarDsec(pybind11::module *m) {
   pybind11::class_<pd::VarDesc> var_desc(*m, "VarDesc", "");
-  var_desc
-      .def("__init__",
-           [](pd::VarDesc &self, const pybind11::bytes &binary_str) {
-             std::string str(binary_str);
-             new (&self) pd::VarDesc(str);
-           },
-           pybind11::return_value_policy::reference)
+  var_desc.def(pybind11::init<const std::string &>())
       .def("name", &pd::VarDesc::Name, pybind11::return_value_policy::reference)
       .def("set_name", &pd::VarDesc::SetName)
       .def("set_shape", &pd::VarDesc::SetShape)
diff --git a/python/paddle/fluid/contrib/slim/graph/graph.py b/python/paddle/fluid/contrib/slim/graph/graph.py
index 774da2d1ef1..61f9f950c4d 100644
--- a/python/paddle/fluid/contrib/slim/graph/graph.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph.py
@@ -11,12 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from __future__ import print_function
+import os
+import subprocess
 from ....framework import Program
 from ....framework import Block
 from .... import core
 
-__all__ = ['Graph', 'ImitationGraph', 'PyGraph']
+__all__ = ['Graph', 'ImitationGraph', 'IRGraph', 'PyGraph']
 
 
 class PyGraph(object):
@@ -30,17 +32,18 @@ class PyGraph(object):
         self.graph = graph
 
     def all_parameters(self):
-        params = []
+        param_nodes = set()
         for node in self.graph.nodes():
-            if node.is_var() and node.var().persistable():
-                params.append(node)
-        return params
+            if node.is_var() and node.var() is not None and node.var(
+            ).persistable():
+                param_nodes.add(node)
+        return param_nodes
 
     def all_vars(self):
-        return [node for node in self.graph.nodes() if node.is_var()]
+        return {node for node in self.graph.nodes() if node.is_var()}
 
     def all_ops(self):
-        return [node for node in self.graph.nodes() if node.is_op()]
+        return {node for node in self.graph.nodes() if node.is_op()}
 
     def create_param_node(self, name, var_type, shape, var_dtype):
         var_desc = core.VarDesc(name)
@@ -65,10 +68,16 @@ class PyGraph(object):
         op_desc.set_type(op_type)
         for attr, value in attrs.iteritems():
             self._update_desc_attr(op_desc, attr, value)
-        for input_name, var_node in inputs.iteritems():
-            op_desc.set_input(input_name, [var_node.name()])
-        for output_name, var_node in outputs.iteritems():
-            op_desc.set_output(output_name, [var_node.name()])
+        for input_name, var_nodes in inputs.iteritems():
+            if not isinstance(var_nodes, list):
+                var_nodes = [var_nodes]
+            op_desc.set_input(input_name,
+                              [var_node.name() for var_node in var_nodes])
+        for output_name, var_nodes in outputs.iteritems():
+            if not isinstance(var_nodes, list):
+                var_nodes = [var_nodes]
+            op_desc.set_output(output_name,
+                               [var_node.name() for var_node in var_nodes])
         return self.graph.create_op_node(op_desc)
 
     def create_op_node_from_desc(self, op_desc):
@@ -89,6 +98,49 @@ class PyGraph(object):
         else:
             desc._set_attr(name, val)
 
+    def safe_remove_nodes(self, remove_nodes):
+        if not isinstance(remove_nodes, set):
+            remove_nodes = set(remove_nodes)
+        core.graph_safe_remove_nodes(self.graph, remove_nodes)
+
+    def draw_graph(self, save_path, name, marked_nodes=None):
+        def _convert_to_pdf(dot_file_path):
+            pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
+            exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \
+                            + ' -o ' + pdf_save_path, shell=True)
+            if exited_code != 0:
+                print('The dot command is needed for creating pdf files.')
+                print('The {} is saved as the dot filetype.'.format(
+                    dot_file_path))
+
+        remove_ctr_vars = set()
+        ops_num = 0
+        for node in self.graph.nodes():
+            if node.is_ctrl_var():
+                remove_ctr_vars.add(node)
+            elif node.is_op():
+                ops_num += 1
+        print('Total ops num = {}.'.format(ops_num))
+        self.safe_remove_nodes(remove_ctr_vars)
+        if marked_nodes is not None:
+            if not isinstance(marked_nodes, set):
+                marked_nodes = set(marked_nodes)
+            marked_nodes = marked_nodes - remove_ctr_vars
+            self.graph.set('__graphviz__marked_node__', marked_nodes)
+        viz_dot_path = os.path.join(save_path, name) + '.dot'
+        viz_pass = core.get_pass('graph_viz_pass')
+        viz_pass.set_str('graph_viz_path', viz_dot_path)
+        viz_pass.apply(self.graph)
+        _convert_to_pdf(viz_dot_path)
+
+    def to_program(self):
+        convert_pass = core.get_pass('graph_to_program_pass')
+        convert_pass.set_program('program', Program().desc)
+        convert_pass.apply(self.graph)
+        program = Program()
+        program.desc = convert_pass.get_program('program')
+        return program
+
 
 class Graph(object):
     """
@@ -112,3 +164,7 @@ class ImitationGraph(Graph):
 
     def all_parameters(self):
         return self.program.global_block().all_parameters()
+
+
+class IRGraph(Graph):
+    pass
diff --git a/python/paddle/fluid/contrib/slim/quantization/__init__.py b/python/paddle/fluid/contrib/slim/quantization/__init__.py
new file mode 100644
index 00000000000..f5223854176
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/quantization/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from . import quantization_performer
+from .quantization_performer import *
+
+__all__ = quantization_performer.__all__
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_performer.py b/python/paddle/fluid/contrib/slim/quantization/quantization_performer.py
index 7d9207dfbc9..ac84b763a6c 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_performer.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_performer.py
@@ -19,6 +19,8 @@ from ....initializer import Constant
 from .... import unique_name
 from ..graph import PyGraph
 
+__all__ = ['QuantizationPerformer']
+
 
 class QuantizationPerformer(object):
     def __init__(self,
@@ -108,19 +110,62 @@ class QuantizationPerformer(object):
                         graph, quant_var_node, scale_var_node, quant_bits)
                     dequantized_vars[var_node.name()] = dequant_var_node
                 self._update_input(var_node, dequant_var_node, op)
+                op.op()._rename_input(var_node.name(), dequant_var_node.name())
+
+        def _transform_backward(graph, op):
+            no_dequanted_input_vars = True
+            for var_node in op.inputs:
+                if var_node.name() in dequantized_vars:
+                    dequant_var_node = dequantized_vars[var_node.name()]
+                    self._update_input(var_node, dequant_var_node, op)
+                    op.op()._rename_input(var_node.name(),
+                                          dequant_var_node.name())
+                    no_dequanted_input_vars = False
+            if no_dequanted_input_vars:
+                raise ValueError("There is no dequanted inputs for op %s." %
+                                 (op.name()))
 
         if not self.is_test:
             self._create_global_step(graph)
         ops = graph.all_ops()
+        # The process of _transform_forward and _transform_backward is needed in two for loops.
+        # The loop for transforming the forward graph:
         for op in ops:
-            # transform the forward graph
             if op.name() in self.quantizable_ops:
                 _transform_forward(graph, op)
-            # rename the inputs of backward op
+        # The loop for renaming the inputs of backward op.
+        for op in ops:
             if op.name() in self.quantizable_grad_ops:
                 _transform_backward(graph, op)
+
         return self.need_inited_outer
 
+    def _create_global_step(self, graph):
+        if self.weight_quantize_type == 'range_abs_max' or \
+                self.activation_quantize_type == 'range_abs_max':
+            counter_name = '@STEP_COUNTER@'
+            for node in graph.all_vars():
+                if node.name() == counter_name:
+                    self.global_step = node
+            if self.global_step is None:
+                global_step_in = graph.create_param_node(
+                    name=counter_name,
+                    var_type=core.VarDesc.VarType.LOD_TENSOR,
+                    shape=[1],
+                    var_dtype=core.VarDesc.VarType.INT64)
+                self.need_inited_outer[global_step_in.var()] = \
+                    Constant(value=0, force_cpu=True)
+                global_step_out = graph.create_var_node_from_desc(
+                    global_step_in.var())
+                increment_op = graph.create_op_node(
+                    op_type='increment',
+                    attrs={'step': 1.0},
+                    inputs={'X': global_step_in},
+                    outputs={'Out': global_step_out})
+                self._link_to(global_step_in, increment_op)
+                self._link_to(increment_op, global_step_out)
+                self.global_step = global_step_out
+
     def _insert_quant_op(self, graph, var_node, quant_bits, quant_type):
         """
         Insert fake_quantize_op in the graph.
@@ -128,8 +173,8 @@ class QuantizationPerformer(object):
         if quant_type == 'abs_max':
             return self._insert_quant_abs_max_op(graph, var_node, quant_bits)
         elif quant_type == 'range_abs_max':
-            return self._inser_quant_range_abs_max_op(graph, var_node,
-                                                      quant_bits)
+            return self._insert_quant_range_abs_max_op(graph, var_node,
+                                                       quant_bits)
 
     def _insert_quant_abs_max_op(self, graph, var_node, quant_bits):
         """
@@ -237,14 +282,14 @@ class QuantizationPerformer(object):
         return dequant_var_node
 
     def _update_input(self, old_input_node, new_input_node, op_node):
-        old_input_node.outputs.remove(op_node)
-        op_node.inputs.remove(old_input_node)
-        new_input_node.outputs.append(op_node)
-        op_node.inputs.append(new_input_node)
-
-    def _link_to(node_in, node_out):
-        node_in.outputs.append(node_out)
-        node_out.inputs.append(node_in)
+        old_input_node.outputs_remove(op_node)
+        op_node.inputs_remove(old_input_node)
+        new_input_node.outputs_append(op_node)
+        op_node.inputs_append(new_input_node)
+
+    def _link_to(self, node_in, node_out):
+        node_in.outputs_append(node_out)
+        node_out.inputs_append(node_in)
 
     def _quantized_var_name(self, var_name):
         """
diff --git a/python/paddle/fluid/contrib/slim/unitest/test_quantization_performer.py b/python/paddle/fluid/contrib/slim/unitest/test_quantization_performer.py
new file mode 100644
index 00000000000..771d880a28d
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/unitest/test_quantization_performer.py
@@ -0,0 +1,135 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import unittest
+import random
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import six
+from paddle.fluid.framework import Program
+from paddle.fluid.contrib.slim.quantization import QuantizationPerformer
+from paddle.fluid.contrib.slim.graph import PyGraph
+from paddle.fluid import core
+
+
+def linear_fc(num):
+    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = data
+    for _ in six.moves.xrange(num):
+        hidden = fluid.layers.fc(hidden, size=128, act='relu')
+    loss = fluid.layers.cross_entropy(input=hidden, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def residual_block(num):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=bias_attr)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = data
+    for _ in six.moves.xrange(num):
+        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
+        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
+        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
+    fc = fluid.layers.fc(input=hidden, size=10)
+    loss = fluid.layers.cross_entropy(input=fc, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestQuantizationPerformer(unittest.TestCase):
+    def setUp(self):
+        # since quant_op and dequant_op is not ready, use cos and sin for test
+        self.weight_quant_op_type = 'fake_quantize_abs_max'
+        self.dequant_op_type = 'fake_dequantize_max_abs'
+        self.quantizable_op_and_inputs = {
+            'conv2d': ['Input', 'Filter'],
+            'depthwise_conv2d': ['Input', 'Filter'],
+            'mul': ['X', 'Y']
+        }
+        self.quantizable_op_grad_and_inputs = {
+            'conv2d_grad': ['Input', 'Filter'],
+            'depthwise_conv2d_grad': ['Input', 'Filter'],
+            'mul_grad': ['X', 'Y']
+        }
+
+    def linear_fc_quant(self, quant_type):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = linear_fc(3)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+        graph = PyGraph(core.Graph(main.desc))
+        performer = QuantizationPerformer(activation_quantize_type=quant_type)
+        performer.quantize_transform(graph, False)
+        marked_nodes = set()
+        for op in graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        graph.draw_graph('.', 'quantize_fc_' + quant_type, marked_nodes)
+
+    def test_linear_fc_quant_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_abs_max'
+        self.linear_fc_quant('abs_max')
+
+    def test_linear_fc_quant_range_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_range_abs_max'
+        self.linear_fc_quant('range_abs_max')
+
+    def residual_block_quant(self, quant_type):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = residual_block(2)
+            opt = fluid.optimizer.Adam(learning_rate=0.001)
+            opt.minimize(loss)
+        graph = PyGraph(core.Graph(main.desc))
+        performer = QuantizationPerformer(activation_quantize_type=quant_type)
+        performer.quantize_transform(graph, False)
+        marked_nodes = set()
+        for op in graph.all_ops():
+            if op.name().find('quantize') > -1:
+                marked_nodes.add(op)
+        graph.draw_graph('.', 'quantize_residual_' + quant_type, marked_nodes)
+
+    def test_residual_block_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_abs_max'
+        self.residual_block_quant('abs_max')
+
+    def test_residual_block_range_abs_max(self):
+        self.act_quant_op_type = 'fake_quantize_range_abs_max'
+        self.residual_block_quant('range_abs_max')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index c9afe6c8856..e41bd4d377a 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -113,6 +113,7 @@ packages=['paddle',
           'paddle.fluid.contrib.slim.core',
           'paddle.fluid.contrib.slim.graph',
           'paddle.fluid.contrib.slim.prune',
+          'paddle.fluid.contrib.slim.quantization',
           'paddle.fluid.contrib.utils',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details']
-- 
GitLab


From 7ab4af27160f692711b3f793e800af8d9eb36409 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Mon, 21 Jan 2019 10:12:01 +0800
Subject: [PATCH 33/73] Fix brpc compilation. (#15417)

---
 paddle/fluid/operators/distributed/CMakeLists.txt         | 8 +++-----
 .../fluid/operators/distributed/collective_server_test.cc | 3 +--
 paddle/testing/paddle_gtest_main.cc                       | 6 +++++-
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 7fcbf85f187..6a61a8d7861 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -29,11 +29,9 @@ if(WITH_GRPC)
     DEPS ${RPC_DEPS} scope profiler math_function SERIAL)
 
 else()
-  set_source_files_properties(brpc_server.cc parameter_prefetch.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
-      brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc collective_server.cc collective_server_test.cc
-      collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
+  set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
-  set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc/server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
   brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
       request_handler_impl.cc rpc_client.cc rpc_server.cc
       variable_response.cc
@@ -54,6 +52,6 @@ cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
 cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 if(WITH_GPU)
     cc_test(collective_server_test SRCS collective_server_test.cc 
-        DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
+        DEPS sendrecvop_rpc executor ${RPC_DEPS}
         selected_rows_functor  scope math_function SERIAL)
 endif()
diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc
index 46c761000c3..5009058422b 100644
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ b/paddle/fluid/operators/distributed/collective_server_test.cc
@@ -82,8 +82,7 @@ void Gather(const std::vector<distributed::RemoteVar>& vars,
   std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]);
 }
 
-TEST(PREFETCH, GPU) {
-  setenv("FLAGS_max_body_size", "2147483647", 1);
+TEST(CollectiveServer, GPU) {
   platform::CUDAPlace place;
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto& ctx = *pool.Get(place);
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 47c5248b57d..e91fa929243 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -32,7 +32,11 @@ int main(int argc, char** argv) {
   std::vector<std::string> envs;
   std::vector<std::string> undefok;
 #if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_GRPC)
-  envs.push_back("max_body_size");
+  std::string str_max_body_size;
+  if (google::GetCommandLineOption("max_body_size", &str_max_body_size)) {
+    setenv("FLAGS_max_body_size", "2147483647", 1);
+    envs.push_back("max_body_size");
+  }
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-- 
GitLab


From 530869f829821347405e7a80ee5e07c9bb5daa94 Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Mon, 21 Jan 2019 10:44:43 +0800
Subject: [PATCH 34/73] Share LoD from Input(Rois). (#15420)

test=develop
---
 paddle/fluid/operators/detection/roi_perspective_transform_op.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
index 3796854fe67..a97828e6fe9 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
@@ -493,6 +493,7 @@ class ROIPerspectiveTransformOp : public framework::OperatorWithKernel {
     auto out_dims = framework::make_ddim(out_dims_v);
 
     ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("ROIs", /*->*/ "Out");
   }
 
  protected:
-- 
GitLab


From 9252aa41f5af28f73f890b775ce2648c02c45724 Mon Sep 17 00:00:00 2001
From: Wu Yi <typhoonzero1986@gmail.com>
Date: Mon, 21 Jan 2019 11:12:39 +0800
Subject: [PATCH 35/73] add multi process start script (#15381)

* add multi process start script test=develop

* refine tool test=develop
---
 tools/run_mp.py | 129 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 tools/run_mp.py

diff --git a/tools/run_mp.py b/tools/run_mp.py
new file mode 100644
index 00000000000..2485400ab81
--- /dev/null
+++ b/tools/run_mp.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import subprocess
+import os
+import sys
+import time
+import argparse
+
+default_envs = {
+    "PADDLE_TRAINER_ENDPOINTS":
+    "127.0.0.1:6170,127.0.0.1:6171,127.0.0.1:6172,127.0.0.1:6173,127.0.0.1:6174,127.0.0.1:6175,127.0.0.1:6176,127.0.0.1:6177",
+    "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+    "PATH": os.getenv("PATH"),
+    "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
+    "PADDLE_TRAINERS_NUM": "8",
+    "NCCL_DEBUG": "INFO",
+    "GLOG_v": "0",
+    "NCCL_SOCKET_IFNAME": "eth0",
+    "NCCL_IB_GID_INDEX": "3",
+    "NCCL_IB_RETRY_CNT": "0",
+}
+
+GPUS = 8
+
+
+def start_procs(gpus, cmd, log_dir):
+    procs = []
+    log_fns = []
+    os.system("mkdir -p %s" % log_dir)
+    # ======== update parent envs =======
+    for k, v in os.environ.items():
+        if k.startswith("FLAGS_") or k.startswith("NCCL_") or \
+            k.startswith("GLOG_"):
+            default_envs[k] = v
+
+    # ======== for dist training =======
+    node_trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+    current_ip = os.getenv("POD_IP", "127.0.0.1")
+    trainer_ips = os.getenv("PADDLE_TRAINERS", current_ip).split(",")
+    num_nodes = len(trainer_ips)
+    all_nodes_devices_endpoints = ""
+    for n in trainer_ips:
+        for i in range(gpus):
+            if all_nodes_devices_endpoints:
+                all_nodes_devices_endpoints += ","
+            all_nodes_devices_endpoints += "%s:617%d" % (n, i)
+    nranks = num_nodes * gpus
+    # ======== for dist training =======
+
+    for i in range(gpus):
+        curr_env = {}
+        curr_env.update(default_envs)
+        curr_env.update({
+            "FLAGS_selected_gpus": "%d" % i,
+            "PADDLE_TRAINER_ID": "%d" % (node_trainer_id * gpus + i),
+            "PADDLE_CURRENT_ENDPOINT": "%s:617%d" % (current_ip, i),
+            # nranks
+            "PADDLE_TRAINERS_NUM": "%d" % nranks,
+            "PADDLE_TRAINER_ENDPOINTS": all_nodes_devices_endpoints
+        })
+
+        print("starting process ", i, cmd, curr_env)
+        fn = open("%s/workerlog.%d" % (log_dir, i), "w")
+        log_fns.append(fn)
+        procs.append(
+            subprocess.Popen(
+                cmd.strip().split(" "), stdout=fn, stderr=fn, env=curr_env))
+
+    for i in range(gpus):
+        try:
+            procs[i].communicate()
+            procs[i].terminate()
+            log_fns[i].close()
+        except:
+            pass
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='''start paddle training using multi-process mode.
+NOTE: your train program ***must*** run as distributed nccl2 mode,
+see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
+And your train program must read environment variables below in order to let different
+process init properly:
+FLAGS_selected_gpus
+PADDLE_TRAINER_ID
+PADDLE_CURRENT_ENDPOINT
+PADDLE_TRAINERS_NUM
+PADDLE_TRAINER_ENDPOINTS
+POD_IP (current node ip address, not needed for local training)
+''')
+    parser.add_argument(
+        '--gpus',
+        type=int,
+        default=8,
+        help='start number of processes for every gpu')
+    parser.add_argument(
+        '--cmd',
+        type=str,
+        default="",
+        help='command to run for each process, e.g. python train.py --lr 0.1')
+    parser.add_argument(
+        '--log_dir',
+        type=str,
+        default="mylog",
+        help='directory to put logs per process.')
+    args = parser.parse_args()
+    if args.cmd == "":
+        parser.print_help()
+        exit(0)
+    start_procs(args.gpus, args.cmd, args.log_dir)
+
+
+if __name__ == "__main__":
+    main()
-- 
GitLab


From 9f8f0fc2d307e85045dc45d362bb5672a9b24011 Mon Sep 17 00:00:00 2001
From: Dun <randonlang@gmail.com>
Date: Mon, 21 Jan 2019 11:18:51 +0800
Subject: [PATCH 36/73] Memory optimization of depthwise conv op and group norm
 op (#15313)

* mem opt

* test=develop

* test=develop

* test=develop

* test=develop

* test=develop

* test=develop

* test=develop

* refine code  test=develop

* refine code  test=develop

* refine code  test=develop

* refine code  test=develop

* refine with cub test=develop

* fix mkldnn test && remove comments && test=develop

* polish code && test=develop

* add only_forward test && test=develop
---
 paddle/fluid/framework/details/CMakeLists.txt |   3 +-
 .../fluid/framework/details/build_strategy.cc |  10 +
 .../fluid/framework/details/build_strategy.h  |   2 +
 paddle/fluid/framework/ir/CMakeLists.txt      |   1 +
 .../ir/fuse_relu_depthwise_conv_pass.cc       | 159 ++++++++++++
 .../ir/fuse_relu_depthwise_conv_pass.h        |  42 ++++
 paddle/fluid/operators/conv_mkldnn_op.cc      |  15 +-
 paddle/fluid/operators/conv_op.cc             |  34 ++-
 paddle/fluid/operators/conv_op.h              |  49 ++--
 paddle/fluid/operators/group_norm_op.cc       |  33 ++-
 paddle/fluid/operators/group_norm_op.cu       | 186 +++++++-------
 paddle/fluid/operators/group_norm_op.h        |  69 +++---
 paddle/fluid/operators/math/depthwise_conv.cu | 233 ++++++++++--------
 paddle/fluid/operators/math/depthwise_conv.h  |   9 +-
 paddle/fluid/pybind/pybind.cc                 |  14 ++
 .../paddle/fluid/contrib/memory_usage_calc.py |   2 +-
 python/paddle/fluid/layers/nn.py              |   1 +
 .../unittests/parallel_executor_test_base.py  |   5 +-
 .../fluid/tests/unittests/test_conv2d_op.py   |  86 ++++++-
 .../test_fuse_relu_depthwise_conv_pass.py     | 149 +++++++++++
 20 files changed, 846 insertions(+), 256 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
 create mode 100644 paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index c1ba6606f10..d5966ad5a97 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -93,5 +93,6 @@ cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fuse
 cc_library(build_strategy SRCS build_strategy.cc DEPS
         graph_viz_pass multi_devices_graph_pass
         multi_devices_graph_print_pass multi_devices_graph_check_pass
-        fuse_elewise_add_act_pass multi_batch_merge_pass
+        fuse_elewise_add_act_pass multi_batch_merge_pass 
+        fuse_relu_depthwise_conv_pass
         memory_optimize_pass lock_free_optimize_pass)
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index df0ff772c9d..756470c5b0b 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -55,6 +55,9 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     }
 
     // Add op fusion.
+    if (strategy.fuse_relu_depthwise_conv_) {
+      AppendPass("fuse_relu_depthwise_conv_pass");
+    }
     if (strategy.fuse_elewise_add_act_ops_) {
       auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass");
       // Add a graph viz pass to record a graph.
@@ -210,6 +213,12 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
       pass->Set<const std::vector<OpDesc *>>(
           kAllOpDescs,
           new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
+    } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
+      if (!use_cuda) {
+        LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on "
+                        "GPU, skipped.";
+        continue;
+      }
     }
     graph = pass->Apply(std::move(graph));
   }
@@ -220,6 +229,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
 }  // namespace framework
 }  // namespace paddle
 
+USE_PASS(fuse_relu_depthwise_conv_pass);
 USE_PASS(fuse_elewise_add_act_pass);
 USE_PASS(graph_viz_pass);
 USE_PASS(multi_batch_merge_pass);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 15c2e01b614..603df2e0693 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -74,6 +74,8 @@ struct BuildStrategy {
 
   bool fuse_elewise_add_act_ops_{false};
 
+  bool fuse_relu_depthwise_conv_{false};
+
   bool memory_optimize_{false};
 
   bool memory_early_delete_{false};
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 84b53212647..b118dccd1b3 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -70,6 +70,7 @@ if(WITH_MKLDNN)
 endif()
 
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
+cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector )
 
 set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
 
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
new file mode 100644
index 00000000000..0d94008ea82
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
@@ -0,0 +1,159 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  graph = FuseReluDepthwiseConv(std::move(graph), true);
+  graph = FuseReluDepthwiseConv(std::move(graph), false);
+  return graph;
+}
+
+std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
+    std::unique_ptr<ir::Graph> graph, bool only_forward) const {
+  PADDLE_ENFORCE(graph.get());
+  if (only_forward)
+    FusePassBase::Init("relu_depthwise_conv_only_forward", graph.get());
+  else
+    FusePassBase::Init("relu_depthwise_conv", graph.get());
+  /*
+           x ---act--> y ---layer-> z
+            +----------+
+            ↓          ↓
+    x' <--act'--- y' <-layer'--- z'
+
+    fuse to:
+
+           x ---act-layer-> z
+           |
+           ↓
+    x' <--act-layer'--- z'
+
+  */
+
+  GraphPatternDetector gpd;
+  auto *pattern = gpd.mutable_pattern();
+  std::string act_type = "relu";
+  std::string layer_type = "depthwise_conv2d";
+  auto *x = pattern->NewNode("x")->AsInput();
+  auto *y = pattern->NewNode("y")->AsIntermediate();
+  auto *z = pattern->NewNode("z")->AsOutput();
+  PDNode *xg = nullptr;
+  PDNode *yg = nullptr;
+  PDNode *zg = nullptr;
+  if (!only_forward) {
+    xg = pattern->NewNode("xg")->AsOutput();
+    yg = pattern->NewNode("yg")->AsIntermediate();
+    zg = pattern->NewNode("zg")->AsInput();
+  }
+
+  PDNode *act_g = nullptr;
+  PDNode *layer_g = nullptr;
+  auto *act = pattern->NewNode("act")->assert_is_op(act_type);
+  auto *layer = pattern->NewNode("layer")->assert_is_op(layer_type);
+  if (!only_forward) {
+    act_g = pattern->NewNode("act_g")->assert_is_op(act_type + "_grad");
+    layer_g = pattern->NewNode("layer_g")->assert_is_op(layer_type + "_grad");
+  }
+
+  act->LinksFrom({x}).LinksTo({y});
+  layer->LinksFrom({y}).LinksTo({z});
+  if (!only_forward) {
+    layer_g->LinksFrom({y, zg}).LinksTo({yg});
+    act_g->LinksFrom({y, yg}).LinksTo({xg});
+  }
+
+  int count = 0;
+  std::unordered_set<const Node *> need_removed_nodes;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle FuseReluDepthwiseConv fuse";
+    // 1. turn on fuse option
+    auto *layer_op = subgraph.at(layer)->Op();
+    layer_op->SetAttr("use_cudnn", false);
+    layer_op->SetAttr("fuse_relu_before_depthwise_conv", true);
+
+    OpDesc *layer_g_op = nullptr;
+    if (!only_forward) {
+      layer_g_op = subgraph.at(layer_g)->Op();
+      layer_g_op->SetAttr("use_cudnn", false);
+      layer_g_op->SetAttr("fuse_relu_before_depthwise_conv", true);
+    }
+    // 2. connect x to layer and layer_g, layer_g to xg
+    auto *y_var = subgraph.at(y)->Var();
+    auto *x_var = subgraph.at(x)->Var();
+    VarDesc *yg_var = nullptr;
+    VarDesc *xg_var = nullptr;
+    if (!only_forward) {
+      yg_var = subgraph.at(yg)->Var();
+      xg_var = subgraph.at(xg)->Var();
+    }
+
+    PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1);
+    PADDLE_ENFORCE_EQ(layer_op->Input("Input")[0], y_var->Name());
+    layer_op->SetInput("Input", {x_var->Name()});
+    subgraph.at(layer)->inputs.push_back(subgraph.at(x));
+    subgraph.at(x)->outputs.push_back(subgraph.at(layer));
+    VLOG(4) << "replace " << y_var->Name() << " -> " << x_var->Name();
+
+    if (!only_forward) {
+      PADDLE_ENFORCE_EQ(layer_g_op->Input("Input").size(), 1);
+      PADDLE_ENFORCE_EQ(layer_g_op->Input("Input")[0], y_var->Name());
+      layer_g_op->SetInput("Input", {x_var->Name()});
+      subgraph.at(layer_g)->inputs.push_back(subgraph.at(x));
+      subgraph.at(x)->outputs.push_back(subgraph.at(layer_g));
+
+      PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input")).size(), 1);
+      PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input"))[0],
+                        yg_var->Name());
+      layer_g_op->SetOutput(GradVarName("Input"), {xg_var->Name()});
+      subgraph.at(layer_g)->outputs.push_back(subgraph.at(xg));
+      subgraph.at(xg)->inputs.push_back(subgraph.at(layer_g));
+      VLOG(4) << "replace " << yg_var->Name() << " -> " << xg_var->Name();
+    }
+
+    // 3. delete y, yg, act, act_g
+
+    if (only_forward) {
+      need_removed_nodes.insert({subgraph.at(y), subgraph.at(act)});
+    } else {
+      need_removed_nodes.insert({subgraph.at(y), subgraph.at(yg),
+                                 subgraph.at(act), subgraph.at(act_g)});
+    }
+    count++;
+  };
+  gpd(graph.get(), handler);
+  GraphSafeRemoveNodes(graph.get(), need_removed_nodes);
+  AddStatis(count);
+
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fuse_relu_depthwise_conv_pass,
+              paddle::framework::ir::FuseReluDepthwiseConvPass);
diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
new file mode 100644
index 00000000000..6bd653775e4
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the relu and depthwise conv
+ */
+class FuseReluDepthwiseConvPass : public FusePassBase {
+ public:
+  virtual ~FuseReluDepthwiseConvPass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  std::unique_ptr<ir::Graph> FuseReluDepthwiseConv(
+      std::unique_ptr<ir::Graph> graph, bool only_forward) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 16ffc11419f..0ce174654e8 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -143,7 +143,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     // Get unique name for storing MKLDNN primitives
     const std::string key = platform::ConvMKLDNNHandler::GetHash(
         src_tz, weights_tz, strides, paddings, dilations, groups,
-        ctx.op().Output("Output"));
+        ctx.op().Input("Input") + ctx.op().Input("Filter"));
     const std::string key_conv_pd = key + "@conv_pd";
 
     std::vector<primitive> pipeline;
@@ -371,7 +371,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     platform::ConvMKLDNNHandler::AppendKey(
         &key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt,
         input->format(), fuse_relu, fuse_residual_conn,
-        ctx.op().Output("Output"));
+        ctx.op().Input("Input") + ctx.op().Input("Filter"));
     const std::string key_conv_pd = key + "@conv_pd";
 
     bool need_s8_to_u8 = false;
@@ -798,7 +798,6 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     const Tensor* input = ctx.Input<Tensor>("Input");
     const Tensor* filter = ctx.Input<Tensor>("Filter");
-    const Tensor* output = ctx.Input<Tensor>("Output");
     const Tensor* output_grad =
         ctx.Input<Tensor>(framework::GradVarName("Output"));
     Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
@@ -810,9 +809,6 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
                        filter->format() != memory::format::format_undef,
                    "Wrong layout/format set for Filter tensor");
-    PADDLE_ENFORCE(output->layout() == DataLayout::kMKLDNN &&
-                       output->format() != memory::format::format_undef,
-                   "Wrong layout/format set for Output tensor");
     PADDLE_ENFORCE(output_grad->layout() == DataLayout::kMKLDNN &&
                        output_grad->format() != memory::format::format_undef,
                    "Wrong layout/format set for output_grad tensor");
@@ -840,18 +836,19 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
         paddle::framework::vectorize2int(filter->dims());
     int g = std::max(groups, 1);
     GetWeightsTz(weights_tz, g, is_conv3d);
-    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    std::vector<int> dst_tz =
+        paddle::framework::vectorize2int(output_grad->dims());
 
     auto src_format = input->format();
     mkldnn::memory::format weights_format =
         GetWeightsFormat(filter->format(), g, is_conv3d);
 
-    // Get an unique name from "argument" name of "Output" variable
+    // Get an unique name from "argument" name of "input" and "Filter" variable
     // as well as attributes of primitive to be created
     // This name will be used as key when saving info into device context
     const std::string key = platform::ConvMKLDNNHandler::GetHash(
         src_tz, weights_tz, strides, paddings, dilations, groups,
-        ctx.op().Input("Output"));
+        ctx.op().Input("Input") + ctx.op().Input("Filter"));
 
     const std::string key_conv_pd = key + "@conv_pd";
     std::vector<primitive> pipeline;
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index c8b33b8932d..bd788f03e7d 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -171,6 +171,9 @@ void Conv2DOpMaker::Make() {
       "use_cudnn",
       "(bool, default false) Only used in cudnn kernel, need install cudnn")
       .SetDefault(false);
+  AddAttr<bool>("fuse_relu_before_depthwise_conv",
+                "(bool, default false) Only used in cuda depthwise kernel")
+      .SetDefault(false);
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
@@ -412,18 +415,43 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
                                  customized_type_value);
 }
 
+class Conv2dGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType(GradOpType());
+    op->SetInput("Input", Input("Input"));
+    op->SetInput("Filter", Input("Filter"));
+    op->SetInput("Bias", Input("Bias"));
+    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
+
+    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
+    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
+    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+
+    op->SetAttrMap(Attrs());
+
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+
+  virtual std::string GradOpType() const {
+    return this->ForwardOpType() + "_grad";
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
-                  ops::ConvOpInferVarType,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::ConvOpInferVarType, ops::Conv2dGradMaker);
 REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
 
 // depthwise convolution op
 REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::ConvOpInferVarType, ops::Conv2dGradMaker);
 REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad);
 
 REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index eaa288edc55..797c6651659 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -397,12 +397,18 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-
-    math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
-
+    bool fuse_relu = context.Attr<bool>("fuse_relu_before_depthwise_conv");
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
-                  output);
+
+    if (fuse_relu) {
+      math::DepthwiseConvFunctor<DeviceContext, T, true> depthwiseConv;
+      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
+                    output);
+    } else {
+      math::DepthwiseConvFunctor<DeviceContext, T, false> depthwiseConv;
+      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
+                    output);
+    }
   }
 };
 
@@ -424,27 +430,42 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    bool fuse_relu = context.Attr<bool>("fuse_relu_before_depthwise_conv");
 
     math::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
 
-    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
-        depthwiseConvInputGrad;
-    math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
-        depthwiseConvFilterGrad;
-
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
       set_zero(dev_ctx, input_grad, static_cast<T>(0));
-      depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
-                             paddings, dilations, input_grad);
+
+      if (fuse_relu) {
+        math::DepthwiseConvInputGradFunctor<DeviceContext, T, true>
+            depthwiseConvInputGrad;
+        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
+                               paddings, dilations, input_grad);
+      } else {
+        math::DepthwiseConvInputGradFunctor<DeviceContext, T, false>
+            depthwiseConvInputGrad;
+        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
+                               paddings, dilations, input_grad);
+      }
     }
 
     if (filter_grad) {
       filter_grad->mutable_data<T>(context.GetPlace());
       set_zero(dev_ctx, filter_grad, static_cast<T>(0));
-      depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
-                              dilations, filter_grad);
+      if (fuse_relu) {
+        math::DepthwiseConvFilterGradFunctor<DeviceContext, T, true>
+            depthwiseConvFilterGrad;
+        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
+                                paddings, dilations, filter_grad);
+      } else {
+        math::DepthwiseConvFilterGradFunctor<DeviceContext, T, false>
+            depthwiseConvFilterGrad;
+        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
+                                paddings, dilations, filter_grad);
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index 4fa15058f86..e18d9841bb8 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/group_norm_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -102,8 +103,8 @@ class GroupNormGradOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     // check input
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of GroupNormOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Mean"),
                    "Input(Mean) of GroupNormOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Variance"),
@@ -113,7 +114,7 @@ class GroupNormGradOp : public framework::OperatorWithKernel {
 
     // check output
     if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Y"));
     }
     if (ctx->HasOutput(framework::GradVarName("Scale"))) {
       ctx->SetOutputDim(framework::GradVarName("Scale"),
@@ -145,12 +146,36 @@ class GroupNormGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class GroupNormGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
+    op->SetType("group_norm_grad");
+    op->SetInput("Scale", Input("Scale"));
+    op->SetInput("Bias", Input("Bias"));
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+    op->SetInput("Y", Output("Y"));
+    op->SetInput("Mean", Output("Mean"));
+    op->SetInput("Variance", Output("Variance"));
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+    op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
+
+    op->SetAttrMap(Attrs());
+
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::GroupNormGradMaker);
 REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp);
 REGISTER_OP_CPU_KERNEL(
     group_norm, ops::GroupNormKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index 27174630227..6e460c470be 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -12,12 +12,38 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cub/cub.cuh>
+#include "cub/cub.cuh"
 #include "paddle/fluid/operators/group_norm_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
 
 namespace paddle {
 namespace operators {
 
+enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
+
+#define CHECK_CASE(i, flags, kernel_name, args...)                   \
+  if (i == flags) {                                                  \
+    kernel_name<T, i><<<grid, threads, 0, dev_ctx.stream()>>>(args); \
+  }
+
+// 0 for no scale, no bias
+// 1 for has scale, no bias
+// 2 for no scale, has bias
+// 3 for has scale, has bias
+#define UNROLL_ALL_CASES(flags, kernel_name, args...) \
+  CHECK_CASE(0, flags, kernel_name, args)             \
+  CHECK_CASE(1, flags, kernel_name, args)             \
+  CHECK_CASE(2, flags, kernel_name, args)             \
+  CHECK_CASE(3, flags, kernel_name, args)
+
+template <typename T>
+__device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
+  typedef cub::WarpReduce<T> WarpReduce;
+  typename WarpReduce::TempStorage temp_storage;
+  value = WarpReduce(temp_storage).Sum(value);
+  if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value);
+}
+
 template <typename T>
 __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C,
                                               int imsize, int groups,
@@ -36,21 +62,11 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C,
   }
   x_mean /= number * imsize;
   x_var /= number * imsize;
-  __shared__ T s_mem[2];
-  if (threadIdx.x == 0) {
-    s_mem[0] = s_mem[1] = 0;
-  }
-  __syncthreads();
-  paddle::platform::CudaAtomicAdd(&s_mem[0], x_mean);
-  paddle::platform::CudaAtomicAdd(&s_mem[1], x_var);
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    paddle::platform::CudaAtomicAdd(&mean[bid * groups + gid], s_mem[0]);
-    paddle::platform::CudaAtomicAdd(&var[bid * groups + gid], s_mem[1]);
-  }
+  CudaAtomicAddWithWarp(&mean[bid * groups + gid], x_mean);
+  CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var);
 }
 
-template <typename T>
+template <typename T, int flags>
 __global__ void GroupNormForward(const T* x, const T* mean, const T* var,
                                  const T* scale, const T* bias, int N, int C,
                                  int imsize, int groups, int group_size,
@@ -68,8 +84,8 @@ __global__ void GroupNormForward(const T* x, const T* mean, const T* var,
   for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
     T val = x[(bid * C + ccid) * imsize + imid];
     val = (val - x_mean) * var_inv;
-    if (scale) val *= scale[gid * group_size + cid];
-    if (bias) val += bias[gid * group_size + cid];
+    if (flags & kHasScale) val *= scale[gid * group_size + cid];
+    if (flags & kHasBias) val += bias[gid * group_size + cid];
     y[(bid * C + ccid) * imsize + imid] = val;
   }
 }
@@ -115,93 +131,87 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
     if (bias) bias_data = bias->data<T>();
 
     int imsize = x_dims[2] * x_dims[3];
-    int block_size = std::min(512, imsize);
+    int block_size = std::min(1024, imsize);
     dim3 grid(group_size, groups, x_dims[0]);
     dim3 threads(block_size, 1, 1);
     GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
         x_data, x_dims[0], x_dims[1], imsize, groups, group_size, mean_data,
         temp_var_data);
-    GroupNormForward<T><<<grid, threads, 0, dev_ctx.stream()>>>(
-        x_data, mean_data, temp_var_data, scale_data, bias_data, x_dims[0],
-        x_dims[1], imsize, groups, group_size, epsilon, y_data, var_data);
+    int flags =
+        (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
+    UNROLL_ALL_CASES(flags, GroupNormForward, x_data, mean_data, temp_var_data,
+                     scale_data, bias_data, x_dims[0], x_dims[1], imsize,
+                     groups, group_size, epsilon, y_data, var_data);
   }
 };
 
-template <typename T>
-__global__ void GroupNormBackwardGetMeanAndVar(
-    const T* x, const T* mean, const T* var, const T* scale, const T* d_y,
-    int N, int C, int imsize, int groups, int group_size, T epsilon, T* d_x,
-    T* d_mean, T* d_var, T* d_scale, T* d_bias) {
+template <typename T, int flags>
+__global__ void GroupNormBackwardGetMeanAndVar(const T* x, const T* scale,
+                                               const T* bias, const T* d_y,
+                                               int N, int C, int imsize,
+                                               int groups, int group_size,
+                                               T epsilon, T* d_mean, T* d_var,
+                                               T* d_scale, T* d_bias) {
   int gid = blockIdx.y;
   int cid = blockIdx.x;
   int bid = blockIdx.z;
   int number = min(group_size, static_cast<int>(C - gid * group_size));
   int ccid = gid * group_size + cid;
   if (ccid >= C) return;
-  T x_mean = mean[bid * groups + gid];
-  T x_var = var[bid * groups + gid];
-  T var_inv = 1.0 / sqrt(x_var + epsilon);
-  T d_var_inv = 0, d_x_mean = 0;
+  T x_scale = (flags & kHasScale) ? scale[ccid] : 1;
+  T x_bias = (flags & kHasBias) ? bias[ccid] : 0;
+  T x_scale_inv = 0;
+  if (x_scale != 0) x_scale_inv = 1.0 / x_scale;
   T d_mean_data = 0, d_var_data = 0, d_scale_data = 0, d_bias_data = 0;
 
   for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
-    T tmp = x[(bid * C + ccid) * imsize + imid];
-    T val = (tmp - x_mean) * var_inv;
+    T val = x[(bid * C + ccid) * imsize + imid] - x_bias;
     T dval = d_y[(bid * C + ccid) * imsize + imid];
-    if (d_bias) d_bias_data += dval;
-    if (d_scale) d_scale_data += val * dval;
-    if (scale) dval = dval * scale[ccid];
-    d_var_data += (tmp - x_mean) * dval;
-    T d_tmp = dval * var_inv;
-    if (d_x) d_x[(bid * C + ccid) * imsize + imid] = d_tmp;
-    d_mean_data -= d_tmp;
-  }
 
-  __shared__ T s_mem[4];
-  if (threadIdx.x == 0) {
-    s_mem[0] = s_mem[1] = 0;
-    if (d_scale) s_mem[2] = 0;
-    if (d_bias) s_mem[3] = 0;
-  }
-  __syncthreads();
-  paddle::platform::CudaAtomicAdd(&s_mem[0], d_mean_data);
-  paddle::platform::CudaAtomicAdd(&s_mem[1], d_var_data);
-  if (d_scale) paddle::platform::CudaAtomicAdd(&s_mem[2], d_scale_data);
-  if (d_bias) paddle::platform::CudaAtomicAdd(&s_mem[3], d_bias_data);
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    paddle::platform::CudaAtomicAdd(&d_mean[bid * groups + gid], s_mem[0]);
-    paddle::platform::CudaAtomicAdd(&d_var[bid * groups + gid], s_mem[1]);
-    if (d_scale) paddle::platform::CudaAtomicAdd(&d_scale[ccid], s_mem[2]);
-    if (d_bias) paddle::platform::CudaAtomicAdd(&d_bias[ccid], s_mem[3]);
+    d_var_data += val * dval;
+    d_mean_data += dval * x_scale;
+
+    val = val * x_scale_inv;
+    d_bias_data += dval;
+    d_scale_data += val * dval;
   }
+  CudaAtomicAddWithWarp(&d_mean[bid * groups + gid], d_mean_data);
+  CudaAtomicAddWithWarp(&d_var[bid * groups + gid], d_var_data);
+  if (flags & kHasScale) CudaAtomicAddWithWarp(&d_scale[ccid], d_scale_data);
+  if (flags & kHasBias) CudaAtomicAddWithWarp(&d_bias[ccid], d_bias_data);
 }
 
-template <typename T>
-__global__ void GroupNormBackward(const T* x, const T* mean, const T* var,
-                                  const T* d_mean, const T* d_var, int N, int C,
-                                  int imsize, int groups, int group_size,
-                                  T epsilon, T* d_x) {
+template <typename T, int flags>
+__global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale,
+                                  const T* bias, const T* var, const T* d_mean,
+                                  const T* d_var, int N, int C, int imsize,
+                                  int groups, int group_size, T epsilon,
+                                  T* d_x) {
   int gid = blockIdx.y;
   int cid = blockIdx.x;
   int bid = blockIdx.z;
   int number = min(group_size, static_cast<int>(C - gid * group_size));
   int ccid = gid * group_size + cid;
   if (ccid >= C) return;
-  T x_mean = mean[bid * groups + gid];
   T x_var = var[bid * groups + gid];
   T d_x_mean = d_mean[bid * groups + gid];
-  T d_var_inv = d_var[bid * groups + gid];
+  T d_x_var = d_var[bid * groups + gid];
+
+  T x_var_inv = 1.0 / sqrt(x_var + epsilon);
+  T number_inv = 1.0 / (number * imsize);
+
+  T x_scale = (flags & kHasScale) ? scale[ccid] : 1;
+  T x_bias = (flags & kHasBias) ? bias[ccid] : 0;
+  T x_scale_inv = 0;
+  if (x_scale != 0) x_scale_inv = 1.0 / x_scale;
 
-  T d_x_var =
-      -1.0 / (2 * (x_var + epsilon) * sqrt(x_var + epsilon)) * d_var_inv;
-  d_x_mean -= 2 * d_x_var * x_mean;
-  d_x_var /= number * imsize;
-  d_x_mean /= number * imsize;
   for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
     T tmp = x[(bid * C + ccid) * imsize + imid];
-    if (d_x)
-      d_x[(bid * C + ccid) * imsize + imid] += d_x_mean + tmp * 2 * d_x_var;
+    T v_y = (tmp - x_bias) * x_scale_inv;
+    T dly = d_y[(bid * C + ccid) * imsize + imid];
+    d_x[(bid * C + ccid) * imsize + imid] =
+        x_var_inv *
+        (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean);
   }
 }
 
@@ -211,10 +221,10 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const float epsilon = ctx.Attr<float>("epsilon");
-    auto* x = ctx.Input<Tensor>("X");
-    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* x = ctx.Input<Tensor>("Y");
     auto* var = ctx.Input<Tensor>("Variance");
     auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
     auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
     const auto groups = ctx.Attr<int>("groups");
 
@@ -226,11 +236,7 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
     const auto& x_dims = x->dims();
     const int group_size = (x_dims[1] - 1) / groups + 1;
 
-    T* d_x_data = nullptr;
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-      d_x_data = d_x->data<T>();
-    }
+    d_x->mutable_data<T>(ctx.GetPlace());
     math::SetConstant<platform::CUDADeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
 
@@ -245,8 +251,9 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
     T* temp_mean_data = temp_mean.data<T>();
 
     auto* x_data = x->data<T>();
+    T* d_x_data = nullptr;
+    if (d_x) d_x_data = d_x->data<T>();
     auto* y_data = d_y->data<T>();
-    auto* mean_data = mean->data<T>();
     auto* var_data = var->data<T>();
     T* d_scale_data = nullptr;
     if (d_scale) {
@@ -263,18 +270,25 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
 
     const T* scale_data = nullptr;
     if (scale) scale_data = scale->data<T>();
+    const T* bias_data = nullptr;
+    if (bias) bias_data = bias->data<T>();
 
     int imsize = x_dims[2] * x_dims[3];
-    int block_size = std::min(512, imsize);
+    int block_size = std::min(1024, imsize);
     dim3 grid(group_size, groups, x_dims[0]);
     dim3 threads(block_size, 1, 1);
-    GroupNormBackwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
-        x_data, mean_data, var_data, scale_data, y_data, x_dims[0], x_dims[1],
-        imsize, groups, group_size, epsilon, d_x_data, temp_mean_data,
-        temp_var_data, d_scale_data, d_bias_data);
-    GroupNormBackward<T><<<grid, threads, 0, dev_ctx.stream()>>>(
-        x_data, mean_data, var_data, temp_mean_data, temp_var_data, x_dims[0],
-        x_dims[1], imsize, groups, group_size, epsilon, d_x_data);
+    int flags =
+        (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
+    UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, x_data, scale_data,
+                     bias_data, y_data, x_dims[0], x_dims[1], imsize, groups,
+                     group_size, epsilon, temp_mean_data, temp_var_data,
+                     d_scale_data, d_bias_data);
+    if (d_x_data != nullptr) {
+      UNROLL_ALL_CASES(flags, GroupNormBackward, x_data, y_data, scale_data,
+                       bias_data, var_data, temp_mean_data, temp_var_data,
+                       x_dims[0], x_dims[1], imsize, groups, group_size,
+                       epsilon, d_x_data);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h
index 3d6c6a46a96..498e65f6149 100644
--- a/paddle/fluid/operators/group_norm_op.h
+++ b/paddle/fluid/operators/group_norm_op.h
@@ -96,10 +96,10 @@ class GroupNormGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const float epsilon = ctx.Attr<float>("epsilon");
-    auto* x = ctx.Input<Tensor>("X");
-    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* x = ctx.Input<Tensor>("Y");
     auto* var = ctx.Input<Tensor>("Variance");
     auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
     auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
     const auto groups = ctx.Attr<int>("groups");
 
@@ -111,19 +111,13 @@ class GroupNormGradKernel : public framework::OpKernel<T> {
     const auto& x_dims = x->dims();
     const int group_size = (x_dims[1] - 1) / groups + 1;
 
-    // TODO(liangdun): need to check d_x is null
+    d_x->mutable_data<T>(ctx.GetPlace());
     math::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    T* d_x_data = nullptr;
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_x, static_cast<T>(0));
-      d_x_data = d_x->data<T>();
-    }
 
     auto* x_data = x->data<T>();
+    auto* d_x_data = d_x->data<T>();
     auto* y_data = d_y->data<T>();
-    auto* mean_data = mean->data<T>();
     auto* var_data = var->data<T>();
     T* d_scale_data = nullptr;
     if (d_scale) {
@@ -140,6 +134,8 @@ class GroupNormGradKernel : public framework::OpKernel<T> {
 
     const T* scale_data = nullptr;
     if (scale) scale_data = scale->data<T>();
+    const T* bias_data = nullptr;
+    if (bias) bias_data = bias->data<T>();
 
     int imsize = x_dims[2] * x_dims[3];
     auto* iter_x_data = x_data;
@@ -147,46 +143,45 @@ class GroupNormGradKernel : public framework::OpKernel<T> {
     auto* iter_y_data = y_data;
     for (int bid = 0; bid < x_dims[0]; bid++)
       for (int gid = 0; gid < groups; gid++) {
-        T x_mean = mean_data[bid * groups + gid];
         T x_var = var_data[bid * groups + gid];
         T var_inv = 1.0 / sqrt(x_var + epsilon);
         int number = std::min(group_size,
                               static_cast<int>(x_dims[1] - gid * group_size));
-        auto* tmp = iter_x_data;
-        auto* tmp2 = iter_d_x_data;
-        T d_var_inv = 0, d_x_mean = 0;
+        T number_inv = 1.0 / (number * imsize);
+        auto* iter_x_data2 = iter_x_data;
+        auto* iter_y_data2 = iter_y_data;
+        T dp_scale = 0, dp_bias = 0;
         for (int cid = 0; cid < number; cid++) {
           for (int imid = 0; imid < imsize;
-               imid++, tmp++, iter_y_data++, iter_d_x_data++) {
-            T val = (tmp[0] - x_mean) * var_inv;
+               imid++, iter_x_data++, iter_y_data++) {
+            T val = iter_x_data[0];
+            if (bias_data) val -= bias_data[gid * group_size + cid];
             T dval = iter_y_data[0];
+            dp_scale += val * dval;
+            dp_bias += dval * scale_data[gid * group_size + cid];
+
+            if (scale_data && scale_data[gid * group_size + cid] != 0)
+              val /= scale_data[gid * group_size + cid];
             if (d_bias_data) d_bias_data[gid * group_size + cid] += dval;
             if (d_scale_data)
               d_scale_data[gid * group_size + cid] += val * dval;
-            if (scale_data) dval = scale_data[gid * group_size + cid] * dval;
-
-            d_var_inv += (tmp[0] - x_mean) * dval;
-            T d_tmp = dval * var_inv;
-            if (d_x_data) iter_d_x_data[0] += d_tmp;
-            d_x_mean -= d_tmp;
           }
         }
 
-        T d_x_var =
-            -1.0 / (2 * (x_var + epsilon) * sqrt(x_var + epsilon)) * d_var_inv;
-        d_x_mean -= 2 * d_x_var * x_mean;
-        d_x_var /= number * imsize;
-        d_x_mean /= number * imsize;
-
-        iter_d_x_data = tmp2;
-
-        if (d_x_data) {
-          for (int cid = 0; cid < number; cid++) {
-            for (int imid = 0; imid < imsize;
-                 imid++, iter_x_data++, iter_d_x_data++) {
-              iter_d_x_data[0] += d_x_mean;
-              iter_d_x_data[0] += iter_x_data[0] * 2 * d_x_var;
-            }
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize;
+               imid++, iter_d_x_data++, iter_x_data2++, iter_y_data2++) {
+            T v_y = iter_x_data2[0];
+            T dly = iter_y_data2[0];
+            T dss = dp_scale;
+            T dbs = dp_bias;
+            T v_scale = scale_data[gid * group_size + cid];
+            T v_bias = bias_data[gid * group_size + cid];
+            v_y -= v_bias;
+            if (v_scale != 0) v_y /= v_scale;
+            iter_d_x_data[0] =
+                (dly * v_scale - number_inv * dss * v_y - number_inv * dbs) *
+                var_inv;
           }
         }
       }
diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu
index 66d37c3bf31..240cec14dc2 100644
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+#include "cub/cub.cuh"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
@@ -22,28 +24,11 @@ namespace operators {
 namespace math {
 
 template <typename T>
-__inline__ __device__ T warpReduceSum(T val) {
-#if CUDA_VERSION < 9000
-  for (int offset = 16; offset > 0; offset /= 2)
-    val += __shfl_down(val, offset);
-  return val;
-#else
-#define FULL_MASK 0xffffffff
-  for (int offset = 16; offset > 0; offset /= 2)
-    val += __shfl_down_sync(FULL_MASK, val, offset);
-  return val;
-#endif
-}
-__forceinline__ __device__ unsigned lane_id() {
-  unsigned ret;
-  asm volatile("mov.u32 %0, %laneid;" : "=r"(ret));
-  return ret;
-}
-
-__forceinline__ __device__ unsigned warp_id() {
-  unsigned ret;
-  asm volatile("mov.u32 %0, %warpid;" : "=r"(ret));
-  return ret;
+__device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
+  typedef cub::WarpReduce<T> WarpReduce;
+  typename WarpReduce::TempStorage temp_storage;
+  value = WarpReduce(temp_storage).Sum(value);
+  if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value);
 }
 
 #define ARG_DEFINE_KernelDepthwiseConv                                         \
@@ -58,7 +43,7 @@ __forceinline__ __device__ unsigned warp_id() {
 
 // A Cuda kernel to compute the depthwise convolution forward pass
 // in NCHW format.
-template <typename T>
+template <typename T, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConv(ARG_DEFINE_KernelDepthwiseConv) {
   for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) {
     for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) {
@@ -87,7 +72,11 @@ __device__ __inline__ void KernelDepthwiseConv(ARG_DEFINE_KernelDepthwiseConv) {
           if (h_in >= h_start && h_in < h_end && w_in >= w_start &&
               w_in < w_end) {
             const int offset = in_offset + h_in * input_width + w_in;
-            value += weight[weight_offset] * input_data[offset];
+            if (fuse_relu_before_conv) {
+              value += weight[weight_offset] * max(0.0f, input_data[offset]);
+            } else {
+              value += weight[weight_offset] * input_data[offset];
+            }
           }
           weight_offset++;
         }
@@ -100,7 +89,7 @@ __device__ __inline__ void KernelDepthwiseConv(ARG_DEFINE_KernelDepthwiseConv) {
   }
 }
 
-template <typename T, int c_filter>
+template <typename T, int c_filter, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvCFilter(
     ARG_DEFINE_KernelDepthwiseConv) {
   const int kWeghtSize = c_filter * c_filter;
@@ -137,7 +126,12 @@ __device__ __inline__ void KernelDepthwiseConvCFilter(
           if (h_in >= 0 && h_in < input_height && w_in >= 0 &&
               w_in < input_width) {
             const int offset = in_offset + h_in * input_width + w_in;
-            value += r_weight[h_f * c_filter + w_f] * input_data[offset];
+            if (fuse_relu_before_conv) {
+              value += r_weight[h_f * c_filter + w_f] *
+                       max(0.0f, input_data[offset]);
+            } else {
+              value += r_weight[h_f * c_filter + w_f] * input_data[offset];
+            }
           }
         }
       }
@@ -149,18 +143,19 @@ __device__ __inline__ void KernelDepthwiseConvCFilter(
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter>
+template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
+          bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
   if (c_filter_multiplier == 0) {
     if (c_filter == -1)
-      KernelDepthwiseConv<T>(
+      KernelDepthwiseConv<T, fuse_relu_before_conv>(
           input_data, filter_data, batch_size, output_channels, output_height,
           output_width, input_channels, input_height, input_width,
           filter_multiplier, filter_height, filter_width, stride_height,
           stride_width, padding_height, padding_width, dilate_height,
           dilate_width, output_data);
     else
-      KernelDepthwiseConvCFilter<T, c_filter>(
+      KernelDepthwiseConvCFilter<T, c_filter, fuse_relu_before_conv>(
           input_data, filter_data, batch_size, output_channels, output_height,
           output_width, input_channels, input_height, input_width,
           filter_multiplier, filter_height, filter_width, stride_height,
@@ -168,14 +163,14 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
           dilate_width, output_data);
   } else {
     if (c_filter == -1)
-      KernelDepthwiseConv<T>(input_data, filter_data, batch_size,
-                             output_channels, output_height, output_width,
-                             input_channels, input_height, input_width,
-                             c_filter_multiplier, filter_height, filter_height,
-                             c_stride, c_stride, padding_height, padding_width,
-                             dilate_height, dilate_width, output_data);
+      KernelDepthwiseConv<T, fuse_relu_before_conv>(
+          input_data, filter_data, batch_size, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          c_filter_multiplier, filter_height, filter_height, c_stride, c_stride,
+          padding_height, padding_width, dilate_height, dilate_width,
+          output_data);
     else
-      KernelDepthwiseConvCFilter<T, c_filter>(
+      KernelDepthwiseConvCFilter<T, c_filter, fuse_relu_before_conv>(
           input_data, filter_data, batch_size, output_channels, output_height,
           output_width, input_channels, input_height, input_width,
           c_filter_multiplier, filter_height, filter_height, c_stride, c_stride,
@@ -186,17 +181,18 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
 
 // CUDA kernel to compute the depthwise convolution backprop w.r.t input.
 #define ARG_DEFINE_KernelDepthwiseConvInputGrad                                \
-  const T *const output_grad_data, const T *const filter_data,                 \
-      const int batch_size, const int output_channels,                         \
-      const int output_height, const int output_width,                         \
-      const int input_channels, const int input_height, const int input_width, \
+  const T *const input_data, const T *const output_grad_data,                  \
+      const T *const filter_data, const int batch_size,                        \
+      const int output_channels, const int output_height,                      \
+      const int output_width, const int input_channels,                        \
+      const int input_height, const int input_width,                           \
       const int filter_multiplier, const int filter_height,                    \
       const int filter_width, const int stride_height, const int stride_width, \
       const int padding_height, const int padding_width,                       \
       const int dilate_height, const int dilate_width,                         \
       T *const input_grad_data
 
-template <typename T>
+template <typename T, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvInputGrad(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
   for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
@@ -217,6 +213,15 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad(
       int w_out_end = w_in + padding_width;
 
       T value = 0;
+      int index =
+          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
+          w_in;
+      if (fuse_relu_before_conv) {
+        if (input_data[index] <= 0) {
+          input_grad_data[index] = 0;
+          continue;
+        }
+      }
 
       for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
            c_out++) {
@@ -242,15 +247,13 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad(
           }
         }
       }
-      int index =
-          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
-          w_in;
       input_grad_data[index] = value;
     }
   }
 }
 
-template <typename T, int c_filter, int c_filter_multiplier>
+template <typename T, int c_filter, int c_filter_multiplier,
+          bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
   const int kWeghtSize = c_filter * c_filter * c_filter_multiplier + 1;
@@ -276,6 +279,15 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
       int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width;
 
       T value = 0;
+      int index =
+          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
+          w_in;
+      if (fuse_relu_before_conv) {
+        if (input_data[index] <= 0) {
+          input_grad_data[index] = 0;
+          continue;
+        }
+      }
 
       for (int c_i = 0; c_i < filter_multiplier; c_i++) {
         int c_out = c_in * filter_multiplier + c_i;
@@ -300,34 +312,33 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
           }
         }
       }
-      int index =
-          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
-          w_in;
       input_grad_data[index] = value;
     }
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter>
+template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
+          bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvInputGradSp(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
   if (c_filter_multiplier == 0)
-    KernelDepthwiseConvInputGrad<T>(
-        output_grad_data, filter_data, batch_size, output_channels,
+    KernelDepthwiseConvInputGrad<T, fuse_relu_before_conv>(
+        input_data, output_grad_data, filter_data, batch_size, output_channels,
         output_height, output_width, input_channels, input_height, input_width,
         filter_multiplier, filter_height, filter_width, stride_height,
         stride_width, padding_height, padding_width, dilate_height,
         dilate_width, input_grad_data);
   else if (c_filter == -1)
-    KernelDepthwiseConvInputGrad<T>(
-        output_grad_data, filter_data, batch_size, output_channels,
+    KernelDepthwiseConvInputGrad<T, fuse_relu_before_conv>(
+        input_data, output_grad_data, filter_data, batch_size, output_channels,
         output_height, output_width, input_channels, input_height, input_width,
         c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
         padding_height, padding_width, dilate_height, dilate_width,
         input_grad_data);
   else
-    KernelDepthwiseConvInputGradCFilter<T, c_filter, c_filter_multiplier>(
-        output_grad_data, filter_data, batch_size, output_channels,
+    KernelDepthwiseConvInputGradCFilter<T, c_filter, c_filter_multiplier,
+                                        fuse_relu_before_conv>(
+        input_data, output_grad_data, filter_data, batch_size, output_channels,
         output_height, output_width, input_channels, input_height, input_width,
         c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
         padding_height, padding_width, dilate_height, dilate_width,
@@ -335,7 +346,7 @@ __global__ void KernelDepthwiseConvInputGradSp(
 }
 
 // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
-template <typename T>
+template <typename T, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvFilterGrad(
     const T* output_grad_data, const T* input_data, const int num,
     const int output_channels, const int output_height, const int output_width,
@@ -347,7 +358,6 @@ __device__ __inline__ void KernelDepthwiseConvFilterGrad(
   T s = 0;
 
   int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
-  int lid = lane_id();
 
   for (int image_w = threadIdx.x; image_w < output_width;
        image_w += blockDim.x) {
@@ -364,28 +374,28 @@ __device__ __inline__ void KernelDepthwiseConvFilterGrad(
         if (image_wk < 0 || image_wk >= input_width) continue;
 #define gaid(N, C, H, W) \
   ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W))
-
-        s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
-             input_data[((bid * (gridDim.z / filter_multiplier) +
-                          kernel_id / filter_multiplier) *
-                             input_height +
-                         image_hk) *
-                            input_width +
-                        image_wk];
+        int input_id = ((bid * (gridDim.z / filter_multiplier) +
+                         kernel_id / filter_multiplier) *
+                            input_height +
+                        image_hk) *
+                           input_width +
+                       image_wk;
+        if (fuse_relu_before_conv) {
+          s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
+               max(0.0f, input_data[input_id]);
+        } else {
+          s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
+               input_data[input_id];
+        }
 
 #undef gaid
       }
     }
   }
-#if __CUDA_ARCH__ >= 530
-  s = warpReduceSum<T>(s);
-  if (lid == 0) paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
-#else
-  paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
-#endif
+  CudaAtomicAddWithWarp(&filter_grad_data[gbid], s);
 }
 
-template <typename T, int c_filter_multiplier>
+template <typename T, int c_filter_multiplier, bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvFilterGradSp(
     const T* output_grad_data, const T* input_data, const int num,
     const int output_channels, const int output_height, const int output_width,
@@ -395,14 +405,14 @@ __global__ void KernelDepthwiseConvFilterGradSp(
     const int padding_height, const int padding_width, const int dilate_height,
     const int dilate_width, T* filter_grad_data) {
   if (c_filter_multiplier == 0)
-    KernelDepthwiseConvFilterGrad<T>(
+    KernelDepthwiseConvFilterGrad<T, fuse_relu_before_conv>(
         output_grad_data, input_data, num, output_channels, output_height,
         output_width, input_channels, input_height, input_width,
         filter_multiplier, filter_height, filter_width, stride_height,
         stride_width, padding_height, padding_width, dilate_height,
         dilate_width, filter_grad_data);
   else
-    KernelDepthwiseConvFilterGrad<T>(
+    KernelDepthwiseConvFilterGrad<T, fuse_relu_before_conv>(
         output_grad_data, input_data, num, output_channels, output_height,
         output_width, input_channels, input_height, input_width,
         c_filter_multiplier, filter_height, filter_width, stride_height,
@@ -415,8 +425,9 @@ __global__ void KernelDepthwiseConvFilterGradSp(
  * Ksize, strides, paddings are two elements. These two elements represent
  * height and width, respectively.
  */
-template <class T>
-class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
+template <class T, bool fuse_relu_before_conv>
+class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
+                           fuse_relu_before_conv> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
@@ -446,6 +457,10 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
     int thread = 512;
+    if (output_width > 1024 && output_width <= 2048)
+      thread = (output_width - 1) / 2 + 1;
+    else if (output_width > 512 && output_width <= 1024)
+      thread = output_width;
     int blocks = std::min(std::max(thread / output_width, 1), output_height);
     dim3 threads(std::min(output_width, thread), blocks, 1);
     dim3 grid(output_channels, batch_size, 1);
@@ -456,8 +471,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
           stride_height == stride_width && stride_height == c_stride &&      \
           (ksize_height == ksize_width && ksize_height == c_filter ||        \
            c_filter == -1)) {                                                \
-    KernelDepthwiseConvSp<T, c_filter_multiplier, c_stride,                  \
-                          c_filter><<<grid, threads, 0, context.stream()>>>( \
+    KernelDepthwiseConvSp<                                                   \
+        T, c_filter_multiplier, c_stride, c_filter,                          \
+        fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
         input_data, filter_data, batch_size, output_channels, output_height, \
         output_width, input_channels, input_height, input_width,             \
         filter_multiplier, ksize_height, ksize_width, stride_height,         \
@@ -480,8 +496,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template <typename T>
-class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
+template <typename T, bool fuse_relu_before_conv>
+class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
+                                    fuse_relu_before_conv> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
@@ -507,11 +524,16 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
     const int dilate_height = dilations[0];
     const int dilate_width = dilations[1];
 
+    const T* input_data = input.data<T>();
     const T* filter_data = filter.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     int thread = 512;
+    if (input_width > 1024 && input_width <= 2048)
+      thread = (input_width - 1) / 2 + 1;
+    else if (input_width > 512 && input_width <= 1024)
+      thread = input_width;
     int blocks = std::min(std::max(thread / input_width, 1), input_height);
     dim3 threads(std::min(input_width, thread), blocks, 1);
     dim3 grid(input_channels, batch_size, 1);
@@ -524,13 +546,13 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
           (ksize_height == ksize_width && ksize_height == c_filter ||   \
            c_filter == -1)) {                                           \
     KernelDepthwiseConvInputGradSp<                                     \
-        T, c_filter_multiplier, c_stride,                               \
-        c_filter><<<grid, threads, 0, context.stream()>>>(              \
-        output_grad_data, filter_data, batch_size, output_channels,     \
-        output_height, output_width, input_channels, input_height,      \
-        input_width, filter_multiplier, ksize_height, ksize_width,      \
-        stride_height, stride_width, padding_height, padding_width,     \
-        dilate_height, dilate_width, input_grad_data);                  \
+        T, c_filter_multiplier, c_stride, c_filter,                     \
+        fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
+        input_data, output_grad_data, filter_data, batch_size,          \
+        output_channels, output_height, output_width, input_channels,   \
+        input_height, input_width, filter_multiplier, ksize_height,     \
+        ksize_width, stride_height, stride_width, padding_height,       \
+        padding_width, dilate_height, dilate_width, input_grad_data);   \
     return;                                                             \
   }
     check_case(1, 1, 3);
@@ -552,8 +574,9 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template <typename T>
-class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
+template <typename T, bool fuse_relu_before_conv>
+class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
+                                     fuse_relu_before_conv> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
@@ -583,6 +606,10 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
     T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
 
     int block_size = 512;
+    if (output_width > 1024 && output_width <= 2048)
+      block_size = (output_width - 1) / 2 + 1;
+    else if (output_width > 512 && output_width <= 1024)
+      block_size = output_width;
     int crop_output_height =
         std::min(std::max(block_size / output_width, 1), output_height);
     dim3 grid(ksize_width, ksize_height, output_channels);
@@ -592,7 +619,8 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
 #define check_case(c_filter_multiplier)                                       \
   if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \
     KernelDepthwiseConvFilterGradSp<                                          \
-        T, c_filter_multiplier><<<grid, threads, 0, context.stream()>>>(      \
+        T, c_filter_multiplier,                                               \
+        fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(       \
         output_grad_data, input_data, batch_size, output_channels,            \
         output_height, output_width, input_channels, input_height,            \
         input_width, filter_multiplier, ksize_height, ksize_width,            \
@@ -606,18 +634,31 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, float>;
-template class DepthwiseConvFunctor<platform::CUDADeviceContext, double>;
+template class DepthwiseConvFunctor<platform::CUDADeviceContext, float, false>;
+template class DepthwiseConvFunctor<platform::CUDADeviceContext, double, false>;
 
+template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, float,
+                                             false>;
 template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
-                                             float>;
+                                             double, false>;
+
+template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
+                                              float, false>;
+template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
+                                              double, false>;
+
+template class DepthwiseConvFunctor<platform::CUDADeviceContext, float, true>;
+template class DepthwiseConvFunctor<platform::CUDADeviceContext, double, true>;
+
+template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, float,
+                                             true>;
 template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
-                                             double>;
+                                             double, true>;
 
 template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              float>;
+                                              float, true>;
 template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                              double>;
+                                              double, true>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h
index 71f6fcb23df..56648e4125b 100644
--- a/paddle/fluid/operators/math/depthwise_conv.h
+++ b/paddle/fluid/operators/math/depthwise_conv.h
@@ -26,7 +26,8 @@ namespace math {
  * \brief Compute the depthwise convolution which include
  * forward process and backpropagation process
  */
-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename T,
+          bool fuse_relu_before_conv = false>
 class DepthwiseConvFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
@@ -36,7 +37,8 @@ class DepthwiseConvFunctor {
                   const std::vector<int>& dilations, framework::Tensor* output);
 };
 
-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename T,
+          bool fuse_relu_before_conv = false>
 class DepthwiseConvInputGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
@@ -48,7 +50,8 @@ class DepthwiseConvInputGradFunctor {
                   framework::Tensor* input_grad);
 };
 
-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename T,
+          bool fuse_relu_before_conv = false>
 class DepthwiseConvFilterGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 96fa428ee36..96d0d16bf78 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1023,6 +1023,20 @@ All parameter, weight, gradient are variables in Paddle.
           R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
                      to fuse elementwise_add_op and activation_op,
                      it may make the execution faster. Default False)DOC")
+      .def_property(
+          "fuse_relu_depthwise_conv",
+          [](const BuildStrategy &self) {
+            return self.fuse_relu_depthwise_conv_;
+          },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
+            self.fuse_relu_depthwise_conv_ = b;
+          },
+          R"DOC(The type is BOOL, fuse_relu_depthwise_conv indicate whether
+                      to fuse relu and depthwise_conv2d,
+                      it will save GPU memory and may make the execution faster.
+                      This options is only available in GPU devices.
+                      Default False)DOC")
       .def_property(
           "memory_optimize",
           [](const BuildStrategy &self) { return self.memory_optimize_; },
diff --git a/python/paddle/fluid/contrib/memory_usage_calc.py b/python/paddle/fluid/contrib/memory_usage_calc.py
index baa14a573fc..1f7ec69dd75 100644
--- a/python/paddle/fluid/contrib/memory_usage_calc.py
+++ b/python/paddle/fluid/contrib/memory_usage_calc.py
@@ -76,7 +76,7 @@ def memory_usage(program, batch_size):
 
     # Get the var_name list of first block and calculate
     total_memory = 0.0
-    processed_var_names = set()
+    processed_var_names = set(["@EMPTY@"])
     for op in program.global_block().ops:
         for var_name in op.output_arg_names:
             if var_name in processed_var_names:
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ea88d8b4d09..503c91c27ba 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1972,6 +1972,7 @@ def conv2d(input,
             'groups': groups,
             'use_cudnn': use_cudnn,
             'use_mkldnn': False,
+            'fuse_relu_before_depthwise_conv': False
         })
 
     pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 1ba47d5a576..fdacd241f9e 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -42,6 +42,7 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   use_reduce=False,
                                   use_ir_memory_optimize=False,
                                   fuse_elewise_add_act_ops=False,
+                                  fuse_relu_depthwise_conv=False,
                                   optimizer=fluid.optimizer.Adam,
                                   use_fast_executor=False,
                                   enable_sequential_execution=False):
@@ -60,7 +61,8 @@ class TestParallelExecutorBase(unittest.TestCase):
 
             loss = method(use_feed=feed_dict is not None)
 
-            optimizer().minimize(loss)
+            if optimizer:
+                optimizer().minimize(loss)
 
             if memory_opt:
                 fluid.memory_optimize(main)
@@ -76,6 +78,7 @@ class TestParallelExecutorBase(unittest.TestCase):
             build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
                 if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
             build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
+            build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
             build_strategy.memory_optimize = use_ir_memory_optimize
             build_strategy.enable_sequential_execution = enable_sequential_execution
             if use_cuda and core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 25a9e8d46ed..2927a9828fd 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -70,6 +70,7 @@ class TestConv2dOp(OpTest):
         self.exhaustive_search = False
         self.use_cuda = False
         self.use_mkldnn = False
+        self.fuse_relu_before_depthwise_conv = False
         self.data_format = "AnyLayout"
         self.dtype = np.float32
         self.init_kernel_type()
@@ -84,8 +85,17 @@ class TestConv2dOp(OpTest):
         }
 
         input = np.random.random(self.input_size).astype(self.dtype)
+        if not self.testcuda():
+            self.fuse_relu_before_depthwise_conv = False
+        if self.fuse_relu_before_depthwise_conv:
+            input = input - 0.5
+            input -= (input < 0) * 0.1
+            input += (input >= 0) * 0.1
+            input2 = np.maximum(input, 0.0)
+        else:
+            input2 = input
         filter = np.random.random(self.filter_size).astype(self.dtype)
-        output, _, _, _, _ = conv2d_forward_naive(input, filter, self.groups,
+        output, _, _, _, _ = conv2d_forward_naive(input2, filter, self.groups,
                                                   conv2d_param)
         output = output.astype(self.dtype)
 
@@ -101,6 +111,8 @@ class TestConv2dOp(OpTest):
             'use_cudnn': self.use_cudnn,
             'use_mkldnn': self.use_mkldnn,
             'data_format': self.data_format,
+            'fuse_relu_before_depthwise_conv':
+            self.fuse_relu_before_depthwise_conv,
             'exhaustive_search': self.exhaustive_search
         }
         self.outputs = {'Output': output}
@@ -364,6 +376,78 @@ class TestDepthwiseConvWithDilation2(TestConv2dOp):
         self.op_type = "depthwise_conv2d"
 
 
+class TestDepthwiseConvandFuse(TestConv2dOp):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [3, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConv2andFuse(TestConv2dOp):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [3, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConv3andFuse(TestConv2dOp):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConvWithDilationandFuse(TestConv2dOp):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        self.dilations = [2, 2]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConvWithDilation2andFuse(TestConv2dOp):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        self.dilations = [2, 2]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
 class TestCUDNNExhaustiveSearch(TestConv2dOp):
     def init_kernel_type(self):
         self.use_cudnn = True
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
new file mode 100644
index 00000000000..0c8531606b8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from parallel_executor_test_base import TestParallelExecutorBase
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+import paddle
+import paddle.dataset.mnist as mnist
+import unittest
+import os
+
+MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
+
+
+def norm(*args, **kargs):
+    return fluid.layers.batch_norm(*args, **kargs)
+
+
+def sep_conv(input, channel, stride, filter, dilation=1, act=None):
+    # with scope('depthwise'):
+    input = fluid.layers.conv2d(
+        input,
+        input.shape[1],
+        filter,
+        stride,
+        groups=input.shape[1],
+        padding=(filter // 2) * dilation,
+        dilation=dilation,
+        use_cudnn=False,
+        bias_attr=False)
+    input = norm(input)
+    if act: input = act(input)
+    # with scope('pointwise'):
+    input = fluid.layers.conv2d(
+        input, channel, 1, 1, groups=1, padding=0, bias_attr=False)
+    input = norm(input)
+    if act: input = act(input)
+    return input
+
+
+def simple_depthwise_net(use_feed):
+    if use_feed:
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    else:
+        reader = fluid.layers.open_files(
+            filenames=[MNIST_RECORDIO_FILE],
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'])
+        reader = fluid.layers.io.double_buffer(reader)
+        img, label = fluid.layers.read_file(reader)
+    hidden = fluid.layers.reshape(img, (-1, 1, 28, 28))
+    for _ in range(4):
+        hidden = sep_conv(hidden, channel=200, stride=2, filter=5)
+        hidden = fluid.layers.relu(hidden)
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestMNIST(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=4)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            fluid.recordio_writer.convert_reader_to_recordio_file(
+                MNIST_RECORDIO_FILE, reader, feeder)
+
+    def _init_data(self, random=True):
+        np.random.seed(5)
+        if random:
+            img = np.random.random(size=[32, 784]).astype(np.float32)
+        else:
+            img = np.ones(shape=[32, 784], dtype='float32')
+        label = np.ones(shape=[32, 1], dtype='int64')
+        return img, label
+
+    def _compare(self, model, use_cuda, random_data=True, only_forward=False):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+        img, label = self._init_data(random_data)
+
+        def _optimizer(learning_rate=1e-6):
+            optimizer = fluid.optimizer.SGD(
+                learning_rate=learning_rate,
+                regularization=fluid.regularizer.L2Decay(1e-6))
+            return optimizer
+
+        if only_forward:
+            _optimizer = None
+
+        fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            fuse_relu_depthwise_conv=True,
+            use_ir_memory_optimize=True,
+            memory_opt=False,
+            optimizer=_optimizer)
+        not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            fuse_relu_depthwise_conv=False,
+            memory_opt=False,
+            optimizer=_optimizer)
+
+        for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+
+    def test_simple_depthwise_with_fuse_op(self):
+        self._compare(simple_depthwise_net, True)
+        self._compare(simple_depthwise_net, False)
+
+    def test_simple_depthwise_with_fuse_op_only_forward(self):
+        self._compare(simple_depthwise_net, True, only_forward=True)
+        self._compare(simple_depthwise_net, False, only_forward=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab


From 5e5e6a32251ddd67defcf301262c85d6ec00efbb Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 21 Jan 2019 14:26:18 +0800
Subject: [PATCH 37/73] fix the prompt when dll load failed on windows
 test=develop

---
 python/paddle/fluid/framework.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 569ca2a4f72..77239f2e8ee 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -37,11 +37,13 @@ try:
     from . import core
 except ImportError as e:
     if os.name == 'nt':
+        executable_path = os.path.abspath(os.path.dirname(sys.executable))
         raise ImportError(
-            """NOTE: You may need to run \"set PATH=c:\python27\lib:%PATH%\"
-        if you encounters \"mkldnn.dll not found\" errors. If you have python
-        installed in other directory, replace \"c:\python27\lib" with your own
-        directory. The original error is: \n""" + cpt.get_exception_message(e))
+            """NOTE: You may need to run \"set PATH=%s;%%PATH%%\"
+        if you encounters \"DLL load failed\" errors. If you have python
+        installed in other directory, replace \"%s\" with your own
+        directory. The original error is: \n %s""" %
+            (executable_path, executable_path, cpt.get_exception_message(e)))
     else:
         raise ImportError(
             """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
-- 
GitLab


From 885c4e57abdace2e769697b4b464edbfa62b19e6 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Mon, 21 Jan 2019 14:31:02 +0800
Subject: [PATCH 38/73] fea/infer memory optim2 (#14953)

---
 paddle/fluid/framework/ir/fc_fuse_pass.cc     |   1 +
 paddle/fluid/framework/ir/graph_helper.cc     | 143 +++-
 paddle/fluid/framework/ir/graph_helper.h      |  17 +
 .../framework/ir/graph_to_program_pass.cc     |  31 +-
 .../framework/ir/graph_to_program_pass.h      |   4 +
 paddle/fluid/framework/ir/graph_viz_pass.cc   |   2 +-
 paddle/fluid/framework/ir/node.h              |   2 +-
 paddle/fluid/framework/naive_executor.cc      |   7 +-
 .../fluid/inference/analysis/CMakeLists.txt   |   1 +
 paddle/fluid/inference/analysis/analyzer.cc   |  17 +-
 paddle/fluid/inference/analysis/analyzer.h    |   2 +-
 .../inference/analysis/analyzer_tester.cc     |   4 +
 paddle/fluid/inference/analysis/argument.h    |  11 +
 paddle/fluid/inference/analysis/helper.h      |   7 +
 .../inference/analysis/ir_pass_manager.cc     |   1 +
 .../analysis/ir_passes/CMakeLists.txt         |   2 +-
 .../analysis/ir_passes/subgraph_detector.cc   |   1 -
 .../ir_passes/tensorrt_subgraph_pass.cc       |   4 +
 .../inference/analysis/passes/CMakeLists.txt  |  13 +-
 .../passes/ir_analysis_compose_pass.cc        |  62 --
 .../analysis/passes/ir_analysis_pass.cc       |  14 +-
 .../analysis/passes/ir_analysis_pass.h        |   3 +
 .../passes/ir_graph_to_program_pass.cc        |  45 ++
 ...pose_pass.h => ir_graph_to_program_pass.h} |  20 +-
 .../analysis/passes/memory_optimize_pass.cc   | 647 ++++++++++++++++++
 .../analysis/passes/memory_optimize_pass.h    | 106 +++
 .../fluid/inference/analysis/passes/passes.cc |  13 +-
 paddle/fluid/inference/api/CMakeLists.txt     |  11 +-
 paddle/fluid/inference/api/analysis_config.cc | 102 ++-
 .../fluid/inference/api/analysis_predictor.cc |  92 ++-
 .../fluid/inference/api/analysis_predictor.h  |  10 +
 .../api/analysis_predictor_tester.cc          |  51 ++
 paddle/fluid/inference/api/demo_ci/run.sh     |   1 +
 paddle/fluid/inference/api/helper.h           |  15 +-
 .../inference/api/paddle_analysis_config.h    |  11 +
 .../inference/api/paddle_pass_builder.cc      |   5 +
 .../fluid/inference/api/paddle_pass_builder.h |  46 +-
 .../fluid/inference/tests/api/CMakeLists.txt  |   4 +-
 .../tests/api/analyzer_dam_tester.cc          |  30 +
 .../analyzer_text_classification_tester.cc    |   2 +
 .../tests/api/analyzer_vis_tester.cc          |   9 +-
 .../fluid/inference/tests/api/tester_helper.h |   6 +-
 .../inference/tests/api/trt_models_tester.cc  |   5 +
 paddle/fluid/inference/utils/benchmark.h      |   2 +-
 .../fluid/inference/utils/benchmark_tester.cc |   4 +-
 paddle/fluid/operators/controlflow/feed_op.cc |   1 +
 paddle/fluid/string/pretty_log.h              |  17 +
 47 files changed, 1450 insertions(+), 154 deletions(-)
 delete mode 100644 paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
 create mode 100644 paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
 rename paddle/fluid/inference/analysis/passes/{ir_analysis_compose_pass.h => ir_graph_to_program_pass.h} (59%)
 create mode 100644 paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
 create mode 100644 paddle/fluid/inference/analysis/passes/memory_optimize_pass.h

diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 26eac939054..12b31da010c 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index d99f856d8f4..8de93cf285e 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -18,8 +18,10 @@ limitations under the License. */
 #include <fstream>
 #include <iosfwd>
 #include <ostream>
+#include <stack>
 #include <unordered_map>
 #include <unordered_set>
+#include "paddle/fluid/framework/ir/graph_traits.h"
 
 DEFINE_string(print_sub_graph_dir, "",
               "FLAGS_print_sub_graph_dir is used "
@@ -41,7 +43,7 @@ void SortHelper(
     }
   }
 
-  VLOG(3) << "topology sort insert: " << node->Name()
+  VLOG(5) << "topology sort insert: " << node->Name() << " "
           << reinterpret_cast<void *>(node) << " input " << node->inputs.size();
   ret->push_back(node);
 }
@@ -99,12 +101,13 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
   return ret;
 }
 
+// Build operator inlink edge table.
 std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
     const Graph &graph) {
   std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
 
   for (auto &n : graph.Nodes()) {
-    if (n->NodeType() != ir::Node::Type::kOperation) continue;
+    if (!n->IsOp()) continue;
     if (adj_list.find(n) == adj_list.end()) {
       adj_list[n] = std::unordered_set<ir::Node *>();
     }
@@ -121,6 +124,119 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
   return adj_list;
 }
 
+// Build operator outlink edge table.
+std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationOutAdjList(
+    const Graph &graph) {
+  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
+
+  for (auto &n : graph.Nodes()) {
+    if (!n->IsOp()) continue;
+    if (adj_list.find(n) == adj_list.end()) {
+      adj_list[n] = std::unordered_set<ir::Node *>();
+    }
+    for (auto &var : n->outputs) {
+      for (auto &adj_n : var->outputs) {
+        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
+        VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
+                 << " -> " << n->Name() << reinterpret_cast<void *>(n)
+                 << "  via " << var->Name() << reinterpret_cast<void *>(var);
+        adj_list[n].insert(adj_n);
+      }
+    }
+  }
+  return adj_list;
+}
+
+std::vector<ir::Node *> OpDFSSort(const Graph &graph) {
+  auto edge_table = BuildOperationOutAdjList(graph);
+  std::stack<Node *> stack;
+  for (auto &ele : edge_table) {
+    if (ele.first->inputs.empty()) {
+      // find the input ops (those without input vars)
+      stack.push(ele.first);
+    } else {
+      // find the ops with only persistable vars as inputs.
+      bool all_persistable = true;
+      for (auto *input : ele.first->inputs) {
+        if (!(input->IsVar() && input->Var() && input->Var()->Persistable())) {
+          all_persistable = false;
+        }
+      }
+      if (all_persistable) {
+        stack.push(ele.first);
+      }
+    }
+  }
+
+  std::vector<Node *> res;
+  // start from the feed op and DFS
+  std::unordered_set<Node *> unique_set;
+  while (!stack.empty()) {
+    // will start from the last feed by default.
+    auto cur = stack.top();
+    stack.pop();
+    unique_set.insert(cur);
+    res.push_back(cur);
+
+    for (auto *op : edge_table[cur]) {
+      if (!unique_set.count(op)) {
+        stack.push(op);
+      }
+    }
+  }
+  return res;
+}
+
+std::vector<ir::Node *> TopologyDfsSortOperations(const Graph &graph) {
+  std::vector<ir::Node *> nodes;
+  std::unordered_map<Node *, int> in_degree;
+
+  auto set_out_ops_ready = [&](Node *var) {
+    for (auto *op : var->outputs) {
+      --in_degree[op];
+    }
+  };
+  // build in_degree
+  for (auto *node : graph.Nodes()) {
+    if (node->IsOp()) {
+      in_degree[node] += node->inputs.size();
+    } else if (node->IsVar() && node->inputs.empty()) {
+      // put all the inputs of the whole graph ready.
+      set_out_ops_ready(node);
+    }
+  }
+
+  std::deque<Node *> op_queue;
+  // first visit
+  for (auto &node : OpDFSSort(graph)) {
+    if (node->IsOp()) {
+      op_queue.push_back(node);
+    }
+  }
+
+  // traverse the graph
+  int num_ops = op_queue.size();
+  while (num_ops) {
+    for (auto it = op_queue.begin(); it != op_queue.end(); it++) {
+      auto *&cur_op = *it;
+      if (!cur_op || in_degree[cur_op] > 0) continue;
+      // visit this node
+      // put all the output var of this op valid.
+      for (auto *out_var : cur_op->outputs) {
+        if (!out_var) continue;
+        set_out_ops_ready(out_var);
+      }
+      VLOG(8) << "visit " << cur_op->Name();
+      nodes.push_back(cur_op);
+
+      cur_op = nullptr;
+      num_ops--;
+    }
+  }
+
+  return nodes;
+}
+
 size_t GraphNum(const Graph &graph) {
   std::unordered_set<ir::Node *> nodes(graph.Nodes());
   std::unordered_set<ir::Node *> visited_nodes;
@@ -203,6 +319,29 @@ size_t GraphNum(const Graph &graph) {
   return graph_count;
 }
 
+void CleanIndividualNodes(Graph *graph) {
+  std::unordered_set<Node *> nodes2rm;
+  for (auto *node : graph->Nodes()) {
+    if (node->inputs.empty() && node->outputs.empty()) {
+      nodes2rm.insert(node);
+    }
+  }
+
+  for (auto *node : nodes2rm) {
+    graph->RemoveNode(node);
+  }
+}
+
+std::vector<Node *> TopologyVarientSort(const Graph &graph,
+                                        SortKind sort_kind) {
+  switch (sort_kind) {
+    case SortKind::TS:
+      return framework::ir::TopologySortOperations(graph);
+    default:
+      return framework::ir::TopologyDfsSortOperations(graph);
+  }
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
index be525151f9f..fba4936f2c5 100644
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -34,6 +34,23 @@ size_t GraphNum(const Graph &graph);
 // `graph` cannot contain circle.
 std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
 
+// Topological sort, but try to DFS.
+std::vector<ir::Node *> TopologyDfsSortOperations(const Graph &graph);
+
+// Different kinds to sort the operators in a graph to a sequence.
+enum class SortKind {
+  // Topological Search
+  TS = 0,
+  // Topological and Depth First Search
+  TDFS
+};
+
+// Several kinds of topological sort.
+std::vector<Node *> TopologyVarientSort(const Graph &graph, SortKind sort_kind);
+
+// Clean the nodes that doesn't connect to others.
+void CleanIndividualNodes(Graph *graph);
+
 // Build an adjacency list of operations for the `graph`.
 std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
     const Graph &graph);
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc
index 36f36933265..3372dcd181d 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
-
 #include "paddle/fluid/framework/program_desc.h"
 
 namespace paddle {
@@ -29,6 +28,14 @@ namespace ir {
 
 std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
     std::unique_ptr<Graph> graph) const {
+  // Remove the unneeded variables after memory optimization.
+  std::unordered_set<std::string> vars2remove;
+  if (graph->Has(kGraphToProgramVarsToRemove)) {
+    vars2remove = graph->Get<std::unordered_set<std::string>>(
+        kGraphToProgramVarsToRemove);
+    VLOG(2) << "graph to program remove " << vars2remove.size() << " nodes";
+  }
+
   ProgramDesc& program = Get<ProgramDesc>("program");
 
   std::unique_ptr<proto::ProgramDesc> program_pb(
@@ -40,25 +47,35 @@ std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
   std::unordered_set<std::string> visited_vars;
   for (ir::Node* n : graph->Nodes()) {
     if (n->IsVar()) {
-      if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) {
+      if (n->Var() && visited_vars.count(n->Var()->Name()) == 0 &&
+          !vars2remove.count(n->Var()->Name())) {
         visited_vars.insert(n->Var()->Name());
         block->add_vars()->MergeFrom(*n->Var()->Proto());
       }
     }
   }
-
   block->clear_ops();
-  std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
+
+  std::vector<ir::Node*> nodes;
+  if (Has(kGraphToProgramSortKind)) {
+    // Inference Memory Optimize relays on this branch.
+    int sort_kind = Get<int>(kGraphToProgramSortKind);
+    nodes = TopologyVarientSort(
+        *graph, static_cast<framework::ir::SortKind>(sort_kind));
+  } else {
+    nodes = TopologySortOperations(*graph);
+  }
+
   for (ir::Node* n : nodes) {
-    if (!n->Op()) {
-      continue;
-    }
+    if (!n->Op()) continue;
+
     block->add_ops()->MergeFrom(*n->Op()->Proto());
   }
 
   program.CopyFrom(*program_pb);
   return graph;
 }
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.h b/paddle/fluid/framework/ir/graph_to_program_pass.h
index 124ec5a8e77..4c36c3a5da1 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass.h
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.h
@@ -20,6 +20,10 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+const char kGraphToProgramVarsToRemove[] =
+    "__graph_to_program_vars_to_remove__";
+const char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__";
+
 class GraphToProgramPass : public Pass {
  protected:
   std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override;
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
index 31ed98db72c..87a28a2a66c 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -135,4 +135,4 @@ GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes(
 }  // namespace paddle
 
 REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass)
-    .RequirePassAttr(paddle::framework::ir::kGraphVizPath);
+    .RequirePassAttr(paddle::framework::ir::kGraphVizPath);
\ No newline at end of file
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index 89dcc677b57..9eade9eaa8f 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -64,7 +64,7 @@ class Node {
 
   std::string Name() const { return name_; }
 
-  VarDesc* Var() {
+  VarDesc* Var() const {
     PADDLE_ENFORCE(IsVar());
     return var_desc_.get();
   }
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 86e6b1f7d92..a37bb6f4da1 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -50,8 +50,8 @@ void NaiveExecutor::Run() {
                              "running Paddle Inference";
 #endif  // PADDLE_ON_INFERENCE
   for (auto &op : ops_) {
-    VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
-            << " on scope " << scope_;
+    VLOG(4) << std::this_thread::get_id() << " run "
+            << op->DebugStringEx(scope_) << " on scope " << scope_;
     op->SetIsCalledByExecutor(false);
     op->Run(*scope_, place_);
   }
@@ -69,10 +69,12 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
     anc = anc->parent();
   }
 
+  int num_vars = 0;
   for (auto &var : global_block.AllVars()) {
     if (var->Name() == framework::kEmptyVarName) {
       continue;
     }
+    num_vars++;
 
     if (persistable == var->Persistable()) {
       if (persistable) {
@@ -90,6 +92,7 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
       }
     }
   }
+  VLOG(4) << "naive executor create " << num_vars << " vars";
 }
 
 void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id,
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 27b6b80955e..7a795bda820 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -18,6 +18,7 @@ cc_library(analysis SRCS
   analyzer.cc
   analysis_pass
   DEPS ${analysis_deps} analysis_helper
+  ${INFER_IR_PASSES}
   )
 
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index c8ed373ee7c..d82a063d880 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -15,8 +15,8 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <string>
 #include <vector>
-#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
 #include "paddle/fluid/inference/analysis/passes/passes.h"
+#include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
 namespace inference {
@@ -24,13 +24,16 @@ namespace analysis {
 
 Analyzer::Analyzer() {}
 
-void Analyzer::Run(Argument *argument) { RunIrAnalysis(argument); }
+void Analyzer::Run(Argument *argument) { RunAnalysis(argument); }
 
-void Analyzer::RunIrAnalysis(Argument *argument) {
-  std::vector<std::string> passes({"ir_analysis_compose_pass"});
-
-  for (auto &pass : passes) {
-    PassRegistry::Global().Retreive(pass)->Run(argument);
+void Analyzer::RunAnalysis(Argument *argument) {
+  PADDLE_ENFORCE(argument->analysis_passes_valid(),
+                 "analsis_passes is not valid in the argument.");
+  for (auto &pass : argument->analysis_passes()) {
+    string::PrettyLogH1("--- Running analysis [%s]", pass);
+    auto *ptr = PassRegistry::Global().Retreive(pass);
+    PADDLE_ENFORCE_NOT_NULL(ptr, "no analysis pass called %s", pass);
+    ptr->Run(argument);
   }
 }
 
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index b43e67f20f4..a6de18db600 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -54,7 +54,7 @@ class Analyzer final {
   DISABLE_COPY_AND_ASSIGN(Analyzer);
 
  protected:
-  void RunIrAnalysis(Argument* argument);
+  void RunAnalysis(Argument* argument);
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 4c84d02d867..c814ce45484 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -32,6 +32,8 @@ TEST(Analyzer, analysis_without_tensorrt) {
   argument.SetModelDir(FLAGS_inference_model_dir);
   argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
   argument.SetUseGPU(false);
+  argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass",
+                              "ir_params_sync_among_devices_pass"});
 
   Analyzer analyser;
   analyser.Run(&argument);
@@ -44,6 +46,8 @@ TEST(Analyzer, analysis_with_tensorrt) {
   argument.SetModelDir(FLAGS_inference_model_dir);
   argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
   argument.SetUseGPU(false);
+  argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass",
+                              "ir_params_sync_among_devices_pass"});
 
   Analyzer analyser;
   analyser.Run(&argument);
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 2d8980b1d15..88ce61f9b92 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -110,16 +110,20 @@ struct Argument {
   // The overall Scope to work on.
   DECL_ARGUMENT_UNIQUE_FIELD(scope, Scope, framework::Scope);
 
+  // The default program, loaded from disk.
   DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc);
 
   // The ir passes to perform in analysis phase.
   DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses,
                       std::vector<std::string>);
+  DECL_ARGUMENT_FIELD(analysis_passes, AnalysisPasses,
+                      std::vector<std::string>);
 
   // Pass a set of op types to enable its mkldnn kernel
   DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
                       std::unordered_set<std::string>);
 
+  // Passed from config.
   DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
   DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
   DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
@@ -127,6 +131,13 @@ struct Argument {
   DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
   DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
 
+  // Memory optimized related.
+  DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
+  DECL_ARGUMENT_FIELD(memory_optim_force_update, MemoryOptimForceUpdate, bool);
+  // Indicate which kind of sort algorithm is used for operators, the memory
+  // optimization relays on the sort algorithm.
+  DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int);
+
   // The program transformed by IR analysis phase.
   DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram,
                              framework::proto::ProgramDesc);
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index 269a0da9f93..de04713b531 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -28,6 +28,13 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/port.h"
 
+#ifdef _WIN32
+#define GCC_ATTRIBUTE(attr__) ;
+#else
+#define GCC_ATTRIBUTE(attr__) __attribute__((attr__));
+#endif
+#define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result)
+
 namespace paddle {
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index e37fea38bcb..4e146422645 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -83,6 +83,7 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
   PADDLE_ENFORCE(graph.get());
   // Apply all the passes
   for (const auto &pass : passes_) {
+    if (pass->Type() == "graph_viz_pass") continue;
     PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
     graph = pass->Apply(std::move(graph));
   }
diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
index 9ae5b8aa173..eb6e1768a2c 100644
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -1,6 +1,6 @@
 cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
 
-if (TENSORRT_FOUND)
+if (WITH_GPU AND TENSORRT_FOUND)
   cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
 
   set(analysis_deps ${analysis_deps}
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
index b6a5dfd087c..a64f85ee9ac 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
@@ -413,7 +413,6 @@ void SubGraphFuser::ReplaceNodesWithSubGraphs() {
   auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)();
   for (auto &subgraph : subgraphs) {
     if (subgraph.size() <= (size_t)min_subgraph_size_) continue;
-    LOG(INFO) << "detect a subgraph size " << subgraph.size();
     std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
     // replace this sub-graph with the first node. Two steps: 1. Create a Block
     // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index bc06e78ae69..5f25303cc1e 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
 #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
+#include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
 namespace inference {
@@ -77,6 +78,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
   framework::BlockDesc block_desc(nullptr, &block_proto);
   block_desc.Proto()->set_parent_idx(-1);
   block_desc.Proto()->set_idx(0);
+  string::PrettyLogDetail("---  detect a sub-graph with %d nodes",
+                          subgraph.size());
+
   for (auto *node : subgraph) {
     auto *op = block_desc.AppendOp();
     *op->Proto() = *node->Op()->Proto();
diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
index add9b70f2cd..691c336ebe4 100644
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -1,11 +1,18 @@
 cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
+cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass)
 cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass)
+cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
+
+cc_library(analysis_passes SRCS passes.cc DEPS
+  ir_graph_build_pass
+  ir_analysis_pass
+  ir_params_sync_among_devices_pass
+  memory_optim_pass
+  ir_graph_to_program_pass
+)
 
 set(analysis_deps ${analysis_deps}
-        ir_graph_build_pass
-        ir_analysis_pass
         analysis_passes
         subgraph_detector
         CACHE INTERNAL "")
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
deleted file mode 100644
index 490189e5507..00000000000
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
-#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
-#include "paddle/fluid/string/pretty_log.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void IrAnalysisComposePass::RunImpl(Argument *argument) {
-  ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
-  ApplyIrPasses(argument);
-  CollectFusionStatis(argument);
-}
-
-std::string IrAnalysisComposePass::repr() const {
-  return "ir-analysis-compose-pass";
-}
-
-void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
-  std::vector<std::string> passes({
-      "ir_graph_build_pass", "ir_analysis_pass",
-      "ir_params_sync_among_devices_pass",
-  });
-  for (const auto &pass : passes) {
-    VLOG(2) << "Run pass " << pass;
-    auto *the_pass = PassRegistry::Global().Retreive(pass);
-    the_pass->Run(argument);
-  }
-}
-
-void IrAnalysisComposePass::CollectFusionStatis(Argument *argument) {
-  if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) {
-    LOG(INFO) << "argument has no fuse statis";
-    return;
-  }
-  argument->SetFusionStatis(
-      argument->main_graph().Get<Argument::fusion_statis_t>(
-          framework::ir::kFuseStatisAttr));
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
index e327bd39f0a..d986811a827 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 
 namespace paddle {
@@ -31,9 +32,18 @@ void IrAnalysisPass::RunImpl(Argument* argument) {
   IRPassManager the_ir_manager(argument);
   graph = the_ir_manager.Apply(std::move(graph));
   PADDLE_ENFORCE_GT(graph->Nodes().size(), 0);
-  argument->SetIrAnalyzedProgram(new framework::proto::ProgramDesc(
-      the_ir_manager.AcquireProgram(&graph, argument->main_program())));
   argument->SetMainGraph(graph.release());
+  CollectFusionStatis(argument);
+}
+
+void IrAnalysisPass::CollectFusionStatis(Argument* argument) {
+  if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) {
+    LOG(INFO) << "argument has no fuse statis";
+    return;
+  }
+  argument->SetFusionStatis(
+      argument->main_graph().Get<Argument::fusion_statis_t>(
+          framework::ir::kFuseStatisAttr));
 }
 
 std::string IrAnalysisPass::repr() const { return "ir-analysis-pass"; }
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
index d8a74498075..2c2113c06d9 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h
@@ -29,6 +29,9 @@ namespace analysis {
 class IrAnalysisPass : public AnalysisPass {
  public:
   void RunImpl(Argument* argument) override;
+
+  void CollectFusionStatis(Argument* argument);
+
   std::string repr() const override;
 };
 
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
new file mode 100644
index 00000000000..f1da37af3cc
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
+#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void IrGraphToProgramPass::RunImpl(Argument *argument) {
+  auto pass =
+      framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
+
+  if (argument->memory_optim_sort_kind_valid()) {
+    pass->Set(framework::ir::kGraphToProgramSortKind,
+              new int(argument->memory_optim_sort_kind()));
+  }
+
+  std::unique_ptr<Graph> graph(argument->main_graph_ptr());
+  framework::ProgramDesc desc(argument->main_program());
+  pass->SetNotOwned("program", &desc);
+  auto thegraph = pass->Apply(std::move(graph));
+  thegraph.release();  // the argument still own the graph.
+
+  argument->SetIrAnalyzedProgram(
+      new framework::proto::ProgramDesc(*desc.Proto()));
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
similarity index 59%
rename from paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
rename to paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
index 16c6b7d84df..838ebdbc9d7 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
@@ -14,31 +14,17 @@
 
 #pragma once
 
-#include <string>
-#include <vector>
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
-#include "paddle/fluid/inference/analysis/passes/passes.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-/*
- * The analysis pass to run a list of IR passes (like a function call).
- * Currently, it should be the first pass of analysis phase.
- */
-class IrAnalysisComposePass : public AnalysisPass {
+class IrGraphToProgramPass : public AnalysisPass {
  public:
-  void RunImpl(Argument* argument) override;
-  std::string repr() const override;
+  void RunImpl(Argument *argument) override;
 
- private:
-  void ApplyIrPasses(Argument* argument);
-
-  void CollectFusionStatis(Argument* argument);
-
-  // Assign a Scope for IR passes to modify the weights.
-  void AssignScopeToModify(Argument* argument);
+  std::string repr() const override { return "ir-graph-to-param-pass"; }
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
new file mode 100644
index 00000000000..57683c0b727
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -0,0 +1,647 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+#include <algorithm>
+#include <fstream>
+#include <limits>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
+#include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+using framework::ir::Graph;
+using framework::ir::Node;
+using framework::ir::TopologyVarientSort;
+using space_table_t = MemoryOptimizePass::space_table_t;
+
+// Collect the lifecycles of the tensors.
+// Traverse the graph in topological order.
+// The traversal order also affect the lifecycles, so different sort_kind is
+// used.
+void MemoryOptimizePass::CollectLifeCycle(
+    std::unordered_map<std::string, lifecycle_t>* lifecycles,
+    int sort_kind) const {
+  max_lifecycle_ = 0;
+  for (auto* op_node : framework::ir::TopologyVarientSort(
+           *graph_, static_cast<framework::ir::SortKind>(sort_kind))) {
+    if (!op_node->IsOp()) continue;
+    auto reads = op_node->inputs;
+    auto writes = op_node->outputs;
+
+    std::vector<Node*> requires(reads.begin(), reads.end());
+    requires.insert(requires.end(), writes.begin(), writes.end());
+
+    // Disable reuse of feed variables.
+    if (op_node->Name() == "feed") {
+      for (auto* node : op_node->outputs) {
+        auto var = node->Name();
+        lifecycles->emplace(var,
+                            std::make_pair(0, std::numeric_limits<int>::max()));
+      }
+    } else {
+      // Normal operators.
+      for (const Node* node : requires) {
+        if (node->Var()->Persistable()) continue;
+        std::string var = node->Name();
+        if (!lifecycles->count(var)) {
+          (*lifecycles)[var] = std::make_pair(max_lifecycle_, max_lifecycle_);
+        } else {
+          (*lifecycles)[var].second =
+              std::max(max_lifecycle_, lifecycles->at(var).second);  // max()
+        }
+      }
+    }
+
+    ++max_lifecycle_;
+  }
+}
+
+// TODO(Superjomn) Make this a general help method.
+int DataTypeToSpace(framework::proto::VarType_Type type) {
+  switch (type) {
+    case framework::proto::VarType_Type_BOOL:
+      return sizeof(bool);
+    case framework::proto::VarType_Type_FP32:
+      return sizeof(float);
+    case framework::proto::VarType_Type_INT32:
+      return sizeof(int32_t);
+    case framework::proto::VarType_Type_INT64:
+      return sizeof(int64_t);
+    default:
+      PADDLE_THROW("Unknown data type");
+  }
+}
+
+// Collect the memory size of the tensors.
+void MemoryOptimizePass::CollectVarMemorySize(
+    const std::unordered_map<std::string, size_t>& batch_var_ave_dim,
+    std::unordered_map<std::string, Node*>* tensor_nodes,
+    space_table_t* space_table) const {
+  // Collect tensors from graph.
+  for (auto* node : graph_->Nodes()) {
+    if (node->IsVar() &&
+        node->Var()->GetType() ==
+            framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) {
+      // Parameters will not be reused.
+      if (node->Var()->Persistable()) continue;
+      (*tensor_nodes)[node->Name()] = node;
+      (*space_table)[node->Name()] =
+          DataTypeToSpace(node->Var()->GetDataType()) *
+          batch_var_ave_dim.at(node->Name());
+    }
+  }
+}
+
+// Find a sutable (big enough but smallest to avoid memory waste).
+//
+// Args:
+// @tensor_nodes: the tensor nodes in the ir::Graph.
+// @free_existing_tensors: the allocated tensor and are free.
+// @space_table: the memory space of tensors.
+// @tensor2use: the tensor that requires memory.
+//
+// Returns:
+// true if found some existing tensor to reuse.
+// false if no sutable tensor to reuse, one need to allocate a new tensor for
+// this requirement.
+// The suitable tensor for reuse is one that is approximately equal to the
+// memory demand.
+bool FindSuitableTensorToReuse(
+    const std::string& tensor, int space_required,
+    const std::unordered_map<std::string, Node*>& tensor_nodes,
+    std::unordered_set<std::string>* free_existing_tensors,
+    const space_table_t& space_table,
+    const std::vector<std::unordered_set<std::string>>& var_clusters,
+    std::string* tensor2use) __SHOULD_USE_RESULT__;
+
+bool FindSuitableTensorToReuse(
+    const std::string& tensor, int space_required,
+    const std::unordered_map<std::string, Node*>& tensor_nodes,
+    std::unordered_set<std::string>* free_existing_tensors,
+    const space_table_t& space_table,
+    const std::vector<std::unordered_set<std::string>>& var_clusters,
+    std::string* tensor2use) {
+  std::pair<std::string, size_t> best_fit;
+  best_fit.second = std::numeric_limits<int>::max();
+  VLOG(5) << "Split Tensors to " << var_clusters.size() << " clusters";
+
+  // find the cluster this var belongs to.
+  const std::unordered_set<std::string>* cluster = nullptr;
+  for (const auto& c : var_clusters) {
+    if (c.count(tensor)) {
+      cluster = &c;
+      break;
+    }
+  }
+  PADDLE_ENFORCE_NOT_NULL(cluster,
+                          "something wrong in memory optimization, the "
+                          "variable %s not in the clusters.",
+                          tensor);
+
+  for (auto& candidate : *free_existing_tensors) {
+    // This is not a temporary tensor.
+    if (!space_table.count(candidate)) continue;
+    // Not in the same cluster.
+    if (!cluster->count(candidate)) continue;
+
+    size_t space = space_table.at(candidate);
+    size_t space_diff = std::abs<size_t>(space - space_required);
+    if (space_diff < best_fit.second) {
+      best_fit.first = candidate;
+      best_fit.second = space_diff;
+    }
+  }
+
+  if (best_fit.second < std::numeric_limits<int>::max()) {
+    *tensor2use = best_fit.first;
+    return true;
+  }
+  return false;
+}
+
+// Allocate new tensor instead of reusing the existing one.
+void AllocateNewTensor(
+    const std::string& name, size_t space_required,
+    const std::unordered_map<std::string, Node*>& tensor_nodes,
+    std::unordered_set<std::string>* free_existing_tensors,
+    space_table_t* space_table,
+    std::unordered_map<std::string, std::string>* reuse_table) {
+  // The newly born tensor is free to be used.
+  free_existing_tensors->insert(name);
+  // Register the space it has.
+  PADDLE_ENFORCE(space_table->count(name));
+  space_table->at(name) = std::max(space_table->at(name), space_required);
+  // The allocated new tensor use the memory of itself.
+  (*reuse_table)[name] = name;
+}
+
+// Free a tensor and make it resuable.
+// @tensor: the tensor to free.
+// @free_existing_tensors: the free and allocated tensors.
+// @reuse_table: a map from a fake tensor to the existing allocated tensor.
+void FreeATensor(const std::string& tensor,
+                 std::unordered_set<std::string>* free_existing_tensors,
+                 std::unordered_map<std::string, std::string>* reuse_table) {
+  if (tensor == "feed" || tensor == "fetch") return;
+  // the really allocated tensor.
+  const auto& free_tensor = reuse_table->at(tensor);
+
+  free_existing_tensors->insert(free_tensor);
+}
+
+// Reuse a free existing tensor.
+void ReuseATensor(const std::string& tensor, const std::string& tensor2reuse,
+                  size_t memory_size,
+                  std::unordered_set<std::string>* free_existing_tensors,
+                  std::unordered_map<std::string, std::string>* reuse_table,
+                  space_table_t* reused_space_table) {
+  auto it = free_existing_tensors->find(tensor2reuse);
+  PADDLE_ENFORCE(it != free_existing_tensors->end());
+  free_existing_tensors->erase(it);
+  (*reuse_table)[tensor] = tensor2reuse;
+  // Update the memory size of a reused tensor, the memory will grow if the
+  // required memory is larger.
+  (*reused_space_table)[tensor2reuse] =
+      std::max(reused_space_table->at(tensor2reuse), memory_size);
+}
+
+// Calculate the memory usage.
+void EvaluateMemoryUsage(
+    const std::unordered_map<std::string, std::string>& reuse_table,
+    const space_table_t& space_table,
+    const std::unordered_map<std::string, size_t>& var_batch_ave_size,
+    size_t* allocated, size_t* saved) {
+  *allocated = 0;
+  *saved = 0;
+
+  for (auto elem : reuse_table) {
+    if (elem.first == elem.second) {
+      *allocated += space_table.at(elem.first);
+      VLOG(4) << elem.first << " <-> " << elem.second << " "
+              << space_table.at(elem.first) << " "
+              << space_table.at(elem.second);
+    } else {
+      *saved += space_table.at(elem.first);
+      VLOG(4) << "reuse " << elem.first << " -> " << elem.second;
+    }
+  }
+  VLOG(4) << "allocated " << *allocated;
+  VLOG(4) << "saved " << *saved;
+}
+
+// Return saved ratio.
+void MemoryOptimizePass::MakeReusePlan(
+    const std::vector<std::unordered_set<std::string>>& var_clusters,
+    const std::unordered_map<std::string, size_t>& var_batch_ave_size,
+    const space_table_t& space_table,
+    std::unordered_map<std::string, std::string>* reuse_table, int sort_kind,
+    MemoryAllocation* memory_allocation) const {
+  // Clear the existing plan.
+  reuse_table->clear();
+
+  // The `space_table` stores the real memory size for each tensor.
+  // The `reused_space_table` stores the maximum memory size required by a
+  // tensor during the memory reusing, the small tensor might be reused by a
+  // larger tensor, and the memory size of the small one will grow.
+  auto reused_space_table = space_table;
+
+  std::unordered_map<std::string, lifecycle_t> life_cycles;
+  std::unordered_map<std::string, Node*> tensor_nodes;
+  // The allocated tensors whose memory can be reused, they will live across the
+  // program execution.
+  std::unordered_set<std::string> existing_tensors;
+  // The existing tensor that has been allocated, and is also free to reuse.
+  std::unordered_set<std::string> free_existing_tensors;
+
+  CollectLifeCycle(&life_cycles, sort_kind);
+
+  for (int age = 0; age < max_lifecycle_; ++age) {
+    std::unordered_set<std::string> born_tensors;
+    std::unordered_set<std::string> dead_tensors;
+    // Gather the dead and born tensors.
+    for (auto elem_it = life_cycles.begin(); elem_it != life_cycles.end();
+         elem_it++) {
+      if (elem_it->second.first == -1) {
+        continue;
+      }
+      const auto& tensor = elem_it->first;
+      const auto& lifecycle = elem_it->second;
+      VLOG(4) << "process " << tensor << " reuse " << lifecycle.first << "->"
+              << lifecycle.second;
+
+      // Collect newly born tensors.
+      if (lifecycle.first == age) {
+        born_tensors.insert(tensor);
+      }
+      // Collect dead tensors whose memory can be reused.
+      else if (lifecycle.second < age) {  // NOLINT
+        dead_tensors.insert(tensor);
+        // remove to avoid duplicate process.
+        elem_it->second.first = -1;  // avoid duplicate search
+      }
+    }
+
+    // Reuse the dead tensors for born tensors
+    for (const auto& tensor : born_tensors) {
+      // Skip the feed and fetch tensor for that they share data with others.
+      std::string tensor2reuse;
+      if (!space_table.count(tensor)) continue;
+      size_t space_required = space_table.at(tensor);
+      if (FindSuitableTensorToReuse(tensor, space_required, tensor_nodes,
+                                    &free_existing_tensors, reused_space_table,
+                                    var_clusters, &tensor2reuse)) {
+        if (tensor != tensor2reuse) {
+          VLOG(4) << tensor << " -> " << tensor2reuse;
+        }
+        ReuseATensor(tensor, tensor2reuse, space_required,
+                     &free_existing_tensors, reuse_table, &reused_space_table);
+      } else {
+        VLOG(4) << "allocate " << tensor;
+        AllocateNewTensor(tensor, space_required, tensor_nodes,
+                          &free_existing_tensors, &reused_space_table,
+                          reuse_table);
+        ReuseATensor(tensor, tensor, space_required, &free_existing_tensors,
+                     reuse_table, &reused_space_table);
+      }
+    }
+
+    for (const auto& tensor : dead_tensors) {
+      // free its memory.
+      FreeATensor(tensor, &free_existing_tensors, reuse_table);
+    }
+  }
+
+  EvaluateMemoryUsage(*reuse_table, reused_space_table, var_batch_ave_size,
+                      &(memory_allocation->allocated),
+                      &(memory_allocation->saved));
+  memory_allocation->sort_kind = sort_kind;
+}
+
+void BuildVarNodeTable(Graph* graph,
+                       std::unordered_map<std::string, Node*>* var_node_table) {
+  for (auto* node : graph->Nodes()) {
+    if (node->IsVar()) {
+      (*var_node_table)[node->Name()] = node;
+    }
+  }
+}
+
+// NOTE The optimized opdesc doesn't match ir::Graph.
+void UpdateOpDescsByReuse(
+    Graph* graph,
+    const std::unordered_map<std::string, std::string>& reuse_table,
+    int sort_kind) {
+  // TODO(Superjomn) change here to be compatible with the runtime order.
+  for (auto* node : TopologyVarientSort(
+           *graph, static_cast<framework::ir::SortKind>(sort_kind))) {
+    if (node->IsOp()) {
+      // Replace the original inputs/outputs with the reused tensors.
+      std::unordered_map<std::string, std::vector<std::string>> in_args,
+          out_args;
+      for (auto argument : node->Op()->Inputs()) {
+        for (const auto& x : argument.second) {
+          auto name = x;
+          if (reuse_table.count(x) && reuse_table.at(x) != x) {
+            name = reuse_table.at(x);
+          }
+          in_args[argument.first].push_back(name);
+          VLOG(4) << node->Name() << " input " << x << " -> " << name;
+        }
+      }
+
+      for (auto argument : node->Op()->Outputs()) {
+        for (const auto& x : argument.second) {
+          auto name = x;
+          if (reuse_table.count(x) && reuse_table.at(x) != x) {
+            name = reuse_table.at(x);
+          }
+          out_args[argument.first].push_back(name);
+          VLOG(4) << node->Name() << " output " << x << " -> " << name;
+        }
+      }
+
+      // Update arguments.
+      for (auto& arg : in_args) {
+        node->Op()->SetInput(arg.first, arg.second);
+      }
+      for (auto& arg : out_args) {
+        node->Op()->SetOutput(arg.first, arg.second);
+      }
+      node->Op()->Flush();
+    }
+  }
+}
+
+void MemoryOptimizePass::PerformReusePlan(
+    const std::unordered_map<std::string, std::string>& reuse_table,
+    int sort_kind, std::unordered_set<std::string>* vars2remove) const {
+  std::unordered_map<std::string, Node*> var_node_table;
+  BuildVarNodeTable(graph_, &var_node_table);
+  UpdateOpDescsByReuse(graph_, reuse_table, sort_kind);
+
+  for (auto& item : reuse_table) {
+    if (item.first != item.second) {
+      vars2remove->insert(item.first);
+    }
+  }
+  VLOG(2) << "to remove vars " << vars2remove->size();
+}
+
+std::vector<std::string> split(const std::string& line, char delim) {
+  std::vector<std::string> res;
+  std::string field;
+  std::stringstream line_stream(line);
+  while (std::getline(line_stream, field, delim)) {
+    res.emplace_back(field);
+  }
+  return res;
+}
+
+// Deserialize the batch var shapes from the cache file.
+std::vector<std::map<std::string, std::vector<int>>> DeseralizeBatchVarShapes(
+    const std::string& path) {
+  std::ifstream file(path);
+  PADDLE_ENFORCE(file.is_open(), "failed to open %s  to read cache", path);
+  std::string line;
+  std::vector<std::map<std::string, std::vector<int>>> batch_shapes;
+
+  while (std::getline(file, line)) {
+    std::map<std::string, std::vector<int>> batch;
+    for (const auto& var_info : split(line, ';')) {
+      auto fields = split(var_info, ':');
+      PADDLE_ENFORCE_EQ(fields.size(), 2UL);
+      auto var_name = fields.front();
+      auto shape_str = split(fields[1], ',');
+      std::vector<int> shape;
+      for (const auto& v : shape_str) shape.push_back(std::stoi(v));
+      batch[var_name] = shape;
+    }
+    batch_shapes.push_back(batch);
+  }
+  return batch_shapes;
+}
+
+// Calculate the average dim of each tensor from the batch shape cache.
+std::unordered_map<std::string, size_t> GetBatchAverageSize(
+    const std::vector<std::map<std::string, std::vector<int>>>& batches) {
+  std::unordered_map<std::string, size_t> var2size;
+  // The average size of the batches for each variable.
+  int num_batch = 0;
+  for (const auto& batch : batches) {
+    num_batch++;
+    for (const auto& item : batch) {
+      int dim = std::accumulate(item.second.begin(), item.second.end(), 1,
+                                [](int a, int b) { return a * b; });
+      var2size[item.first] += dim;
+    }
+  }
+
+  for (auto& item : var2size) {
+    item.second /= num_batch;
+  }
+
+  return var2size;
+}
+
+// Analysis the batch shapes loading from the cache file.
+// By splitting the variables to different clusters by analyzing their batch
+// size, we can pre-schedule the changes of difference LoDTensor when different
+// length of input sequences is entered.
+// This should works fine for the models operating on sentences.
+std::vector<std::unordered_set<std::string>> AnalysisBatchShapesByBatchSize(
+    const std::vector<std::map<std::string, std::vector<int>>>& batches) {
+  // collect the batch size of each shape and combine to a stringstream in
+  // converient to generate a hash.
+  std::unordered_map<std::string, std::stringstream> var_batchsize_hashes;
+  for (auto& batch : batches) {
+    for (auto& ele : batch) {
+      int batch_size = ele.second.front();
+      // TODO(Superjomn) might consume large memory here, use combine hash.
+      var_batchsize_hashes[ele.first] << batch_size;
+    }
+  }
+
+  // Split to sets by batch size sequences.
+  std::unordered_map<size_t /*hash*/, std::unordered_set<std::string>>
+      shape_sets;
+  for (auto& ele : var_batchsize_hashes) {
+    auto hash = std::hash<std::string>()(ele.second.str());
+    shape_sets[hash].insert(ele.first);
+  }
+  std::vector<std::unordered_set<std::string>> res;
+  for (auto& ele : shape_sets) {
+    res.emplace_back(std::move(ele.second));
+  }
+
+  VLOG(3) << "Cluster by batch_size and get " << res.size() << " clusters";
+  return res;
+}
+
+// Analysis the batch shapes loading from the cache file, and split them to
+// different clusters by their size.
+// This should works fine for the overall models.
+std::vector<std::unordered_set<std::string>> AnalysisBatchShapesBySimilarSize(
+    const space_table_t& space_table,
+    const std::vector<std::map<std::string, std::vector<int>>>& batches,
+    int interval = 200000) {
+  PADDLE_ENFORCE_GT(interval, 0);
+  // cluster to different clusters.
+  size_t max_size = 0;
+  for (auto& item : space_table) {
+    max_size = std::max(item.second, max_size);
+  }
+  VLOG(4) << "tensor max size " << max_size;
+
+  std::vector<std::unordered_set<std::string>> res;
+
+  // cluster by intervals.
+  for (size_t interval_size = 0; interval_size <= max_size;
+       interval_size += interval) {
+    std::unordered_set<std::string> cluster;
+    for (auto& item : space_table) {
+      if (interval_size <= item.second &&
+          interval_size + interval > item.second) {
+        cluster.insert(item.first);
+      }
+    }
+    if (!cluster.empty()) {
+      res.push_back(cluster);
+    }
+  }
+
+  VLOG(3) << "Cluster by interval and get " << res.size() << " cluster";
+  return res;
+}
+
+std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; }
+
+void MemoryOptimizePass::RunImpl(Argument* argument) {
+  // When force update, should not optimize memory.
+  if (!argument->enable_memory_optim() || argument->memory_optim_force_update())
+    return;
+  graph_ = argument->main_graph_ptr();
+
+  auto path = GetMemoryCachePath(
+      argument->model_dir_valid() ? argument->model_dir() : "",
+      argument->model_program_path_valid() ? argument->model_program_path()
+                                           : "");
+  VLOG(3) << "Load memory cache from " << path;
+  if (inference::IsFileExists(path)) {
+    VLOG(4) << "Performing memory optimize";
+    auto batches = DeseralizeBatchVarShapes(path);
+    auto var_batch_ave_size = GetBatchAverageSize(batches);
+
+    std::unordered_map<std::string, Node*> tensor_nodes;
+    space_table_t space_table;
+    CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table);
+
+    std::unordered_map<std::string, std::string> reuse_table;
+    double max_saving_ratio = 0.;
+
+    std::vector<std::function<MemoryAllocation()>> strategies;
+
+    for (int sort_kind = 0; sort_kind < 2; sort_kind++) {
+      strategies.emplace_back([&, sort_kind] {
+        auto clustered_vars_by_batch_size =
+            AnalysisBatchShapesByBatchSize(batches);
+        MemoryAllocation allocation;
+        MakeReusePlan(clustered_vars_by_batch_size, var_batch_ave_size,
+                      space_table, &reuse_table, sort_kind, &allocation);
+        return allocation;
+      });
+
+      strategies.emplace_back([&, sort_kind] {
+        auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
+            space_table, batches, 1024);  // interval 1kb
+        MemoryAllocation allocation;
+        MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
+                      space_table, &reuse_table, sort_kind, &allocation);
+        return allocation;
+      });
+
+      strategies.emplace_back([&, sort_kind] {
+        auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
+            space_table, batches, 1024 * 1024);  // interval 1MB
+        MemoryAllocation allocation;
+        MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
+                      space_table, &reuse_table, sort_kind, &allocation);
+        return allocation;
+      });
+
+      strategies.emplace_back([&, sort_kind] {
+        auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
+            space_table, batches,
+            std::numeric_limits<int>::max());  // no intervals
+        MemoryAllocation allocation;
+        MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
+                      space_table, &reuse_table, sort_kind, &allocation);
+        return allocation;
+      });
+    }
+
+    std::function<MemoryAllocation()>* best_strategy{nullptr};
+
+    // Try all strategies to get the best result.
+    for (auto& strategy : strategies) {
+      auto allocation = strategy();
+      string::PrettyLogDetail("--- get strategy saving %f memory for workspace",
+                              allocation.GetSavingRatio());
+      if (allocation.GetSavingRatio() > max_saving_ratio) {
+        max_saving_ratio = allocation.GetSavingRatio();
+        best_strategy = &strategy;
+      }
+    }
+    if (!best_strategy) {
+      LOG(ERROR)
+          << "This model makes poor memory optimize, skip memory optimize";
+      return;
+    }
+    auto memory_allocation = (*best_strategy)();
+
+    string::PrettyLogH2(
+        "--- Saved %.2f%s memory for workspace(temporary variables)",
+        memory_allocation.GetSavingRatio() * 100, "%");
+    string::PrettyLogDetail("--- Allocated %d MB",
+                            memory_allocation.allocated / 1024. / 1024.);
+    string::PrettyLogDetail("--- Saved %d MB",
+                            memory_allocation.saved / 1024. / 1024.);
+    argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove,
+                               new std::unordered_set<std::string>);
+    auto& vars2remove =
+        argument->main_graph().Get<std::unordered_set<std::string>>(
+            framework::ir::kGraphToProgramVarsToRemove);
+
+    PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove);
+    argument->SetMemoryOptimSortKind(memory_allocation.sort_kind);
+  }
+}
+
+float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const {
+  return (saved / 1024.) / (allocated / 1024. + saved / 1024.);
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
new file mode 100644
index 00000000000..fa1ad9c8c6a
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
@@ -0,0 +1,106 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Memory optimization pass for inference with pre-analysis of memory usage
+ * without GC.
+ * Different from training, the inference memory reuse strategies doesn't
+ * include GC for that overhead is too much when batch size equals one.
+ *
+ * The inference memory reuse tries to pre-determine the tensor reusing strategy
+ * without runtime overhead.
+ *
+ * To improve the strategy's performance, a warm-up running is introduced:
+ *   - Before officially deploy the inference program, one should warm it up and
+ *     generate some runtime cache,
+ *   - Run the inference program with several batches of data, it will persist
+ *     some runtime information about memory of tensors to disk, we call the
+ *     information the memory reusing cache,
+ *   - With the memory reusing cache, user can deploy the inference to a
+ *     service, before running the model, the inference program will load the
+ *     memory cache, analysis it and generate the best memory reusing strategy,
+ *     and adjust the execution of the network.
+ *
+ * With the warm-up and memory reusing cache design, the memory reusing
+ * algorithm can analysis the real memory consume of the tensors, even with the
+ * flexible LoDTensor and special shape changing operators such as
+ * sequence-pooling.
+ */
+class MemoryOptimizePass : public AnalysisPass {
+ public:
+  using space_table_t = std::unordered_map<std::string, size_t>;
+  using lifecycle_t = std::pair<int, int>;
+
+  struct MemoryAllocation {
+    size_t allocated;  // allocated memory in byte.
+    size_t saved;      // saved memory in byte.
+    int sort_kind;     // the kind of the corresponding sorting algorithm.
+
+    // Get the memory saving ratio of temporary variables.
+    float GetSavingRatio() const;
+  };
+
+  virtual ~MemoryOptimizePass() = default;
+
+ protected:
+  void RunImpl(Argument *argument) override;
+
+ private:
+  void CollectLifeCycle(
+      std::unordered_map<std::string, lifecycle_t> *lifecycles,
+      int sort_kind) const;
+
+  void CollectVarMemorySize(
+      const std::unordered_map<std::string, size_t> &batch_var_ave_dim,
+      std::unordered_map<std::string, framework::ir::Node *> *tensor_nodes,
+      space_table_t *space_table) const;
+
+  // Returns percentage of saved memory.
+  void MakeReusePlan(
+      const std::vector<std::unordered_set<std::string>> &var_clusters,
+      const std::unordered_map<std::string, size_t> &var_batch_ave_size,
+      const space_table_t &space_table,
+      std::unordered_map<std::string, std::string> *reuse_table, int sort_kind,
+      MemoryAllocation *memory_allocation) const;
+
+  void PerformReusePlan(
+      const std::unordered_map<std::string, std::string> &reuse_table,
+      int sort_kind, std::unordered_set<std::string> *vars2remove) const;
+
+ public:
+  std::string repr() const override;
+
+ private:
+  mutable framework::ir::Graph *graph_{nullptr};
+  mutable int max_lifecycle_{-1};
+};
+
+static std::string GetMemoryCachePath(const std::string &model_path,
+                                      const std::string &prog_path) {
+  auto path = model_path.empty() ? prog_path : model_path;
+  return path + ".memory_cache";
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc
index 9245e32cee2..161b127d6d5 100644
--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ b/paddle/fluid/inference/analysis/passes/passes.cc
@@ -13,24 +13,31 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/passes.h"
-#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
+#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
+
 PassRegistry::PassRegistry() {
+  // Register manually to avoid the trivial `USE_OP` like macro for easier use
+  // and link.
   passes_.emplace("ir_analysis_pass",
                   std::unique_ptr<AnalysisPass>(new IrAnalysisPass));
   passes_.emplace("ir_graph_build_pass",
                   std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
-  passes_.emplace("ir_analysis_compose_pass",
-                  std::unique_ptr<AnalysisPass>(new IrAnalysisComposePass));
+  passes_.emplace("memory_optimize_pass",
+                  std::unique_ptr<AnalysisPass>(new MemoryOptimizePass));
   passes_.emplace(
       "ir_params_sync_among_devices_pass",
       std::unique_ptr<AnalysisPass>(new IrParamsSyncAmongDevicesPass));
+  passes_.emplace(
+      "ir_graph_to_program_pass",
+      std::unique_ptr<IrGraphToProgramPass>(new IrGraphToProgramPass));
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 8b3838f69a8..ad0af4005ad 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -18,8 +18,10 @@ if(APPLE)
 endif(APPLE)
 
 
-set(inference_deps paddle_inference_api paddle_fluid_api analysis pass
-    ir_pass_manager naive_executor analysis_predictor ${GLOB_PASS_LIB})
+set(inference_deps ${analysis_deps}
+  paddle_inference_api paddle_fluid_api
+  analysis pass naive_executor
+  ${GLOB_PASS_LIB})
 
 if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
@@ -29,7 +31,8 @@ add_subdirectory(details)
 
 cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager)
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api zero_copy_tensor
+  reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps})
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
            lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
            analysis_config paddle_pass_builder zero_copy_tensor
@@ -44,7 +47,7 @@ if(WITH_TESTING)
                       ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
   set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
 endif()
-cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps}
+cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
         ARGS --dirname=${WORD2VEC_MODEL_DIR})
 
 if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 965bbd0fd26..f9da3004ed8 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -44,16 +44,22 @@ PassStrategy *contrib::AnalysisConfig::pass_builder() const {
 
 contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) {
   model_dir_ = model_dir;
+
+  Update();
 }
 contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file,
                                         const std::string &params_file) {
   prog_file_ = prog_file;
   params_file_ = params_file;
+
+  Update();
 }
 void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path,
                                        const std::string &params_file_path) {
   prog_file_ = prog_file_path;
   params_file_ = params_file_path;
+
+  Update();
 }
 void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
                                            int device_id) {
@@ -62,11 +68,17 @@ void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
   memory_pool_init_size_mb_ = memory_pool_init_size_mb;
   device_id_ = device_id;
 #else
-  LOG(ERROR) << "Please compile with gpu to EnableGpu";
+  LOG(ERROR) << "Please compile with gpu to EnableGpu()";
   use_gpu_ = false;
 #endif
+
+  Update();
+}
+void contrib::AnalysisConfig::DisableGpu() {
+  use_gpu_ = false;
+
+  Update();
 }
-void contrib::AnalysisConfig::DisableGpu() { use_gpu_ = false; }
 
 contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
 #define CP_MEMBER(member__) member__ = other.member__;
@@ -81,6 +93,9 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
   CP_MEMBER(use_gpu_);
   CP_MEMBER(device_id_);
   CP_MEMBER(memory_pool_init_size_mb_);
+
+  CP_MEMBER(enable_memory_optim_);
+  CP_MEMBER(memory_optim_force_update_);
   // TensorRT releated.
   CP_MEMBER(use_tensorrt_);
   CP_MEMBER(tensorrt_workspace_size_);
@@ -109,6 +124,8 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
   }
 
 #undef CP_MEMBER
+
+  Update();
 }
 
 void contrib::AnalysisConfig::EnableMKLDNN() {
@@ -119,33 +136,64 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
   LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
   use_mkldnn_ = false;
 #endif
+
+  Update();
 }
 
 void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
                                                    int max_batch_size,
                                                    int min_subgraph_size) {
+#ifdef PADDLE_WITH_CUDA
+  if (!use_gpu()) {
+    LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
+    return;
+  }
+
   use_tensorrt_ = true;
   tensorrt_workspace_size_ = workspace_size;
   tensorrt_max_batchsize_ = max_batch_size;
   tensorrt_min_subgraph_size_ = min_subgraph_size;
+
   Update();
+#else
+  LOG(ERROR)
+      << "To use TensorRT engine, please compile inference lib with GPU first.";
+#endif
 }
 
+// TODO(Superjomn) refactor this, buggy.
 void contrib::AnalysisConfig::Update() {
   auto info = SerializeInfoCache();
   if (info == serialized_info_cache_) return;
 
-  if (use_gpu_) {
-    pass_builder_.reset(new GpuPassStrategy);
+  // Transfer pass_builder and copy the existing compatible passes.
+  if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu()))) {
+    if (use_gpu()) {
+      pass_builder_.reset(new GpuPassStrategy);
+
+      if (use_tensorrt_) {
+        // Append after the Affine_channel_conv_fuse pass.
+        pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
+      }
+    } else {
+      pass_builder_.reset(new CpuPassStrategy);
+    }
+
   } else {
-    pass_builder_.reset(new CpuPassStrategy);
+    if (use_gpu()) {
+      pass_builder_.reset(new GpuPassStrategy(
+          *static_cast<GpuPassStrategy *>(pass_builder_.get())));
+
+    } else {
+      pass_builder_.reset(new CpuPassStrategy(
+          *static_cast<CpuPassStrategy *>(pass_builder_.get())));
+    }
   }
 
   if (use_tensorrt_) {
-    if (!use_gpu_) {
-      LOG(ERROR)
-          << "TensorRT engine is not available when EnableGpu() not actived.";
-    } else {
+    const auto &passes = pass_builder_->AllPasses();
+    if (std::find(passes.begin(), passes.end(), "tensorrt_subgraph_pass") ==
+        std::end(passes)) {
       // Append after the Affine_channel_conv_fuse pass.
       pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
     }
@@ -165,6 +213,10 @@ void contrib::AnalysisConfig::Update() {
 #endif
   }
 
+  if (enable_memory_optim_) {
+    pass_builder()->AppendAnalysisPass("memory_optimize_pass");
+  }
+
   if (ir_debug_) {
     pass_builder()->TurnOnDebug();
   }
@@ -172,24 +224,43 @@ void contrib::AnalysisConfig::Update() {
 
 std::string contrib::AnalysisConfig::SerializeInfoCache() {
   std::stringstream ss;
+  ss << model_dir_;
+  ss << prog_file_;
+  ss << params_file_;
+
   ss << use_gpu_;
+  ss << device_id_;
   ss << memory_pool_init_size_mb_;
 
   ss << use_tensorrt_;
   ss << tensorrt_workspace_size_;
   ss << tensorrt_max_batchsize_;
+  ss << tensorrt_min_subgraph_size_;
+
+  ss << enable_memory_optim_;
+  ss << memory_optim_force_update_;
 
   ss << use_mkldnn_;
+  for (auto &item : mkldnn_enabled_op_types_) ss << item;
+  ss << ";";
+
+  ss << model_from_memory_;
+
   ss << enable_ir_optim_;
   ss << use_feed_fetch_ops_;
   ss << ir_debug_;
 
+  ss << specify_input_name_;
+  ss << cpu_math_library_num_threads_;
+
   return ss.str();
 }
 
 void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads(
     int cpu_math_library_num_threads) {
   cpu_math_library_num_threads_ = cpu_math_library_num_threads;
+
+  Update();
 }
 
 float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
@@ -207,6 +278,17 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
 #endif
 }
 
+void contrib::AnalysisConfig::EnableMemoryOptim(bool force_update_cache) {
+  enable_memory_optim_ = true;
+  memory_optim_force_update_ = force_update_cache;
+
+  Update();
+}
+
+bool contrib::AnalysisConfig::enable_memory_optim() const {
+  return enable_memory_optim_;
+}
+
 void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
                                              size_t prog_buffer_size,
                                              const char *param_buffer,
@@ -214,6 +296,8 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
   prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size);
   params_file_ = std::string(param_buffer, param_buffer + param_buffer_size);
   model_from_memory_ = true;
+
+  Update();
 }
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 3917b9b65b5..2b0cad5faa0 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -24,18 +24,21 @@
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#if PADDLE_WITH_TENSORRT
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#endif
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 
+#if PADDLE_WITH_TENSORRT
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#endif
+
 DECLARE_bool(profile);
 
 namespace paddle {
@@ -189,6 +192,12 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
     LOG(ERROR) << "fail to get fetches";
     return false;
   }
+
+  // Collect variable shapes for memory optimization.
+  if (need_collect_var_shapes_for_memory_optim()) {
+    CollectVarShapes();
+  }
+
   VLOG(3) << "predict cost: " << timer.toc() << "ms";
 
   // All the containers in the scope will be hold in inference, but the
@@ -317,6 +326,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 
   argument_.SetUseGPU(config_.use_gpu());
   argument_.SetGPUDeviceId(config_.gpu_device_id());
+  argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
+  argument_.SetMemoryOptimForceUpdate(config_.memory_optim_force_update_);
   argument_.SetModelFromMemory(config_.model_from_memory_);
   // Analyze inference_program
   if (!config_.model_dir().empty()) {
@@ -331,6 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   }
 
   if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
+    LOG(INFO) << "TensorRT subgraph engine is enabled";
     argument_.SetUseTensorRT(true);
     argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
     argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
@@ -338,12 +350,17 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   }
 
   if (config_.use_mkldnn_) {
+    LOG(INFO) << "MKLDNN is enabled";
     argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
   }
 
   auto passes = config_.pass_builder()->AllPasses();
-  if (!config_.ir_optim()) passes.clear();
+  if (!config_.ir_optim()) {
+    passes.clear();
+    LOG(INFO) << "ir_optim is turned off, no IR pass will be executed";
+  }
   argument_.SetIrAnalysisPasses(passes);
+  argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses());
   argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
   Analyzer().Run(&argument_);
 
@@ -558,6 +575,13 @@ AnalysisPredictor::~AnalysisPredictor() {
   if (sub_scope_) {
     scope_->DeleteScope(sub_scope_);
   }
+
+  // TODO(Superjomn) deduce the directory path.
+  std::string out_path = inference::analysis::GetMemoryCachePath(
+      config_.model_dir(), config_.prog_file());
+  if (need_collect_var_shapes_for_memory_optim()) {
+    SerializeBatchVarShapes(out_path);
+  }
 }
 
 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
@@ -567,6 +591,66 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
   return std::unique_ptr<PaddlePredictor>(x);
 }
 
+void AnalysisPredictor::CollectVarShapes() {
+  VLOG(4) << "Collecting var shapes";
+  if (batch_var_shapes_.size() >= max_shape_collect_count_) return;
+  std::map<std::string, std::vector<int>> var_shapes;
+  for (auto var_name : inference_program_->Block(0).LocalVarNames()) {
+    auto *var = sub_scope_->FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(var);
+    if (var->Type() == framework::VarTypeTrait<framework::LoDTensor>::kId ||
+        var->Type() == framework::VarTypeTrait<framework::Tensor>::kId) {
+      auto &tensor = var->Get<framework::LoDTensor>();
+      auto shape = framework::vectorize(tensor.dims());
+      var_shapes[var_name].assign(shape.begin(), shape.end());
+    }
+  }
+  batch_var_shapes_.push_back(var_shapes);
+  LOG_FIRST_N(INFO, 1) << "Collected " << batch_var_shapes_.size()
+                       << " batch of var shapes for analysis";
+}
+
+void AnalysisPredictor::SerializeBatchVarShapes(const std::string &path) {
+  LOG(INFO) << "serialize batch var shapes to " << path;
+  std::ofstream file(path);
+  if (!file.is_open()) {
+    LOG(ERROR) << "failed to serialize the var shapes to " << path;
+    return;
+  }
+
+  // The sirialized data format:
+  // <tensor_name>:dim0,dim1,dim2,;
+  for (auto &batch : batch_var_shapes_) {
+    for (auto &ele : batch) {
+      file << ele.first << ":";
+      for (size_t i = 0; i < ele.second.size() - 1; i++) {
+        file << ele.second[i] << ",";
+      }
+      file << ele.second.back() << ";";
+    }
+    file << "\n";
+  }
+}
+
+bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
+  if (need_collect_var_shapes_ >= 0) return need_collect_var_shapes_;
+  bool need = false;
+  // check if the cache exists
+  if (!config_.enable_memory_optim()) {
+    need = false;
+  } else if (config_.enable_memory_optim() &&
+             !inference::IsFileExists(inference::analysis::GetMemoryCachePath(
+                 config_.model_dir(), config_.prog_file()))) {
+    need = true;
+  } else if (config_.enable_memory_optim() &&
+             config_.memory_optim_force_update_) {
+    need = true;
+  }
+
+  need_collect_var_shapes_ = need ? 1 : 0;
+  return need;
+}
+
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
     const contrib::AnalysisConfig &config) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 6ca4b5e9bed..e25b5a7047b 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -75,6 +75,11 @@ class AnalysisPredictor : public PaddlePredictor {
   void SetMkldnnThreadID(int tid);
 
  protected:
+  // For memory optimization.
+  bool need_collect_var_shapes_for_memory_optim();
+  void CollectVarShapes();
+  void SerializeBatchVarShapes(const std::string &path);
+
   bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
   bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope);
   bool CreateExecutor();
@@ -118,6 +123,11 @@ class AnalysisPredictor : public PaddlePredictor {
   // A mutex help to make Clone thread safe.
   std::mutex clone_mutex_;
 
+  // For memory optimization.
+  const size_t max_shape_collect_count_{1000};
+  int need_collect_var_shapes_{-1};  // -1 for default, 0 for false, 1 for true.
+  std::vector<std::map<std::string, std::vector<int>>> batch_var_shapes_;
+
  private:
   // Some status here that help to determine the status inside the predictor.
   bool status_program_optimized_{false};
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 3df26cde3d5..4688e93d710 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -16,8 +16,10 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <thread>  // NOLINT
+#include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 DEFINE_string(dirname, "", "dirname to tests.");
 
@@ -191,4 +193,53 @@ TEST(AnalysisPredictor, Clone) {
   }
 }
 
+TEST(AnalysisPredictor, memory_optim) {
+  AnalysisConfig config(FLAGS_dirname);
+  config.DisableGpu();
+  config.EnableMemoryOptim(true);
+  config.pass_builder()->TurnOnDebug();
+
+  auto native_predictor =
+      CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
+
+  // 2. Dummy Input Data
+  int64_t data[4] = {1, 2, 3, 4};
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({4, 1});
+  tensor.data.Reset(data, sizeof(data));
+  tensor.dtype = PaddleDType::INT64;
+
+  std::vector<PaddleTensor> inputs(4, tensor);
+  std::vector<PaddleTensor> output, output1;
+
+  {
+    // The first predictor help to cache the memory optimize strategy.
+    auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+
+    // Run several times to check the parameters are not reused by mistake.
+    for (int i = 0; i < 5; i++) {
+      ASSERT_TRUE(predictor->Run(inputs, &output));
+    }
+  }
+
+  {
+    output.clear();
+    // The second predictor to perform memory optimization.
+    config.EnableMemoryOptim(false);
+    auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+
+    // Run with memory optimization
+    ASSERT_TRUE(predictor->Run(inputs, &output));
+  }
+
+  // Run native
+  ASSERT_TRUE(native_predictor->Run(inputs, &output1));
+
+  LOG(INFO) << "the output " << inference::DescribeTensor(output.front());
+  LOG(INFO) << "the native output "
+            << inference::DescribeTensor(output1.front());
+
+  inference::CompareResult(output, output1);
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 9811fe2cd06..963986f245c 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 set -x
 PADDLE_ROOT=$1
 TURN_ON_MKL=$2 # use MKL or Openblas
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index cdd01cb9f06..b92781e4f2c 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -15,7 +15,10 @@
 #pragma once
 
 #include <glog/logging.h>
-
+#include <fstream>
+#if !defined(_WIN32)
+#include <sys/time.h>
+#endif
 #include <algorithm>
 #include <chrono>  // NOLINT
 #include <iterator>
@@ -182,7 +185,8 @@ static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) {
   return true;
 }
 
-static std::string DescribeTensor(const PaddleTensor &tensor) {
+static std::string DescribeTensor(const PaddleTensor &tensor,
+                                  int max_num_of_data = 15) {
   std::stringstream os;
   os << "Tensor [" << tensor.name << "]\n";
   os << " - type: ";
@@ -253,5 +257,12 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
   }
 }
 
+static bool IsFileExists(const std::string &path) {
+  std::ifstream file(path);
+  bool exists = file.is_open();
+  file.close();
+  return exists;
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index ae6ac69854d..1cee8904500 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -192,6 +192,13 @@ struct AnalysisConfig {
    */
   bool model_from_memory() const { return model_from_memory_; }
 
+  /** Turn on memory optimize
+   * NOTE still in development, will release latter.
+   */
+  void EnableMemoryOptim(bool force_update_cache = false);
+  /** Tell whether the memory optimization is activated. */
+  bool enable_memory_optim() const;
+
   friend class ::paddle::AnalysisPredictor;
 
   /** NOTE just for developer, not an official API, easily to be broken.
@@ -232,6 +239,10 @@ struct AnalysisConfig {
   //  subgraph, 3 as default value.
   int tensorrt_min_subgraph_size_{3};
 
+  // memory reuse related.
+  bool enable_memory_optim_{false};
+  bool memory_optim_force_update_{false};
+
   bool use_mkldnn_{false};
   std::unordered_set<std::string> mkldnn_enabled_op_types_;
 
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index bc3ce72f083..039389a4cf9 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
+
 #include <glog/logging.h>
 
 namespace paddle {
@@ -65,4 +66,8 @@ void GpuPassStrategy::EnableMKLDNN() {
   LOG(ERROR) << "GPU not support MKLDNN yet";
 }
 
+void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
+  analysis_passes_.push_back(pass);
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index efe1ba106a2..d3a60d20992 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -45,6 +45,9 @@ class PaddlePassBuilder {
   /** Delete all the passes that has type `pass_type`. */
   void DeletePass(const std::string &pass_type);
 
+  /** Append an analysis pass. */
+  void AppendAnalysisPass(const std::string &pass);
+
   /** Visualize the computation graph after each pass by generating a DOT
    * language file, one can draw them with the Graphviz toolkit.
    */
@@ -54,8 +57,18 @@ class PaddlePassBuilder {
   std::string DebugString();
 
   const std::vector<std::string> &AllPasses() const { return passes_; }
+  std::vector<std::string> AnalysisPasses() const {
+    auto passes = analysis_passes_;
+    // To make sure the ir_graph_to_program should be the last pass so any
+    // modication of IR will persist to the program.
+    passes.push_back("ir_graph_to_program_pass");
+    return passes;
+  }
 
  protected:
+  std::vector<std::string> analysis_passes_{
+      {"ir_graph_build_pass", "ir_analysis_pass",
+       "ir_params_sync_among_devices_pass"}};
   std::vector<std::string> passes_;
 };
 
@@ -69,7 +82,7 @@ class PassStrategy : public PaddlePassBuilder {
   /** The MKLDNN control exists in both CPU and GPU mode, because there can be
    * still some CPU kernels running in CPU mode.
    */
-  virtual void EnableMKLDNN() = 0;
+  virtual void EnableMKLDNN() {}
 
   bool use_gpu() const { return use_gpu_; }
 
@@ -77,6 +90,7 @@ class PassStrategy : public PaddlePassBuilder {
 
  protected:
   bool use_gpu_{false};
+  bool use_mkldnn_{false};
 };
 
 /** The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
@@ -107,25 +121,31 @@ class CpuPassStrategy : public PassStrategy {
     use_gpu_ = false;
   }
 
+  explicit CpuPassStrategy(const CpuPassStrategy &other)
+      : PassStrategy(other.AllPasses()) {}
+
   virtual ~CpuPassStrategy() = default;
 
   void EnableMKLDNN() override {
 // TODO(Superjomn) Consider the way to mix CPU with GPU.
 #ifdef PADDLE_WITH_MKLDNN
-    passes_.insert(passes_.begin(), "mkldnn_placement_pass");
-
-    for (auto &pass :
-         std::vector<std::string>({"depthwise_conv_mkldnn_pass",    //
-                                   "conv_bias_mkldnn_fuse_pass",    //
-                                   "conv3d_bias_mkldnn_fuse_pass",  //
-                                   "conv_relu_mkldnn_fuse_pass",    //
-                                   "conv_elementwise_add_mkldnn_fuse_pass"})) {
-      passes_.push_back(pass);
+    if (!use_mkldnn_) {
+      passes_.insert(passes_.begin(), "mkldnn_placement_pass");
+
+      for (auto &pass : std::vector<std::string>(
+               {"depthwise_conv_mkldnn_pass",    //
+                "conv_bias_mkldnn_fuse_pass",    //
+                "conv3d_bias_mkldnn_fuse_pass",  //
+                "conv_relu_mkldnn_fuse_pass",    //
+                "conv_elementwise_add_mkldnn_fuse_pass"})) {
+        passes_.push_back(pass);
+      }
     }
+    use_mkldnn_ = true;
+#else
+    use_mkldnn_ = false;
 #endif
   }
-
-  CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {}
 };
 
 /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
@@ -150,7 +170,7 @@ class GpuPassStrategy : public PassStrategy {
     use_gpu_ = true;
   }
 
-  GpuPassStrategy(const GpuPassStrategy &other)
+  explicit GpuPassStrategy(const GpuPassStrategy &other)
       : PassStrategy(other.AllPasses()) {
     use_gpu_ = true;
   }
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index adbf98e9e8a..423c39813f0 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -19,7 +19,7 @@ endfunction()
 
 function(inference_analysis_api_test target install_dir filename)
     inference_analysis_test(${target} SRCS ${filename}
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark
         ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
 endfunction()
 
@@ -62,7 +62,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
 # normal DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
 download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc SERIAL)
+inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
 
 # small DAM
 set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index fc87e0a8d17..4ec9404ab42 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -126,6 +126,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   std::string turn_mask_pre = "turn_mask_";
 
   auto one_batch = data->NextBatch();
+  PADDLE_ENFORCE(!one_batch.response.empty());
   int size = one_batch.response[0].size();
   CHECK_EQ(size, kMaxTurnLen);
   // turn tensor assignment
@@ -200,6 +201,7 @@ void profile(bool use_mkldnn = false) {
   std::vector<PaddleTensor> outputs;
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
+
   TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
                  input_slots_all, &outputs, FLAGS_num_threads);
 
@@ -250,7 +252,35 @@ void compare(bool use_mkldnn = false) {
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
+// Compare result of NativeConfig and AnalysisConfig with memory optimization.
+TEST(Analyzer_dam, compare_with_memory_optim) {
+  // The small dam will core in CI, but works in local.
+  if (FLAGS_max_turn_num == 9) {
+    contrib::AnalysisConfig cfg, cfg1;
+    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+
+    std::vector<std::vector<PaddleTensor>> input_slots_all;
+    SetInput(&input_slots_all);
+    // Run the first time to force to update memory cache
+    SetConfig(&cfg);
+    cfg.EnableMemoryOptim(true);
+
+    CompareNativeAndAnalysis(
+        reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+        input_slots_all);
+
+    // Run second time to use the memory cache and perform memory optimization.
+    SetConfig(&cfg1);
+    cfg1.EnableMemoryOptim();
+
+    CompareNativeAndAnalysis(
+        reinterpret_cast<const PaddlePredictor::Config *>(&cfg1),
+        input_slots_all);
+  }
+}
+
 TEST(Analyzer_dam, compare) { compare(); }
+
 #ifdef PADDLE_WITH_MKLDNN
 TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); }
 #endif
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
index 7b448a32003..2db297e2005 100644
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -69,6 +69,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 TEST(Analyzer_Text_Classification, profile) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
+  cfg.pass_builder()->TurnOnDebug();
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -98,6 +99,7 @@ TEST(Analyzer_Text_Classification, profile) {
 TEST(Analyzer_Text_Classification, compare) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
+  cfg.EnableMemoryOptim();
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index 5a77b53a851..f3e75ffbb59 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <gtest/gtest.h>
 #include <fstream>
 #include <iostream>
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
@@ -55,7 +56,7 @@ void SetConfig(AnalysisConfig *cfg) {
                 FLAGS_infer_model + "/__params__");
   cfg->DisableGpu();
   cfg->SwitchIrDebug();
-  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchSpecifyInputNames(false);
   // TODO(TJ): fix fusion gru
   cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
 }
@@ -86,6 +87,7 @@ void profile(bool use_mkldnn = false) {
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
   }
+  // cfg.pass_builder()->TurnOnDebug();
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -103,9 +105,8 @@ void profile(bool use_mkldnn = false) {
     size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
     CHECK_EQ(numel, refer.data.size());
     for (size_t i = 0; i < numel; ++i) {
-      CHECK_LT(
-          fabs(static_cast<float *>(output.data.data())[i] - refer.data[i]),
-          1e-5);
+      EXPECT_NEAR(static_cast<float *>(output.data.data())[i], refer.data[i],
+                  1e-5);
     }
   }
 }
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index ac964dc0c86..d2ca1d0b009 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <gtest/gtest.h>
+
 #include <algorithm>
 #include <string>
 #include <thread>  // NOLINT
@@ -28,9 +29,8 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/tests/api/config_printer.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 #include "paddle/fluid/inference/utils/benchmark.h"
@@ -91,7 +91,7 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
         float *pdata = static_cast<float *>(out.data.data());
         float *pdata_ref = static_cast<float *>(ref_out.data.data());
         for (size_t j = 0; j < size; ++j) {
-          EXPECT_NEAR(pdata_ref[j], pdata[j], FLAGS_accuracy);
+          CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy);
         }
         break;
       }
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index 9725c190329..5aca807ee3a 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -157,5 +157,10 @@ TEST(AnalysisPredictor, use_gpu) {
   }
 }
 
+TEST(TensorRT_mobilenet, profile) {
+  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
+  profile(model_dir, true, false);
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h
index 76a3dd2c299..a1304cf4e77 100644
--- a/paddle/fluid/inference/utils/benchmark.h
+++ b/paddle/fluid/inference/utils/benchmark.h
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#pragma once
 
+#pragma once
 #include <fstream>
 #include <iostream>
 #include <string>
diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc
index eb255474082..80763160df3 100644
--- a/paddle/fluid/inference/utils/benchmark_tester.cc
+++ b/paddle/fluid/inference/utils/benchmark_tester.cc
@@ -16,7 +16,7 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-using namespace paddle::inference;
+using namespace paddle::inference;  // NOLINT
 TEST(Benchmark, basic) {
   Benchmark benchmark;
   benchmark.SetName("key0");
@@ -36,4 +36,4 @@ TEST(Benchmark, PersistToFile) {
   benchmark.PersistToFile("1.log");
   benchmark.PersistToFile("1.log");
   benchmark.PersistToFile("1.log");
-}
\ No newline at end of file
+}
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index 86b3114cb3c..0dfed7f5cc1 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -50,6 +50,7 @@ class FeedOp : public framework::OperatorBase {
             << out_name;
 
     auto &feed_list = feed_var->Get<framework::FeedFetchList>();
+    PADDLE_ENFORCE_LT(static_cast<size_t>(col), feed_list.size());
     auto &feed_item = feed_list.at(static_cast<size_t>(col));
     auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
 
diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h
index 10c9eb80d0a..da4c1f326fb 100644
--- a/paddle/fluid/string/pretty_log.h
+++ b/paddle/fluid/string/pretty_log.h
@@ -66,5 +66,22 @@ static void PrettyLog(const std::string &style, const char *fmt,
   std::cerr << style << Sprintf(fmt, args...) << reset();
 }
 
+template <typename... Args>
+static void PrettyLogInfo(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::info(), fmt, args...);
+}
+template <typename... Args>
+static void PrettyLogDetail(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::detail(), fmt, args...);
+}
+template <typename... Args>
+static void PrettyLogH1(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::H1(), fmt, args...);
+}
+template <typename... Args>
+static void PrettyLogH2(const char *fmt, const Args &... args) {
+  PrettyLogEndl(Style::H2(), fmt, args...);
+}
+
 }  // namespace string
 }  // namespace paddle
-- 
GitLab


From 561ae9d50712f2f415614326443e554d10475bcd Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Mon, 21 Jan 2019 18:03:19 +0800
Subject: [PATCH 39/73] remove legacy WITH_C_API option

---
 CMakeLists.txt                        | 17 ----------------
 cmake/external/gflags.cmake           | 10 ---------
 cmake/external/glog.cmake             |  9 ---------
 cmake/external/mkldnn.cmake           |  4 ----
 cmake/external/mklml.cmake            |  4 ----
 cmake/external/openblas.cmake         | 19 ------------------
 cmake/external/protobuf.cmake         |  9 ---------
 cmake/external/pslib.cmake            |  4 ----
 cmake/external/pslib_brpc.cmake       |  4 ----
 cmake/external/xxhash.cmake           |  9 ---------
 cmake/external/zlib.cmake             |  9 ---------
 paddle/scripts/README.md              |  1 -
 paddle/scripts/paddle_build.sh        | 29 +++------------------------
 paddle/scripts/paddle_docker_build.sh |  1 -
 14 files changed, 3 insertions(+), 126 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a51552d96a4..bbf3acb8ad2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,7 +60,6 @@ option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
-option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 option(WITH_FLUID_ONLY  "Compile PaddlePaddle fluid only"               OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
@@ -119,10 +118,6 @@ if(ANDROID OR IOS)
         "Disable golang when cross-compiling for Android and iOS" FORCE)
 
     # Compile PaddlePaddle mobile inference library
-    if (NOT WITH_C_API)
-        set(WITH_C_API ON CACHE STRING
-            "Always compile the C_API when cross-compiling for Android and iOS" FORCE)
-    endif()
     set(MOBILE_INFERENCE ON)
     add_definitions(-DPADDLE_MOBILE_INFERENCE)
 endif()
@@ -135,8 +130,6 @@ endif()
 if (WIN32)
     set(WITH_DISTRIBUTE OFF CACHE STRING
             "Disable DISTRIBUTE when compiling for Windows" FORCE)
-    set(WITH_C_API OFF CACHE STRING
-            "Disable C_API when compiling for Windows" FORCE)
     set(WITH_FLUID_ONLY ON CACHE STRING
             "Enable FLUID_ONLY when compiling for Windows" FORCE)
 endif()
@@ -150,16 +143,6 @@ set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
 set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING
   "A path setting fluid inference shared and static libraries")
 
-if (WITH_C_API AND WITH_PYTHON)
-  message(WARNING "It is suggest not embedded a python interpreter in Paddle "
-    "when using C-API. It will give an unpredictable behavior when using a "
-    "different Python interpreter from compiling.")
-endif()
-
-if (WITH_C_API)
-  set(WITH_FLUID_ONLY OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
-endif()
-
 if(MOBILE_INFERENCE)
     set(THIRD_PARTY_BUILD_TYPE MinSizeRel)
 else()
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 95ca16f57f2..f3ca74faea3 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -71,13 +71,3 @@ if (WIN32)
     set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib)
   endif(HAVE_SHLWAPI)
 endif (WIN32)
-
-IF(WITH_C_API)
-  INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
-  IF(ANDROID)
-    INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI})
-  ELSE()
-    INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib)
-  ENDIF()
-ENDIF()
-
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 8cd0455c16b..72a2f601917 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -78,12 +78,3 @@ ADD_DEPENDENCIES(glog extern_glog gflags)
 LINK_LIBRARIES(glog gflags)
 
 LIST(APPEND external_project_dependencies glog)
-
-IF(WITH_C_API)
-  INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog)
-  IF(ANDROID)
-    INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI})
-  ELSE()
-    INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib)
-  ENDIF()
-ENDIF()
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 03f0dee8591..6a7be73f09a 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -110,7 +110,3 @@ else(WIN32)
 endif(WIN32)
 ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
 ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn)
-IF(WITH_C_API)
-  INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
-ENDIF()
-
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 43322a257a0..2caff273576 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -74,7 +74,3 @@ ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
 ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
 LIST(APPEND external_project_dependencies mklml)
-
-IF(WITH_C_API)
-  INSTALL(FILES ${MKLML_LIB} ${MKLML_IOMP_LIB} DESTINATION lib)
-ENDIF()
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index aeb976b840e..019745aad0d 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -92,25 +92,6 @@ IF(NOT ${CBLAS_FOUND})
     ELSE()
     ENDIF(NOT WIN32)
     SET(CBLAS_PROVIDER openblas)
-    IF(WITH_C_API)
-        INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
-        # Because libopenblas.a is a symbolic link of another library, thus need to
-        # install the whole directory.
-        IF(ANDROID)
-            SET(TMP_INSTALL_DIR third_party/openblas/lib/${ANDROID_ABI})
-        ELSE()
-            SET(TMP_INSTALL_DIR third_party/openblas/lib)
-        ENDIF()
-        INSTALL(CODE "execute_process(
-            COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib
-                    ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}
-            )"
-        )
-        INSTALL(CODE "MESSAGE(STATUS \"Installing: \"
-                \"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\"
-            )"
-        )
-    ENDIF()
 ENDIF(NOT ${CBLAS_FOUND})
 
 MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index e1e619e572b..16fd9fac92e 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -266,15 +266,6 @@ IF(NOT PROTOBUF_FOUND)
     SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
         CACHE FILEPATH "protoc library." FORCE)
 
-    IF(WITH_C_API)
-        INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
-        IF(ANDROID)
-            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
-        ELSE()
-            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib)
-        ENDIF()
-    ENDIF()
-
     IF(CMAKE_CROSSCOMPILING)
         PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf)
     ELSE()
diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake
index 3b495d78e2c..b4ea268e5a4 100644
--- a/cmake/external/pslib.cmake
+++ b/cmake/external/pslib.cmake
@@ -71,7 +71,3 @@ ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
 ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT})
 LIST(APPEND external_project_dependencies pslib)
-
-IF(WITH_C_API)
-  INSTALL(FILES ${PSLIB_LIB} ${PSLIB_IOMP_LIB} DESTINATION lib)
-ENDIF()
diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake
index 7ff5a8aca18..8b43f2ef5c9 100644
--- a/cmake/external/pslib_brpc.cmake
+++ b/cmake/external/pslib_brpc.cmake
@@ -71,7 +71,3 @@ ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB})
 ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT})
 LIST(APPEND external_project_dependencies pslib_brpc)
-
-IF(WITH_C_API)
-  INSTALL(FILES ${PSLIB_BRPC_LIB} ${PSLIB_BRPC_IOMP_LIB} DESTINATION lib)
-ENDIF()
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
index c3e1212d8f8..a0f300c2e8b 100644
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -73,12 +73,3 @@ include_directories(${XXHASH_INCLUDE_DIR})
 add_dependencies(xxhash extern_xxhash)
 
 LIST(APPEND external_project_dependencies xxhash)
-
-IF(WITH_C_API)
-  INSTALL(DIRECTORY ${XXHASH_INCLUDE_DIR} DESTINATION third_party/xxhash)
-  IF(ANDROID)
-    INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib/${ANDROID_ABI})
-  ELSE()
-    INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib)
-  ENDIF()
-ENDIF()
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index d3507375372..6c8d79c25e6 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -59,12 +59,3 @@ SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
 ADD_DEPENDENCIES(zlib extern_zlib)
 
 LIST(APPEND external_project_dependencies zlib)
-
-IF(WITH_C_API)
-  INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
-  IF(ANDROID)
-    INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib/${ANDROID_ABI})
-  ELSE()
-    INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib)
-  ENDIF()
-ENDIF()
diff --git a/paddle/scripts/README.md b/paddle/scripts/README.md
index 9e8b135c1bc..27722245064 100644
--- a/paddle/scripts/README.md
+++ b/paddle/scripts/README.md
@@ -69,7 +69,6 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
 | `WITH_GOLANG` | OFF | Build fault-tolerant parameter server written in go. |
 | `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
-| `WITH_C_API` | OFF | Build capi libraries for inference. |
 | `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. |
 | `WITH_STYLE_CHECK` | ON | Check the code style when building. |
 | `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index f58e392684d..cbd39d7a5d9 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -33,7 +33,6 @@ function print_usage() {
     ${BLUE}gen_doc_lib${NONE}: generate paddle documents library
     ${BLUE}html${NONE}: convert C++ source code into HTML
     ${BLUE}dockerfile${NONE}: generate paddle release dockerfile
-    ${BLUE}capi${NONE}: generate paddle CAPI package
     ${BLUE}fluid_inference_lib${NONE}: deploy fluid inference library
     ${BLUE}check_style${NONE}: run code style check
     ${BLUE}cicheck${NONE}: run CI tasks
@@ -180,7 +179,6 @@ function cmake_gen() {
         -DWITH_AVX=${WITH_AVX:-OFF}
         -DWITH_GOLANG=${WITH_GOLANG:-OFF}
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
-        -DWITH_C_API=${WITH_C_API:-OFF}
         -DWITH_PYTHON=${WITH_PYTHON:-ON}
         -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
         -DCUDNN_ROOT=/usr/
@@ -217,7 +215,6 @@ EOF
         -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
         -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
-        -DWITH_C_API=${WITH_C_API:-OFF} \
         -DWITH_PYTHON=${WITH_PYTHON:-ON} \
         -DCUDNN_ROOT=/usr/ \
         -DWITH_TESTING=${WITH_TESTING:-ON} \
@@ -706,21 +703,10 @@ EOF
 EOF
 }
 
-function gen_capi_package() {
-    if [[ ${WITH_C_API} == "ON" ]]; then
-        capi_install_prefix=${INSTALL_PREFIX:-/paddle/build}/capi_output
-        rm -rf $capi_install_prefix
-        make DESTDIR="$capi_install_prefix" install
-        cd $capi_install_prefix/
-        ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz
-    fi
-}
-
 function gen_fluid_lib() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
-    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
-        cat <<EOF
+    cat <<EOF
     ========================================
     Generating fluid library for train and inference ...
     ========================================
@@ -732,8 +718,7 @@ EOF
 }
 
 function tar_fluid_lib() {
-    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
-        cat <<EOF
+    cat <<EOF
     ========================================
     Taring fluid library for train and inference ...
     ========================================
@@ -747,8 +732,7 @@ EOF
 }
 
 function test_fluid_lib() {
-    if [[ ${WITH_C_API:-OFF} == "OFF" ]] ; then
-        cat <<EOF
+    cat <<EOF
     ========================================
     Testing fluid library for inference ...
     ========================================
@@ -791,11 +775,6 @@ function main() {
       dockerfile)
         gen_dockerfile ${PYTHON_ABI:-""}
         ;;
-      capi)
-        cmake_gen ${PYTHON_ABI:-""}
-        build
-        gen_capi_package
-        ;;
       fluid_inference_lib)
         cmake_gen ${PYTHON_ABI:-""}
         gen_fluid_lib
@@ -810,7 +789,6 @@ function main() {
         build
         assert_api_not_changed ${PYTHON_ABI:-""}
         run_test
-        gen_capi_package
         gen_fluid_lib
         test_fluid_lib
         assert_api_spec_approvals
@@ -820,7 +798,6 @@ function main() {
         assert_api_spec_approvals
         ;;
       test_inference)
-        gen_capi_package
         gen_fluid_lib
         test_fluid_lib
         ;;
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index 174c2a12f00..9a098dbbc66 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -28,7 +28,6 @@ function start_build_docker() {
         -e WITH_AVX=ON \
         -e WITH_GOLANG=OFF \
         -e WITH_TESTING=ON \
-        -e WITH_C_API=OFF \
         -e WITH_COVERAGE=ON \
         -e COVERALLS_UPLOAD=ON \
         -e WITH_DEB=OFF \
-- 
GitLab


From e000d17a0cc2bfb8399f82ada1939ebcea54d108 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Mon, 21 Jan 2019 18:13:14 +0800
Subject: [PATCH 40/73] remove legacy WITH_SWIG_PY option

---
 CMakeLists.txt                             |   2 -
 cmake/external/swig.cmake                  |  65 ---
 paddle/py_paddle/.gitignore                |   2 -
 paddle/py_paddle/__init__.py               |  24 -
 paddle/py_paddle/dataprovider_converter.py | 309 -----------
 paddle/py_paddle/util.py                   | 578 ---------------------
 paddle/scripts/README.md                   |   1 -
 paddle/scripts/paddle_build.sh             |   2 -
 python/CMakeLists.txt                      |   2 -
 9 files changed, 985 deletions(-)
 delete mode 100644 cmake/external/swig.cmake
 delete mode 100644 paddle/py_paddle/.gitignore
 delete mode 100644 paddle/py_paddle/__init__.py
 delete mode 100644 paddle/py_paddle/dataprovider_converter.py
 delete mode 100644 paddle/py_paddle/util.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bbf3acb8ad2..b4a700f9744 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,7 +49,6 @@ option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FO
 option(WITH_NGRAPH      "Compile PaddlePaddle with nGraph support."     OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
-option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
 option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
 option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
 option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
@@ -176,7 +175,6 @@ include(external/python)    # download, build, install python
 include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/ngraph)    # download, build, install nGraph
-include(external/swig)      # download, build, install swig
 include(external/boost)     # download boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake
deleted file mode 100644
index de07703695e..00000000000
--- a/cmake/external/swig.cmake
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-IF(NOT WITH_SWIG_PY)
-    return()
-ENDIF()
-
-FIND_PACKAGE(SWIG)
-
-IF(NOT SWIG_FOUND)
-    # build swig as an external project
-    INCLUDE(ExternalProject)
-
-    SET(SWIG_SOURCES_DIR ${THIRD_PARTY_PATH}/swig)
-    SET(SWIG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/swig)
-    SET(SWIG_TARGET_VERSION "3.0.2")
-    SET(SWIG_DOWNLOAD_SRC_MD5 "62f9b0d010cef36a13a010dc530d0d41")
-    SET(SWIG_DOWNLOAD_WIN_MD5 "3f18de4fc09ab9abb0d3be37c11fbc8f")
-
-    IF(WIN32)
-        # swig.exe available as pre-built binary on Windows:
-        ExternalProject_Add(swig
-            URL                 http://prdownloads.sourceforge.net/swig/swigwin-${SWIG_TARGET_VERSION}.zip
-            URL_MD5             ${SWIG_DOWNLOAD_WIN_MD5}
-            SOURCE_DIR          ${SWIG_SOURCES_DIR}
-            CONFIGURE_COMMAND   ""
-            BUILD_COMMAND       ""
-            INSTALL_COMMAND     ""
-            UPDATE_COMMAND      ""
-        )
-        SET(SWIG_DIR ${SWIG_SOURCES_DIR} CACHE FILEPATH "SWIG Directory" FORCE)
-        SET(SWIG_EXECUTABLE ${SWIG_SOURCES_DIR}/swig.exe  CACHE FILEPATH "SWIG Executable" FORCE)
-    ELSE(WIN32)
-        # swig uses bison find it by cmake and pass it down
-        FIND_PACKAGE(BISON)
-
-        # From SWIG configure
-        ExternalProject_Add(swig
-            GIT_REPOSITORY      https://github.com/swig/swig.git
-            GIT_TAG             rel-3.0.10
-            PREFIX              ${SWIG_SOURCES_DIR}
-            CONFIGURE_COMMAND   cd <SOURCE_DIR> && ./autogen.sh && ./configure
-                                --prefix=${SWIG_INSTALL_DIR} --without-pcre
-            BUILD_COMMAND       cd <SOURCE_DIR> && make
-            INSTALL_COMMAND     cd <SOURCE_DIR> && make install
-            UPDATE_COMMAND      ""
-        )
-
-        SET(SWIG_DIR ${SWIG_INSTALL_DIR}/share/swig/${SWIG_TARGET_VERSION})
-        SET(SWIG_EXECUTABLE ${SWIG_INSTALL_DIR}/bin/swig)
-    ENDIF(WIN32)
-
-    LIST(APPEND external_project_dependencies swig)
-ENDIF(NOT SWIG_FOUND)
diff --git a/paddle/py_paddle/.gitignore b/paddle/py_paddle/.gitignore
deleted file mode 100644
index 80d1f76fbc0..00000000000
--- a/paddle/py_paddle/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-swig_paddle.py
-_swig_paddle.so
diff --git a/paddle/py_paddle/__init__.py b/paddle/py_paddle/__init__.py
deleted file mode 100644
index 5504d1d50c5..00000000000
--- a/paddle/py_paddle/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from util import DataProviderWrapperConverter
-from dataprovider_converter import DataProviderConverter
-
-__all__ = [
-    'paddle',
-    'DataProviderConverter',
-    'DataProviderWrapperConverter',  # for deprecated usage.
-    'loadParameterFile'
-]
-util.monkeypatches()
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
deleted file mode 100644
index 43614b9779d..00000000000
--- a/paddle/py_paddle/dataprovider_converter.py
+++ /dev/null
@@ -1,309 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.trainer.PyDataProvider2 as dp2
-import collections
-import swig_paddle
-import numpy
-import itertools
-from functools import reduce
-
-__all__ = ['DataProviderConverter']
-
-
-class IScanner(object):
-    """
-    The scanner will scan Python object two passes, then convert it to Paddle's
-    argument.
-
-    In the first pass, `pre_scan` will be invoked by every data instance, and
-    then invoke `finish_pre_scan` to arguments. And the second pass do the same
-    thing except the functions changed to `scan`, `finish_scan`.
-
-    During the first pass, a scanner may count the shape of input matrix and
-    allocate memory for this argument. Then fill the data into this  argument
-    in second pass.
-    """
-
-    def __init__(self, input_type, pos):
-        self.input_type = input_type
-        if not isinstance(self.input_type, dp2.InputType):
-            raise ValueError("input type should be dataprovider2.InputType")
-        self.pos = pos
-        # data_in_gpu is used to indicate whether to create argument on GPU
-        # or not in GPU mode. Now if using one thread (trainer_count=1),
-        # trainer uses NeuralNetwork which needs to create argument on GPU
-        # before calling forward function. So, set data_in_gpu to True.
-        # Otherwise, trainer uses MultiGradientMachine which will transfer
-        # data from CPU to GPU in the forward function, set data_in_gpu to
-        # False in this case.
-        self.data_in_gpu = swig_paddle.isUsingGpu(
-        ) and swig_paddle.getTrainerCount() == 1
-
-    def pre_scan(self, dat):
-        """
-        First pass scan method. During this method, the scanner could count the
-        data number, and get the total memory size this batch would use.
-
-        :param dat: The python object.
-        """
-        pass
-
-    def finish_pre_scan(self, argument):
-        """
-        Finish first scan pass. Allocate the memory.
-
-        :param argument: Output arguments object.
-        :type argument: swig_paddle.Arguments
-        :param dat: Output arguments object.
-        :type dat: The Python object, numpy.array or List.
-        :return:
-        """
-        pass
-
-    def scan(self, dat):
-        """
-        Second pass scan method. Copy the data to arguments.
-
-        :param dat: The python object.
-        """
-        pass
-
-    def finish_scan(self, argument):
-        """
-        Finish second pass. Finalize the resources, etc.
-
-        :param argument: Output arguments object.
-        :type argument: swig_paddle.Arguments
-        """
-        pass
-
-
-class DenseScanner(IScanner):
-    """
-    :type __mat__: numpy.ndarray
-    """
-
-    def __init__(self, input_type, pos):
-        IScanner.__init__(self, input_type, pos)
-        self.__mat__ = None
-        self.__shape__ = None
-        self.__height__ = 0
-        self.__dim__ = 0
-
-    def pre_scan(self, dat):
-        self.__height__ += 1
-        if self.__shape__ is None:
-            self.__shape__ = numpy.array(dat).shape
-            if len(self.__shape__) > 3:
-                raise ValueError(
-                    "The dimension of input cannot be greater than 3.")
-            if len(self.__shape__) == 0:
-                raise ValueError(
-                    "The input should be a vector, please check your input data."
-                )
-            self.__dim__ = reduce(lambda x, y: x * y, self.__shape__)
-            if len(self.__shape__) == 1 and self.__dim__ != self.input_type.dim:
-                raise ValueError(
-                    "The data size must be equal to it in data layer.")
-        else:
-            if self.__shape__ != numpy.array(dat).shape:
-                raise ValueError(
-                    "The data shape must be same in one mini-batch.")
-
-    def finish_pre_scan(self, argument):
-        self.__mat__ = numpy.ndarray(
-            shape=(self.__height__, self.__dim__), dtype=numpy.float32)
-        self.__height__ = 0
-
-    def scan(self, dat):
-        # It's better to use NumPy array for speed.
-        dat = numpy.array(dat)
-        dat = dat.flatten()
-        self.__mat__[self.__height__] = dat
-        self.__height__ += 1
-
-    def finish_scan(self, argument):
-        assert isinstance(argument, swig_paddle.Arguments)
-        if self.__mat__.dtype != numpy.float32:
-            self.__mat__ = self.__mat__.astype(numpy.float32)
-        m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True,
-                                                    self.data_in_gpu)
-        argument.setSlotValue(self.pos, m)
-        if len(self.__shape__) > 1:
-            # The last-two dimenstions are the frame height and width.
-            # For example, the layout is CHW for 3-D feature of image.
-            # The H and W are the frame height and width.
-            h, w = self.__shape__[-2:]
-            argument.setSlotFrameHeight(self.pos, h)
-            argument.setSlotFrameWidth(self.pos, w)
-        self.__shape__ = None
-
-
-class SparseBinaryScanner(IScanner):
-    def __init__(self, input_type, pos):
-        IScanner.__init__(self, input_type, pos)
-        self.__rows__ = [0]
-        self.__cols__ = []
-        self.__height__ = 0
-        self.__value__ = []
-
-    def scan(self, dat):
-        self.extend_cols(dat)
-        self.__rows__.append(len(self.__cols__))
-        self.__height__ += 1
-
-    def extend_cols(self, dat):
-        self.__cols__.extend(dat)
-
-    def finish_scan(self, argument):
-        assert isinstance(argument, swig_paddle.Arguments)
-        m = swig_paddle.Matrix.createSparse(
-            self.__height__,
-            self.input_type.dim,
-            len(self.__cols__),
-            len(self.__value__) == 0,
-            False,  # trans
-            False)  # TODO supoort GPU
-        assert isinstance(m, swig_paddle.Matrix)
-        m.sparseCopyFrom(self.__rows__, self.__cols__, self.__value__)
-        argument.setSlotValue(self.pos, m)
-
-
-class SparseFloatScanner(SparseBinaryScanner):
-    def __init__(self, input_type, pos):
-        SparseBinaryScanner.__init__(self, input_type, pos)
-
-    def extend_cols(self, dat):
-        self.__cols__.extend((x[0] for x in dat))
-        self.__value__.extend((x[1] for x in dat))
-
-
-class IndexScanner(IScanner):
-    def __init__(self, input_type, pos):
-        IScanner.__init__(self, input_type, pos)
-        self.__ids__ = None
-        self.__idx__ = 0
-
-    def pre_scan(self, dat):
-        self.__idx__ += 1
-
-    def finish_pre_scan(self, argument):
-        self.__ids__ = [0] * self.__idx__
-        self.__idx__ = 0
-
-    def scan(self, dat):
-        self.__ids__[self.__idx__] = dat
-        self.__idx__ += 1
-
-    def finish_scan(self, argument):
-        ids = swig_paddle.IVector.create(self.__ids__, self.data_in_gpu)
-        assert isinstance(argument, swig_paddle.Arguments)
-        argument.setSlotIds(self.pos, ids)
-
-
-class SequenceScanner(IScanner):
-    def __init__(self, input_type, pos, inner_scanner, setter):
-        IScanner.__init__(self, input_type, pos)
-        self.__seq__ = [0]
-        self.__inner_scanner__ = inner_scanner
-        self.__setter__ = setter
-
-    def pre_scan(self, dat):
-        for each in dat:
-            self.__inner_scanner__.pre_scan(each)
-
-    def finish_pre_scan(self, argument):
-        self.__inner_scanner__.finish_pre_scan(argument)
-
-    def scan(self, dat):
-        self.__seq__.append(self.__seq__[-1] + self.get_size(dat))
-        for each in dat:
-            self.__inner_scanner__.scan(each)
-
-    def finish_scan(self, argument):
-        seq = swig_paddle.IVector.create(self.__seq__, False)
-        self.__setter__(argument, self.pos, seq)
-        self.__inner_scanner__.finish_scan(argument)
-
-    def get_size(self, dat):
-        if isinstance(self.__inner_scanner__, SequenceScanner):
-            return sum(self.__inner_scanner__.get_size(item) for item in dat)
-        else:
-            return len(dat)
-
-
-class DataProviderConverter(object):
-    def __init__(self, input_types):
-        self.input_types = input_types
-        assert isinstance(self.input_types, collections.Sequence)
-        for each in self.input_types:
-            assert isinstance(each, dp2.InputType)
-
-    def convert(self, dat, argument=None):
-        if argument is None:
-            argument = swig_paddle.Arguments.createArguments(0)
-        assert isinstance(argument, swig_paddle.Arguments)
-        argument.resize(len(self.input_types))
-
-        scanners = [
-            DataProviderConverter.create_scanner(i, each_type)
-            for i, each_type in enumerate(self.input_types)
-        ]
-
-        for each_sample in dat:
-            for each_step, scanner in itertools.izip(each_sample, scanners):
-                scanner.pre_scan(each_step)
-
-        for scanner in scanners:
-            scanner.finish_pre_scan(argument)
-
-        for each_sample in dat:
-            for each_step, scanner in itertools.izip(each_sample, scanners):
-                scanner.scan(each_step)
-
-        for scanner in scanners:
-            scanner.finish_scan(argument)
-
-        return argument
-
-    def __call__(self, dat, argument=None):
-        return self.convert(dat, argument)
-
-    @staticmethod
-    def create_scanner(i, each):
-        assert isinstance(each, dp2.InputType)
-        retv = None
-        if each.type == dp2.DataType.Dense:
-            retv = DenseScanner(each, i)
-        elif each.type == dp2.DataType.Index:
-            retv = IndexScanner(each, i)
-        elif each.type == dp2.DataType.SparseNonValue:
-            retv = SparseBinaryScanner(each, i)
-        elif each.type == dp2.DataType.SparseValue:
-            retv = SparseFloatScanner(each, i)
-        assert retv is not None
-
-        if each.seq_type == dp2.SequenceType.SUB_SEQUENCE:
-            retv = SequenceScanner(
-                each, i, retv,
-                lambda a, p, seq: a.setSlotSubSequenceStartPositions(p, seq))
-
-        if each.seq_type in [
-                dp2.SequenceType.SUB_SEQUENCE, dp2.SequenceType.SEQUENCE
-        ]:
-            retv = SequenceScanner(
-                each, i, retv,
-                lambda a, p, seq: a.setSlotSequenceStartPositions(p, seq))
-        return retv
diff --git a/paddle/py_paddle/util.py b/paddle/py_paddle/util.py
deleted file mode 100644
index 3ae8dbf964c..00000000000
--- a/paddle/py_paddle/util.py
+++ /dev/null
@@ -1,578 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Some Useful method for py_paddle.
-"""
-
-import swig_paddle
-import os
-import paddle.trainer.PyDataProviderWrapper
-import paddle.proto.ParameterConfig_pb2
-import paddle.proto.ModelConfig_pb2
-import paddle.proto.TrainerConfig_pb2
-import weakref
-import numpy
-import struct
-import sys
-import copy
-
-
-def initializePaddle(*args):
-    """
-    To initialize paddle process.
-    :param args: Command line options, such as --use_gpu=0, etc.
-    :return: Nothing.
-    """
-    old_argv = copy.deepcopy(sys.argv)
-    old_pypath = os.getenv("PYTHONPATH")
-    pypath = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-    if old_pypath is not None:
-        pypath = os.pathsep.join([pypath, old_pypath])
-        os.putenv("PYTHONPATH", pypath)
-    args = [""] + list(args)  # argv[0] is command name, it is not important.
-    swig_paddle.__initPaddle__(args)
-    sys.argv = old_argv
-
-
-def __monkeypatch_init_paddle__():
-    swig_paddle.__initPaddle__ = swig_paddle.initPaddle
-    swig_paddle.initPaddle = initializePaddle
-
-
-class __ParameterCallbackWrapper__(swig_paddle.UpdateCallback):
-    """
-    Wrap the python callable object to paddle.UpdateCallback.
-
-    INTERNAL USE ONLY.
-    """
-
-    def __init__(self, callback):
-        swig_paddle.UpdateCallback.__init__(self)
-        self.callback = callback
-
-    def apply(self, param):
-        self.callback(param)
-
-    @staticmethod
-    def wrap(callback):
-        """
-        Cast the python callable object/paddle.UpdateCallback to
-        swig_paddle.UpdateCallback.__disown__
-        :param callback: callable or swig_paddle.UpdateCallback object.
-        """
-        if isinstance(callback, swig_paddle.UpdateCallback):
-            return callback.__disown__()
-        elif isinstance(callback, weakref.ProxyType):
-            raise RuntimeError("Should not pass __disown__ object")
-        else:
-            return __ParameterCallbackWrapper__(callback).__disown__()
-
-
-def __arguments_to_numpy__(i, arg):
-    assert isinstance(arg, swig_paddle.Arguments)
-    value = arg.getSlotValue(i)
-    ids = arg.getSlotIds(i)
-    prob = arg.getSlotIn(i)
-    if value is not None:
-        assert isinstance(value, swig_paddle.Matrix)
-        value = value.copyToNumpyMat()
-    if ids is not None:
-        assert isinstance(ids, swig_paddle.IVector)
-        ids = ids.copyToNumpyArray()
-    if prob is not None:
-        assert isinstance(prob, swig_paddle.Matrix)
-        prob = prob.copyToNumpyMat()
-    return {"value": value, "id": ids, "prob": prob}
-
-
-def __monkeypatch_gradient_machine__():
-    """
-    Add some class methods to GradientMachine.
-    This method should be only used internally.
-    """
-    swig_paddle.GradientMachine.loadFromConfigFile = \
-        staticmethod(loadGradientMachine)
-
-    def __matrix_to_numpy__(m):
-        if isinstance(m, swig_paddle.Matrix):
-            return m.copyToNumpyMat()
-        elif isinstance(m, swig_paddle.IVector):
-            return m.copyToNumpyArra()
-        else:
-            raise RuntimeError("Input arg should be matrix or vecotr.")
-
-    def createFromConfigProto(protoObj,
-                              createMode=swig_paddle.CREATE_MODE_NORMAL,
-                              paramTypes=[
-                                  swig_paddle.PARAMETER_VALUE,
-                                  swig_paddle.PARAMETER_GRADIENT,
-                                  swig_paddle.PARAMETER_MOMENTUM
-                              ]):
-        """
-        Create Gradient Machine From Proto object.
-        :param protoObj: Model config
-        :type protoObj: proto.ModelConfig_pb2.ModelConfig
-        :param createMode: Create Mode, default is normal.
-        :type createMode: int
-        :param paramTypes: the gradient machine parameter type.
-        :type paramTypes: list of int
-        :return: paddle.GradientMachine
-        """
-        assert isinstance(protoObj, paddle.proto.ModelConfig)
-        return swig_paddle.GradientMachine.createByConfigProtoStr(
-            protoObj.SerializeToString(), createMode, paramTypes)
-
-    swig_paddle.GradientMachine.createFromConfigProto = \
-        staticmethod(createFromConfigProto)
-
-    def forwardTest(self, inArgs):
-        """
-        forwardTest. forward gradient machine in test mode, and return a numpy
-        matrix dict.
-
-        :param inArgs: The input arguments
-        :type inArgs: paddle.Arguments
-        :return: A dictionary with keys ['id', 'value'], each value is a
-                 numpy.ndarray.
-        """
-        outArgs = swig_paddle.Arguments.createArguments(0)
-        self.forward(inArgs, outArgs, swig_paddle.PASS_TEST)
-        return [
-            __arguments_to_numpy__(i, outArgs)
-            for i in xrange(outArgs.getSlotNum())
-        ]
-
-    swig_paddle.GradientMachine.forwardTest = forwardTest
-
-    # Monkey patching backward
-    swig_paddle.GradientMachine.__backward__ = swig_paddle.GradientMachine.backward
-
-    def backward(self, callback):
-        """
-        GradientMachine Backward
-        :param callback: a callback which parameter is (paddle.Parameter) or
-                         a paddle.UpdateCallback object.
-        """
-        self.__backward__(__ParameterCallbackWrapper__.wrap(callback))
-
-    swig_paddle.GradientMachine.backward = backward
-
-    # Monkey patching forwardBackward.
-    swig_paddle.GradientMachine.__forwardBackward__ = \
-        swig_paddle.GradientMachine.forwardBackward
-
-    def forwardBackward(self,
-                        inArgs,
-                        outArgs,
-                        passType,
-                        callback=swig_paddle.UpdateCallback()):
-        """
-        GradientMachine forward backward.
-        :param inArgs: Input Arguments for GradientMachine.
-        :type inArgs: paddle.Arguments
-        :param outArgs: Output Arguments for GradientMachine.
-        :type outArgs: paddle.Arguments
-        :param passType: gradient machine's pass type.
-        :type passType: paddle.PassType
-        :param callback: a callable object with arguments (paddle.Parameter) or
-                         a paddle.UpdateCallback it will be called when
-                         backward
-        """
-        self.__forwardBackward__(inArgs, outArgs, passType,
-                                 __ParameterCallbackWrapper__.wrap(callback))
-
-    swig_paddle.GradientMachine.forwardBackward = forwardBackward
-
-    def getParameters(self):
-        return (self.getParameter(i) for i in xrange(self.getParameterSize()))
-
-    swig_paddle.GradientMachine.getParameters = getParameters
-
-    def getNonStaticParameters(self):
-        return (self.getNonStaticParameter(i)
-                for i in xrange(self.getNonStaticParameterSize()))
-
-    swig_paddle.GradientMachine.getNonStaticParameters = getNonStaticParameters
-
-    def getLayerOutputs(self, layerNames):
-        """
-        getLayerOutputs. get outputs of layers and return a numpy matrix dict.
-        :param layerNames: layer names.
-        :type layerNames: string or list.
-        """
-        if isinstance(layerNames, basestring):
-            layerNames = [layerNames]
-        elif not isinstance(layerNames, list):
-            raise RuntimeError("Input args shuld be string or a sting list.")
-
-        output = dict()
-        for name in layerNames:
-            output[name] = __arguments_to_numpy__(0, self.getLayerOutput(name))
-        return output
-
-    swig_paddle.GradientMachine.getLayerOutputs = getLayerOutputs
-
-
-def loadGradientMachine(config_filename, model_dir=None):
-    """
-    Load a gradient machine from config file name/path.
-    :param config_filename: The trainer config file name/path
-    :param model_dir: The model parameter directory. None if same as the
-    directory of config_filename
-    :return: GradientMachine with some enhance methods.
-    :rtype: paddle.GradientMachine
-    """
-    trainer_config = swig_paddle.TrainerConfig.createFromTrainerConfigFile(
-        config_filename)
-    assert isinstance(trainer_config, swig_paddle.TrainerConfig)
-    model_conf = trainer_config.getModelConfig()
-    network = swig_paddle.GradientMachine.createByModelConfig(model_conf)
-    assert isinstance(network, swig_paddle.GradientMachine)
-    if model_dir is None:
-        model_dir = os.path.dirname(config_filename)
-    network.loadParameters(model_dir)
-    return network
-
-
-def loadParameterFile(fn):
-    """
-    Load Paddle Parameter file to numpy.ndarray
-    :param fn: file name or file like object.
-    :type fn: str or file like object.
-    :return: numpy array
-    :rtype: numpy.ndarray
-    :raise: paddle.UnsupportError when parameter format is wrong.
-    """
-    if isinstance(fn, str):
-        with open(fn, 'rb') as f:
-            return loadParameterFile(f)
-    elif hasattr(fn, 'read'):  # File like object
-        version, = struct.unpack('i', fn.read(4))
-        if version != 0:
-            raise swig_paddle.UnsupportError()
-        value_length, = struct.unpack("I", fn.read(4))
-        if value_length != 4 and value_length != 8:
-            raise swig_paddle.UnsupportError()
-        dtype = 'float32' if value_length == 4 else 'float64'
-        param_size, = struct.unpack("L", fn.read(8))
-        value = numpy.fromfile(fn, dtype)
-        if len(value) != param_size:
-            raise swig_paddle.UnsupportError()
-        return value
-    else:
-        raise swig_paddle.UnsupportError()
-
-
-class DataProviderWrapperConverter(object):
-    """
-    A class convert DataFormat from PyDataProvider Wrapper to
-    py_paddle.paddle.Arguemnts.
-    """
-
-    class DenseValueConverter(object):
-        """
-        Internal class
-        """
-
-        def __init__(self, header_def):
-            self.__dim__ = header_def.dim
-            self.buf = []
-
-        def append(self, other):
-            assert len(other) == self.__dim__
-            self.buf += other
-
-        def __call__(self, slot_idx, arg):
-            mat = swig_paddle.Matrix.createDense(self.buf,
-                                                 len(self.buf) / self.__dim__,
-                                                 self.__dim__)
-            arg.setSlotValue(slot_idx, mat)
-
-    class IdValueConverter(object):
-        """
-        Internal class
-        """
-
-        def __init__(self, *args):
-            self.buf = []
-
-        def append(self, other):
-            assert isinstance(other, int)
-            self.buf.append(other)
-
-        def __call__(self, slot_idx, arg):
-            arg.setSlotIds(slot_idx, swig_paddle.IVector.create(self.buf))
-
-    class SparseNonValueConverter(object):
-        """
-        Internal class
-        """
-
-        def __init__(self, slot_def):
-            self.indices = [0]
-            self.cols = []
-            self.dim = slot_def.dim
-
-        def append(self, other):
-            self.indices.append(self.indices[-1] + len(other))
-            self.cols += other
-
-        def __call__(self, slot_idx, arg):
-            mat = swig_paddle.Matrix.createSparse(
-                len(self.indices) - 1, self.dim, len(self.cols), True)
-            assert isinstance(mat, swig_paddle.Matrix)
-            mat.sparseCopyFrom(self.indices, self.cols)
-            self.putIntoArg(slot_idx, arg, mat)
-
-        def putIntoArg(self, slot_idx, arg, mat):
-            arg.setSlotValue(slot_idx, mat)
-
-    class SparseValueConverter(SparseNonValueConverter):
-        """
-        Internal class
-        """
-
-        def __init__(self, slot_def):
-            super(DataProviderWrapperConverter.SparseValueConverter,
-                  self).__init__(slot_def)
-            self.values = []
-
-        def append(self, other):
-            super(DataProviderWrapperConverter.SparseValueConverter,
-                  self).append(map(lambda x: x[0], other))
-            self.values += map(lambda x: x[1], other)
-
-        def __call__(self, slot_idx, arg):
-            mat = swig_paddle.Matrix.createSparse(
-                len(self.indices) - 1, self.dim, len(self.cols), False)
-            assert isinstance(mat, swig_paddle.Matrix)
-            mat.sparseCopyFrom(self.indices, self.cols, self.values)
-            self.putIntoArg(slot_idx, arg, mat)
-
-    __SLOT_VALUE_CONVERTER_MAP__ = {
-        paddle.trainer.PyDataProviderWrapper.DenseSlot: DenseValueConverter,
-        paddle.trainer.PyDataProviderWrapper.IndexSlot: IdValueConverter,
-        paddle.trainer.PyDataProviderWrapper.SparseNonValueSlot:
-        SparseNonValueConverter,
-        paddle.trainer.PyDataProviderWrapper.SparseValueSlot:
-        SparseValueConverter
-    }
-
-    def __init__(self, use_seq, header):
-        """
-        Ctor
-        :param use_seq: True if use sequence.
-        :param header:  List of slots type,
-                       trainer.PyDataProviderWrapper.SlotType
-        """
-        self.__use_seq__ = use_seq
-        self.__header__ = header
-
-    def convert(self, wrapper_data, argument=None):
-        """
-        Convert PyDataProviderWrapper format to paddle.Argument
-        :param wrapper_data: PyDataProviderWrapper yield's data list.
-        :param argument: The output paddle.Arguments.
-                        If it is not None, it will assign data in this
-                        arguments, else it will create new arguments.
-        :return: arguments that contains data.
-        :rtype: paddle.Arguments
-        """
-        if argument is None:
-            argument = swig_paddle.Arguments.createArguments(0)
-        assert isinstance(argument, swig_paddle.Arguments)
-        argument.resize(len(self.__header__))
-
-        values = map(
-            lambda x: DataProviderWrapperConverter.__SLOT_VALUE_CONVERTER_MAP__[x.__class__](x),
-            self.__header__)
-
-        if self.__use_seq__:
-            seq_dim = [[] for _ in xrange(self.__header__.__len__())]
-            seq_start_pos = [[0] for _ in xrange(self.__header__.__len__())]
-
-            for each_sample in wrapper_data:
-                for slot_idx, sequence in enumerate(each_sample):
-                    for raw_data in sequence:
-                        values[slot_idx].append(raw_data)
-                    seq_start_pos[slot_idx].append(seq_start_pos[slot_idx][-1] +
-                                                   len(sequence))
-                    seq_dim[slot_idx].append(len(sequence))
-
-            for slot_idx in xrange(len(self.__header__)):
-                argument.setSlotSequenceDim(
-                    slot_idx, swig_paddle.IVector.create(seq_dim[slot_idx]))
-                argument.setSlotSequenceStartPositions(
-                    slot_idx,
-                    swig_paddle.IVector.create(seq_start_pos[slot_idx]))
-        else:
-            for each_sample in wrapper_data:
-                for raw_data, value in zip(each_sample, values):
-                    value.append(raw_data)
-
-        for i, v in enumerate(values):
-            v(i, argument)
-
-        return argument
-
-    def __call__(self, wrapper_data, argument=None):
-        """
-        Invoke self.convert. See documents in self.convert.
-        """
-        return self.convert(wrapper_data, argument)
-
-
-def __monkey_patch_protobuf_objects__():
-    def ParameterConfig_toProto(self):
-        """
-        Convert paddle.ParameterConfig to
-        proto.ParameterConfig_pb2.ParameterConfig
-
-        :return: proto.ParameterConfig_pb2.ParameterConfig object.
-        """
-        param_conf = paddle.proto.ParameterConfig_pb2.ParameterConfig()
-        param_conf.ParseFromString(self.toProtoString())
-        return param_conf
-
-    swig_paddle.ParameterConfig.toProto = ParameterConfig_toProto
-
-    def OptimizationConfig_toProto(self):
-        """
-        Convert paddle.OptimizationConfig to
-        proto.TrainerConfig_pb2.OptimizationConfig
-
-        :return: proto.TrainerConfig_pb2.OptimizationConfig
-        """
-        opt_conf = proto.TrainerConfig_pb2.OptimizationConfig()
-        opt_conf.ParseFromString(self.toProtoString())
-        return opt_conf
-
-    swig_paddle.OptimizationConfig.toProto = OptimizationConfig_toProto
-
-    def OptimizationConfig_createFromProto(protoObj):
-        """
-        Create a new paddle.OptimizationConfig from
-        proto.TrainerConfig_pb2.OptimizationConfig
-
-        :param protoObj: proto.TrainerConfig_pb2.OptimizationConfig
-        :return: paddle.OptimizationConfig
-        """
-
-        assert isinstance(protoObj, paddle.proto.OptimizationConfig)
-        return swig_paddle.OptimizationConfig.createFromProtoString(
-            protoObj.SerializeToString())
-
-    swig_paddle.OptimizationConfig.createFromProto = staticmethod(
-        OptimizationConfig_createFromProto)
-
-    def TrainerConfig_createFromProto(protoObj):
-        """
-        Create a new paddle.TrainerConfig from
-        proto.OptimizationConfig
-
-        :param protoObj: proto.TrainerConfig
-        :return: paddle.TrainerConfig
-        """
-        assert isinstance(protoObj, paddle.proto.TrainerConfig)
-        return swig_paddle.TrainerConfig.createFromProtoString(
-            protoObj.SerializeToString())
-
-    swig_paddle.TrainerConfig.createFromProto = staticmethod(
-        TrainerConfig_createFromProto)
-
-
-def __monkey_patch_parameter__():
-    def getBufs(self):
-        """
-        get all parameter vectors.
-        NOTE: the return value is a generator. Maybe you need to cast to
-        list or tuple or something else.
-
-        :return: generator of all parameter vectors.
-        :rtype: generator
-        """
-        return (self.getBuf(i) for i in xrange(swig_paddle.NUM_PARAMETER_TYPES))
-
-    swig_paddle.Parameter.getBufs = getBufs
-
-
-def __monkey_patch_trainer__():
-    swig_paddle.Trainer.__create__ = staticmethod(swig_paddle.Trainer.create)
-
-    def Trainer_create(config, model=None):
-        """
-        Create a trainer for model with TrainerCOnfig trainer_config
-        trainer_config.model_config will be ignored when model is supplied.
-        Trainer.trainOneBatch() and Trainer.forwardOneBatch() can be used only
-        when trainer_config.data_config is set.
-
-        A typical usage for Trainer is:
-        .. code-block:: python
-           trainer = Trainer.create(trainer_config, model)
-           for p in xrange(num_passes)
-               while True:
-                   data = get_next_batch(batch_size)
-                   if not data:
-                       break
-                   trainer.trainOneDataBatch(batch_size, data)
-               trainer.finishTrainPass()
-           trainer.finishTrain()
-
-        The trainer will take care of logging, model saving, distributed
-        training, etc.
-
-        :param config: trainer configuration
-        :type config: paddle.proto.TrainerConfig
-        :param model: the model to be trained
-        :type model: swig_paddle.GradientMachine
-        :return: a trainer
-        :rtype swig_paddle.Trainer
-
-        """
-        assert isinstance(config, paddle.proto.TrainerConfig)
-        if model is not None:
-            assert isinstance(model, swig_paddle.GradientMachine)
-        return swig_paddle.Trainer.__create__(
-            swig_paddle.TrainerConfig.createFromProto(config), model)
-
-    swig_paddle.Trainer.create = staticmethod(Trainer_create)
-
-    swig_paddle.Trainer.__getForwardOutput__ = \
-        swig_paddle.Trainer.getForwardOutput
-
-    def getForwardOutput(self):
-        """
-        Get the netword outputs from the previous trainOneBatch(),
-        trainOneDataBatch(), testOneDataPatch(), or forwardOneBatch() call.
-
-        :return: list of dictionary with keys ['id', 'value'], each value is a
-                 numpy.ndarray.
-        """
-        outArgs = self.__getForwardOutput__()
-        return [
-            __arguments_to_numpy__(i, outArgs)
-            for i in xrange(outArgs.getSlotNum())
-        ]
-
-    swig_paddle.Trainer.getForwardOutput = getForwardOutput
-
-
-def monkeypatches():
-    patches = [
-        __monkeypatch_init_paddle__, __monkeypatch_gradient_machine__,
-        __monkey_patch_protobuf_objects__, __monkey_patch_parameter__,
-        __monkey_patch_trainer__
-    ]
-    for patch in patches:
-        patch()
diff --git a/paddle/scripts/README.md b/paddle/scripts/README.md
index 27722245064..dd3242f62ba 100644
--- a/paddle/scripts/README.md
+++ b/paddle/scripts/README.md
@@ -68,7 +68,6 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | `WITH_TESTING` | OFF | Build unit tests binaries. |
 | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
 | `WITH_GOLANG` | OFF | Build fault-tolerant parameter server written in go. |
-| `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
 | `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. |
 | `WITH_STYLE_CHECK` | ON | Check the code style when building. |
 | `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index cbd39d7a5d9..a06952782b3 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -180,7 +180,6 @@ function cmake_gen() {
         -DWITH_GOLANG=${WITH_GOLANG:-OFF}
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
         -DWITH_PYTHON=${WITH_PYTHON:-ON}
-        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
         -DCUDNN_ROOT=/usr/
         -DWITH_TESTING=${WITH_TESTING:-ON}
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
@@ -214,7 +213,6 @@ EOF
         -DWITH_AVX=${WITH_AVX:-OFF} \
         -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
-        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
         -DWITH_PYTHON=${WITH_PYTHON:-ON} \
         -DCUDNN_ROOT=/usr/ \
         -DWITH_TESTING=${WITH_TESTING:-ON} \
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 37ad77549c8..59e695e6fcb 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -39,7 +39,6 @@ add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE})
 IF(WIN32)
     add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
             COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle/
-            COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
             COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
             COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
             COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
@@ -48,7 +47,6 @@ ELSE(WIN32)
 	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 		COMMAND touch stub.cc
 		COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
-		COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
 		COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
 		COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-- 
GitLab


From b4ccae75c044f513d18dc08b383ca5c3ce3be41a Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Mon, 21 Jan 2019 18:37:52 +0800
Subject: [PATCH 41/73] remove legacy target in cmake/util.cmake

---
 cmake/util.cmake | 115 -----------------------------------------------
 1 file changed, 115 deletions(-)

diff --git a/cmake/util.cmake b/cmake/util.cmake
index 0dc33ce3851..02667dbce69 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -53,118 +53,3 @@ function(target_circle_link_libraries TARGET_NAME)
                 "-Wl,--end-group")
     endif()
 endfunction()
-
-# compile_cu_as_cpp
-# Make a cu file compiled as C++
-# Arguments: Source files
-macro(compile_cu_as_cpp)
-    foreach(s ${ARGN})
-        set_source_files_properties(${s} PROPERTIES LANGUAGE CXX)
-        set_source_files_properties(${s} PROPERTIES COMPILE_FLAGS "-x c++")
-    endforeach()
-endmacro()
-
-# link_paddle_exe
-# add paddle library for a paddle executable, such as trainer, pserver.
-#
-# It will handle WITH_PYTHON etc.
-function(link_paddle_exe TARGET_NAME)
-    if(WITH_RDMA)
-        generate_rdma_links()
-    endif()
-
-    if(MOBILE_INFERENCE)
-        target_circle_link_libraries(${TARGET_NAME}
-            ARCHIVE_START
-            paddle_gserver
-            paddle_function
-            ARCHIVE_END
-            paddle_math
-            paddle_utils
-            paddle_parameter
-            paddle_proto
-            paddle_cuda
-            ${EXTERNAL_LIBS}
-            ${CMAKE_THREAD_LIBS_INIT}
-            ${CMAKE_DL_LIBS}
-            ${RDMA_LD_FLAGS}
-            ${RDMA_LIBS})
-    else()
-        target_circle_link_libraries(${TARGET_NAME}
-            ARCHIVE_START
-            paddle_gserver
-            paddle_function
-            ARCHIVE_END
-            paddle_pserver
-            paddle_trainer_lib
-            paddle_network
-            paddle_math
-            paddle_utils
-            paddle_parameter
-            paddle_proto
-            paddle_cuda
-            paddle_optimizer
-            ${EXTERNAL_LIBS}
-            ${CMAKE_THREAD_LIBS_INIT}
-            ${CMAKE_DL_LIBS}
-            ${RDMA_LD_FLAGS}
-            ${RDMA_LIBS})
-    endif()
-
-    if(ANDROID)
-        target_link_libraries(${TARGET_NAME} log)
-    endif(ANDROID)
-
-    if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB)
-      target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
-    endif()
-
-    add_dependencies(${TARGET_NAME} ${external_project_dependencies})
-endfunction()
-
-# link_paddle_test
-# Link a paddle unittest for target
-# TARGET_NAME: the unittest target name
-# Rest Arguemnts: not used.
-function(link_paddle_test TARGET_NAME)
-    link_paddle_exe(${TARGET_NAME})
-    target_link_libraries(${TARGET_NAME}
-                          paddle_test_main
-                          paddle_test_util
-                          ${GTEST_LIBRARIES})
-endfunction()
-
-# add_unittest_without_exec
-#
-# create a paddle unittest. not specifically define how to run this unittest.
-# TARGET_NAME: the unittest target name, same as executable file name
-# Rest Arguments: the source files to compile this unittest.
-macro(add_unittest_without_exec TARGET_NAME)
-    add_executable(${TARGET_NAME} ${ARGN})
-    link_paddle_test(${TARGET_NAME})
-endmacro()
-
-# add_unittest
-# create a paddle unittest and just to execute this binary to make unittest.
-#
-# TARGET_NAME: the unittest target name, same as executable file name
-# Rest Arguments: the source files to compile this unittest.
-macro(add_unittest TARGET_NAME)
-    add_unittest_without_exec(${TARGET_NAME} ${ARGN})
-    add_test(${TARGET_NAME} ${TARGET_NAME})
-endmacro()
-
-# add_simple_unittest
-# create a paddle unittest with file name. It just compile ${TARGET_NAME}.cpp to
-# ${TARGET_NAME} and then execute it.
-macro(add_simple_unittest TARGET_NAME)
-    add_unittest(${TARGET_NAME} ${TARGET_NAME}.cpp)
-endmacro()
-
-# Creates C resources file from files in given resource file
-function(create_resources res_file output_file)
-  add_custom_command(
-    OUTPUT ${output_file}
-    COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file}
-    DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py)
-endfunction()
-- 
GitLab


From 9353bc58ddf4a07117eabf1bcc8698731dace5ae Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Mon, 21 Jan 2019 18:38:58 +0800
Subject: [PATCH 42/73] remove legacy MOBILE_INFERENCE option

---
 CMakeLists.txt                    | 10 +---------
 cmake/external/cares.cmake        |  2 +-
 cmake/external/grpc.cmake         |  2 +-
 cmake/external/gzstream.cmake     |  4 ----
 cmake/external/protobuf.cmake     | 15 +--------------
 cmake/external/snappy.cmake       |  4 ----
 cmake/external/snappystream.cmake |  4 ----
 cmake/external/warpctc.cmake      |  4 ----
 cmake/generic.cmake               |  8 +-------
 cmake/inference_lib.cmake         | 32 +++++++++++++++----------------
 10 files changed, 20 insertions(+), 65 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b4a700f9744..8136829a509 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -115,10 +115,6 @@ if(ANDROID OR IOS)
         "Disable nGraph when cross-compiling for Android and iOS" FORCE)
     set(WITH_GOLANG OFF CACHE STRING
         "Disable golang when cross-compiling for Android and iOS" FORCE)
-
-    # Compile PaddlePaddle mobile inference library
-    set(MOBILE_INFERENCE ON)
-    add_definitions(-DPADDLE_MOBILE_INFERENCE)
 endif()
 
 if (APPLE)
@@ -142,11 +138,7 @@ set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
 set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING
   "A path setting fluid inference shared and static libraries")
 
-if(MOBILE_INFERENCE)
-    set(THIRD_PARTY_BUILD_TYPE MinSizeRel)
-else()
-    set(THIRD_PARTY_BUILD_TYPE Release)
-endif()
+set(THIRD_PARTY_BUILD_TYPE Release)
 
 set(WITH_MKLML ${WITH_MKL})
 if (NOT DEFINED WITH_MKLDNN)
diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake
index a743b572a6c..52507a6ae4a 100644
--- a/cmake/external/cares.cmake
+++ b/cmake/external/cares.cmake
@@ -13,7 +13,7 @@
 # limitations under the License.
 #
 
-IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
+IF(NOT WITH_DISTRIBUTE)
     return()
 ENDIF()
 
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index fd9835d023c..c5754da59bf 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -13,7 +13,7 @@
 # limitations under the License.
 #
 
-IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
+IF(NOT WITH_DISTRIBUTE)
     return()
 ENDIF()
 
diff --git a/cmake/external/gzstream.cmake b/cmake/external/gzstream.cmake
index 3e36ef7ae20..af7a8bfda6f 100644
--- a/cmake/external/gzstream.cmake
+++ b/cmake/external/gzstream.cmake
@@ -13,10 +13,6 @@
 # limitations under the License.
 #
 
-IF(MOBILE_INFERENCE)
-    return()
-ENDIF()
-
 include (ExternalProject)
 
 # NOTE: gzstream is needed when linking with ctr reader.
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 16fd9fac92e..d0f7f7409ba 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -204,15 +204,6 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
 
     SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
     SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
-    IF(MOBILE_INFERENCE)
-        # The reason why the official version is not used is described in
-        # https://github.com/PaddlePaddle/Paddle/issues/6114
-        SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git")
-        SET(PROTOBUF_TAG "v3.2.0")
-        IF(NOT BUILD_FOR_HOST)
-            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF")
-        ENDIF()
-    ENDIF()
 
     ExternalProject_Add(
         ${TARGET_NAME}
@@ -240,11 +231,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     )
 ENDFUNCTION()
 
-IF(NOT MOBILE_INFERENCE)
-    SET(PROTOBUF_VERSION 3.1)
-ELSE()
-    SET(PROTOBUF_VERSION 3.2)
-ENDIF()
+SET(PROTOBUF_VERSION 3.1)
 IF(CMAKE_CROSSCOMPILING)
     build_protobuf(protobuf_host TRUE)
     LIST(APPEND external_project_dependencies protobuf_host)
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
index f9d4cd97400..27d075336d5 100644
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-if(MOBILE_INFERENCE OR RPI)
-    return()
-endif()
-
 include (ExternalProject)
 
 # NOTE: snappy is needed when linking with recordio
diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake
index 1ec79462c14..392f186b7ce 100644
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-IF(MOBILE_INFERENCE OR RPI)
-    return()
-ENDIF()
-
 include (ExternalProject)
 
 set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 7b937c93feb..7a25aaf15f2 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-IF(MOBILE_INFERENCE)
-    return()
-ENDIF()
-
 INCLUDE(ExternalProject)
 
 SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 3f1be11d855..7dd59577c4e 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -655,12 +655,6 @@ function(paddle_protobuf_generate_cpp SRCS HDRS)
   set(${SRCS})
   set(${HDRS})
 
-  if (MOBILE_INFERENCE)
-      set(EXTRA_FLAG "lite:")
-  else()
-      set(EXTRA_FLAG "")
-  endif()
-
   foreach(FIL ${ARGN})
     get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
     get_filename_component(FIL_WE ${FIL} NAME_WE)
@@ -677,7 +671,7 @@ function(paddle_protobuf_generate_cpp SRCS HDRS)
       COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
       COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
       -I${CMAKE_CURRENT_SOURCE_DIR}
-      --cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
+      --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
       DEPENDS ${ABS_FIL} protoc
       COMMENT "Running C++ protocol buffer compiler on ${FIL}"
       VERBATIM )
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 3e11d332ff7..a7dce4dfdb5 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -149,25 +149,23 @@ if (WITH_NGRAPH)
             )
 endif ()
 
-if (NOT MOBILE_INFERENCE AND NOT RPI)
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
-    copy(snappy_lib
-            SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
-            DSTS ${dst_dir} ${dst_dir}/lib
-            DEPS snappy)
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
+copy(snappy_lib
+        SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
+        DSTS ${dst_dir} ${dst_dir}/lib
+        DEPS snappy)
 
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
-    copy(snappystream_lib
-            SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
-            DSTS ${dst_dir} ${dst_dir}/lib
-            DEPS snappystream)
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
+copy(snappystream_lib
+        SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
+        DSTS ${dst_dir} ${dst_dir}/lib
+        DEPS snappystream)
 
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
-    copy(zlib_lib
-            SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
-            DSTS ${dst_dir} ${dst_dir}/lib
-            DEPS zlib)
-endif ()
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
+copy(zlib_lib
+        SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
+        DSTS ${dst_dir} ${dst_dir}/lib
+        DEPS zlib)
 
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
-- 
GitLab


From 2d529186f1592d3751d83c58f0818de7ec7aa0de Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Mon, 21 Jan 2019 18:46:54 +0800
Subject: [PATCH 43/73] remove legacy CMAKE_CROSSCOMPILING option

---
 CMakeLists.txt                           |   4 +-
 cmake/cblas.cmake                        |  28 +-
 cmake/configure.cmake                    |  10 +-
 cmake/cross_compiling/android.cmake      | 236 ---------------
 cmake/cross_compiling/host.cmake         |  49 ----
 cmake/cross_compiling/ios.cmake          | 347 -----------------------
 cmake/cross_compiling/raspberry_pi.cmake |  84 ------
 cmake/cuda.cmake                         |   4 +-
 cmake/external/openblas.cmake            |  38 +--
 cmake/external/protobuf.cmake            |  18 +-
 cmake/flags.cmake                        |   6 +-
 11 files changed, 28 insertions(+), 796 deletions(-)
 delete mode 100644 cmake/cross_compiling/android.cmake
 delete mode 100644 cmake/cross_compiling/host.cmake
 delete mode 100644 cmake/cross_compiling/ios.cmake
 delete mode 100644 cmake/cross_compiling/raspberry_pi.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8136829a509..de62382b783 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,9 +33,7 @@ if(WIN32)
     set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
 endif(WIN32)
 
-if(NOT CMAKE_CROSSCOMPILING)
-    find_package(CUDA QUIET)
-endif(NOT CMAKE_CROSSCOMPILING)
+find_package(CUDA QUIET)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
 
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 24de8d9d7ce..74b1ef2122c 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -64,24 +64,18 @@ endif()
 ## Then find the reference-cblas.  www.netlib.org/blas/
 set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
   "Folder contains reference-cblas")
-if(NOT CMAKE_CROSSCOMPILING)
-  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
-    ${REFERENCE_CBLAS_ROOT}/include
-    /usr/include
-    /usr/include/cblas
-  )
+set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
+  ${REFERENCE_CBLAS_ROOT}/include
+  /usr/include
+  /usr/include/cblas
+)
 
-  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
-    ${REFERENCE_CBLAS_ROOT}/lib
-    /usr/lib
-    /usr/lib/blas/reference/
-    /usr/lib/reference/
-  )
-else()
-  # Disable the finding of reference cblas under host's system path
-  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
-  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
-endif()
+set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
+  ${REFERENCE_CBLAS_ROOT}/lib
+  /usr/lib
+  /usr/lib/blas/reference/
+  /usr/lib/reference/
+)
 
 if(WITH_SYSTEM_BLAS)
   find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e3d856fb30d..076e839120d 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -49,12 +49,10 @@ if(NOT WITH_PROFILER)
     add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
 
-if(NOT CMAKE_CROSSCOMPILING)
-    if(WITH_AVX AND AVX_FOUND)
-        set(SIMD_FLAG ${AVX_FLAG})
-    elseif(SSE3_FOUND)
-        set(SIMD_FLAG ${SSE3_FLAG})
-    endif()
+if(WITH_AVX AND AVX_FOUND)
+    set(SIMD_FLAG ${AVX_FLAG})
+elseif(SSE3_FOUND)
+    set(SIMD_FLAG ${SSE3_FLAG})
 endif()
 
 if(WIN32)
diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
deleted file mode 100644
index 4cf2be3bdf0..00000000000
--- a/cmake/cross_compiling/android.cmake
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-# http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This is a toolchain file for cross-compiling for Android, and the
-# configuration refers to the open-source resposity:
-#     https://github.com/taka-no-me/android-cmake
-# Most of the variables are compatible with that used in
-#     https://developer.android.com/ndk/guides/cmake.html
-# The supported variables are listed belows:
-# 
-# ANDROID_STANDALONE_TOOLCHAIN
-# ANDROID_TOOLCHAIN
-# ANDROID_ABI
-# ANDROID_NATIVE_API_LEVEL
-# ANDROID_ARM_MODE
-# ANDROID_ARM_NEON
-#
-# For CMake >= 3.7.0, all the settings will be delivered to CMake system
-# variables to let CMake do the cross-compiling configurations itself.
-# More detail of cross-compiling settings
-#     https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html
-
-IF(NOT ANDROID)
-    return()
-ENDIF()
-
-# check the exist of android standalone toolchain
-IF(NOT DEFINED ANDROID_STANDALONE_TOOLCHAIN)
-    SET(ANDROID_STANDALONE_TOOLCHAIN $ENV{ANDROID_STANDALONE_TOOLCHAIN}
-        CACHE PATH "Folder holds the standalone toolchain of Android NDK")
-ENDIF()
-IF(NOT ANDROID_STANDALONE_TOOLCHAIN)
-    MESSAGE(WARNING "It is recommended to set ANDROID_STANDALONE_TOOLCHAIN to "
-            "use a standalone toolchain.\n"
-            "To cross-compile for Android, you need to:\n"
-            "1. Download an Android NDK from"
-            " https://developer.android.com/ndk/downloads/index.html\n"
-            "2. Setup a standalone toolchain"
-            "https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn\n")
-ENDIF()
-
-IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL)
-    IF(ANDROID_NATIVE_API_LEVEL MATCHES "^android-[0-9]+$")
-        STRING(REPLACE "android-" "" CMAKE_SYSTEM_VERSION "${CMAKE_MATCH_0}")
-    ELSEIF(ANDROID_NATIVE_API_LEVEL MATCHES "^[0-9]+$")
-        SET(CMAKE_SYSTEM_VERSION ${ANDROID_NATIVE_API_LEVEL})
-    ENDIF()
-ENDIF()
-
-IF(NOT DEFINED ANDROID_TOOLCHAIN)
-    SET(ANDROID_TOOLCHAIN clang)
-ENDIF()
-
-IF(NOT DEFINED ANDROID_ABI)
-    SET(ANDROID_ABI "armeabi-v7a")
-ENDIF()
-
-IF(NOT DEFINED ANDROID_ARM_MODE)
-    SET(ANDROID_ARM_MODE ON)
-ENDIF()
-IF(ANDROID_ARM_MODE)
-    SET(ANDROID_ARM_MODE_NAME "arm")
-ELSE(ANDROID_ARM_MODE)
-    SET(ANDROID_ARM_MODE_NAME "thumb")
-ENDIF(ANDROID_ARM_MODE)
-
-IF(NOT DEFINED ANDROID_ARM_NEON)
-    SET(ANDROID_ARM_NEON ON)
-ENDIF()
-
-IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
-    IF("${CMAKE_VERSION}" VERSION_LESS "3.1.0")
-        SET(CMAKE_SYSTEM_NAME "Linux")
-    ENDIF()
-    MESSAGE(WARNING "It is recommended to use CMake >= 3.7.0 (current version: "
-            "${CMAKE_VERSION}), when cross-compiling for Android.")
-
-    IF(ANDROID_STANDALONE_TOOLCHAIN)
-        # Use standalone toolchain
-        SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot")
-
-        IF(NOT CMAKE_SYSTEM_VERSION)
-            SET(ANDROID_STANDALONE_TOOLCHAIN_API "")
-            SET(ANDROID_API_LEVEL_H_REGEX "^[\t ]*#[\t ]*define[\t ]+__ANDROID_API__[\t ]+([0-9]+)")
-            FILE(STRINGS "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot/usr/include/android/api-level.h"
-                ANDROID_API_LEVEL_H_CONTENT REGEX "${ANDROID_API_LEVEL_H_REGEX}")
-            IF(ANDROID_API_LEVEL_H_CONTENT MATCHES "${ANDROID_API_LEVEL_H_REGEX}")
-                SET(ANDROID_STANDALONE_TOOLCHAIN_API "${CMAKE_MATCH_1}")
-            ENDIF()
-            SET(CMAKE_SYSTEM_VERSION ${ANDROID_STANDALONE_TOOLCHAIN_API})
-        ENDIF()
-
-        # Toolchain
-        SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN})
-    ELSE(ANDROID_NDK)
-        # TODO: use android ndk
-    ENDIF()
-
-    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-        SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
-        IF(ANDROID_ABI STREQUAL "armeabi")
-            SET(CMAKE_SYSTEM_PROCESSOR armv5te)
-            SET(ANDROID_CLANG_TRIPLE armv5te-none-linux-androideabi)
-        ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
-            SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
-            SET(ANDROID_CLANG_TRIPLE armv7-none-linux-androideabi)
-        ENDIF()
-    ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
-        SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
-        SET(CMAKE_SYSTEM_PROCESSOR aarch64)
-        SET(ANDROID_CLANG_TRIPLE aarch64-none-linux-android)
-    ELSE()
-        MESSAGE(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.")
-    ENDIF()
-    SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
-
-    IF(ANDROID_TOOLCHAIN STREQUAL clang)
-        SET(ANDROID_C_COMPILER_NAME clang)
-        SET(ANDROID_CXX_COMPILER_NAME clang++)
-        SET(CMAKE_C_COMPILER_TARGET   ${ANDROID_CLANG_TRIPLE})
-        SET(CMAKE_CXX_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE})
-    ELSEIF(ANDROID_TOOLCHAIN STREQUAL gcc)
-        SET(ANDROID_C_COMPILER_NAME gcc)
-        SET(ANDROID_CXX_COMPILER_NAME g++)
-    ELSE()
-        MESSAGE(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}")
-    ENDIF()
-
-    # C compiler
-    IF(NOT CMAKE_C_COMPILER)
-        SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_C_COMPILER_NAME}")
-    ELSE()
-        GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
-    ENDIF()
-    IF(NOT EXISTS ${ANDROID_C_COMPILER})
-        MESSAGE(FATAL_ERROR "Cannot find C compiler: ${ANDROID_C_COMPILER}")
-    ENDIF()
-
-    # CXX compiler
-    IF(NOT CMAKE_CXX_COMPILER)
-        SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_CXX_COMPILER_NAME}")
-    ELSE()
-        GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
-    ENDIF()
-    IF(NOT EXISTS ${ANDROID_CXX_COMPILER})
-        MESSAGE(FATAL_ERROR "Cannot find CXX compiler: ${ANDROID_CXX_COMPILER}")
-    ENDIF()
-
-    SET(CMAKE_C_COMPILER ${ANDROID_C_COMPILER} CACHE PATH "C compiler" FORCE)
-    SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
-
-    # Toolchain and ABI specific flags.
-    SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections")
-    SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections")
-
-    IF(ANDROID_ABI STREQUAL "armeabi")
-        LIST(APPEND ANDROID_COMPILER_FLAGS
-             -march=armv5te
-             -mtune=xscale
-             -msoft-float)
-    ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
-        LIST(APPEND ANDROID_COMPILER_FLAGS
-             -march=armv7-a
-             -mfloat-abi=softfp)
-        IF(ANDROID_ARM_NEON)
-            LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=neon)
-        ELSE()
-            LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16)
-        ENDIF()
-        LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8)
-    ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
-        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
-    ENDIF()
-
-    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-        IF(ANDROID_ARM_MODE)
-            LIST(APPEND ANDROID_COMPILER_FLAGS -marm)
-        ELSE()
-            LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb)
-        ENDIF()
-        IF(ANDROID_TOOLCHAIN STREQUAL clang)
-            # Disable integrated-as for better compatibility.
-            LIST(APPEND ANDROID_COMPILER_FLAGS -fno-integrated-as)
-        ENDIF()
-    ENDIF()
-
-    IF(ANDROID_TOOLCHAIN STREQUAL clang)
-        # CMake automatically forwards all compiler flags to the linker,
-        # and clang doesn't like having -Wa flags being used for linking.
-        # To prevent CMake from doing this would require meddling with
-        # the CMAKE_<LANG>_COMPILE_OBJECT rules, which would get quite messy.
-        LIST(APPEND ANDROID_LINKER_FLAGS -Qunused-arguments)
-    ENDIF()
-
-    STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
-    STRING(REPLACE ";" " " ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}")
-
-    SET(CMAKE_C_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_C_FLAGS}"
-        CACHE STRING "C flags")
-    SET(CMAKE_CXX_FLAGS "${ANDROID_COMPILER_FLAGS} ${CMAKE_CXX_FLAGS}"
-        CACHE STRING "CXX flags")
-    SET(CMAKE_SHARED_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}"
-        CACHE STRING "shared linker flags")
-
-    SET(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
-    SET(CMAKE_EXE_LINKER_FLAGS "-pie -fPIE ${ANDROID_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}"
-        CACHE STRING "executable linker flags")
-
-    MESSAGE(STATUS "Android: Targeting API '${CMAKE_SYSTEM_VERSION}' "
-            "with architecture '${ANDROID_ARM_MODE_NAME}', "
-            "ABI '${ANDROID_ABI}', and processor '${CMAKE_SYSTEM_PROCESSOR}'")
-    MESSAGE(STATUS "System CMAKE_C_FLAGS: " ${CMAKE_C_FLAGS})
-    MESSAGE(STATUS "System CMAKE_CXX_FLAGS: " ${CMAKE_CXX_FLAGS})
-ELSE()
-    IF(ANDROID_STANDALONE_TOOLCHAIN)
-        SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN})
-    ENDIF()
-    SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
-    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-        SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
-        IF(ANDROID_ABI STREQUAL "armeabi-v7a")
-            SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
-        ENDIF()
-    ENDIF()
-ENDIF()
diff --git a/cmake/cross_compiling/host.cmake b/cmake/cross_compiling/host.cmake
deleted file mode 100644
index f9c6b12136f..00000000000
--- a/cmake/cross_compiling/host.cmake
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-# http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# find host C compiler
-IF(HOST_C_COMPILER)
-    SET(HOST_C_COMPILER_NAME ${HOST_C_COMPILER})
-ELSEIF(NOT $ENV{CC} STREQUAL "")
-    SET(HOST_C_COMPILER_NAME $ENV{CC})
-ELSE()
-    SET(HOST_C_COMPILER_NAME cc)
-ENDIF()
-
-GET_FILENAME_COMPONENT(HOST_C_COMPILER_PATH ${HOST_C_COMPILER_NAME} PROGRAM)
-IF(NOT HOST_C_COMPILER_PATH OR NOT EXISTS ${HOST_C_COMPILER_PATH})
-    MESSAGE(FATAL_ERROR "Cannot find host C compiler, set host C compiler:\n"
-            "\tcmake .. -DHOST_C_COMPILER=...")
-ENDIF()
-
-# find host CXX compiler
-IF(HOST_CXX_COMPILER)
-    SET(HOST_CXX_COMPILER_NAME ${HOST_CXX_COMPILER})
-ELSEIF(NOT $ENV{CXX} STREQUAL "")
-    SET(HOST_CXX_COMPILER_NAME $ENV{CXX})
-ELSE()
-    SET(HOST_CXX_COMPILER_NAME c++)
-ENDIF()
-
-GET_FILENAME_COMPONENT(HOST_CXX_COMPILER_PATH ${HOST_CXX_COMPILER_NAME} PROGRAM)
-IF(NOT HOST_CXX_COMPILER_PATH OR NOT EXISTS ${HOST_CXX_COMPILER_PATH})
-    MESSAGE(FATAL_ERROR "Cannot find host CXX compiler, set host CXX compiler:\n"
-            "\tcmake .. -DHOST_CXX_COMPILER=...")
-ENDIF()
-
-SET(HOST_C_COMPILER ${HOST_C_COMPILER_PATH} CACHE PATH "Host C compiler")
-SET(HOST_CXX_COMPILER ${HOST_CXX_COMPILER_PATH} CACHE PATH "Host CXX compiler")
-
-MESSAGE(STATUS "Found host C compiler: " ${HOST_C_COMPILER})
-MESSAGE(STATUS "Found host CXX compiler: " ${HOST_CXX_COMPILER})
diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake
deleted file mode 100644
index 10d389ec8ed..00000000000
--- a/cmake/cross_compiling/ios.cmake
+++ /dev/null
@@ -1,347 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This is a toolchain file for cross-compiling for iOS, and the
-# configuration largely refers to public toolchain file:
-#    https://raw.githubusercontent.com/leetal/ios-cmake/master/ios.toolchain.cmake
-# and
-#    https://github.com/cristeab/ios-cmake
-#
-# Supports options:
-# IOS_PLATFORM = OS (default) or SIMULATOR
-#   This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
-#   OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
-#   SIMULATOR - used to build for the Simulator platforms, which have an x86 arch.
-# IOS_ARCH
-#   The archectures wanted to support, such "arm64", "armv7;arm64"
-# IOS_DEPLOYMENT_TARGET
-#   The minimum iOS deployment version, such as "7.0"
-# IOS_ENABLE_BITCODE = ON (default) or OFF
-# IOS_USE_VECLIB_FOR_BLAS = OFF (default) or ON
-# IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
-#   By default this location is automatcially chosen based on the IOS_PLATFORM value above.
-#   If set manually, it will override the default location and force the user of a particular Developer Platform
-# IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
-#   By default this location is automatcially chosen based on the IOS_DEVELOPER_ROOT value.
-#   In this case it will always be the most up-to-date SDK found in the IOS_DEVELOPER_ROOT path.
-#   If set manually, this will force the use of a specific SDK version
-
-# Macros:
-# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
-#  A convenience macro for setting xcode specific properties on targets
-#  example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
-# find_host_package (PROGRAM ARGS)
-#  A macro used to find executable programs on the host system, not within the iOS environment.
-#  Thanks to the android-cmake project for providing the command
-
-if(NOT IOS)
-  return()
-endif()
-
-set(CMAKE_SYSTEM_NAME Darwin)
-
-# Get the Xcode version being used.
-execute_process(COMMAND xcodebuild -version
-                OUTPUT_VARIABLE XCODE_VERSION
-                RESULT_VARIABLE XCODE_VERSION_RESULT
-                ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT ${XCODE_VERSION_RESULT})
-  string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}")
-  string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}")
-  message(STATUS "Building with Xcode version: ${XCODE_VERSION}")
-else()
-  message(FATAL_ERROR "Cannot execute xcodebuild, please check whether xcode is installed.")
-endif()
-
-# Required as of cmake 2.8.10
-set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
-
-# Setup iOS platform unless specified manually with IOS_PLATFORM
-if(NOT DEFINED IOS_PLATFORM)
-  set(IOS_PLATFORM "OS")
-endif()
-set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
-
-# Set the architecture for iOS
-if(NOT DEFINED IOS_ARCH)
-  if(IOS_PLATFORM STREQUAL "OS")
-    set(IOS_ARCH "armv7;armv7s;arm64")
-  elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
-    set(IOS_ARCH "i386;x86_64")
-  endif()
-endif()
-set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
-
-# Specify minimum iOS deployment version
-if(NOT DEFINED IOS_DEPLOYMENT_TARGET)
-  set(IOS_DEPLOYMENT_TARGET "7.0")
-endif()
-set(IOS_DEPLOYMENT_TARGET ${IOS_DEPLOYMENT_TARGET} CACHE STRING "Minimum iOS version")
-
-# Whether to enable bitcode
-if(NOT DEFINED IOS_ENABLE_BITCODE)
-  set(IOS_ENABLE_BITCODE ON)
-endif()
-set(IOS_ENABLE_BITCODE ${IOS_ENABLE_BITCODE} CACHE BOOL "Whether to enable bitcode")
-
-if(NOT DEFINED IOS_USE_VECLIB_FOR_BLAS)
-  set(IOS_USE_VECLIB_FOR_BLAS OFF)
-endif()
-set(IOS_USE_VECLIB_FOR_BLAS ${IOS_UES_VECLIB_FOR_BLAS} CACHE BOOL "Whether to use veclib")
-
-# Check the platform selection and setup for developer root
-if(${IOS_PLATFORM} STREQUAL "OS")
-  set(IOS_PLATFORM_LOCATION "iPhoneOS.platform")
-  set(XCODE_IOS_PLATFORM iphoneos)
-
-  # This causes the installers to properly locate the output libraries
-  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
-elseif(${IOS_PLATFORM} STREQUAL "SIMULATOR")
-  set(IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
-  set(XCODE_IOS_PLATFORM iphonesimulator)
-
-  # This causes the installers to properly locate the output libraries
-  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
-elseif(${IOS_PLATFORM} STREQUAL "WATCHOS")
-  set(IOS_PLATFORM_LOCATION "WatchOS.platform")
-  set(XCODE_IOS_PLATFORM watchos)
-
-  # This causes the installers to properly locate the output libraries
-  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-watchos")
-else(${IOS_PLATFORM} STREQUAL "OS")
-  message(FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please set to\n"
-          "\t OS, SIMULATOR, or WATCHOS.")
-endif()
-
-# Check iOS developer toolchain
-if(NOT DEFINED IOS_DEVELOPER_ROOT)
-  # Setup iOS developer location
-  execute_process(COMMAND xcode-select -print-path
-                  OUTPUT_VARIABLE XCODE_DEVELOPER_DIR
-                  RESULT_VARIABLE XCODE_DEVELOPER_DIR_RESULT
-                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  # Xcode 4.3 changed the installation location, choose the most recent one available
-  if(${XCODE_VERSION} VERSION_LESS "4.3.0")
-    set(IOS_DEVELOPER_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
-  else()
-    set(IOS_DEVELOPER_ROOT "${XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
-  endif()
-endif()
-if(EXISTS ${IOS_DEVELOPER_ROOT})
-  set(IOS_DEVELOPER_ROOT ${IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")
-else()
-  message(FATAL_ERROR "Invalid IOS_DEVELOPER_ROOT: ${IOS_DEVELOPER_ROOT} does not exist.")
-endif()
-
-# Check iOS SDK
-if(NOT DEFINED IOS_SDK_ROOT)
-  # Find and use the most recent iOS sdk
-  file(GLOB IOS_SDK_LISTS "${IOS_DEVELOPER_ROOT}/SDKs/*")
-  if(IOS_SDK_LISTS)
-    list(SORT IOS_SDK_LISTS)
-    list(REVERSE IOS_SDK_LISTS)
-    list(GET IOS_SDK_LISTS 0 IOS_SDK_ROOT)
-  else(IOS_SDK_LISTS)
-    message(FATAL_ERROR "No iOS SDK's found in default search path ${IOS_DEVELOPER_ROOT}."
-            " Please manually set IOS_SDK_ROOT or install the iOS SDK.")
-  endif(IOS_SDK_LISTS)
-endif()
-if(EXISTS ${IOS_SDK_ROOT})
-  set(IOS_SDK_ROOT ${IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")
-  message(STATUS "iOS toolchain: ${IOS_SDK_ROOT}")
-else()
-  message(FATAL_ERROR "Invalid IOS_SDK_ROOT: ${IOS_SDK_ROOT} does not exist.")
-endif()
-
-# Set the sysroot default to the most recent SDK
-set(CMAKE_OSX_SYSROOT ${IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
-
-# Get version of iOS SDK
-execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion
-                OUTPUT_VARIABLE IOS_SDK_VERSION
-                RESULT_VARIABLE IOS_SDK_VERSION_RESULT
-                ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(${IOS_SDK_VERSION_RESULT})
-  string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" IOS_SDK_VERSION "${IOS_SDK_ROOT}")
-endif()
-if(NOT IOS_SDK_VERSION)
-  message(WARNING "Cannot get SDK's version.")
-  set(IOS_SDK_VERSION 1)
-endif()
-set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION})
-
-# Find the C & C++ compilers for the specified SDK.
-if(NOT CMAKE_C_COMPILER)
-  # Default to use clang
-  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang
-                  OUTPUT_VARIABLE IOS_C_COMPILER
-                  RESULT_VARIABLE IOS_C_COMPILER_RESULT
-                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if(${IOS_C_COMPILER_RESULT})
-    get_filename_component(IOS_C_COMPILER clang PROGRAM)
-  endif()
-else(NOT CMAKE_C_COMPILER)
-  # User can set it in cmake command
-  get_filename_component(IOS_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
-endif(NOT CMAKE_C_COMPILER)
-if(NOT EXISTS ${IOS_C_COMPILER})
-  message(FATAL_ERROR "Cannot find C compiler: ${IOS_C_COMPILER}")
-endif()
-
-if(NOT CMAKE_CXX_COMPILER)
-  # Default to use clang++
-  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++
-                  OUTPUT_VARIABLE IOS_CXX_COMPILER
-                  RESULT_VARIABLE IOS_CXX_COMPILER_RESULT
-                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if(${IOS_CXX_COMPILER_RESULT})
-    get_filename_component(IOS_CXX_COMPILER clang++ PROGRAM)
-  endif()
-else(NOT CMAKE_CXX_COMPILER)
-  # User can set it in cmake command
-  get_filename_component(IOS_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
-endif(NOT CMAKE_CXX_COMPILER)
-if(NOT EXISTS ${IOS_CXX_COMPILER})
-  message(FATAL_ERROR "Cannot find CXX compiler: ${IOS_CXX_COMPILER}")
-endif()
-
-set(CMAKE_C_COMPILER ${IOS_C_COMPILER} CACHE PATH "C compiler" FORCE)
-set(CMAKE_CXX_COMPILER ${IOS_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
-
-set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
-set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
-set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
-set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
-
-# Set iOS specific C/C++ flags
-if(IOS_PLATFORM STREQUAL "OS")
-  if(XCODE_VERSION VERSION_LESS "7.0")
-    set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mios-version-min=${IOS_DEPLOYMENT_TARGET}")
-  else()
-    # Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM.
-    set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
-  endif()
-else()
-  set(XCODE_IOS_FLATFORM_VERSION_FLAGS "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
-endif()
-
-if(IOS_ENABLE_BITCODE)
-  set(XCODE_IOS_BITCODE_FLAGS "${IOS_COMPILER_FLAGS} -fembed-bitcode")
-else()
-  set(XCODE_IOS_BITCODE_FLAGS "")
-endif()
-
-set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_FLAGS}")
-
-# Hidden visibilty is required for cxx on iOS 
-set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
-set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
-
-set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
-
-if(IOS_USE_VECLIB_FOR_BLAS)
-  # Find vecLib for iOS
-  set(VECLIB_SEARCH_DIRS
-      ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks
-      ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Frameworks
-      )
-  find_path(VECLIB_INC_DIR vecLib.h PATHS ${VECLIB_SEARCH_DIRS}/vecLib.framework/Headers)
-
-  include(FindPackageHandleStandardArgs)
-  find_package_handle_standard_args(vecLib DEFAULT_MSG VECLIB_INC_DIR)
-
-  if(VECLIB_FOUND)
-    if(VECLIB_INC_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*")
-      set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework vecLib")
-      message(STATUS "Found standalone vecLib.framework")
-    else()
-      set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework Accelerate")
-      message(STATUS "Found vecLib as part of Accelerate.framework")
-    endif()
-
-  endif()
-endif()
-
-set(CMAKE_C_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_C_LINK_FLAGS}")
-set(CMAKE_CXX_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_CXX_LINK_FLAGS}")
-
-set(CMAKE_PLATFORM_HAS_INSTALLNAME 1)
-if(NOT IOS_ENABLE_BITCODE)
-  set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
-  set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
-else()
-  set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib")
-  set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle")
-endif()
-set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
-set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
-set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
-
-# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
-# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
-# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
-# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
-if(NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
-  find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
-endif()
-
-# Set the find root to the iOS developer roots and to user defined paths
-set(CMAKE_FIND_ROOT_PATH ${IOS_DEVELOPER_ROOT} ${IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH}
-    CACHE string  "iOS find search path root")
-
-# default to searching for frameworks first
-set(CMAKE_FIND_FRAMEWORK FIRST)
-
-# set up the default search directories for frameworks
-set(CMAKE_SYSTEM_FRAMEWORK_PATH
-    ${IOS_SDK_ROOT}/System/Library/Frameworks
-    ${IOS_SDK_ROOT}/System/Library/PrivateFrameworks
-    ${IOS_SDK_ROOT}/Developer/Library/Frameworks
-    )
-
-# only search the iOS sdks, not the remainder of the host filesystem
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-
-message(STATUS "iOS: Targeting iOS '${CMAKE_SYSTEM_VERSION}', "
-        "building for '${IOS_PLATFORM}' platform, with architecture '${CMAKE_OSX_ARCHITECTURES}'")
-message(STATUS "System CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
-message(STATUS "System CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
-
-# Used in ExternalProject command
-string(REPLACE ";" "\\$<SEMICOLON>" EXTERNAL_IOS_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
-set(EXTERNAL_OPTIONAL_ARGS
-    -DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT}
-    -DCMAKE_OSX_ARCHITECTURES=${EXTERNAL_IOS_ARCHITECTURES})
-
-# This little macro lets you set any XCode specific property
-macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
-  set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
-endmacro(set_xcode_property)
-
-# This macro lets you find executable programs on the host system
-macro(find_host_package)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
-  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
-  set(IOS FALSE)
-
-  find_package(${ARGN})
-
-  set(IOS TRUE)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-endmacro(find_host_package)
diff --git a/cmake/cross_compiling/raspberry_pi.cmake b/cmake/cross_compiling/raspberry_pi.cmake
deleted file mode 100644
index 0425b2ae158..00000000000
--- a/cmake/cross_compiling/raspberry_pi.cmake
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-# http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This is a toolchain file for cross-compiling for Raspberry Pi.
-# The supported variables are listed belows:
-#
-# RPI_TOOLCHAIN
-# RPI_ARM_NEON
-#
-# Also you can set CMAKE_C/CXX_COMPILER yourself, through cmake arguments.
-
-IF(NOT RPI)
-    return()
-ENDIF()
- 
-SET(CMAKE_SYSTEM_NAME Linux)
-SET(CMAKE_SYSTEM_VERSION 1)
-SET(CMAKE_SYSTEM_PROCESSOR arm)
-
-# check the exist of raspberry pi toolchain
-IF(NOT DEFINED RPI_TOOLCHAIN)
-    SET(RPI_TOOLCHAIN $ENV{RPI_TOOLCHAIN}
-        CACHE PATH "Folder holds the toolchain of Raspberr Pi")
-ENDIF()
-IF(NOT RPI_TOOLCHAIN)
-    MESSAGE(WARNING "It is recommended to set RPI_TOOLCHAIN to use toolchain.\n"
-            "To cross-compile for Raspberry Pi, you need to download the tools using:\n"
-            " git clone https://github.com/raspberrypi/tools\n")
-ENDIF()
-
-IF(NOT DEFINED RPI_ARM_NEON)
-    SET(RPI_ARM_NEON ON)
-ENDIF()
-
-IF(RPI_TOOLCHAIN)
-    SET(RPI_TOOLCHAIN_ROOT ${RPI_TOOLCHAIN})
-    IF(RPI_TOOLCHAIN_ROOT MATCHES "gcc-linaro-arm-linux-gnueabihf-raspbian(-x64)?$")
-        # gcc-linaro-arm-linux-gnueabihf-raspbian
-        # gcc-linaro-arm-linux-gnueabihf-raspbian-x64
-        SET(RPI_TOOLCHAIN_NAME arm-linux-gnueabihf)
-    ENDIF()
-    SET(RPI_TOOLCHAIN_PREFIX "${RPI_TOOLCHAIN_ROOT}/bin/${RPI_TOOLCHAIN_NAME}-")
-ENDIF()
-
-# C compiler
-IF(NOT CMAKE_C_COMPILER)
-    SET(RPI_C_COMPILER "${RPI_TOOLCHAIN_PREFIX}gcc")
-ELSE()
-    GET_FILENAME_COMPONENT(RPI_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
-ENDIF()
-IF(NOT EXISTS ${RPI_C_COMPILER})
-    MESSAGE(FATAL_ERROR "Cannot find C compiler: ${RPI_C_COMPILER}")
-ENDIF()
-
-# CXX compiler
-IF(NOT CMAKE_CXX_COMPILER)
-    SET(RPI_CXX_COMPILER "${RPI_TOOLCHAIN_PREFIX}g++")
-ELSE()
-    GET_FILENAME_COMPONENT(RPI_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
-ENDIF()
-IF(NOT EXISTS ${RPI_CXX_COMPILER})
-    MESSAGE(FATAL_ERROR "Cannot find CXX compiler: ${RPI_CXX_COMPILER}")
-ENDIF()
-
-SET(CMAKE_C_COMPILER ${RPI_C_COMPILER} CACHE PATH "C compiler" FORCE)
-SET(CMAKE_CXX_COMPILER ${RPI_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
-
-IF(RPI_ARM_NEON)
-    SET(RPI_C_FLAGS "${RPI_C_FLAGS} -mfpu=neon")
-ENDIF()
-
-SET(CMAKE_C_FLAGS "${RPI_C_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
-SET(CMAKE_CXX_FLAGS "${RPI_C_FLAGS} ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 16432ce2b80..ea46f6418ed 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -63,9 +63,7 @@ function(select_nvcc_arch_flags out_variable)
   # List of arch names
   set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
   set(archs_name_default "All")
-  if(NOT CMAKE_CROSSCOMPILING)
-    list(APPEND archs_names "Auto")
-  endif()
+  list(APPEND archs_names "Auto")
 
   # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
   set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 019745aad0d..b347a592929 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -40,38 +40,12 @@ IF(NOT ${CBLAS_FOUND})
     SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
     SET(OPENBLAS_COMMIT "v0.2.20")
 
-    IF(CMAKE_CROSSCOMPILING)
-        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
-        GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
-        SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
-        IF(ANDROID)
-            IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-                # use softfp
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
-            ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
-            ENDIF()
-        ELSEIF(IOS)
-            IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
-                SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
-                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
-                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
-            ELSE()
-                MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
-                       "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
-            ENDIF()
-        ELSEIF(RPI)
-            # use hardfp
-            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
-        ENDIF()
-    ELSE()
-        IF(APPLE)
-            SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
-        ENDIF()
-        SET(OPTIONAL_ARGS "")
-        IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
-            SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
-        ENDIF()
+    IF(APPLE)
+        SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
+    ENDIF()
+    SET(OPTIONAL_ARGS "")
+    IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
+        SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
     ENDIF()
 
     SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index d0f7f7409ba..e05b7694ddf 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -232,14 +232,6 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
 ENDFUNCTION()
 
 SET(PROTOBUF_VERSION 3.1)
-IF(CMAKE_CROSSCOMPILING)
-    build_protobuf(protobuf_host TRUE)
-    LIST(APPEND external_project_dependencies protobuf_host)
-
-    SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_host_PROTOC_EXECUTABLE}
-        CACHE FILEPATH "protobuf executable." FORCE)
-ENDIF()
-
 
 IF(NOT PROTOBUF_FOUND)
     build_protobuf(extern_protobuf FALSE)
@@ -253,11 +245,7 @@ IF(NOT PROTOBUF_FOUND)
     SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
         CACHE FILEPATH "protoc library." FORCE)
 
-    IF(CMAKE_CROSSCOMPILING)
-        PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf)
-    ELSE()
-        SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE}
-            CACHE FILEPATH "protobuf executable." FORCE)
-        PROMPT_PROTOBUF_LIB(extern_protobuf)
-    ENDIF()
+    SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE}
+        CACHE FILEPATH "protobuf executable." FORCE)
+    PROMPT_PROTOBUF_LIB(extern_protobuf)
 ENDIF(NOT PROTOBUF_FOUND)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index c4472040cef..9e6c47f016f 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -156,10 +156,8 @@ set(GPU_COMMON_FLAGS
 endif(NOT WIN32)
 
 if (APPLE)
-    if(NOT CMAKE_CROSSCOMPILING)
-        # On Mac OS X build fat binaries with x86_64 architectures by default.
-        set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
-    endif()
+    # On Mac OS X build fat binaries with x86_64 architectures by default.
+    set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
     # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
     set (COMMON_FLAGS -Wno-deprecated-register)
 endif(APPLE)
-- 
GitLab


From 3ce10dba15f7d0ac6f3a4e45e59550ac58563eff Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Mon, 21 Jan 2019 18:56:16 +0800
Subject: [PATCH 44/73] remove legacy USE_NNPACK option

---
 CMakeLists.txt              |  6 ------
 cmake/external/nnpack.cmake | 30 ------------------------------
 2 files changed, 36 deletions(-)
 delete mode 100644 cmake/external/nnpack.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index de62382b783..37bc1743e2e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,7 +60,6 @@ option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_FLUID_ONLY  "Compile PaddlePaddle fluid only"               OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
-option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(WITH_PSLIB       "Compile with pslib support"                    OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
@@ -283,11 +282,6 @@ if(WITH_MKLDNN)
     list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
 endif()
 
-if(USE_NNPACK)
-    include(external/nnpack)
-    list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
-endif(USE_NNPACK)
-
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
diff --git a/cmake/external/nnpack.cmake b/cmake/external/nnpack.cmake
deleted file mode 100644
index d42bcb0f329..00000000000
--- a/cmake/external/nnpack.cmake
+++ /dev/null
@@ -1,30 +0,0 @@
-# Find the NNPACK library
-#  NNPACK_ROOT - where to find NNPACK include and library.
-#
-
-set(NNPACK_FOUND OFF)
-set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
-find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
-find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
-find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
-find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib)
-find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib)
-
-if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
-  set(NNPACK_FOUND ON)
-  INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
-
-  set(NNPACK_LIBS)
-  list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB})
-  if (NNPACK_UKERNELS_LIB)
-    list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB})
-  endif()
-  if (NNPACK_CPUFEATURES_LIB)
-    list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB})
-  endif()
-  if(NOT ANDROID)
-    list(APPEND NNPACK_LIBS "rt")
-  endif()
-else()
-  message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
-endif()
-- 
GitLab


From cf29ea1592017ef037585fcc89151c159ba64317 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Mon, 21 Jan 2019 19:08:26 +0800
Subject: [PATCH 45/73] remove legacy ANDROID option

---
 CMakeLists.txt                        | 23 ---------------
 Dockerfile.android                    | 42 ---------------------------
 cmake/external/glog.cmake             | 10 ++-----
 cmake/external/libxsmm.cmake          |  2 +-
 cmake/generic.cmake                   |  4 +--
 cmake/system.cmake                    | 15 ----------
 paddle/fluid/pybind/CMakeLists.txt    |  4 +--
 paddle/scripts/README.md              |  1 -
 paddle/scripts/paddle_docker_build.sh |  3 --
 9 files changed, 7 insertions(+), 97 deletions(-)
 delete mode 100644 Dockerfile.android

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 37bc1743e2e..9ec632e2069 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -91,29 +91,6 @@ if(NOT CMAKE_BUILD_TYPE)
       FORCE)
 endif()
 
-if(ANDROID OR IOS)
-    if(ANDROID)
-        if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
-            message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
-        endif()
-    endif()
-
-    set(WITH_GPU OFF CACHE STRING
-        "Disable GPU when cross-compiling for Android and iOS" FORCE)
-    set(WITH_AVX OFF CACHE STRING
-        "Disable AVX when cross-compiling for Android and iOS" FORCE)
-    set(WITH_PYTHON OFF CACHE STRING
-        "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
-    set(WITH_RDMA OFF CACHE STRING
-        "Disable RDMA when cross-compiling for Android and iOS" FORCE)
-    set(WITH_MKL OFF CACHE STRING
-        "Disable MKL when cross-compiling for Android and iOS" FORCE)
-    set(WITH_NGRAPH OFF CACHE STRING
-        "Disable nGraph when cross-compiling for Android and iOS" FORCE)
-    set(WITH_GOLANG OFF CACHE STRING
-        "Disable golang when cross-compiling for Android and iOS" FORCE)
-endif()
-
 if (APPLE)
     set(WITH_MKL OFF CACHE STRING
         "Disable MKL for building on mac" FORCE)
diff --git a/Dockerfile.android b/Dockerfile.android
deleted file mode 100644
index 48db2efea21..00000000000
--- a/Dockerfile.android
+++ /dev/null
@@ -1,42 +0,0 @@
-FROM ubuntu:16.04
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-# ENV variables
-ARG ANDROID_ABI
-ARG ANDROID_API
-
-ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
-ENV ANDROID_API=${ANDROID_API:-21}
-
-ENV HOME=/root \
-    ANDROID_NDK_HOME=/opt/android-ndk-linux \
-    ANDROID_TOOLCHAINS_DIR=/opt/toolchains
-
-RUN apt-get update && \
-    apt-get install -y \
-    git python-dev python-pip python-numpy \
-    wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
-    apt-get clean -y
-
-# git credential to skip password typing
-RUN git config --global credential.helper store
-
-# Fix locales to en_US.UTF-8
-RUN localedef -i en_US -f UTF-8 en_US.UTF-8
-
-RUN pip install --upgrade pip==9.0.3 && \
-    pip install -U 'protobuf==3.1.0' && \
-    pip install -U wheel sphinx && \
-    pip install pre-commit
-
-# Android NDK
-RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
-    mkdir -p /opt/android-ndk-tmp && \
-    cd /opt/android-ndk-tmp && \
-    wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
-    unzip -q android-ndk-r14b-linux-x86_64.zip && \
-    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
-    rm -rf /opt/android-ndk-tmp
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 72a2f601917..7a6a4523886 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -26,14 +26,8 @@ ENDIF(WIN32)
 
 INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
 
-IF(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
-  # Using the unofficial glog for Android API < 21
-  SET(GLOG_REPOSITORY "https://github.com/Xreki/glog.git")
-  SET(GLOG_TAG "8a547150548b284382ccb6582408e9140ff2bea8")
-ELSE()
-  SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
-  SET(GLOG_TAG "v0.3.5")
-ENDIF()
+SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
+SET(GLOG_TAG "v0.3.5")
 
 ExternalProject_Add(
     extern_glog
diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake
index 530f7ebe281..05fb94a7271 100644
--- a/cmake/external/libxsmm.cmake
+++ b/cmake/external/libxsmm.cmake
@@ -19,7 +19,7 @@ IF(NOT WITH_LIBXSMM)
     return()
 ENDIF()
 
-IF(WIN32 OR APPLE OR ANDROID OR IOS)
+IF(WIN32 OR APPLE)
     MESSAGE(WARNING "Windows, Mac or Mobile are not supported with libxsmm in Paddle yet.")
     SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE)
     return()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 7dd59577c4e..1f4dbe0b498 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -90,11 +90,11 @@
 # including binary directory for generated headers.
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
-if(NOT APPLE AND NOT ANDROID)
+if(NOT APPLE)
   find_package(Threads REQUIRED)
   link_libraries(${CMAKE_THREAD_LIBS_INIT})
   set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
-endif(NOT APPLE AND NOT ANDROID)
+endif(NOT APPLE)
 
 set_property(GLOBAL PROPERTY FLUID_MODULES "")
 # find all fluid modules is used for paddle fluid static library
diff --git a/cmake/system.cmake b/cmake/system.cmake
index c91ef911275..65db05bebe9 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -74,21 +74,6 @@ MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
 MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}")
 MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
 
-# configuration for cross-compiling
-IF(DEFINED CMAKE_SYSTEM_NAME)
-    INCLUDE(cross_compiling/host)
-    IF(${CMAKE_SYSTEM_NAME} STREQUAL "Android")
-        SET(ANDROID TRUE)
-        INCLUDE(cross_compiling/android)
-    ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "RPi")
-        SET(RPI TRUE)
-        INCLUDE(cross_compiling/raspberry_pi)
-    ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
-        SET(IOS TRUE)
-        INCLUDE(cross_compiling/ios)
-    ENDIF()
-ENDIF()
-
 # external dependencies log output
 SET(EXTERNAL_PROJECT_LOG_ARGS
     LOG_DOWNLOAD    0     # Wrap download in script to log output
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 9a91ea38cae..5cc79b9c23f 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -17,9 +17,9 @@ if(WITH_PYTHON)
       SRCS ${PYBIND_SRCS}
       DEPS ${PYBIND_DEPS}
       ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
-    if(NOT APPLE AND NOT ANDROID AND NOT WIN32)
+    if(NOT APPLE AND NOT WIN32)
       target_link_libraries(paddle_pybind rt)
-    endif(NOT APPLE AND NOT ANDROID AND NOT WIN32)
+    endif(NOT APPLE AND NOT WIN32)
   endif(WITH_AMD_GPU)
 
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
diff --git a/paddle/scripts/README.md b/paddle/scripts/README.md
index dd3242f62ba..6c608fce3cd 100644
--- a/paddle/scripts/README.md
+++ b/paddle/scripts/README.md
@@ -40,7 +40,6 @@ The lastest pre-built build environment images are:
 | Image | Tag |
 | ----- | --- |
 | paddlepaddle/paddle | latest-dev |
-| paddlepaddle/paddle | latest-dev-android |
 
 ### Start Build
 
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index 9a098dbbc66..91ca8907c75 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -66,9 +66,6 @@ function main() {
     DOCKER_REPO="paddlepaddle/paddle"
     VERSION="latest-dev"
     PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
-    if [ "$1" == "build_android" ]; then
-        VERSION="latest-dev-android"
-    fi
     IMG=${DOCKER_REPO}:${VERSION}
     start_build_docker $@
 }
-- 
GitLab


From df92d05ef369368f473a80b32aece44073db3986 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Mon, 21 Jan 2019 19:13:41 +0800
Subject: [PATCH 46/73] remove legacy IOS option

test=develop
---
 cmake/cblas.cmake            | 7 -------
 cmake/external/libxsmm.cmake | 2 +-
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 74b1ef2122c..52ac31d1d12 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -92,10 +92,3 @@ if(WITH_SYSTEM_BLAS)
     message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
   endif()
 endif()
-
-if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER vecLib)
-  set(CBLAS_INC_DIR ${VECLIB_INC_DIR})
-  add_definitions(-DPADDLE_USE_VECLIB)
-endif()
diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake
index 05fb94a7271..39f49d210a2 100644
--- a/cmake/external/libxsmm.cmake
+++ b/cmake/external/libxsmm.cmake
@@ -20,7 +20,7 @@ IF(NOT WITH_LIBXSMM)
 ENDIF()
 
 IF(WIN32 OR APPLE)
-    MESSAGE(WARNING "Windows, Mac or Mobile are not supported with libxsmm in Paddle yet.")
+    MESSAGE(WARNING "Windows, Mac are not supported with libxsmm in Paddle yet.")
     SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE)
     return()
 ENDIF()
-- 
GitLab


From e6a3a3a31a672994054dae4c9e23a712ad206180 Mon Sep 17 00:00:00 2001
From: peizhilin <wopeizl@163.com>
Date: Mon, 21 Jan 2019 19:47:34 +0800
Subject: [PATCH 47/73] fix pr 15313 test=develop

---
 paddle/fluid/operators/group_norm_op.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index 6e460c470be..3bf8586254e 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -21,20 +21,20 @@ namespace operators {
 
 enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
 
-#define CHECK_CASE(i, flags, kernel_name, args...)                   \
-  if (i == flags) {                                                  \
-    kernel_name<T, i><<<grid, threads, 0, dev_ctx.stream()>>>(args); \
+#define CHECK_CASE(i, flags, kernel_name, ...)                              \
+  if (i == flags) {                                                         \
+    kernel_name<T, i><<<grid, threads, 0, dev_ctx.stream()>>>(__VA_ARGS__); \
   }
 
 // 0 for no scale, no bias
 // 1 for has scale, no bias
 // 2 for no scale, has bias
 // 3 for has scale, has bias
-#define UNROLL_ALL_CASES(flags, kernel_name, args...) \
-  CHECK_CASE(0, flags, kernel_name, args)             \
-  CHECK_CASE(1, flags, kernel_name, args)             \
-  CHECK_CASE(2, flags, kernel_name, args)             \
-  CHECK_CASE(3, flags, kernel_name, args)
+#define UNROLL_ALL_CASES(flags, kernel_name, ...) \
+  CHECK_CASE(0, flags, kernel_name, __VA_ARGS__)  \
+  CHECK_CASE(1, flags, kernel_name, __VA_ARGS__)  \
+  CHECK_CASE(2, flags, kernel_name, __VA_ARGS__)  \
+  CHECK_CASE(3, flags, kernel_name, __VA_ARGS__)
 
 template <typename T>
 __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
-- 
GitLab


From 54c0da080dccb843e6a21f0acbcfbf5ffd3a3f86 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Mon, 21 Jan 2019 20:04:18 +0800
Subject: [PATCH 48/73] fix compiler error in paddle_build.sh

test=develop
---
 paddle/scripts/paddle_build.sh | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index a06952782b3..cda04451f5e 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -709,10 +709,9 @@ function gen_fluid_lib() {
     Generating fluid library for train and inference ...
     ========================================
 EOF
-        cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON
-        make -j `nproc` fluid_lib_dist
-        make -j `nproc` inference_lib_dist
-      fi
+    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON
+    make -j `nproc` fluid_lib_dist
+    make -j `nproc` inference_lib_dist
 }
 
 function tar_fluid_lib() {
@@ -721,12 +720,11 @@ function tar_fluid_lib() {
     Taring fluid library for train and inference ...
     ========================================
 EOF
-        cd ${PADDLE_ROOT}/build
-        cp -r fluid_install_dir fluid
-        tar -czf fluid.tgz fluid
-        cp -r fluid_inference_install_dir fluid_inference
-        tar -czf fluid_inference.tgz fluid_inference
-      fi
+    cd ${PADDLE_ROOT}/build
+    cp -r fluid_install_dir fluid
+    tar -czf fluid.tgz fluid
+    cp -r fluid_inference_install_dir fluid_inference
+    tar -czf fluid_inference.tgz fluid_inference
 }
 
 function test_fluid_lib() {
@@ -735,12 +733,11 @@ function test_fluid_lib() {
     Testing fluid library for inference ...
     ========================================
 EOF
-        cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
-        ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
-                 ${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} \
-                 ${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib}
-        ./clean.sh
-      fi
+    cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
+    ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
+             ${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} \
+             ${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib}
+    ./clean.sh
 }
 
 function main() {
-- 
GitLab


From d60751fb71490b77bbf5b90faa87fcf4bb01670b Mon Sep 17 00:00:00 2001
From: flame <fuchang1991@gmail.com>
Date: Mon, 21 Jan 2019 20:58:17 +0800
Subject: [PATCH 49/73] add python inference api (#15248)

add python inference api
---
 paddle/fluid/API.spec                         |   1 +
 .../fluid/inference/api/analysis_predictor.h  |   2 +-
 paddle/fluid/pybind/CMakeLists.txt            |   5 +-
 paddle/fluid/pybind/inference_api.cc          | 256 ++++++++++++++++++
 paddle/fluid/pybind/inference_api.h           |  23 ++
 paddle/fluid/pybind/pybind.cc                 |   3 +-
 python/paddle/fluid/compiler.py               |  29 +-
 python/paddle/fluid/executor.py               |   7 +
 .../paddle/fluid/tests/book/test_word2vec.py  |  29 +-
 9 files changed, 346 insertions(+), 9 deletions(-)
 create mode 100644 paddle/fluid/pybind/inference_api.cc
 create mode 100644 paddle/fluid/pybind/inference_api.h

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 0a4edea2c3c..ad39542b4d8 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -45,6 +45,7 @@ paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], vararg
 paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.CompiledProgram.with_data_parallel ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.CompiledProgram.with_inference_optimize ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
 paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
 paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index e25b5a7047b..9095b6ec1af 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -45,6 +45,7 @@ using contrib::AnalysisConfig;
 class AnalysisPredictor : public PaddlePredictor {
  public:
   explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {}
+  ~AnalysisPredictor();
 
   bool Init(const std::shared_ptr<framework::Scope> &parent_scope,
             const std::shared_ptr<framework::ProgramDesc> &program = nullptr);
@@ -95,7 +96,6 @@ class AnalysisPredictor : public PaddlePredictor {
   template <typename T>
   void GetFetchOne(const framework::LoDTensor &fetchs,
                    PaddleTensor *output_data);
-  ~AnalysisPredictor();
 
 // Some more detailed tests, they are made the friends of the predictor, so that
 // the all the details can be tested.
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 9a91ea38cae..67b2386813e 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,10 +1,11 @@
 set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune
   feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
-  tracer)
+  tracer analysis_predictor)
+
 if(WITH_PYTHON)
   list(APPEND PYBIND_DEPS py_func_op)
 endif()
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc ir.cc)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc ir.cc inference_api.cc)
 
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
new file mode 100644
index 00000000000..26247026667
--- /dev/null
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -0,0 +1,256 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pybind/inference_api.h"
+#include <pybind11/stl.h>
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+using paddle::PaddleDType;
+using paddle::PaddleBuf;
+using paddle::PaddleTensor;
+using paddle::PaddlePlace;
+using paddle::PaddlePredictor;
+using paddle::NativeConfig;
+using paddle::NativePaddlePredictor;
+using paddle::AnalysisPredictor;
+using paddle::contrib::AnalysisConfig;
+
+static void BindPaddleDType(py::module *m);
+static void BindPaddleBuf(py::module *m);
+static void BindPaddleTensor(py::module *m);
+static void BindPaddlePlace(py::module *m);
+static void BindPaddlePredictor(py::module *m);
+static void BindNativeConfig(py::module *m);
+static void BindNativePredictor(py::module *m);
+static void BindAnalysisConfig(py::module *m);
+static void BindAnalysisPredictor(py::module *m);
+
+void BindInferenceApi(py::module *m) {
+  BindPaddleDType(m);
+  BindPaddleBuf(m);
+  BindPaddleTensor(m);
+  BindPaddlePlace(m);
+  BindPaddlePredictor(m);
+  BindNativeConfig(m);
+  BindNativePredictor(m);
+  BindAnalysisConfig(m);
+  BindAnalysisPredictor(m);
+
+  m->def("create_paddle_predictor",
+         &paddle::CreatePaddlePredictor<AnalysisConfig>);
+  m->def("create_paddle_predictor",
+         &paddle::CreatePaddlePredictor<NativeConfig>);
+  m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
+}
+
+void BindPaddleDType(py::module *m) {
+  py::enum_<PaddleDType>(*m, "PaddleDType")
+      .value("FLOAT32", PaddleDType::FLOAT32)
+      .value("INT64", PaddleDType::INT64);
+}
+
+void BindPaddleBuf(py::module *m) {
+  py::class_<PaddleBuf>(*m, "PaddleBuf")
+      .def(py::init<size_t>())
+      .def(py::init([](std::vector<float> &data) {
+        auto buf = PaddleBuf(data.size() * sizeof(float));
+        std::memcpy(buf.data(), static_cast<void *>(data.data()), buf.length());
+        return std::move(buf);
+      }))
+      .def(py::init([](std::vector<int64_t> &data) {
+        auto buf = PaddleBuf(data.size() * sizeof(int64_t));
+        std::memcpy(buf.data(), static_cast<void *>(data.data()), buf.length());
+        return std::move(buf);
+      }))
+      .def("resize", &PaddleBuf::Resize)
+      .def("reset",
+           [](PaddleBuf &self, std::vector<float> &data) {
+             self.Resize(data.size() * sizeof(float));
+             std::memcpy(self.data(), data.data(), self.length());
+           })
+      .def("reset",
+           [](PaddleBuf &self, std::vector<int64_t> &data) {
+             self.Resize(data.size() * sizeof(int64_t));
+             std::memcpy(self.data(), data.data(), self.length());
+           })
+      .def("empty", &PaddleBuf::empty)
+      .def("float_data",
+           [](PaddleBuf &self) -> std::vector<float> {
+             auto *data = static_cast<float *>(self.data());
+             return {data, data + self.length() / sizeof(*data)};
+           })
+      .def("int64_data",
+           [](PaddleBuf &self) -> std::vector<int64_t> {
+             int64_t *data = static_cast<int64_t *>(self.data());
+             return {data, data + self.length() / sizeof(*data)};
+           })
+      .def("length", &PaddleBuf::length);
+}
+
+void BindPaddleTensor(py::module *m) {
+  py::class_<PaddleTensor>(*m, "PaddleTensor")
+      .def(py::init<>())
+      .def_readwrite("name", &PaddleTensor::name)
+      .def_readwrite("shape", &PaddleTensor::shape)
+      .def_readwrite("data", &PaddleTensor::data)
+      .def_readwrite("dtype", &PaddleTensor::dtype)
+      .def_readwrite("lod", &PaddleTensor::lod);
+}
+
+void BindPaddlePlace(py::module *m) {
+  py::enum_<PaddlePlace>(*m, "PaddlePlace")
+      .value("UNK", PaddlePlace::kUNK)
+      .value("CPU", PaddlePlace::kCPU)
+      .value("GPU", PaddlePlace::kGPU);
+}
+
+void BindPaddlePredictor(py::module *m) {
+  auto paddle_predictor = py::class_<PaddlePredictor>(*m, "PaddlePredictor");
+  paddle_predictor
+      .def("run",
+           [](PaddlePredictor &self, const std::vector<PaddleTensor> &inputs) {
+             std::vector<PaddleTensor> outputs;
+             self.Run(inputs, &outputs);
+             return outputs;
+           })
+      .def("get_input_tensor", &PaddlePredictor::GetInputTensor)
+      .def("get_output_tensor", &PaddlePredictor::GetOutputTensor)
+      .def("zero_copy_run", &PaddlePredictor::ZeroCopyRun)
+      .def("clone", &PaddlePredictor::Clone);
+
+  auto config = py::class_<PaddlePredictor::Config>(paddle_predictor, "Config");
+  config.def(py::init<>())
+      .def_readwrite("model_dir", &PaddlePredictor::Config::model_dir);
+}
+
+void BindNativeConfig(py::module *m) {
+  py::class_<NativeConfig, PaddlePredictor::Config>(*m, "NativeConfig")
+      .def(py::init<>())
+      .def_readwrite("use_gpu", &NativeConfig::use_gpu)
+      .def_readwrite("device", &NativeConfig::device)
+      .def_readwrite("fraction_of_gpu_memory",
+                     &NativeConfig::fraction_of_gpu_memory)
+      .def_readwrite("prog_file", &NativeConfig::prog_file)
+      .def_readwrite("param_file", &NativeConfig::param_file)
+      .def_readwrite("specify_input_name", &NativeConfig::specify_input_name)
+      .def("set_cpu_math_library_num_threads",
+           &NativeConfig::SetCpuMathLibraryNumThreads)
+      .def("cpu_math_library_num_threads",
+           &NativeConfig::cpu_math_library_num_threads);
+}
+
+void BindNativePredictor(py::module *m) {
+  py::class_<NativePaddlePredictor, PaddlePredictor>(*m,
+                                                     "NativePaddlePredictor")
+      .def(py::init<const NativeConfig &>())
+      .def("init", &NativePaddlePredictor::Init)
+      .def("run",
+           [](NativePaddlePredictor &self,
+              const std::vector<PaddleTensor> &inputs) {
+             std::vector<PaddleTensor> outputs;
+             self.Run(inputs, &outputs);
+             return outputs;
+           })
+      .def("get_input_tensor", &NativePaddlePredictor::GetInputTensor)
+      .def("get_output_tensor", &NativePaddlePredictor::GetOutputTensor)
+      .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun)
+      .def("clone", &NativePaddlePredictor::Clone)
+      .def("scope", &NativePaddlePredictor::scope,
+           py::return_value_policy::reference);
+}
+
+void BindAnalysisConfig(py::module *m) {
+  py::class_<AnalysisConfig>(*m, "AnalysisConfig")
+      .def(py::init<const AnalysisConfig &>())
+      .def(py::init<const std::string &>())
+      .def(py::init<const std::string &, const std::string &>())
+      .def("set_model", (void (AnalysisConfig::*)(const std::string &)) &
+                            AnalysisConfig::SetModel)
+      .def("set_model", (void (AnalysisConfig::*)(const std::string &,
+                                                  const std::string &)) &
+                            AnalysisConfig::SetModel)
+      .def("set_prog_file", &AnalysisConfig::SetProgFile)
+      .def("set_params_file", &AnalysisConfig::SetParamsFile)
+      .def("model_dir", &AnalysisConfig::model_dir)
+      .def("prog_file", &AnalysisConfig::prog_file)
+      .def("params_file", &AnalysisConfig::params_file)
+      .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu,
+           py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0)
+      .def("disable_gpu", &AnalysisConfig::DisableGpu)
+      .def("use_gpu", &AnalysisConfig::use_gpu)
+      .def("gpu_device_id", &AnalysisConfig::gpu_device_id)
+      .def("memory_pool_init_size_mb",
+           &AnalysisConfig::memory_pool_init_size_mb)
+      .def("fraction_of_gpu_memory_for_pool",
+           &AnalysisConfig::fraction_of_gpu_memory_for_pool)
+      .def("switch_ir_optim", &AnalysisConfig::SwitchIrOptim,
+           py::arg("x") = true)
+      .def("ir_optim", &AnalysisConfig::ir_optim)
+      .def("switch_use_feed_fetch_ops", &AnalysisConfig::SwitchUseFeedFetchOps,
+           py::arg("x") = true)
+      .def("use_feed_fetch_ops_enabled",
+           &AnalysisConfig::use_feed_fetch_ops_enabled)
+      .def("switch_specify_input_names",
+           &AnalysisConfig::SwitchSpecifyInputNames, py::arg("x") = true)
+      .def("specify_input_name", &AnalysisConfig::specify_input_name)
+      .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine,
+           py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1,
+           py::arg("min_subgraph_size") = 3)
+      .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
+      .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug,
+           py::arg("x") = true)
+      .def("enable_mkldnn", &AnalysisConfig::EnableMKLDNN)
+      .def("mkldnn_enabled", &AnalysisConfig::mkldnn_enabled)
+      .def("set_cpu_math_library_num_threads",
+           &AnalysisConfig::SetCpuMathLibraryNumThreads)
+      .def("cpu_math_library_num_threads",
+           &AnalysisConfig::cpu_math_library_num_threads)
+      .def("to_native_config", &AnalysisConfig::ToNativeConfig)
+      .def("set_mkldnn_op", &AnalysisConfig::SetMKLDNNOp)
+      .def("set_model_buffer", &AnalysisConfig::SetModelBuffer)
+      .def("model_from_memory", &AnalysisConfig::model_from_memory)
+      .def("pass_builder", &AnalysisConfig::pass_builder,
+           py::return_value_policy::reference);
+}
+
+void BindAnalysisPredictor(py::module *m) {
+  py::class_<AnalysisPredictor, PaddlePredictor>(*m, "AnalysisPredictor")
+      .def(py::init<const AnalysisConfig &>())
+      .def("init", &AnalysisPredictor::Init)
+      .def(
+          "run",
+          [](AnalysisPredictor &self, const std::vector<PaddleTensor> &inputs) {
+            std::vector<PaddleTensor> outputs;
+            self.Run(inputs, &outputs);
+            return outputs;
+          })
+      .def("get_input_tensor", &AnalysisPredictor::GetInputTensor)
+      .def("get_output_tensor", &AnalysisPredictor::GetOutputTensor)
+      .def("zero_copy_run", &AnalysisPredictor::ZeroCopyRun)
+      .def("clone", &AnalysisPredictor::Clone)
+      .def("scope", &AnalysisPredictor::scope,
+           py::return_value_policy::reference);
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/inference_api.h b/paddle/fluid/pybind/inference_api.h
new file mode 100644
index 00000000000..c2adfbecf72
--- /dev/null
+++ b/paddle/fluid/pybind/inference_api.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace paddle {
+namespace pybind {
+void BindInferenceApi(pybind11::module *m);
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 96d0d16bf78..b086c218988 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -49,6 +49,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/pybind/inference_api.h"
 #include "paddle/fluid/pybind/ir.h"
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
@@ -1083,9 +1084,9 @@ All parameter, weight, gradient are variables in Paddle.
 
   BindRecordIOWriter(&m);
   BindAsyncExecutor(&m);
-
   BindGraph(&m);
   BindNode(&m);
+  BindInferenceApi(&m);
 }
 }  // namespace pybind
 }  // namespace paddle
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index 8bdd03fd50a..a35a4c59835 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -24,6 +24,8 @@ __all__ = ['CompiledProgram', 'ExecutionStrategy', 'BuildStrategy']
 
 ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
 BuildStrategy = core.ParallelExecutor.BuildStrategy
+InferNativeConfig = core.NativeConfig
+InferAnalysisConfig = core.AnalysisConfig
 
 
 def _place_obj(place):
@@ -70,6 +72,7 @@ class CompiledProgram(object):
         self._executor = None
         self._compiled = False
         self._is_data_parallel = False
+        self._is_inference = False
 
     def with_data_parallel(self,
                            loss_name=None,
@@ -109,10 +112,24 @@ class CompiledProgram(object):
             self._build_strategy = BuildStrategy()
         return self
 
-    def _with_distributed(self):
-        raise NotImplementedError()
+    def with_inference_optimize(self, config):
+        """ Add inference optimize
+
+        Args:
+            config: instance of `NativeConfig` or `AnalysisConfig` to create predictor
+        Returns:
+            self
+        """
+        assert any([
+            isinstance(config, InferNativeConfig),
+            isinstance(config, InferAnalysisConfig)
+        ])
+        self._is_data_parallel = False
+        self._is_inference = True
+        self._infer_config = config
+        return self
 
-    def _with_inference_optimize(self):
+    def _with_distributed(self):
         raise NotImplementedError()
 
     def _compile_data_parallel(self):
@@ -177,6 +194,10 @@ class CompiledProgram(object):
             if self._loss_name else six.u(''), self._scope, self._local_scopes,
             self._exec_strategy, self._build_strategy)
 
+    def _compile_inference(self):
+        assert self._is_data_parallel is False
+        return core.create_paddle_predictor(self._infer_config)
+
     def _compile(self, scope, place):
         """Compile the program based on the configs.
 
@@ -200,6 +221,8 @@ class CompiledProgram(object):
         self._place = place
         if self._is_data_parallel:
             self._executor = self._compile_data_parallel()
+        elif self._is_inference:
+            self._executor = self._compile_inference()
         else:
             p = _place_obj(self._place)
             self._executor = core.Executor(p)
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 0d06d0f2c95..20aa6054fe4 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -27,6 +27,8 @@ from .. import compat as cpt
 __all__ = ['Executor', 'global_scope', 'scope_guard']
 
 g_scope = core.Scope()
+InferNativeConfig = core.NativeConfig
+InferAnalysisConfig = core.AnalysisConfig
 
 
 def global_scope():
@@ -533,6 +535,8 @@ class Executor(object):
                 fetch_list=fetch_list,
                 fetch_var_name=fetch_var_name,
                 return_numpy=return_numpy)
+        elif program._is_inference:
+            return self._run_inference(program, feed)
         else:
             # TODO(panyx0718): Can compile program to optimize executor
             # performance.
@@ -590,3 +594,6 @@ class Executor(object):
         if return_numpy:
             outs = as_numpy(outs)
         return outs
+
+    def _run_inference(self, program, feed):
+        return self.executor.run(feed)
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index e24a9aa989b..48cb7789276 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -195,9 +195,34 @@ def infer(use_cuda, save_dirname=None):
                           },
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].recursive_sequence_lengths())
+
+        def to_infer_tensor(lod_tensor):
+            infer_tensor = fluid.core.PaddleTensor()
+            infer_tensor.lod = lod_tensor.lod()
+            infer_tensor.data = fluid.core.PaddleBuf(np.array(lod_tensor))
+            infer_tensor.shape = lod_tensor.shape()
+            infer_tensor.dtype = fluid.core.PaddleDType.INT64
+            return infer_tensor
+
+        infer_inputs = [first_word, second_word, third_word, fourth_word]
+        infer_inputs = [to_infer_tensor(t) for t in infer_inputs]
+
+        infer_config = fluid.core.NativeConfig()
+        infer_config.model_dir = 'word2vec.inference.model'
+        infer_config.use_gpu = use_cuda
+        if use_cuda:
+            infer_config.device = 0
+            infer_config.fraction_of_gpu_memory = 0.15
+        compiled_program = fluid.compiler.CompiledProgram(inference_program)
+        compiled_program.with_inference_optimize(infer_config)
+        assert compiled_program._is_inference is True
+        infer_outputs = exe.run(compiled_program, feed=infer_inputs)
         np_data = np.array(results[0])
-        print("Inference Shape: ", np_data.shape)
+        infer_out = infer_outputs[0].data.float_data()
+        for a, b in zip(np_data[0], infer_out):
+            g_a = float("{:.6g}".format(a))
+            g_b = float("{:.6g}".format(b))
+            assert g_a == g_b
 
 
 def main(use_cuda, is_sparse, is_parallel):
-- 
GitLab


From 59e5cc51d63c71c6c9eab0e715fdbdad4e6e5314 Mon Sep 17 00:00:00 2001
From: WangZhen <wangzhen31@baidu.com>
Date: Mon, 21 Jan 2019 21:24:05 +0800
Subject: [PATCH 50/73] Add quantization transform pass and UT.

---
 paddle/fluid/pybind/ir.cc                     |  4 +-
 .../paddle/fluid/contrib/slim/graph/graph.py  | 20 ++++--
 .../contrib/slim/quantization/__init__.py     |  6 +-
 ...tion_performer.py => quantization_pass.py} | 60 +++++++++++-----
 ...performer.py => test_quantization_pass.py} | 70 +++++++++++++++----
 python/paddle/fluid/framework.py              | 40 +++++++++++
 6 files changed, 157 insertions(+), 43 deletions(-)
 rename python/paddle/fluid/contrib/slim/quantization/{quantization_performer.py => quantization_pass.py} (86%)
 rename python/paddle/fluid/contrib/slim/unitest/{test_quantization_performer.py => test_quantization_pass.py} (61%)

diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index 24059140ab2..ba0d4bb4355 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -148,8 +148,8 @@ void BindNode(py::module *m) {
            })
       .def("outputs_append",
            [](Node &self, Node &node) { self.outputs.push_back(&node); })
-      .def_readwrite("inputs", &Node::inputs)
-      .def_readwrite("outputs", &Node::outputs);
+      .def_readonly("inputs", &Node::inputs)
+      .def_readonly("outputs", &Node::outputs);
 
   py::enum_<Node::Type>(node, "Type")
       .value("Operation", Node::Type::kOperation)
diff --git a/python/paddle/fluid/contrib/slim/graph/graph.py b/python/paddle/fluid/contrib/slim/graph/graph.py
index 61f9f950c4d..80deeee8793 100644
--- a/python/paddle/fluid/contrib/slim/graph/graph.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph.py
@@ -26,10 +26,20 @@ class PyGraph(object):
     PyGraph uses core.Graph as the delegation to accomplish the manipulation.
     """
 
-    def __init__(self, graph):
+    def __init__(self, graph, for_test=False):
+        """
+        Construct the PyGraph using core.Graph.
+        Args:
+            graph(core.Graph): C++ Graph.
+            for_test(bool): True for the test graph and false for the train graph.
+        """
         assert isinstance(
             graph, core.Graph), 'graph must be the instance of core.Graph.'
         self.graph = graph
+        self.for_test = for_test
+
+    def is_test(self):
+        return self.for_test
 
     def all_parameters(self):
         param_nodes = set()
@@ -103,7 +113,7 @@ class PyGraph(object):
             remove_nodes = set(remove_nodes)
         core.graph_safe_remove_nodes(self.graph, remove_nodes)
 
-    def draw_graph(self, save_path, name, marked_nodes=None):
+    def draw(self, save_path, name, marked_nodes=None):
         def _convert_to_pdf(dot_file_path):
             pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
             exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \
@@ -126,6 +136,8 @@ class PyGraph(object):
             if not isinstance(marked_nodes, set):
                 marked_nodes = set(marked_nodes)
             marked_nodes = marked_nodes - remove_ctr_vars
+            if self.graph.has('__graphviz__marked_node__'):
+                self.graph.erase('__graphviz__marked_node__')
             self.graph.set('__graphviz__marked_node__', marked_nodes)
         viz_dot_path = os.path.join(save_path, name) + '.dot'
         viz_pass = core.get_pass('graph_viz_pass')
@@ -137,8 +149,8 @@ class PyGraph(object):
         convert_pass = core.get_pass('graph_to_program_pass')
         convert_pass.set_program('program', Program().desc)
         convert_pass.apply(self.graph)
-        program = Program()
-        program.desc = convert_pass.get_program('program')
+        desc = convert_pass.get_program('program')
+        program = Program.construct_from_desc(desc)
         return program
 
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/__init__.py b/python/paddle/fluid/contrib/slim/quantization/__init__.py
index f5223854176..6c26475f488 100644
--- a/python/paddle/fluid/contrib/slim/quantization/__init__.py
+++ b/python/paddle/fluid/contrib/slim/quantization/__init__.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-from . import quantization_performer
-from .quantization_performer import *
+from . import quantization_pass
+from .quantization_pass import *
 
-__all__ = quantization_performer.__all__
+__all__ = quantization_pass.__all__
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_performer.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
similarity index 86%
rename from python/paddle/fluid/contrib/slim/quantization/quantization_performer.py
rename to python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index ac84b763a6c..3c33a513ff3 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_performer.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -15,22 +15,26 @@
 import collections
 import numpy as np
 from .... import core
+from ....framework import Program
+from ....framework import Variable
 from ....initializer import Constant
 from .... import unique_name
 from ..graph import PyGraph
 
-__all__ = ['QuantizationPerformer']
+__all__ = ['QuantizationTransformPass']
 
 
-class QuantizationPerformer(object):
+class QuantizationTransformPass(object):
     def __init__(self,
+                 scope=None,
+                 program_exe=None,
                  weight_bits=8,
                  activation_bits=8,
                  activation_quantize_type='abs_max',
                  weight_quantize_type='abs_max',
                  window_size=10000):
         """
-        Convert and rewrite the IRGraph according to weight and
+        Convert and rewrite the PyGraph according to weight and
         activation quantization type.
         Args:
             weight_bits (int): quantization bit number for weights,
@@ -48,15 +52,21 @@ class QuantizationPerformer(object):
             window_size (int): the window size for 'range_abs_max' quantization.
         Examples:
         .. code-block:: python
-            # the original graph will be rewrite, if you don't want to
-            # change it, please clone at first.
-            # graph = graph.clone()
-            from paddle.fluid.contrib.slim import *
-            from paddle.fluid.contrib.quantize import *
-            graph = IRGraph(program)
-            performer = QuantizationPerformer()
-            performer.quantize_transform(graph)
+            # The original graph will be rewrite.
+            import paddle.fluid as fluid
+            from paddle.fluid.contrib.slim.quantization \
+                import QuantizationTransformPass
+            from paddle.fluid.contrib.slim.graph import PyGraph
+            from paddle.fluid import core
+
+            graph = PyGraph(core.Graph(program.desc), for_test=False)
+            exe = fluid.Executor(fluid.CPUPlace())
+            transform_pass = QuantizationTransformPass(fluid.global_scope(),
+            exe)
+            transform_pass.apply(graph)
         """
+        self.scope = scope
+        self.program_exe = program_exe
         self.weight_bits = weight_bits
         self.activation_bits = activation_bits
 
@@ -74,7 +84,7 @@ class QuantizationPerformer(object):
         self.weight_quantize_type = weight_quantize_type
         self.window_size = window_size
 
-        self.need_inited_outer = collections.OrderedDict()
+        self.need_initialized = collections.OrderedDict()
         self.quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
         self.quantizable_grad_ops = [
             '%s_grad' % (op) for op in self.quantizable_ops
@@ -86,11 +96,11 @@ class QuantizationPerformer(object):
         self.is_test = None
         self.global_step = None
 
-    def quantize_transform(self, graph, is_test):
-        self.need_inited_outer.clear()
-        self.is_test = is_test
+    def apply(self, graph):
         assert isinstance(graph,
                           PyGraph), 'graph must be the instance of PyGraph.'
+        self.need_initialized.clear()
+        self.is_test = graph.is_test()
         # marked the variable which has been dequantized.
         dequantized_vars = collections.OrderedDict()
         params = [p.name() for p in graph.all_parameters()]
@@ -138,7 +148,19 @@ class QuantizationPerformer(object):
             if op.name() in self.quantizable_grad_ops:
                 _transform_backward(graph, op)
 
-        return self.need_inited_outer
+        if len(self.need_initialized) > 0:
+            assert self.scope is not None, \
+            'The scope cannot be set None when activation_quantize_type equals to range_abs_max.'
+            assert self.program_exe is not None, \
+            'The program_exe cannot be set None when activation_quantize_type equals to range_abs_max.'
+            init_program = Program()
+            for var_desc, initializer in self.need_initialized.iteritems():
+                var = Variable.construct_from_desc(init_program.global_block(),
+                                                   var_desc)
+                initializer(var, init_program.global_block())
+            self.program_exe.run(program=init_program, scope=self.scope)
+
+        return graph
 
     def _create_global_step(self, graph):
         if self.weight_quantize_type == 'range_abs_max' or \
@@ -153,7 +175,7 @@ class QuantizationPerformer(object):
                     var_type=core.VarDesc.VarType.LOD_TENSOR,
                     shape=[1],
                     var_dtype=core.VarDesc.VarType.INT64)
-                self.need_inited_outer[global_step_in.var()] = \
+                self.need_initialized[global_step_in.var()] = \
                     Constant(value=0, force_cpu=True)
                 global_step_out = graph.create_var_node_from_desc(
                     global_step_in.var())
@@ -220,7 +242,7 @@ class QuantizationPerformer(object):
             var_type=core.VarDesc.VarType.LOD_TENSOR,
             shape=[1],
             var_dtype=var_node.var().dtype())
-        self.need_inited_outer[scale_in_node.var()] = Constant(value=0.001)
+        self.need_initialized[scale_in_node.var()] = Constant(value=0.001)
 
         scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
         inputs = {'X': var_node, 'InScale': scale_in_node}
@@ -233,7 +255,7 @@ class QuantizationPerformer(object):
                 var_type=core.VarDesc.VarType.LOD_TENSOR,
                 shape=[self.window_size],
                 var_dtype=var_node.var().dtype())
-            self.need_inited_outer[scales_node.var()] = Constant(value=0)
+            self.need_initialized[scales_node.var()] = Constant(value=0)
             inputs['Iter'] = self.global_step
             outputs['OutScales'] = scales_node
         attrs = {
diff --git a/python/paddle/fluid/contrib/slim/unitest/test_quantization_performer.py b/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
similarity index 61%
rename from python/paddle/fluid/contrib/slim/unitest/test_quantization_performer.py
rename to python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
index 771d880a28d..31188bedbbe 100644
--- a/python/paddle/fluid/contrib/slim/unitest/test_quantization_performer.py
+++ b/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
@@ -15,11 +15,10 @@
 import unittest
 import random
 import numpy as np
-import paddle
 import paddle.fluid as fluid
 import six
 from paddle.fluid.framework import Program
-from paddle.fluid.contrib.slim.quantization import QuantizationPerformer
+from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
 from paddle.fluid.contrib.slim.graph import PyGraph
 from paddle.fluid import core
 
@@ -66,22 +65,39 @@ def residual_block(num):
     return loss
 
 
-class TestQuantizationPerformer(unittest.TestCase):
+class TestQuantizationTransformPass(unittest.TestCase):
     def setUp(self):
-        # since quant_op and dequant_op is not ready, use cos and sin for test
-        self.weight_quant_op_type = 'fake_quantize_abs_max'
-        self.dequant_op_type = 'fake_dequantize_max_abs'
         self.quantizable_op_and_inputs = {
             'conv2d': ['Input', 'Filter'],
             'depthwise_conv2d': ['Input', 'Filter'],
             'mul': ['X', 'Y']
         }
-        self.quantizable_op_grad_and_inputs = {
+        self.quantizable_grad_op_inputs = {
             'conv2d_grad': ['Input', 'Filter'],
             'depthwise_conv2d_grad': ['Input', 'Filter'],
             'mul_grad': ['X', 'Y']
         }
 
+    def check_program(self, transform_pass, program):
+        quantized_ops = set()
+        for block in program.blocks:
+            for op in block.ops:
+                # check forward
+                if op.type in self.quantizable_op_and_inputs:
+                    for arg_name in op.input_arg_names:
+                        self.assertTrue(
+                            arg_name.endswith('.quantized.dequantized'))
+                        quantized_ops.add(arg_name)
+
+            for op in block.ops:
+                # check backward
+                if op.type in self.quantizable_grad_op_inputs:
+                    for pname in self.quantizable_grad_op_inputs[op.type]:
+                        arg_name = op.input(pname)[0]
+                        self.assertTrue(
+                            arg_name.endswith('.quantized.dequantized'))
+                        self.assertTrue(arg_name in quantized_ops)
+
     def linear_fc_quant(self, quant_type):
         main = fluid.Program()
         startup = fluid.Program()
@@ -89,14 +105,26 @@ class TestQuantizationPerformer(unittest.TestCase):
             loss = linear_fc(3)
             opt = fluid.optimizer.Adam(learning_rate=0.001)
             opt.minimize(loss)
-        graph = PyGraph(core.Graph(main.desc))
-        performer = QuantizationPerformer(activation_quantize_type=quant_type)
-        performer.quantize_transform(graph, False)
+        exe = fluid.Executor(fluid.CPUPlace())
+        graph = PyGraph(core.Graph(main.desc), for_test=False)
+        transform_pass = QuantizationTransformPass(
+            scope=fluid.global_scope(),
+            program_exe=exe,
+            activation_quantize_type=quant_type)
+        transform_pass.apply(graph)
         marked_nodes = set()
         for op in graph.all_ops():
             if op.name().find('quantize') > -1:
                 marked_nodes.add(op)
-        graph.draw_graph('.', 'quantize_fc_' + quant_type, marked_nodes)
+        graph.draw('.', 'quantize_fc_' + quant_type, marked_nodes)
+        program = graph.to_program()
+        self.check_program(transform_pass, program)
+        val_graph = PyGraph(core.Graph(program.desc), for_test=False)
+        val_marked_nodes = set()
+        for op in val_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                val_marked_nodes.add(op)
+        val_graph.draw('.', 'val_fc_' + quant_type, val_marked_nodes)
 
     def test_linear_fc_quant_abs_max(self):
         self.act_quant_op_type = 'fake_quantize_abs_max'
@@ -113,14 +141,26 @@ class TestQuantizationPerformer(unittest.TestCase):
             loss = residual_block(2)
             opt = fluid.optimizer.Adam(learning_rate=0.001)
             opt.minimize(loss)
-        graph = PyGraph(core.Graph(main.desc))
-        performer = QuantizationPerformer(activation_quantize_type=quant_type)
-        performer.quantize_transform(graph, False)
+        exe = fluid.Executor(fluid.CPUPlace())
+        graph = PyGraph(core.Graph(main.desc), for_test=False)
+        transform_pass = QuantizationTransformPass(
+            scope=fluid.global_scope(),
+            program_exe=exe,
+            activation_quantize_type=quant_type)
+        transform_pass.apply(graph)
         marked_nodes = set()
         for op in graph.all_ops():
             if op.name().find('quantize') > -1:
                 marked_nodes.add(op)
-        graph.draw_graph('.', 'quantize_residual_' + quant_type, marked_nodes)
+        graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes)
+        program = graph.to_program()
+        self.check_program(transform_pass, program)
+        val_graph = PyGraph(core.Graph(program.desc), for_test=False)
+        val_marked_nodes = set()
+        for op in val_graph.all_ops():
+            if op.name().find('quantize') > -1:
+                val_marked_nodes.add(op)
+        val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes)
 
     def test_residual_block_abs_max(self):
         self.act_quant_op_type = 'fake_quantize_abs_max'
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 8d061f41f09..3fd625109aa 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -378,6 +378,27 @@ class Variable(object):
             self._ivar.desc = self.desc
             self._ivar.stop_gradient = stop_gradient
 
+    @staticmethod
+    def construct_from_desc(block, desc):
+        """
+        Construct a Variable from variable desc.
+        Args:
+            desc(core.VarDesc): The  variable desc for constructing.
+
+        Returns:
+            Variable: A variable.
+        """
+        v = Variable(
+            block=block,
+            type=desc.type(),
+            name=desc.name(),
+            shape=desc.shape(),
+            dtype=desc.dtype(),
+            lod_level=desc.lod_level(),
+            persistable=desc.persistable())
+        v.desc = desc
+        return v
+
     def _numpy(self):
         tensor = self._ivar.value().get_tensor()
         return np.array(tensor)
@@ -1925,6 +1946,25 @@ class Program(object):
         p._sync_with_cpp()
         return p
 
+    @staticmethod
+    def construct_from_desc(desc):
+        """
+        Construct a program from program desc.
+
+        Notes: All information about parameters will be lost.
+
+        Args:
+            desc(core.ProgramDesc): The program desc for constructing.
+
+        Returns:
+            Program: A program.
+        """
+        p = Program()
+        p.desc = desc
+        p.blocks = [Block(p, i) for i in six.moves.range(p.desc.num_blocks())]
+        p._sync_with_cpp()
+        return p
+
     @property
     def random_seed(self):
         """
-- 
GitLab


From 787c5e714c84d2d2d699fb58e1d420b0fe4d09d6 Mon Sep 17 00:00:00 2001
From: WangZhen <wangzhen31@baidu.com>
Date: Mon, 21 Jan 2019 21:33:56 +0800
Subject: [PATCH 51/73] Update the API.spec. test=develop.

---
 paddle/fluid/API.spec | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 50ffef72baa..3a0c8f888e9 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -1,6 +1,7 @@
 paddle.fluid.Program.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.block ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.clone ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,))
+paddle.fluid.Program.construct_from_desc ArgSpec(args=['desc'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-- 
GitLab


From 1888337a49584f356d47561389bde180a07e9586 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Mon, 21 Jan 2019 20:51:18 +0800
Subject: [PATCH 52/73] tweak the executor implementation to better match
 origin behavior

test=develop
---
 python/paddle/fluid/executor.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 20aa6054fe4..f6bee559eaf 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -305,7 +305,9 @@ class Executor(object):
     def __init__(self, place):
         self.place = place
         self.program_caches = dict()
-        self.executor = None
+        p = core.Place()
+        p.set_place(self.place)
+        self._default_executor = core.Executor(p)
         self._closed = False
 
     def _get_program_cache(self, program_cache_key):
@@ -397,12 +399,13 @@ class Executor(object):
             >>> ...
             >>> exe.close()
         """
-        if not self._closed and self.executor:
-            self.executor.close()
+        if not self._closed:
+            self._default_executor.close()
             self._closed = True
 
     def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
                       return_numpy):
+        exe = program._executor
         if isinstance(feed, dict):
             feed_tensor_dict = dict()
             for feed_name in feed:
@@ -414,8 +417,7 @@ class Executor(object):
                     feed_tensor.set(feed[feed_name], core.CPUPlace())
                 feed_tensor_dict[feed_name] = feed_tensor
 
-            self.executor.feed_and_split_tensor_into_local_scopes(
-                feed_tensor_dict)
+            exe.feed_and_split_tensor_into_local_scopes(feed_tensor_dict)
         elif isinstance(feed, list) or isinstance(feed, tuple):
             if len(feed) != len(program._places):
                 raise ValueError(
@@ -436,10 +438,10 @@ class Executor(object):
                         tensor = tmp
                     res_dict[feed_name] = tensor
                 res.append(res_dict)
-            self.executor.feed_tensors_into_local_scopes(res)
+            exe.feed_tensors_into_local_scopes(res)
 
         fetch_var_names = list(map(_to_name_str, fetch_list))
-        self.executor.run(fetch_var_names, fetch_var_name)
+        exe.run(fetch_var_names, fetch_var_name)
         arr = scope.find_var(fetch_var_name).get_lod_tensor_array()
 
         if return_numpy:
@@ -511,12 +513,9 @@ class Executor(object):
         compiled = isinstance(program, compiler.CompiledProgram)
         # For backward compatibility, run directly.
         if not compiled:
-            if not self.executor:
-                p = core.Place()
-                p.set_place(self.place)
-                self.executor = core.Executor(p)
             return self._run(
                 program,
+                self._default_executor,
                 feed=feed,
                 fetch_list=fetch_list,
                 feed_var_name=feed_var_name,
@@ -526,7 +525,6 @@ class Executor(object):
                 use_program_cache=use_program_cache)
 
         program._compile(scope, self.place)
-        self.executor = program._executor
         if program._is_data_parallel:
             return self._run_parallel(
                 program,
@@ -542,6 +540,7 @@ class Executor(object):
             # performance.
             return self._run(
                 program._program,
+                self._default_executor,
                 feed=feed,
                 fetch_list=fetch_list,
                 feed_var_name=feed_var_name,
@@ -550,8 +549,8 @@ class Executor(object):
                 return_numpy=return_numpy,
                 use_program_cache=use_program_cache)
 
-    def _run(self, program, feed, fetch_list, feed_var_name, fetch_var_name,
-             scope, return_numpy, use_program_cache):
+    def _run(self, program, exe, feed, fetch_list, feed_var_name,
+             fetch_var_name, scope, return_numpy, use_program_cache):
 
         if feed is None:
             feed = {}
@@ -589,7 +588,7 @@ class Executor(object):
                 fetch_var_name=fetch_var_name)
 
         self._feed_data(program, feed, feed_var_name, scope)
-        self.executor.run(program.desc, scope, 0, True, True)
+        exe.run(program.desc, scope, 0, True, True)
         outs = self._fetch_data(fetch_list, fetch_var_name, scope)
         if return_numpy:
             outs = as_numpy(outs)
-- 
GitLab


From 1d31a0e10c1ba59e17857b7360cc71bdcd9842d1 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Mon, 21 Jan 2019 22:07:48 +0800
Subject: [PATCH 53/73] resolve conflicts

test=develop
---
 python/paddle/fluid/executor.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index f6bee559eaf..d3ff14a1795 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -534,7 +534,7 @@ class Executor(object):
                 fetch_var_name=fetch_var_name,
                 return_numpy=return_numpy)
         elif program._is_inference:
-            return self._run_inference(program, feed)
+            return self._run_inference(program._executor, feed)
         else:
             # TODO(panyx0718): Can compile program to optimize executor
             # performance.
@@ -594,5 +594,5 @@ class Executor(object):
             outs = as_numpy(outs)
         return outs
 
-    def _run_inference(self, program, feed):
-        return self.executor.run(feed)
+    def _run_inference(self, exe, feed):
+        return exe.run(feed)
-- 
GitLab


From 7f8b40f68ddc23c838c4d1faaf0cfa8721c78753 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Tue, 22 Jan 2019 10:04:03 +0800
Subject: [PATCH 54/73] Fix brpc complation error. (#15451)

---
 paddle/fluid/operators/distributed/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 6a61a8d7861..cb492f99953 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -37,7 +37,7 @@ else()
       variable_response.cc
       collective_client.cc collective_server.cc
       ${BRPC_SRCS}
-    PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto
+    PROTO send_recv.proto
     DEPS lod_tensor selected_rows memory)
 
   set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib)
-- 
GitLab


From a1326cf363599f41ed4ecdf5b69b8815a9e54f2e Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 22 Jan 2019 10:25:50 +0800
Subject: [PATCH 55/73] add NumpyArrayInitializer and use it to refactor nce op

---
 python/paddle/fluid/initializer.py            | 61 ++++++++++++++++++-
 python/paddle/fluid/layers/nn.py              | 27 ++++----
 python/paddle/fluid/layers/tensor.py          | 45 ++++----------
 .../fluid/tests/unittests/test_layers.py      | 12 ----
 4 files changed, 87 insertions(+), 58 deletions(-)

diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 8a2cd4a9290..5e99007031e 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -24,7 +24,8 @@ __all__ = [
     'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear',
     'MSRA', 'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer',
     'UniformInitializer', 'NormalInitializer', 'TruncatedNormalInitializer',
-    'XavierInitializer', 'BilinearInitializer', 'MSRAInitializer'
+    'XavierInitializer', 'BilinearInitializer', 'MSRAInitializer',
+    'NumpyArrayInitializer'
 ]
 
 _force_init_on_cpu_ = False
@@ -683,6 +684,64 @@ class BilinearInitializer(Initializer):
         return op
 
 
+class NumpyArrayInitializer(Initializer):
+    """Init an parameter with an numpy array
+
+    Args:
+        value (numpy): numpy array to initialize the variable
+
+    Examples:
+        .. code-block:: python
+
+            fc = fluid.layers.fc(input=x, size=10,
+                param_attr=fluid.initializer.NumpyArrayInitializer(numpy.array([1,2])))
+    """
+
+    def __init__(self, value):
+        import numpy
+        assert isinstance(value, numpy.ndarray)
+        super(NumpyArrayInitializer, self).__init__()
+        self._value = value
+
+    def __call__(self, var, block):
+        """Add constant initialization ops for a variable
+
+        Args:
+            var: Variable that needs to be initialized
+            block: The block in which initialization ops
+                   should be added
+
+        Returns:
+            the initialization op
+        """
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        # Initialization Ops should be prepended and not appended
+        dtype = framework.convert_np_dtype_to_dtype_(self._value.dtype)
+        if dtype == VarDesc.VarType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in self._value.flat]
+        elif dtype == VarDesc.VarType.INT32:
+            value_name = "int32_values"
+            values = [int(v) for v in self._value.flat]
+        else:
+            raise ValueError("Unsupported dtype %s", self._value.dtype)
+        if self._value.size > 1024 * 1024 * 5:
+            raise ValueError("The size of input is too big. Please consider "
+                             "saving it to file and 'load_op' to load it")
+        op = block._prepend_op(
+            type='assign_value',
+            outputs={'Out': var},
+            attrs={
+                'dtype': dtype,
+                'shape': list(input.shape),
+                value_name: values
+            },
+            stop_gradient=True)
+        var.op = op
+        return op
+
+
 # We short the class name, since users will use the initializer with the package
 # name. The sample code:
 #
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index deadb162214..709d2c07c62 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -22,7 +22,7 @@ import six
 import os
 import inspect
 from ..layer_helper import LayerHelper
-from ..initializer import Normal, Constant
+from ..initializer import Normal, Constant, NumpyArrayInitializer
 from ..framework import Variable, OpProtoHolder
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
@@ -5181,16 +5181,21 @@ def nce(input,
             alias_probs_[little[0]] = 1.0
             alias_[little[0]] = -1
 
-        probs = assign(
-            input=np.array(custom_dist).astype('float32'), init_once=True)
-        custom_alias = assign(
-            input=np.array(alias_).astype('int32'), init_once=True)
-        custom_alias_probs = assign(
-            input=np.array(alias_probs_).astype('float32'), init_once=True)
-
-        inputs['CustomDistProbs'] = probs
-        inputs['CustomDistAlias'] = custom_alias
-        inputs['CustomDistAliasProbs'] = custom_alias_probs
+        def _init_by_numpy_array(numpy_array):
+            ret = helper.create_parameter(
+                attr=ParamAttr(),
+                shape=numpy_array.shape,
+                dtype=numpy_array.dtype,
+                default_initializer=NumpyArrayInitializer(numpy_array))
+            ret.stop_gradient = True
+            return ret
+
+        inputs['CustomDistProbs'] = _init_by_numpy_array(
+            np.array(custom_dist).astype('float32'))
+        inputs['CustomDistAlias'] = _init_by_numpy_array(
+            np.array(alias_).astype('int32'))
+        inputs['CustomDistAliasProbs'] = _init_by_numpy_array(
+            np.array(alias_probs_).astype('float32'))
         sampler = 2
     else:
         raise Exception("Unsupported sampler type.")
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index bd2a7294694..ce9f508c9f1 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -291,7 +291,7 @@ def sums(input, out=None):
     return out
 
 
-def assign(input, output=None, init_once=False):
+def assign(input, output=None):
     """
     **Assign**
 
@@ -300,7 +300,6 @@ def assign(input, output=None, init_once=False):
     Args:
         input(Variable|numpy.ndarray): The source variable
         output(Variable|None): The destination variable
-        init_once(bool|false): assign value into global var only in startup program.
 
     Returns:
         Variable: The destination variable that was supplied as the *output*.
@@ -314,22 +313,10 @@ def assign(input, output=None, init_once=False):
     """
     helper = LayerHelper('assign', **locals())
     if output is None:
-        if init_once:
-            output = helper.create_parameter(
-                attr=ParamAttr(),
-                shape=input.shape,
-                dtype=input.dtype,
-                default_initializer=Constant(0.0))
-            output.stop_gradient = True
-        else:
-            output = helper.create_variable_for_type_inference(
-                dtype=input.dtype)
+        output = helper.create_variable_for_type_inference(dtype=input.dtype)
     if isinstance(input, Variable):
-        if init_once:
-            raise ValueError("init once only support numpy assign!")
         helper.append_op(
             type='assign', inputs={'X': [input]}, outputs={'Out': [output]})
-
     elif isinstance(input, numpy.ndarray):
         dtype = convert_np_dtype_to_dtype_(input.dtype)
         if dtype == VarDesc.VarType.FP32:
@@ -340,28 +327,18 @@ def assign(input, output=None, init_once=False):
             values = [int(v) for v in input.flat]
         else:
             raise ValueError("Unsupported dtype %s", input.dtype)
-        if input.size > 1024 * 1024 * 5:
+        if input.size > 1024 * 1024:
             raise ValueError("The size of input is too big. Please consider "
                              "saving it to file and 'load_op' to load it")
 
-        if init_once:
-            helper.startup_program.global_block().append_op(
-                type='assign_value',
-                outputs={'Out': [output]},
-                attrs={
-                    'dtype': dtype,
-                    'shape': list(input.shape),
-                    value_name: values
-                })
-        else:
-            helper.append_op(
-                type='assign_value',
-                outputs={'Out': [output]},
-                attrs={
-                    'dtype': dtype,
-                    'shape': list(input.shape),
-                    value_name: values
-                })
+        helper.append_op(
+            type='assign_value',
+            outputs={'Out': [output]},
+            attrs={
+                'dtype': dtype,
+                'shape': list(input.shape),
+                value_name: values
+            })
     else:
         raise ValueError("Wrong type for assign input: %s" % type(input))
 
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 2e2f9a5583b..90f5d797a67 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -1023,18 +1023,6 @@ class TestBook(unittest.TestCase):
 
         print(str(program))
 
-    def test_assign(self):
-        import numpy as np
-        startup = Program()
-        main = Program()
-        with program_guard(main, startup):
-            probs = layers.assign(
-                input=np.random.random([1, 2]).astype('float32'),
-                init_once=True)
-
-        print(str(main))
-        print(str(startup))
-
 
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From 0aecf7c70e52e99bf7decda820f18039b3f373e6 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 22 Jan 2019 10:46:48 +0800
Subject: [PATCH 56/73] add TestNumpyArrayInitializer

---
 .../fluid/tests/unittests/test_initializer.py | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index ab7183f88df..2e70175d439 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -420,5 +420,25 @@ class TestMSRAInitializer(unittest.TestCase):
         self.assertEqual(init_op.type, 'assign_value')
 
 
+class TestNumpyArrayInitializer(unittest.TestCase):
+    def test_numpy_array_initializer(self):
+        """Test the numpy array initializer with supplied arguments
+        """
+        import numpy
+        program = framework.Program()
+        block = program.global_block()
+        for _ in range(2):
+            np_array = numpy.array([1, 2, 3, 4]).astype('float32')
+            block.create_parameter(
+                dtype=np_array.dtype,
+                shape=np_array.shape,
+                lod_level=0,
+                name="param",
+                initializer=initializer.NumpyArrayInitializer(np_array))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'assign_value')
+
+
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From ec213730bcb3ca627c59c1a45b82afa4a79aed45 Mon Sep 17 00:00:00 2001
From: nhzlx <nhzlx.dragon@gmail.com>
Date: Tue, 22 Jan 2019 05:00:26 +0000
Subject: [PATCH 57/73] fix trt stream bug. BUG: After continuing to input
 different data, the output cannot be aligned test=develop

---
 .../tensorrt/convert/test_op_converter.cc     |  4 +-
 .../inference/tensorrt/convert/ut_helper.h    | 10 ++--
 paddle/fluid/inference/tensorrt/engine.cc     | 16 +++---
 paddle/fluid/inference/tensorrt/engine.h      | 50 +++----------------
 .../fluid/inference/tensorrt/test_engine.cc   |  4 +-
 .../operators/tensorrt/tensorrt_engine_op.h   |  9 +++-
 .../tensorrt/tensorrt_engine_op_test.cc       |  4 +-
 7 files changed, 31 insertions(+), 66 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 01d7f700da9..c5a413221eb 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -29,9 +29,9 @@ TEST(OpConverter, ConvertBlock) {
   // init trt engine
   cudaStream_t stream_;
   std::unique_ptr<TensorRTEngine> engine_;
-  engine_.reset(new TensorRTEngine(5, 1 << 15, &stream_));
-  engine_->InitNetwork();
   PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+  engine_.reset(new TensorRTEngine(5, 1 << 15, stream_));
+  engine_->InitNetwork();
 
   engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT,
                         nvinfer1::Dims3(2, 5, 5));
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index f313beb73bb..e83961f3d7b 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -78,11 +78,9 @@ class TRTConvertValidation {
         scope_(scope),
         if_add_batch_(if_add_batch),
         max_batch_size_(max_batch_size) {
-    // create engine.
-    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, &stream_));
-    engine_->InitNetwork();
-
     PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, stream_));
+    engine_->InitNetwork();
   }
 
   // Declare a Variable as input with random initialization.
@@ -175,7 +173,7 @@ class TRTConvertValidation {
     op_->Run(scope_, place);
     // Execute TRT.
     engine_->Execute(batch_size);
-    cudaStreamSynchronize(*engine_->stream());
+    cudaStreamSynchronize(engine_->stream());
 
     ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
     const size_t output_space_size = 3000;
@@ -184,7 +182,7 @@ class TRTConvertValidation {
       std::vector<float> fluid_out;
       std::vector<float> trt_out(output_space_size);
       engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
-      cudaStreamSynchronize(*engine_->stream());
+      cudaStreamSynchronize(engine_->stream());
 
       auto* var = scope_.FindVar(output);
       auto tensor = var->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index f739752cbc4..78b590f15d6 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -42,14 +42,13 @@ void TensorRTEngine::Execute(int batch_size) {
     PADDLE_ENFORCE(buf.device == DeviceType::GPU);
     buffers.push_back(buf.buffer);
   }
-  PADDLE_ENFORCE_NOT_NULL(stream_);
-  infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
-  cudaStreamSynchronize(*stream_);
+  infer_context_->enqueue(batch_size, buffers.data(), stream_, nullptr);
+  cudaStreamSynchronize(stream_);
   SetRuntimeBatch(batch_size);
 }
 
 TensorRTEngine::~TensorRTEngine() {
-  cudaStreamSynchronize(*stream_);
+  cudaStreamSynchronize(stream_);
   // clean buffer
   for (auto &buf : buffers_) {
     if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
@@ -173,7 +172,7 @@ void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
   auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
   PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
-                                    cudaMemcpyDeviceToDevice, *stream_),
+                                    cudaMemcpyDeviceToDevice, stream_),
                     0);
 }
 
@@ -194,7 +193,7 @@ void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
   auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
   PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
-                                       cudaMemcpyDeviceToHost, *stream_));
+                                       cudaMemcpyDeviceToHost, stream_));
 }
 
 Buffer &TensorRTEngine::buffer(const std::string &name) {
@@ -211,12 +210,11 @@ void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
   auto &buf = buffer(name);
   PADDLE_ENFORCE_NOT_NULL(buf.buffer);
   PADDLE_ENFORCE_NOT_NULL(data);
-  PADDLE_ENFORCE_NOT_NULL(stream_);
   PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
   PADDLE_ENFORCE(buf.device == DeviceType::GPU);
   buf.size = size;
   PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
-                                       cudaMemcpyHostToDevice, *stream_));
+                                       cudaMemcpyHostToDevice, stream_));
 }
 
 void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
@@ -227,7 +225,7 @@ void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
   PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
   PADDLE_ENFORCE(buf.device == DeviceType::GPU);
   PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
-                                       cudaMemcpyDeviceToDevice, *stream_));
+                                       cudaMemcpyDeviceToDevice, stream_));
 }
 
 void TensorRTEngine::SetITensor(const std::string &name,
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index f5b2c28ba9e..65ab7f3caaa 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -54,17 +54,14 @@ class TensorRTEngine : public EngineBase {
     nvinfer1::Weights w_;
   };
 
-  TensorRTEngine(int max_batch, int max_workspace,
-                 cudaStream_t* stream = nullptr, int device = 0,
+  TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream,
+                 int device = 0,
                  nvinfer1::ILogger& logger = NaiveLogger::Global())
       : max_batch_(max_batch),
         max_workspace_(max_workspace),
-        stream_(stream ? stream : &default_stream_),
+        stream_(stream),
         logger_(logger),
-        device_(device) {
-    freshDeviceId();
-    cudaStreamCreate(stream_);
-  }
+        device_(device) {}
 
   virtual ~TensorRTEngine();
 
@@ -102,7 +99,7 @@ class TensorRTEngine : public EngineBase {
   // NOTE this should be used after calling `FreezeNetwork`.
   Buffer& buffer(const std::string& name) override;
 
-  cudaStream_t* stream() { return stream_; }
+  cudaStream_t stream() { return stream_; }
 
   // Fill an input from CPU memory with name and size.
   void SetInputFromCPU(const std::string& name, const void* data, size_t size);
@@ -158,9 +155,8 @@ class TensorRTEngine : public EngineBase {
 
   // batch size of the current data, will be updated each Executation.
   int batch_size_{-1};
-  cudaStream_t* stream_;
-  // If stream_ is not set from outside, hold its own stream.
-  cudaStream_t default_stream_;
+  cudaStream_t stream_;
+
   nvinfer1::ILogger& logger_;
 
   std::vector<Buffer> buffers_;
@@ -208,38 +204,6 @@ class TensorRTEngine : public EngineBase {
 #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \
   engine__->network()->add##layer__(ARGS);
 
-/*
- * Helper to control the TensorRT engine's creation and deletion.
- */
-class TRT_EngineManager {
- public:
-  bool HasEngine(const std::string& name) const {
-    return engines_.count(name) != 0;
-  }
-
-  // Get an engine called `name`.
-  TensorRTEngine* Get(const std::string& name) const {
-    return engines_.at(name).get();
-  }
-
-  // Create or get an engine called `name`
-  TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream,
-                         const std::string& name, int gpu_device = 0) {
-    auto* p = new TensorRTEngine(max_batch, max_workspace, stream, gpu_device);
-    engines_[name].reset(p);
-    return p;
-  }
-
-  void DeleteALl() {
-    for (auto& item : engines_) {
-      item.second.reset(nullptr);
-    }
-  }
-
- private:
-  std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
-};
-
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index da1f6535cb3..9eed0f6ee9c 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -27,8 +27,8 @@ namespace tensorrt {
 class TensorRTEngineTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    // ASSERT_EQ(0, cudaStreamCreate(&stream_));
-    engine_ = new TensorRTEngine(10, 1 << 10, &stream_);
+    ASSERT_EQ(0, cudaStreamCreate(&stream_));
+    engine_ = new TensorRTEngine(10, 1 << 10, stream_);
     engine_->InitNetwork();
   }
 
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 88c4f508474..e7e990f759b 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -96,9 +96,13 @@ class TensorRTEngineOp : public framework::OperatorBase {
   void RunTrt(const framework::Scope &scope,
               const platform::Place &dev_place) const {
     int runtime_batch = 1;
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
     if (trt_engine_.get() == nullptr) {
       trt_engine_.reset(new TensorRTEngine(
-          max_batch_size_, workspace_size_, nullptr,
+          max_batch_size_, workspace_size_, stream,
           boost::get<platform::CUDAPlace>(dev_place).device));
       Prepare(scope, dev_place, trt_engine_.get());
     }
@@ -126,6 +130,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
       }
     }
 
+    cudaStreamSynchronize(stream);
     PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_);
     // Execute the engine.
     engine->Execute(runtime_batch);
@@ -163,7 +168,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
       output_index += 1;
     }
 
-    cudaStreamSynchronize(*engine->stream());
+    cudaStreamSynchronize(stream);
   }
 
   void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 287b0edc96e..bb25a37584e 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -99,7 +99,7 @@ TEST(TensorRTEngineOp, manual) {
   SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                        block_->SerializeAsString());
   SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", 2);
-  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
+  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 20);
   SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
   SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
                                     std::vector<std::string>({}));
@@ -193,7 +193,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                        block_->SerializeAsString());
   SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", batch_size);
-  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
+  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 20);
   SetAttr<std::vector<std::string>>(
       engine_op_desc.Proto(), "parameters",
       std::vector<std::string>({"y0", "y1", "y2", "y3"}));
-- 
GitLab


From 99d533d026188925186d1ab188130f73897dca70 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 22 Jan 2019 13:36:30 +0800
Subject: [PATCH 58/73] update TestNumpyArrayInitializer test=develop

---
 python/paddle/fluid/initializer.py                      | 2 +-
 python/paddle/fluid/tests/unittests/test_initializer.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 5e99007031e..4f434328e47 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -734,7 +734,7 @@ class NumpyArrayInitializer(Initializer):
             outputs={'Out': var},
             attrs={
                 'dtype': dtype,
-                'shape': list(input.shape),
+                'shape': list(self._value.shape),
                 value_name: values
             },
             stop_gradient=True)
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 2e70175d439..2d98b063d10 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -427,8 +427,8 @@ class TestNumpyArrayInitializer(unittest.TestCase):
         import numpy
         program = framework.Program()
         block = program.global_block()
+        np_array = numpy.random.random((10000)).astype("float32")
         for _ in range(2):
-            np_array = numpy.array([1, 2, 3, 4]).astype('float32')
             block.create_parameter(
                 dtype=np_array.dtype,
                 shape=np_array.shape,
@@ -438,6 +438,7 @@ class TestNumpyArrayInitializer(unittest.TestCase):
         self.assertEqual(len(block.ops), 1)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'assign_value')
+        assert (init_op.attr('fp32_values') == np_array).all()
 
 
 if __name__ == '__main__':
-- 
GitLab


From a71f7ed787766cc2bce9d27ea471acf4f64ab93e Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 22 Jan 2019 14:09:06 +0800
Subject: [PATCH 59/73] update API.spec test=develop

---
 paddle/fluid/API.spec | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index ad39542b4d8..2e7e200484a 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -67,6 +67,7 @@ paddle.fluid.initializer.BilinearInitializer.__init__ ArgSpec(args=['self'], var
 paddle.fluid.initializer.MSRAInitializer.__init__ ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0))
 paddle.fluid.initializer.force_init_on_cpu ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.initializer.init_on_cpu ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
 paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
 paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
-- 
GitLab


From b913463e83faf48a95a6db6f51357bb1af2066d4 Mon Sep 17 00:00:00 2001
From: WangZhen <wangzhen31@baidu.com>
Date: Tue, 22 Jan 2019 14:52:12 +0800
Subject: [PATCH 60/73] Update according to the reviewers' suggestion.
 test=develop

---
 paddle/fluid/pybind/ir.cc                     |   4 +-
 paddle/fluid/pybind/pybind.cc                 |   8 +-
 .../paddle/fluid/contrib/slim/graph/graph.py  | 135 +-------------
 .../slim/quantization/quantization_pass.py    | 168 +++++++----------
 .../slim/unitest/test_quantization_pass.py    |  10 +-
 python/paddle/fluid/framework.py              | 174 +++++++++++++++---
 6 files changed, 228 insertions(+), 271 deletions(-)

diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index ba0d4bb4355..24059140ab2 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -148,8 +148,8 @@ void BindNode(py::module *m) {
            })
       .def("outputs_append",
            [](Node &self, Node &node) { self.outputs.push_back(&node); })
-      .def_readonly("inputs", &Node::inputs)
-      .def_readonly("outputs", &Node::outputs);
+      .def_readwrite("inputs", &Node::inputs)
+      .def_readwrite("outputs", &Node::outputs);
 
   py::enum_<Node::Type>(node, "Type")
       .value("Operation", Node::Type::kOperation)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index c55e8b04759..c4704837566 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -797,18 +797,18 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
   pass.def(py::init())
       .def("has", &ir::Pass::Has)
-      .def("set_program",
+      .def("set",
            [](ir::Pass &self, const std::string &attr_name,
               const ProgramDesc &attr) {
              return self.Set(attr_name, new ProgramDesc(attr));
            })
       .def(
-          "set_str",
+          "set",
           [](ir::Pass &self, const std::string &name, const std::string &attr) {
             self.Set<std::string>(name, new std::string(attr));
           })
-      .def("set_int", [](ir::Pass &self, const std::string &name,
-                         int val) { self.Set<const int>(name, new int(val)); })
+      .def("set", [](ir::Pass &self, const std::string &name,
+                     int val) { self.Set<const int>(name, new int(val)); })
       .def("get_program", &ir::Pass::Get<ProgramDesc>)
       .def("type", &ir::Pass::Type)
       .def("apply", [](ir::Pass &self, std::shared_ptr<ir::Graph> graph) {
diff --git a/python/paddle/fluid/contrib/slim/graph/graph.py b/python/paddle/fluid/contrib/slim/graph/graph.py
index 80deeee8793..f38d9783413 100644
--- a/python/paddle/fluid/contrib/slim/graph/graph.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph.py
@@ -18,140 +18,7 @@ from ....framework import Program
 from ....framework import Block
 from .... import core
 
-__all__ = ['Graph', 'ImitationGraph', 'IRGraph', 'PyGraph']
-
-
-class PyGraph(object):
-    """
-    PyGraph uses core.Graph as the delegation to accomplish the manipulation.
-    """
-
-    def __init__(self, graph, for_test=False):
-        """
-        Construct the PyGraph using core.Graph.
-        Args:
-            graph(core.Graph): C++ Graph.
-            for_test(bool): True for the test graph and false for the train graph.
-        """
-        assert isinstance(
-            graph, core.Graph), 'graph must be the instance of core.Graph.'
-        self.graph = graph
-        self.for_test = for_test
-
-    def is_test(self):
-        return self.for_test
-
-    def all_parameters(self):
-        param_nodes = set()
-        for node in self.graph.nodes():
-            if node.is_var() and node.var() is not None and node.var(
-            ).persistable():
-                param_nodes.add(node)
-        return param_nodes
-
-    def all_vars(self):
-        return {node for node in self.graph.nodes() if node.is_var()}
-
-    def all_ops(self):
-        return {node for node in self.graph.nodes() if node.is_op()}
-
-    def create_param_node(self, name, var_type, shape, var_dtype):
-        var_desc = core.VarDesc(name)
-        var_desc.set_type(var_type)
-        var_desc.set_shape(shape)
-        var_desc.set_dtype(var_dtype)
-        var_desc.set_persistable(True)
-        return self.graph.create_var_node(var_desc)
-
-    def create_var_node(self, name, var_type, shape, var_dtype):
-        var_desc = core.VarDesc(name)
-        var_desc.set_type(var_type)
-        var_desc.set_shape(shape)
-        var_desc.set_dtype(var_dtype)
-        return self.graph.create_var_node(var_desc)
-
-    def create_var_node_from_desc(self, var_desc):
-        return self.graph.create_var_node(var_desc)
-
-    def create_op_node(self, op_type, attrs, inputs, outputs):
-        op_desc = core.OpDesc()
-        op_desc.set_type(op_type)
-        for attr, value in attrs.iteritems():
-            self._update_desc_attr(op_desc, attr, value)
-        for input_name, var_nodes in inputs.iteritems():
-            if not isinstance(var_nodes, list):
-                var_nodes = [var_nodes]
-            op_desc.set_input(input_name,
-                              [var_node.name() for var_node in var_nodes])
-        for output_name, var_nodes in outputs.iteritems():
-            if not isinstance(var_nodes, list):
-                var_nodes = [var_nodes]
-            op_desc.set_output(output_name,
-                               [var_node.name() for var_node in var_nodes])
-        return self.graph.create_op_node(op_desc)
-
-    def create_op_node_from_desc(self, op_desc):
-        return self.graph.create_op_node(op_desc)
-
-    def _update_desc_attr(self, desc, name, val):
-        """
-        Update the value of desc's attribute by attribute's name.
-        """
-        if isinstance(val, Block):
-            desc.set_block_attr(name, val.desc)
-        elif isinstance(val, list) and val and all(
-                isinstance(v, Block) for v in val):
-            desc.set_blocks_attr(name, [v.desc for v in val])
-        elif isinstance(val, core.BlockDesc) or \
-                isinstance(val, core.ProgramDesc):
-            desc.set_serialized_attr(name, val.serialize_to_string())
-        else:
-            desc._set_attr(name, val)
-
-    def safe_remove_nodes(self, remove_nodes):
-        if not isinstance(remove_nodes, set):
-            remove_nodes = set(remove_nodes)
-        core.graph_safe_remove_nodes(self.graph, remove_nodes)
-
-    def draw(self, save_path, name, marked_nodes=None):
-        def _convert_to_pdf(dot_file_path):
-            pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
-            exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \
-                            + ' -o ' + pdf_save_path, shell=True)
-            if exited_code != 0:
-                print('The dot command is needed for creating pdf files.')
-                print('The {} is saved as the dot filetype.'.format(
-                    dot_file_path))
-
-        remove_ctr_vars = set()
-        ops_num = 0
-        for node in self.graph.nodes():
-            if node.is_ctrl_var():
-                remove_ctr_vars.add(node)
-            elif node.is_op():
-                ops_num += 1
-        print('Total ops num = {}.'.format(ops_num))
-        self.safe_remove_nodes(remove_ctr_vars)
-        if marked_nodes is not None:
-            if not isinstance(marked_nodes, set):
-                marked_nodes = set(marked_nodes)
-            marked_nodes = marked_nodes - remove_ctr_vars
-            if self.graph.has('__graphviz__marked_node__'):
-                self.graph.erase('__graphviz__marked_node__')
-            self.graph.set('__graphviz__marked_node__', marked_nodes)
-        viz_dot_path = os.path.join(save_path, name) + '.dot'
-        viz_pass = core.get_pass('graph_viz_pass')
-        viz_pass.set_str('graph_viz_path', viz_dot_path)
-        viz_pass.apply(self.graph)
-        _convert_to_pdf(viz_dot_path)
-
-    def to_program(self):
-        convert_pass = core.get_pass('graph_to_program_pass')
-        convert_pass.set_program('program', Program().desc)
-        convert_pass.apply(self.graph)
-        desc = convert_pass.get_program('program')
-        program = Program.construct_from_desc(desc)
-        return program
+__all__ = ['Graph', 'ImitationGraph', 'IRGraph']
 
 
 class Graph(object):
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 3c33a513ff3..ce16a32415e 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -13,13 +13,12 @@
 # limitations under the License.
 
 import collections
-import numpy as np
 from .... import core
+from ....framework import IrGraph
 from ....framework import Program
 from ....framework import Variable
 from ....initializer import Constant
 from .... import unique_name
-from ..graph import PyGraph
 
 __all__ = ['QuantizationTransformPass']
 
@@ -34,7 +33,7 @@ class QuantizationTransformPass(object):
                  weight_quantize_type='abs_max',
                  window_size=10000):
         """
-        Convert and rewrite the PyGraph according to weight and
+        Convert and rewrite the IrGraph according to weight and
         activation quantization type.
         Args:
             weight_bits (int): quantization bit number for weights,
@@ -56,19 +55,19 @@ class QuantizationTransformPass(object):
             import paddle.fluid as fluid
             from paddle.fluid.contrib.slim.quantization \
                 import QuantizationTransformPass
-            from paddle.fluid.contrib.slim.graph import PyGraph
+            from paddle.fluid.contrib.slim.graph import IrGraph
             from paddle.fluid import core
 
-            graph = PyGraph(core.Graph(program.desc), for_test=False)
+            graph = IrGraph(core.Graph(program.desc), for_test=False)
             exe = fluid.Executor(fluid.CPUPlace())
             transform_pass = QuantizationTransformPass(fluid.global_scope(),
             exe)
             transform_pass.apply(graph)
         """
-        self.scope = scope
-        self.program_exe = program_exe
-        self.weight_bits = weight_bits
-        self.activation_bits = activation_bits
+        self._scope = scope
+        self._program_exe = program_exe
+        self._weight_bits = weight_bits
+        self._activation_bits = activation_bits
 
         quant_type = ['abs_max', 'range_abs_max']
         if activation_quantize_type not in quant_type:
@@ -80,27 +79,27 @@ class QuantizationTransformPass(object):
                 "Unknown weight_quantize_type: '%s'. It can only be ",
                 "'abs_max' or 'range_abs_max'.", str(weight_quantize_type))
 
-        self.activation_quantize_type = activation_quantize_type
-        self.weight_quantize_type = weight_quantize_type
-        self.window_size = window_size
+        self._activation_quantize_type = activation_quantize_type
+        self._weight_quantize_type = weight_quantize_type
+        self._window_size = window_size
 
-        self.need_initialized = collections.OrderedDict()
-        self.quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
-        self.quantizable_grad_ops = [
-            '%s_grad' % (op) for op in self.quantizable_ops
+        self._need_initialized = collections.OrderedDict()
+        self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
+        self._quantizable_grad_ops = [
+            '%s_grad' % (op) for op in self._quantizable_ops
         ]
-        self.fake_quant_op_types = [
+        self._fake_quant_op_types = [
             'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
         ]
-        self.fake_dequant_op_types = ['fake_dequantize_max_abs']
-        self.is_test = None
-        self.global_step = None
+        self._fake_dequant_op_types = ['fake_dequantize_max_abs']
+        self._is_test = None
+        self._global_step = None
 
     def apply(self, graph):
         assert isinstance(graph,
-                          PyGraph), 'graph must be the instance of PyGraph.'
-        self.need_initialized.clear()
-        self.is_test = graph.is_test()
+                          IrGraph), 'graph must be the instance of IrGraph.'
+        self._need_initialized.clear()
+        self._is_test = graph.is_test()
         # marked the variable which has been dequantized.
         dequantized_vars = collections.OrderedDict()
         params = [p.name() for p in graph.all_parameters()]
@@ -110,72 +109,69 @@ class QuantizationTransformPass(object):
                 if var_node.name() in dequantized_vars:
                     dequant_var_node = dequantized_vars[var_node.name()]
                 else:
-                    quant_bits = self.weight_bits if var_node.name() in params \
-                    else self.activation_bits
-                    quant_type = self.weight_quantize_type if var_node.name() \
-                        in params else self.activation_quantize_type
+                    quant_bits = self._weight_bits if var_node.name() in params \
+                    else self._activation_bits
+                    quant_type = self._weight_quantize_type if var_node.name() \
+                        in params else self._activation_quantize_type
                     quant_var_node, scale_var_node = self._insert_quant_op(
                         graph, var_node, quant_bits, quant_type)
                     dequant_var_node = self._insert_dequant_op(
                         graph, quant_var_node, scale_var_node, quant_bits)
                     dequantized_vars[var_node.name()] = dequant_var_node
-                self._update_input(var_node, dequant_var_node, op)
-                op.op()._rename_input(var_node.name(), dequant_var_node.name())
+                graph.update_input_link(var_node, dequant_var_node, op)
 
         def _transform_backward(graph, op):
             no_dequanted_input_vars = True
             for var_node in op.inputs:
                 if var_node.name() in dequantized_vars:
                     dequant_var_node = dequantized_vars[var_node.name()]
-                    self._update_input(var_node, dequant_var_node, op)
-                    op.op()._rename_input(var_node.name(),
-                                          dequant_var_node.name())
+                    graph.update_input_link(var_node, dequant_var_node, op)
                     no_dequanted_input_vars = False
             if no_dequanted_input_vars:
                 raise ValueError("There is no dequanted inputs for op %s." %
                                  (op.name()))
 
-        if not self.is_test:
+        if not self._is_test:
             self._create_global_step(graph)
         ops = graph.all_ops()
         # The process of _transform_forward and _transform_backward is needed in two for loops.
         # The loop for transforming the forward graph:
         for op in ops:
-            if op.name() in self.quantizable_ops:
+            if op.name() in self._quantizable_ops:
                 _transform_forward(graph, op)
         # The loop for renaming the inputs of backward op.
         for op in ops:
-            if op.name() in self.quantizable_grad_ops:
+            if op.name() in self._quantizable_grad_ops:
                 _transform_backward(graph, op)
 
-        if len(self.need_initialized) > 0:
-            assert self.scope is not None, \
+        if len(self._need_initialized) > 0:
+            assert self._scope is not None, \
             'The scope cannot be set None when activation_quantize_type equals to range_abs_max.'
-            assert self.program_exe is not None, \
+            assert self._program_exe is not None, \
             'The program_exe cannot be set None when activation_quantize_type equals to range_abs_max.'
             init_program = Program()
-            for var_desc, initializer in self.need_initialized.iteritems():
-                var = Variable.construct_from_desc(init_program.global_block(),
-                                                   var_desc)
+            for var_desc, initializer in self._need_initialized.iteritems():
+                var = Variable(init_program.global_block())
+                var._set_desc(var_desc)
                 initializer(var, init_program.global_block())
-            self.program_exe.run(program=init_program, scope=self.scope)
+            self._program_exe.run(program=init_program, scope=self._scope)
 
         return graph
 
     def _create_global_step(self, graph):
-        if self.weight_quantize_type == 'range_abs_max' or \
-                self.activation_quantize_type == 'range_abs_max':
+        if self._weight_quantize_type == 'range_abs_max' or \
+                self._activation_quantize_type == 'range_abs_max':
             counter_name = '@STEP_COUNTER@'
             for node in graph.all_vars():
                 if node.name() == counter_name:
-                    self.global_step = node
-            if self.global_step is None:
+                    self._global_step = node
+            if self._global_step is None:
                 global_step_in = graph.create_param_node(
                     name=counter_name,
                     var_type=core.VarDesc.VarType.LOD_TENSOR,
                     shape=[1],
                     var_dtype=core.VarDesc.VarType.INT64)
-                self.need_initialized[global_step_in.var()] = \
+                self._need_initialized[global_step_in.var()] = \
                     Constant(value=0, force_cpu=True)
                 global_step_out = graph.create_var_node_from_desc(
                     global_step_in.var())
@@ -184,9 +180,9 @@ class QuantizationTransformPass(object):
                     attrs={'step': 1.0},
                     inputs={'X': global_step_in},
                     outputs={'Out': global_step_out})
-                self._link_to(global_step_in, increment_op)
-                self._link_to(increment_op, global_step_out)
-                self.global_step = global_step_out
+                graph.link_to(global_step_in, increment_op)
+                graph.link_to(increment_op, global_step_out)
+                self._global_step = global_step_out
 
     def _insert_quant_op(self, graph, var_node, quant_bits, quant_type):
         """
@@ -220,9 +216,9 @@ class QuantizationTransformPass(object):
             inputs={'X': var_node},
             outputs={'Out': quant_var_node,
                      'OutScale': scale_var_node})
-        self._link_to(var_node, quant_op_node)
-        self._link_to(quant_op_node, quant_var_node)
-        self._link_to(quant_op_node, scale_var_node)
+        graph.link_to(var_node, quant_op_node)
+        graph.link_to(quant_op_node, quant_var_node)
+        graph.link_to(quant_op_node, scale_var_node)
         return quant_var_node, scale_var_node
 
     def _insert_quant_range_abs_max_op(self, graph, var_node, quant_bits):
@@ -242,26 +238,26 @@ class QuantizationTransformPass(object):
             var_type=core.VarDesc.VarType.LOD_TENSOR,
             shape=[1],
             var_dtype=var_node.var().dtype())
-        self.need_initialized[scale_in_node.var()] = Constant(value=0.001)
+        self._need_initialized[scale_in_node.var()] = Constant(value=0.001)
 
         scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
         inputs = {'X': var_node, 'InScale': scale_in_node}
         outputs = {'Out': quant_var_node, 'OutScale': scale_out_node}
 
-        if not self.is_test:
+        if not self._is_test:
             # The name of scales_var_node maybe 'scales_0', 'scales_1', etc.
             scales_node = graph.create_param_node(
                 name=unique_name.generate('scales'),
                 var_type=core.VarDesc.VarType.LOD_TENSOR,
-                shape=[self.window_size],
+                shape=[self._window_size],
                 var_dtype=var_node.var().dtype())
-            self.need_initialized[scales_node.var()] = Constant(value=0)
-            inputs['Iter'] = self.global_step
+            self._need_initialized[scales_node.var()] = Constant(value=0)
+            inputs['Iter'] = self._global_step
             outputs['OutScales'] = scales_node
         attrs = {
-            'window_size': self.window_size,
+            'window_size': self._window_size,
             'bit_length': quant_bits,
-            'is_test': self.is_test
+            'is_test': self._is_test
         }
         quant_op_node = graph.create_op_node(
             op_type='fake_quantize_range_abs_max',
@@ -269,14 +265,14 @@ class QuantizationTransformPass(object):
             inputs=inputs,
             outputs=outputs)
 
-        self._link_to(var_node, quant_op_node)
-        self._link_to(scale_in_node, quant_op_node)
-        self._link_to(quant_op_node, quant_var_node)
-        self._link_to(quant_op_node, scale_out_node)
+        graph.link_to(var_node, quant_op_node)
+        graph.link_to(scale_in_node, quant_op_node)
+        graph.link_to(quant_op_node, quant_var_node)
+        graph.link_to(quant_op_node, scale_out_node)
 
-        if not self.is_test:
-            self._link_to(self.global_step, quant_op_node)
-            self._link_to(quant_op_node, scales_node)
+        if not self._is_test:
+            graph.link_to(self._global_step, quant_op_node)
+            graph.link_to(quant_op_node, scales_node)
 
         return quant_var_node, scale_out_node
 
@@ -298,21 +294,11 @@ class QuantizationTransformPass(object):
             inputs={'X': var_node,
                     'Scale': scale_var_node},
             outputs={'Out': dequant_var_node})
-        self._link_to(var_node, dequant_op_node)
-        self._link_to(scale_var_node, dequant_op_node)
-        self._link_to(dequant_op_node, dequant_var_node)
+        graph.link_to(var_node, dequant_op_node)
+        graph.link_to(scale_var_node, dequant_op_node)
+        graph.link_to(dequant_op_node, dequant_var_node)
         return dequant_var_node
 
-    def _update_input(self, old_input_node, new_input_node, op_node):
-        old_input_node.outputs_remove(op_node)
-        op_node.inputs_remove(old_input_node)
-        new_input_node.outputs_append(op_node)
-        op_node.inputs_append(new_input_node)
-
-    def _link_to(self, node_in, node_out):
-        node_in.outputs_append(node_out)
-        node_out.inputs_append(node_in)
-
     def _quantized_var_name(self, var_name):
         """
         Return quantized variable name for the input `var_name`.
@@ -330,25 +316,3 @@ class QuantizationTransformPass(object):
         Return quantized variable name for the input `var_name`.
         """
         return "%s.scale" % (var_name)
-
-    def _original_var_name(self, var_name):
-        """
-        Return the original variable name.
-        """
-        if var_name.endswith('.quantized.dequantized'):
-            return var_name[:-len('.quantized.dequantized')]
-        if var_name.endswith('.quantized'):
-            return var_name[:-len('.quantized')]
-        if var_name.endswith('.dequantized'):
-            return var_name[:-len('.dequantized')]
-        if var_name.endswith('.scale'):
-            return var_name[:-len('.scale')]
-        else:
-            return var_name
-
-    def _is_float(self, v):
-        return isinstance(v, float) or isinstance(v, np.float32)
-
-    def _quant(self, x, scale, num_bits):
-        y = np.round(x / scale * ((1 << (num_bits - 1)) - 1))
-        return y
diff --git a/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
index 31188bedbbe..1bd4b95d6b9 100644
--- a/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/unitest/test_quantization_pass.py
@@ -18,8 +18,8 @@ import numpy as np
 import paddle.fluid as fluid
 import six
 from paddle.fluid.framework import Program
+from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
-from paddle.fluid.contrib.slim.graph import PyGraph
 from paddle.fluid import core
 
 
@@ -106,7 +106,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
             opt = fluid.optimizer.Adam(learning_rate=0.001)
             opt.minimize(loss)
         exe = fluid.Executor(fluid.CPUPlace())
-        graph = PyGraph(core.Graph(main.desc), for_test=False)
+        graph = IrGraph(core.Graph(main.desc), for_test=False)
         transform_pass = QuantizationTransformPass(
             scope=fluid.global_scope(),
             program_exe=exe,
@@ -119,7 +119,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
         graph.draw('.', 'quantize_fc_' + quant_type, marked_nodes)
         program = graph.to_program()
         self.check_program(transform_pass, program)
-        val_graph = PyGraph(core.Graph(program.desc), for_test=False)
+        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
         val_marked_nodes = set()
         for op in val_graph.all_ops():
             if op.name().find('quantize') > -1:
@@ -142,7 +142,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
             opt = fluid.optimizer.Adam(learning_rate=0.001)
             opt.minimize(loss)
         exe = fluid.Executor(fluid.CPUPlace())
-        graph = PyGraph(core.Graph(main.desc), for_test=False)
+        graph = IrGraph(core.Graph(main.desc), for_test=False)
         transform_pass = QuantizationTransformPass(
             scope=fluid.global_scope(),
             program_exe=exe,
@@ -155,7 +155,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
         graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes)
         program = graph.to_program()
         self.check_program(transform_pass, program)
-        val_graph = PyGraph(core.Graph(program.desc), for_test=False)
+        val_graph = IrGraph(core.Graph(program.desc), for_test=False)
         val_marked_nodes = set()
         for op in val_graph.all_ops():
             if op.name().find('quantize') > -1:
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 1913f58e679..fc5e471ae30 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -23,6 +23,7 @@ import traceback
 import six
 
 import numpy as np
+import subprocess
 
 from .. import compat as cpt
 from .proto import framework_pb2
@@ -381,27 +382,6 @@ class Variable(object):
             self._ivar.desc = self.desc
             self._ivar.stop_gradient = stop_gradient
 
-    @staticmethod
-    def construct_from_desc(block, desc):
-        """
-        Construct a Variable from variable desc.
-        Args:
-            desc(core.VarDesc): The  variable desc for constructing.
-
-        Returns:
-            Variable: A variable.
-        """
-        v = Variable(
-            block=block,
-            type=desc.type(),
-            name=desc.name(),
-            shape=desc.shape(),
-            dtype=desc.dtype(),
-            lod_level=desc.lod_level(),
-            persistable=desc.persistable())
-        v.desc = desc
-        return v
-
     def _numpy(self):
         tensor = self._ivar.value().get_tensor()
         return np.array(tensor)
@@ -1533,6 +1513,154 @@ class Block(object):
         return ret_var
 
 
+class IrGraph(object):
+    """
+    IrGraph uses core.Graph as the delegation to accomplish the manipulation.
+    """
+
+    def __init__(self, graph, for_test=False):
+        """
+        Construct the IrGraph using core.Graph.
+        Args:
+            graph(core.Graph): C++ Graph.
+            for_test(bool): True for the test graph and false for the train graph.
+        """
+        assert isinstance(
+            graph, core.Graph), 'graph must be the instance of core.Graph.'
+        self.graph = graph
+        self._for_test = for_test
+
+    def is_test(self):
+        return self._for_test
+
+    def all_parameters(self):
+        param_nodes = set()
+        for node in self.graph.nodes():
+            if node.is_var() and node.var() is not None and node.var(
+            ).persistable():
+                param_nodes.add(node)
+        return param_nodes
+
+    def all_vars(self):
+        return {node for node in self.graph.nodes() if node.is_var()}
+
+    def all_ops(self):
+        return {node for node in self.graph.nodes() if node.is_op()}
+
+    def create_param_node(self, name, var_type, shape, var_dtype):
+        var_desc = core.VarDesc(name)
+        var_desc.set_type(var_type)
+        var_desc.set_shape(shape)
+        var_desc.set_dtype(var_dtype)
+        var_desc.set_persistable(True)
+        return self.graph.create_var_node(var_desc)
+
+    def create_var_node(self, name, var_type, shape, var_dtype):
+        var_desc = core.VarDesc(name)
+        var_desc.set_type(var_type)
+        var_desc.set_shape(shape)
+        var_desc.set_dtype(var_dtype)
+        return self.graph.create_var_node(var_desc)
+
+    def create_var_node_from_desc(self, var_desc):
+        return self.graph.create_var_node(var_desc)
+
+    def create_op_node(self, op_type, attrs, inputs, outputs):
+        op_desc = core.OpDesc()
+        op_desc.set_type(op_type)
+        for attr, value in attrs.iteritems():
+            self._update_desc_attr(op_desc, attr, value)
+        for input_name, var_nodes in inputs.iteritems():
+            if not isinstance(var_nodes, list):
+                var_nodes = [var_nodes]
+            op_desc.set_input(input_name,
+                              [var_node.name() for var_node in var_nodes])
+        for output_name, var_nodes in outputs.iteritems():
+            if not isinstance(var_nodes, list):
+                var_nodes = [var_nodes]
+            op_desc.set_output(output_name,
+                               [var_node.name() for var_node in var_nodes])
+        return self.graph.create_op_node(op_desc)
+
+    def create_op_node_from_desc(self, op_desc):
+        return self.graph.create_op_node(op_desc)
+
+    def update_input_link(self, old_input_node, new_input_node, op_node):
+        assert old_input_node in self.graph.nodes() and new_input_node in self.graph.nodes() and \
+            op_node in self.graph.nodes(), 'Th three arguments must be in the graph nodes.'
+        old_input_node.outputs_remove(op_node)
+        op_node.inputs_remove(old_input_node)
+        new_input_node.outputs_append(op_node)
+        op_node.inputs_append(new_input_node)
+        op_node.op()._rename_input(old_input_node.name(), new_input_node.name())
+
+    def link_to(self, node_in, node_out):
+        assert node_in in self.graph.nodes() and node_out in self.graph.nodes(), \
+            'Th two arguments must be in the graph nodes.'
+        node_in.outputs_append(node_out)
+        node_out.inputs_append(node_in)
+
+    def safe_remove_nodes(self, remove_nodes):
+        if not isinstance(remove_nodes, set):
+            remove_nodes = set(remove_nodes)
+        core.graph_safe_remove_nodes(self.graph, remove_nodes)
+
+    def draw(self, save_path, name, marked_nodes=None):
+        def _convert_to_pdf(dot_file_path):
+            pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
+            exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \
+                            + ' -o ' + pdf_save_path, shell=True)
+            if exited_code != 0:
+                print('The dot command is needed for creating pdf files.')
+                print('The {} is saved as the dot filetype.'.format(
+                    dot_file_path))
+
+        remove_ctr_vars = set()
+        ops_num = 0
+        for node in self.graph.nodes():
+            if node.is_ctrl_var():
+                remove_ctr_vars.add(node)
+            elif node.is_op():
+                ops_num += 1
+        print('Total ops num = {}.'.format(ops_num))
+        self.safe_remove_nodes(remove_ctr_vars)
+        if marked_nodes is not None:
+            if not isinstance(marked_nodes, set):
+                marked_nodes = set(marked_nodes)
+            marked_nodes = marked_nodes - remove_ctr_vars
+            if self.graph.has('__graphviz__marked_node__'):
+                self.graph.erase('__graphviz__marked_node__')
+            self.graph.set('__graphviz__marked_node__', marked_nodes)
+        viz_dot_path = os.path.join(save_path, name) + '.dot'
+        viz_pass = core.get_pass('graph_viz_pass')
+        viz_pass.set('graph_viz_path', viz_dot_path)
+        viz_pass.apply(self.graph)
+        _convert_to_pdf(viz_dot_path)
+
+    def to_program(self):
+        convert_pass = core.get_pass('graph_to_program_pass')
+        convert_pass.set('program', Program().desc)
+        convert_pass.apply(self.graph)
+        desc = convert_pass.get_program('program')
+        program = Program._construct_from_desc(desc)
+        return program
+
+    def _update_desc_attr(self, desc, name, val):
+        """
+        Update the value of desc's attribute by attribute's name.
+        """
+        if isinstance(val, Block):
+            desc.set_block_attr(name, val.desc)
+        elif isinstance(val, list) and val and all(
+                isinstance(v, Block) for v in val):
+            desc.set_blocks_attr(name, [v.desc for v in val])
+        elif isinstance(val, core.BlockDesc) or \
+                isinstance(val, core.ProgramDesc):
+            desc.set_serialized_attr(name, val.serialize_to_string())
+        else:
+            desc._set_attr(name, val)
+
+
 class Program(object):
     """
     Python Program. Beneath it is a ProgramDesc, which is used for
@@ -1958,12 +2086,10 @@ class Program(object):
         return p
 
     @staticmethod
-    def construct_from_desc(desc):
+    def _construct_from_desc(desc):
         """
         Construct a program from program desc.
 
-        Notes: All information about parameters will be lost.
-
         Args:
             desc(core.ProgramDesc): The program desc for constructing.
 
-- 
GitLab


From c6f99a16451c47cfa12633d3b871c3e8940cbd48 Mon Sep 17 00:00:00 2001
From: WangZhen <wangzhen31@baidu.com>
Date: Tue, 22 Jan 2019 14:56:12 +0800
Subject: [PATCH 61/73] Update API.spec. test=develop

---
 paddle/fluid/API.spec | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index b4fc560a5a6..ad39542b4d8 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -1,7 +1,6 @@
 paddle.fluid.Program.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.block ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.clone ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,))
-paddle.fluid.Program.construct_from_desc ArgSpec(args=['desc'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-- 
GitLab


From 2f4aee361a7bacbac375ea082b1a1a646c6b3b40 Mon Sep 17 00:00:00 2001
From: nhzlx <nhzlx.dragon@gmail.com>
Date: Tue, 22 Jan 2019 07:20:52 +0000
Subject: [PATCH 62/73] fix comments test=develop

---
 .../fluid/inference/tests/api/tester_helper.h | 19 +++++++++++-
 .../inference/tests/api/trt_models_tester.cc  | 31 +++++++++++++++++++
 .../tensorrt/tensorrt_engine_op_test.cc       |  4 +--
 3 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index ac964dc0c86..8ee89c34f0b 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -56,6 +56,13 @@ DECLARE_int32(paddle_num_threads);
 namespace paddle {
 namespace inference {
 
+float Random(float low, float high) {
+  static std::random_device rd;
+  static std::mt19937 mt(rd());
+  std::uniform_real_distribution<double> dist(low, high);
+  return dist(mt);
+}
+
 void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
   const auto *analysis_config =
       reinterpret_cast<const contrib::AnalysisConfig *>(config);
@@ -176,7 +183,7 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
     float *input_data = static_cast<float *>(input.data.data());
     // fill input data, for profile easily, do not use random data here.
     for (size_t j = 0; j < len; ++j) {
-      *(input_data + j) = static_cast<float>(j) / len;
+      *(input_data + j) = Random(0, 10.);
     }
   }
   (*inputs).emplace_back(input_slots);
@@ -344,6 +351,16 @@ void CompareNativeAndAnalysis(
   CompareResult(analysis_outputs, native_outputs);
 }
 
+void CompareNativeAndAnalysis(
+    PaddlePredictor *native_pred, PaddlePredictor *analysis_pred,
+    const std::vector<std::vector<PaddleTensor>> &inputs) {
+  int batch_size = FLAGS_batch_size;
+  std::vector<PaddleTensor> native_outputs, analysis_outputs;
+  native_pred->Run(inputs[0], &native_outputs, batch_size);
+  analysis_pred->Run(inputs[0], &analysis_outputs, batch_size);
+  CompareResult(analysis_outputs, native_outputs);
+}
+
 template <typename T>
 std::string LoDTensorSummary(const framework::LoDTensor &tensor) {
   std::stringstream ss;
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index 9725c190329..8d177542934 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -107,6 +107,27 @@ void compare(std::string model_dir, bool use_tensorrt) {
       inputs_all);
 }
 
+void compare_continuous_input(std::string model_dir, bool use_tensorrt) {
+  contrib::AnalysisConfig analysis_config;
+  SetConfig<contrib::AnalysisConfig>(&analysis_config, model_dir, true,
+                                     use_tensorrt, FLAGS_batch_size);
+  auto config =
+      reinterpret_cast<const PaddlePredictor::Config*>(&analysis_config);
+  auto native_pred = CreateTestPredictor(config, false);
+  auto analysis_pred = CreateTestPredictor(config, true);
+  for (int i = 0; i < 100; i++) {
+    std::vector<std::vector<PaddleTensor>> inputs_all;
+    if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
+      SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
+                        FLAGS_param_filename);
+    } else {
+      SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
+    }
+    CompareNativeAndAnalysis(native_pred.get(), analysis_pred.get(),
+                             inputs_all);
+  }
+}
+
 TEST(TensorRT_mobilenet, compare) {
   std::string model_dir = FLAGS_infer_model + "/mobilenet";
   compare(model_dir, /* use_tensorrt */ true);
@@ -157,5 +178,15 @@ TEST(AnalysisPredictor, use_gpu) {
   }
 }
 
+TEST(resnet50, compare_continuous_input) {
+  std::string model_dir = FLAGS_infer_model + "/resnet50";
+  compare_continuous_input(model_dir, true);
+}
+
+TEST(resnet50, compare_continuous_input_native) {
+  std::string model_dir = FLAGS_infer_model + "/resnet50";
+  compare_continuous_input(model_dir, false);
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index bb25a37584e..391e7a1c070 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -99,7 +99,7 @@ TEST(TensorRTEngineOp, manual) {
   SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                        block_->SerializeAsString());
   SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", 2);
-  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 20);
+  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 1 << 20);
   SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
   SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
                                     std::vector<std::string>({}));
@@ -193,7 +193,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                        block_->SerializeAsString());
   SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", batch_size);
-  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 20);
+  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 1 << 20);
   SetAttr<std::vector<std::string>>(
       engine_op_desc.Proto(), "parameters",
       std::vector<std::string>({"y0", "y1", "y2", "y3"}));
-- 
GitLab


From 3b668c157424dce5cbf52cbe813d15275616b1f3 Mon Sep 17 00:00:00 2001
From: WangZhen <wangzhen31@baidu.com>
Date: Tue, 22 Jan 2019 15:37:04 +0800
Subject: [PATCH 63/73] Update some comments in the quantization transform
 pass. test=develop

---
 .../paddle/fluid/contrib/slim/quantization/quantization_pass.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index ce16a32415e..266a106bc50 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -313,6 +313,6 @@ class QuantizationTransformPass(object):
 
     def _quantized_scale_name(self, var_name):
         """
-        Return quantized variable name for the input `var_name`.
+        Return the scale name of quantized variable for the input `var_name`.
         """
         return "%s.scale" % (var_name)
-- 
GitLab


From f534c66d2d7d87aa580538513f4835439acd7bc0 Mon Sep 17 00:00:00 2001
From: flame <fuchang1991@gmail.com>
Date: Tue, 22 Jan 2019 17:17:51 +0800
Subject: [PATCH 64/73] fix test_word2vec bug (#15462)

fix test_word2vec float's equality bug
---
 python/paddle/fluid/tests/book/test_word2vec.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 48cb7789276..487a29c8391 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -220,9 +220,7 @@ def infer(use_cuda, save_dirname=None):
         np_data = np.array(results[0])
         infer_out = infer_outputs[0].data.float_data()
         for a, b in zip(np_data[0], infer_out):
-            g_a = float("{:.6g}".format(a))
-            g_b = float("{:.6g}".format(b))
-            assert g_a == g_b
+            assert np.isclose(a, b), "a: {}, b: {}".format(a, b)
 
 
 def main(use_cuda, is_sparse, is_parallel):
-- 
GitLab


From bac08c4a263eeda61cc2f1bcf20d005f51f542ef Mon Sep 17 00:00:00 2001
From: WangZhen <wangzhen31@baidu.com>
Date: Tue, 22 Jan 2019 18:26:00 +0800
Subject: [PATCH 65/73] Fix some bugs caused by set functions of the Pass
 class. test=develop

---
 python/paddle/fluid/tests/unittests/test_dist_base.py    | 2 +-
 python/paddle/fluid/tests/unittests/test_pass_builder.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 3fcdc57906c..69a38618cde 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -123,7 +123,7 @@ class TestDistRunnerBase(object):
             pass_builder = build_stra._finalize_strategy_and_create_passes()
             mypass = pass_builder.insert_pass(
                 len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
-            mypass.set_int("num_repeats", args.batch_merge_repeat)
+            mypass.set("num_repeats", args.batch_merge_repeat)
 
         if args.update_method == "nccl2":
             build_stra.num_trainers = len(args.endpoints.split(","))
diff --git a/python/paddle/fluid/tests/unittests/test_pass_builder.py b/python/paddle/fluid/tests/unittests/test_pass_builder.py
index 8c9e489e028..7e1c2572f08 100644
--- a/python/paddle/fluid/tests/unittests/test_pass_builder.py
+++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py
@@ -111,7 +111,7 @@ class TestPassBuilder(unittest.TestCase):
 
         pass_builder.remove_pass(len(pass_builder.all_passes()) - 1)
         self.assertEqual(origin_len + 1, len(pass_builder.all_passes()))
-        viz_pass.set_str("graph_viz_path", "/tmp/test_viz_pass")
+        viz_pass.set("graph_viz_path", "/tmp/test_viz_pass")
 
         self.check_network_convergence(
             use_cuda=core.is_compiled_with_cuda(),
-- 
GitLab


From 5a8bd82c0cef63bd7313171e8049953aa2db43f6 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Tue, 22 Jan 2019 06:40:47 -0600
Subject: [PATCH 66/73] Remove workspace_handle (#15376)

* remove workspace_handle
test=develop

* set constant for loss
test=develop
---
 paddle/fluid/operators/conv_fusion_op.cu.cc   | 65 +++++++++++--------
 .../operators/conv_transpose_cudnn_op.cu.cc   | 57 ++++++++--------
 .../fused/fusion_conv_inception_op.cu         | 23 +++----
 paddle/fluid/operators/warpctc_cudnn_op.cu.cc | 24 +++----
 4 files changed, 93 insertions(+), 76 deletions(-)

diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc
index d8b997cca61..f97ebecfdd9 100644
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -104,7 +104,9 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn conv algorithm ---------------------
     cudnnConvolutionFwdAlgo_t algo;
     auto handle = dev_ctx.cudnn_handle();
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+
+    Tensor cudnn_workspace;
+    void* cudnn_workspace_ptr = nullptr;
 
     CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
         cudnn_conv_desc, CUDNN_DEFAULT_MATH));
@@ -118,19 +120,24 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
           workspace_size_limit, &algo));
       VLOG(3) << "cuDNN forward algo " << algo;
     } else {
+      cudnn_workspace =
+          ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
+              framework::make_ddim(
+                  {static_cast<int64_t>(workspace_size_limit)}),
+              dev_ctx);
+      cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
+
       auto search_func = [&]() {
         int returned_algo_count;
         std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
             fwd_perf_stat;
-        auto cudnn_find_func = [&](void* cudnn_workspace) {
-          CUDNN_ENFORCE(
-              platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
-                  handle, cudnn_input_desc, input_data, cudnn_filter_desc,
-                  filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
-                  kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
-                  fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit));
-        };
-        workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
+
+        CUDNN_ENFORCE(platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
+            handle, cudnn_input_desc, input_data, cudnn_filter_desc,
+            filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
+            kNUM_CUDNN_FWD_ALGS, &returned_algo_count, fwd_perf_stat.data(),
+            cudnn_workspace_ptr, workspace_size_limit));
+
         VLOG(3) << "Perf result: (algo: stat, time, memory)";
         for (int i = 0; i < returned_algo_count; ++i) {
           const auto& stat = fwd_perf_stat[i];
@@ -181,6 +188,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
                       "workspace_size to be allocated exceeds the limit");
 
+    if (!cudnn_workspace_ptr) {
+      cudnn_workspace =
+          ctx.AllocateTmpTensor<int8_t, platform::CUDADeviceContext>(
+              framework::make_ddim(
+                  {static_cast<int64_t>(workspace_size_in_bytes)}),
+              dev_ctx);
+      cudnn_workspace_ptr = static_cast<void*>(cudnn_workspace.data<int8_t>());
+    }
+
     if ((activation == "identity") && (!residual)) {
       // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
       // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
@@ -188,13 +204,12 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       // cudnnConvolutionForward and cudnnAddTensor
       // ------------- cudnn conv forward and bias add ---------------------
       ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-      auto cudnn_func = [&](void* cudnn_workspace) {
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
-            handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
-            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
-            workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
-      };
-      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+
+      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
+          handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
+          filter_data, cudnn_conv_desc, algo, cudnn_workspace_ptr,
+          workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
+
       CUDNN_ENFORCE(platform::dynload::cudnnAddTensor(
           handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
           output_data));
@@ -205,15 +220,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       // ------------------- cudnn conv+bias+act forward --------------------
       ScalingParamType<T> alpha1 = 1.0f;
       ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
-      auto cudnn_func = [&](void* cudnn_workspace) {
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
-            handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
-            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
-            workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
-            cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
-            output_data));
-      };
-      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+
+      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
+          handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
+          filter_data, cudnn_conv_desc, algo, cudnn_workspace_ptr,
+          workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
+          cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
+          output_data));
     }
     std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels");
     if (channels.size()) {
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
index f44094ca6b7..016cf8448c5 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
@@ -104,16 +104,18 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     int output_offset = output->numel() / output->dims()[0] / groups;
     int filter_offset = filter->numel() / groups;
     T alpha = 1.0f, beta = 0.0f;
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+
+    auto temp_allocation =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
+            workspace_size_in_bytes);
+    void* cudnn_workspace = temp_allocation->ptr();
+
     for (int g = 0; g < groups; g++) {
-      auto cudnn_func = [&](void* cudnn_workspace) {
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
-            handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
-            cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
-            algo, cudnn_workspace, workspace_size_in_bytes, &beta,
-            cudnn_output_desc, output_data + output_offset * g));
-      };
-      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+          handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
+          cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
+          algo, cudnn_workspace, workspace_size_in_bytes, &beta,
+          cudnn_output_desc, output_data + output_offset * g));
     }
   }
 };
@@ -209,20 +211,22 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
         output_grad->numel() / output_grad->dims()[0] / groups;
     int filter_offset = filter->numel() / groups;
     T alpha = 1.0f, beta = 0.0f;
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+
+    auto temp_allocation =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
+            workspace_size_in_bytes);
+    void* cudnn_workspace = temp_allocation->ptr();
+
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
       for (int g = 0; g < groups; g++) {
-        auto cudnn_func = [&](void* cudnn_workspace) {
-          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
-              handle, &alpha, cudnn_output_desc,
-              output_grad_data + output_grad_offset * g, cudnn_filter_desc,
-              filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
-              cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
-              input_grad_data + input_offset * g));
-        };
-        workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
+            handle, &alpha, cudnn_output_desc,
+            output_grad_data + output_grad_offset * g, cudnn_filter_desc,
+            filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
+            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+            input_grad_data + input_offset * g));
       }
     }
 
@@ -232,15 +236,12 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       // Because beta is zero, it is unnecessary to reset filter_grad.
       // Gradient with respect to the filter
       for (int g = 0; g < groups; g++) {
-        auto cudnn_func = [&](void* cudnn_workspace) {
-          CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
-              handle, &alpha, cudnn_output_desc,
-              output_grad_data + output_grad_offset * g, cudnn_input_desc,
-              input_data + input_offset * g, cudnn_conv_desc, filter_algo,
-              cudnn_workspace, workspace_size_in_bytes, &beta,
-              cudnn_filter_desc, filter_grad_data + filter_offset * g));
-        };
-        workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+            handle, &alpha, cudnn_output_desc,
+            output_grad_data + output_grad_offset * g, cudnn_input_desc,
+            input_data + input_offset * g, cudnn_conv_desc, filter_algo,
+            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc,
+            filter_grad_data + filter_offset * g));
       }
     }
   }
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
index 6e138878664..c72a966c575 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -216,18 +216,19 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
     out_datas.push_back(
         static_cast<void*>(output_data + (oc0 + oc1 + oc2) * h * w));
 
+    auto temp_allocation =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
+            workspace_size_in_bytes);
+    void* cudnn_workspace = temp_allocation->ptr();
+
     for (int i = 0; i < 4; ++i) {
-      auto func = [&](void* cudnn_workspace) {
-        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
-            handle, &alpha, in_desc[i], in_datas[i], filter_desc[i],
-            static_cast<const void*>(filters[i]->data<T>()), conv_desc[i],
-            algo[i], cudnn_workspace, workspace_size_in_bytes, &beta,
-            out_desc[i], out_datas[i], bias_desc[i],
-            static_cast<const void*>(bias[i]->data<T>()), cudnn_act_desc,
-            out_desc[i], out_datas[i]));
-      };
-      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-      workspace_handle.RunFunc(func, workspace_size_in_bytes);
+      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
+          handle, &alpha, in_desc[i], in_datas[i], filter_desc[i],
+          static_cast<const void*>(filters[i]->data<T>()), conv_desc[i],
+          algo[i], cudnn_workspace, workspace_size_in_bytes, &beta, out_desc[i],
+          out_datas[i], bias_desc[i],
+          static_cast<const void*>(bias[i]->data<T>()), cudnn_act_desc,
+          out_desc[i], out_datas[i]));
     }
 
     cudnnTensorDescriptor_t x_desc;
diff --git a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
index a764d59410c..5e16a209e71 100644
--- a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
@@ -144,17 +144,19 @@ class CudnnCTCKernel : public framework::OpKernel<T> {
         CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, &workspace_size));
 
     T* loss_data = loss->mutable_data<T>(loss_dims, ctx.GetPlace());
-
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    auto cudnn_func = [&](void* cudnn_workspace) {
-      CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss(
-          handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data,
-          warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
-          loss_data, cu_grad_desc, warpctc_grad_data,
-          CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, cudnn_workspace,
-          workspace_size));
-    };
-    workspace_handle.RunFunc(cudnn_func, workspace_size);
+    math::SetConstant<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), loss, static_cast<T>(0));
+
+    auto temp_allocation =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
+            workspace_size);
+    void* cudnn_workspace = temp_allocation->ptr();
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss(
+        handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data,
+        warpctc_label_lengths.data(), warpctc_logits_lengths.data(), loss_data,
+        cu_grad_desc, warpctc_grad_data, CUDNN_CTC_LOSS_ALGO_DETERMINISTIC,
+        cu_ctcloss_desc, cudnn_workspace, workspace_size));
   }
 };
 
-- 
GitLab


From f4dec5cdeeba98c3955e02decd74fb9c02fc3202 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 23 Jan 2019 10:45:28 +0800
Subject: [PATCH 67/73] Check collective server's data. (#15449)

---
 .../distributed/collective_server_test.cc       | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc
index 5009058422b..90f2f9fd65b 100644
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ b/paddle/fluid/operators/distributed/collective_server_test.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor_util.h"
 
 #include "paddle/fluid/operators/distributed/collective_client.h"
 #include "paddle/fluid/operators/distributed/collective_server.h"
@@ -57,7 +58,7 @@ std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) {
   auto* tensor = slr->mutable_value();
   auto* rows = slr->mutable_rows();
 
-  tensor->Resize(framework::make_ddim({20000, 1024}));
+  tensor->Resize(framework::make_ddim({3, 1024}));
   tensor->mutable_data<float>(place);
 
   paddle::operators::math::set_constant(ctx, tensor, 32.7);
@@ -80,6 +81,20 @@ void Gather(const std::vector<distributed::RemoteVar>& vars,
   std::vector<const framework::SelectedRows*> dst;
   client->Gather(vars, &dst, *dev_ctx, scope);
   std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]);
+  dev_ctx->Wait();
+
+  ASSERT_EQ(dst[0]->value().dims(), framework::make_ddim({3, 1024}));
+  ASSERT_EQ(dst[0]->height(), 20000);
+  ASSERT_EQ(dst[0]->rows().size(), static_cast<size_t>(3));
+  for (int i = 0; i < 3; i++) {
+    ASSERT_EQ(dst[0]->rows()[i], i);
+  }
+
+  std::vector<float> vec;
+  TensorToVector(dst[0]->value(), *dev_ctx, &vec);
+  for (size_t i = 0; i < 3 * 1024; i++) {
+    ASSERT_FLOAT_EQ(vec[i], 32.7);
+  }
 }
 
 TEST(CollectiveServer, GPU) {
-- 
GitLab


From eaad3e4c3dc9926054ec2989cc780df734a433bb Mon Sep 17 00:00:00 2001
From: Yiqun Liu <liuyiqun01@baidu.com>
Date: Wed, 23 Jan 2019 10:57:36 +0800
Subject: [PATCH 68/73] Add check of input in sequence_expand op. (#15466)

* Add check of input in sequence_expand op.
test=develop

* Correct the unittest of sequence_expand op.
test=develop
---
 paddle/fluid/operators/sequence_ops/sequence_expand_op.cc   | 5 +++++
 python/paddle/fluid/tests/unittests/test_sequence_expand.py | 3 +--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
index c07e6962e67..27e0201bd70 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
@@ -68,6 +68,11 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
                        "Level number of Input(X)'s lod could be 0. Otherwise "
                        "size of Input(X)'s first level lod should be equal to "
                        "size of Input(Y)'s referred level lod.");
+      } else {
+        PADDLE_ENFORCE_EQ(x_dims[0], y_lod[ref_level].size() - 1,
+                          "When Input(X)'s lod is null, the dims[0] of "
+                          "Input(X) should match the "
+                          "size of Input(Y)'s referred level lod.");
       }
 
       int64_t out_first_dim = 0;
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index ffd4026dbad..d33a57f675a 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -81,11 +81,10 @@ class TestSequenceExpand(OpTest):
 class TestSequenceExpandCase1(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
-        x_lod = [[2, 3]]
         y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
         y_lod = [[2, 3], [2, 2, 3, 3, 3]]
         self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
-        self.attrs = {'ref_level': 0}
+        self.attrs = {'ref_level': 1}
 
 
 class TestSequenceExpandCase2(TestSequenceExpand):
-- 
GitLab


From 9f5108a6733c777eaf56806a113eeb776493a989 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 23 Jan 2019 11:41:05 +0800
Subject: [PATCH 69/73] Add cicheck_brpc (#15468)

---
 paddle/scripts/paddle_build.sh | 57 +++++++++++++++++++++++++++++++---
 1 file changed, 53 insertions(+), 4 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index cda04451f5e..bb7258ee591 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -164,6 +164,9 @@ function cmake_gen() {
         INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo}
     fi
 
+    distibuted_flag=${WITH_DISTRIBUTE:-OFF}
+    grpc_flag=${WITH_GRPC:-${distibuted_flag}}
+
     cat <<EOF
     ========================================
     Configuring cmake in /paddle/build ...
@@ -173,7 +176,7 @@ function cmake_gen() {
         -DWITH_DOC=${WITH_DOC:-OFF}
         -DWITH_GPU=${WITH_GPU:-OFF}
         -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
+        -DWITH_DISTRIBUTE=${distibuted_flag}
         -DWITH_MKL=${WITH_MKL:-ON}
         -DWITH_NGRAPH=${WITH_NGRAPH:-OFF}
         -DWITH_AVX=${WITH_AVX:-OFF}
@@ -194,7 +197,8 @@ function cmake_gen() {
         -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}
         -DPY_VERSION=${PY_VERSION:-2.7}
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
-        -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF}
+        -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF} 
+        -DWITH_GRPC=${grpc_flag}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -207,7 +211,7 @@ EOF
         -DWITH_DOC=${WITH_DOC:-OFF} \
         -DWITH_GPU=${WITH_GPU:-OFF} \
         -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
+        -DWITH_DISTRIBUTE=${distibuted_flag} \
         -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_NGRAPH=${WITH_NGRAPH:-OFF} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
@@ -227,7 +231,8 @@ EOF
         -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\
         -DPY_VERSION=${PY_VERSION:-2.7} \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \
-        -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF}
+        -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF} \
+        -DWITH_GRPC=${grpc_flag}
 
 }
 
@@ -311,6 +316,45 @@ EOF
     fi
 }
 
+function run_brpc_test() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    if [[ ${WITH_TESTING:-ON} == "ON" \
+        && ${WITH_DISTRIBUTE:-OFF} == "ON" \
+        && ${WITH_GRPC:-OFF} == "OFF" ]] ; then
+    cat <<EOF
+    ========================================
+    Running brpc unit tests ...
+    ========================================
+EOF
+        set +x
+        declare -a other_tests=("test_listen_and_serv_op" "system_allocator_test")
+        all_tests=`ctest -N`
+
+        for t in "${other_tests[@]}"
+        do
+            if [[ ${all_tests} != *$t* ]]; then
+                continue
+            fi
+
+            if [[ ${TESTING_DEBUG_MODE:-OFF} == "ON" ]] ; then
+                ctest -V -R $t
+            else
+                ctest --output-on-failure -R $t
+            fi
+        done
+        set -x
+
+        if [[ ${TESTING_DEBUG_MODE:-OFF} == "ON" ]] ; then
+            ctest -V -R test_dist_*
+        else
+            ctest --output-on-failure -R test_dist_*
+        fi
+    fi
+}
+
+
+
 function run_mac_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -788,6 +832,11 @@ function main() {
         test_fluid_lib
         assert_api_spec_approvals
         ;;
+      cicheck_brpc)
+        cmake_gen ${PYTHON_ABI:-""}
+        build
+        run_brpc_test
+        ;;
       assert_api)
         assert_api_not_changed ${PYTHON_ABI:-""}
         assert_api_spec_approvals
-- 
GitLab


From 07dc5a1506b4c349b7771f7bec342c11ae0477b1 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Wed, 23 Jan 2019 11:48:13 +0800
Subject: [PATCH 70/73] Add generate_mask_labels_op to support Mask-RCNN and
 refine some code. (#15371)

* Add generate_mask_labels_op to support Mask-RCNN.
* Refine sigmoid_cross_entropy to support nomalize mode.
* Fix generator_proposals_label.
* Use DeviceTemporaryAllocator in roi_pool and roi_algin.
* Remove shape check in data_feeder.
---
 paddle/fluid/API.spec                         |   3 +-
 paddle/fluid/operators/affine_channel_op.cu   |  15 +-
 .../fluid/operators/detection/CMakeLists.txt  |   4 +
 paddle/fluid/operators/detection/bbox_util.h  |   8 +-
 .../detection/generate_mask_labels_op.cc      | 437 ++++++++++++++++++
 .../detection/generate_proposal_labels_op.cc  |  59 +--
 paddle/fluid/operators/detection/mask_util.cc | 229 +++++++++
 paddle/fluid/operators/detection/mask_util.h  |  30 ++
 .../operators/detection/mask_util_test.cc     | 115 +++++
 paddle/fluid/operators/gather_op.cc           |   2 +
 paddle/fluid/operators/roi_align_op.cu        |  47 +-
 paddle/fluid/operators/roi_pool_op.cu         |  51 +-
 .../sigmoid_cross_entropy_with_logits_op.cc   |  17 +-
 .../sigmoid_cross_entropy_with_logits_op.cu   | 180 +++++++-
 .../sigmoid_cross_entropy_with_logits_op.h    | 127 ++---
 python/paddle/fluid/data_feeder.py            |   4 +-
 python/paddle/fluid/layers/detection.py       | 182 +++++++-
 python/paddle/fluid/layers/nn.py              |  22 +-
 python/paddle/fluid/tests/test_detection.py   | 144 ++++--
 .../unittests/test_generate_mask_labels_op.py | 421 +++++++++++++++++
 .../test_generate_proposal_labels_op.py       |   4 +-
 .../unittests/test_generate_proposals_op.py   |   4 +-
 ...st_sigmoid_cross_entropy_with_logits_op.py |  32 ++
 23 files changed, 1933 insertions(+), 204 deletions(-)
 create mode 100644 paddle/fluid/operators/detection/generate_mask_labels_op.cc
 create mode 100644 paddle/fluid/operators/detection/mask_util.cc
 create mode 100644 paddle/fluid/operators/detection/mask_util.h
 create mode 100644 paddle/fluid/operators/detection/mask_util_test.cc
 create mode 100644 python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index ad39542b4d8..430882dee96 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -197,7 +197,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None,
 paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
-paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None))
+paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -318,6 +318,7 @@ paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'asp
 paddle.fluid.layers.roi_perspective_transform ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,))
 paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
 paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
+paddle.fluid.layers.generate_mask_labels ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None))
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index 2bebdb345ab..c054fdb1ba6 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -83,7 +83,7 @@ __global__ void AffineChannelScaleBiasGradientCUDAKernel(
     T* dbias) {
   const int outer_size = C;
   const int inner_size = N * HxW;
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  typedef cub::BlockReduce<double, BlockDim> BlockReduce;
   __shared__ typename BlockReduce::TempStorage ds_storage;
   __shared__ typename BlockReduce::TempStorage db_storage;
 
@@ -97,13 +97,16 @@ __global__ void AffineChannelScaleBiasGradientCUDAKernel(
       ds_sum += dy[index] * x[index];
       db_sum += dy[index];
     }
-    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
-    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    __syncthreads();
+    auto ds_out =
+        BlockReduce(ds_storage).Reduce(static_cast<double>(ds_sum), cub::Sum());
+    auto db_out =
+        BlockReduce(db_storage).Reduce(static_cast<double>(db_sum), cub::Sum());
+    __syncthreads();
     if (threadIdx.x == 0) {
-      dscale[i] = ds_sum;
-      dbias[i] = db_sum;
+      dscale[i] = ds_out;
+      dbias[i] = db_out;
     }
-    __syncthreads();
   }
 }
 
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 6c85f1577e0..d3a61dc367c 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -45,3 +45,7 @@ detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op
 foreach(src ${LOCAL_DETECTION_LIBS})
     set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs")
 endforeach()
+
+cc_library(mask_util SRCS mask_util.cc DEPS memory)
+cc_test(mask_util_test SRCS mask_util_test.cc DEPS memory mask_util)
+detection_library(generate_mask_labels_op SRCS generate_mask_labels_op.cc DEPS mask_util)
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
index 6abeca1da44..b99edb5bf05 100644
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -1,13 +1,17 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #pragma once
 #include <algorithm>
 #include "paddle/fluid/framework/eigen.h"
@@ -88,7 +92,9 @@ void BboxOverlaps(const framework::Tensor& r_boxes,
       inter_w = std::max(x_max - x_min + 1, zero);
       inter_h = std::max(y_max - y_min + 1, zero);
       inter_area = inter_w * inter_h;
-      overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area);
+      overlaps_et(i, j) =
+          (inter_area == 0.) ? 0 : inter_area /
+                                       (r_box_area + c_box_area - inter_area);
     }
   }
 }
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
new file mode 100644
index 00000000000..46727c29de1
--- /dev/null
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -0,0 +1,437 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <math.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/bbox_util.h"
+#include "paddle/fluid/operators/detection/mask_util.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+const int kBoxDim = 4;
+
+template <typename T>
+void AppendMask(LoDTensor* out, int64_t offset, Tensor* to_add) {
+  auto* out_data = out->data<T>();
+  auto* to_add_data = to_add->data<T>();
+  memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
+}
+
+class GenerateMaskLabelsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("GtClasses"),
+                   "Input(GtClasses) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("IsCrowd"),
+                   "Input(IsCrowd) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("GtSegms"),
+                   "Input(GtSegms) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Rois"), "Input(Rois) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LabelsInt32"),
+                   "Input(LabelsInt32) shouldn't be null.");
+
+    PADDLE_ENFORCE(
+        ctx->HasOutput("MaskRois"),
+        "Output(MaskRois) of GenerateMaskLabelsOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("RoiHasMaskInt32"),
+        "Output(RoiHasMaskInt32) of GenerateMaskLabelsOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("MaskInt32"),
+        "Output(MaskInt32) of GenerateMaskLabelsOp should not be null");
+
+    auto im_info_dims = ctx->GetInputDim("ImInfo");
+    auto gt_segms_dims = ctx->GetInputDim("GtSegms");
+    PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
+                      "The rank of Input(ImInfo) must be 2.");
+    PADDLE_ENFORCE_EQ(gt_segms_dims.size(), 2,
+                      "The rank of Input(GtSegms) must be 2.");
+    PADDLE_ENFORCE_EQ(gt_segms_dims[1], 2,
+                      "The second dim of Input(GtSegms) must be 2.");
+    int num_classes = ctx->Attrs().Get<int>("num_classes");
+    int resolution = ctx->Attrs().Get<int>("resolution");
+
+    ctx->SetOutputDim("MaskRois", {-1, 4});
+    ctx->SetOutputDim("RoiHasMaskInt32", {-1, 1});
+    ctx->SetOutputDim("MaskInt32", {-1, num_classes * resolution * resolution});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Rois"));
+    return framework::OpKernelType(data_type, platform::CPUPlace());
+  }
+};
+
+/*
+ * Expand masks from shape (#masks, M ** 2) to (#masks, #classes * M ** 2)
+ * to encode class specific mask targets.
+ */
+template <typename T>
+static inline void ExpandMaskTarget(const platform::CPUDeviceContext& ctx,
+                                    const Tensor& masks,
+                                    const Tensor& mask_class_labels,
+                                    const int resolution, const int num_classes,
+                                    Tensor* mask_targets) {
+  const uint8_t* masks_data = masks.data<uint8_t>();
+  int64_t num_mask = masks.dims()[0];
+  const int* mask_class_labels_data = mask_class_labels.data<int>();
+  const int M = resolution * resolution;
+  const int mask_dim = M * num_classes;
+
+  int* mask_targets_data =
+      mask_targets->mutable_data<int>({num_mask, mask_dim}, ctx.GetPlace());
+  math::set_constant(ctx, mask_targets, -1);
+  for (int64_t mask_id = 0; mask_id < num_mask; ++mask_id) {
+    int cls = mask_class_labels_data[mask_id];
+    int start = M * cls;
+    if (cls > 0) {
+      for (int i = 0; i < M; ++i) {
+        mask_targets_data[mask_id * mask_dim + start + i] =
+            static_cast<int>(masks_data[mask_id * M + i]);
+      }
+    }
+  }
+}
+
+template <typename T>
+std::vector<Tensor> SampleMaskForOneImage(
+    const platform::CPUDeviceContext& ctx, const Tensor& im_info,
+    const Tensor& gt_classes, const Tensor& is_crowd, const Tensor& gt_segms,
+    const Tensor& rois, const Tensor& label_int32, const int num_classes,
+    const int resolution, const framework::LoD& segm_length) {
+  // Prepare the mask targets by associating one gt mask to each training roi
+  // that has a fg (non-bg) class label.
+  const int64_t gt_size = static_cast<int64_t>(gt_classes.dims()[0]);
+  const int64_t roi_size = static_cast<int64_t>(rois.dims()[0]);
+  const int* gt_classes_data = gt_classes.data<int>();
+  const int* is_crowd_data = is_crowd.data<int>();
+  const int* label_int32_data = label_int32.data<int>();
+  PADDLE_ENFORCE_EQ(roi_size, label_int32.dims()[0]);
+
+  std::vector<int> mask_gt_inds, fg_inds;
+  std::vector<std::vector<std::vector<T>>> gt_polys;
+
+  auto polys_num = segm_length[1];
+  auto segm_lod_offset = framework::ConvertToOffsetBasedLoD(segm_length);
+  auto lod1 = segm_lod_offset[1];
+  auto lod2 = segm_lod_offset[2];
+  const T* polys_data = gt_segms.data<T>();
+  for (int64_t i = 0; i < gt_size; ++i) {
+    if ((gt_classes_data[i] > 0) && (is_crowd_data[i] == 0)) {
+      mask_gt_inds.emplace_back(i);
+
+      // slice fg segmentation polys
+      int poly_num = polys_num[i];
+      std::vector<std::vector<T>> polys;
+      int s_idx = lod1[i];
+      for (int j = 0; j < poly_num; ++j) {
+        int s = lod2[s_idx + j];
+        int e = lod2[s_idx + j + 1];
+        PADDLE_ENFORCE_NE(s, e);
+        std::vector<T> plts(polys_data + s * 2, polys_data + e * 2);
+        polys.push_back(plts);
+      }
+      gt_polys.push_back(polys);
+    }
+  }
+  for (int64_t i = 0; i < roi_size; ++i) {
+    if (label_int32_data[i] > 0) {
+      fg_inds.emplace_back(i);
+    }
+  }
+  int gt_num = mask_gt_inds.size();
+  int fg_num = fg_inds.size();
+
+  Tensor boxes_from_polys;
+  boxes_from_polys.mutable_data<T>({gt_num, 4}, platform::CPUPlace());
+  Poly2Boxes(gt_polys, boxes_from_polys.data<T>());
+
+  std::vector<int> roi_has_mask =
+      std::vector<int>(fg_inds.begin(), fg_inds.end());
+  Tensor mask_class_labels;
+  Tensor masks;
+  Tensor rois_fg;
+
+  auto im_scale = im_info.data<T>()[2];
+  if (fg_num > 0) {
+    // Class labels for the foreground rois
+    mask_class_labels.mutable_data<int>({fg_num, 1}, ctx.GetPlace());
+    Gather<int>(label_int32_data, 1, fg_inds.data(), fg_inds.size(),
+                mask_class_labels.data<int>());
+
+    uint8_t* masks_data = masks.mutable_data<uint8_t>(
+        {fg_num, resolution * resolution}, ctx.GetPlace());
+
+    // Find overlap between all foreground rois and the bounding boxes
+    // enclosing each segmentation
+    T* rois_fg_data = rois_fg.mutable_data<T>({fg_num, 4}, ctx.GetPlace());
+    Gather<T>(rois.data<T>(), 4, fg_inds.data(), fg_inds.size(),
+              rois_fg.data<T>());
+
+    for (int k = 0; k < rois_fg.numel(); ++k) {
+      rois_fg_data[k] = rois_fg_data[k] / im_scale;
+    }
+
+    Tensor overlaps_bbfg_bbpolys;
+    overlaps_bbfg_bbpolys.mutable_data<T>({fg_num, gt_num}, ctx.GetPlace());
+    BboxOverlaps<T>(rois_fg, boxes_from_polys, &overlaps_bbfg_bbpolys);
+
+    // Map from each fg rois to the index of the mask with highest overlap
+    // (measured by bbox overlap)
+    T* overlaps_bbfg_bbpolys_data = overlaps_bbfg_bbpolys.data<T>();
+    std::vector<int> fg_masks_inds;
+    for (int64_t i = 0; i < fg_num; ++i) {
+      const T* v = overlaps_bbfg_bbpolys_data + i * gt_num;
+      T max_overlap = std::numeric_limits<T>::min();
+      int id = 0;
+      for (int64_t j = 0; j < gt_num; ++j) {
+        if (v[j] > max_overlap) {
+          max_overlap = v[j];
+          id = j;
+        }
+      }
+      fg_masks_inds.push_back(id);
+    }
+
+    // add fg targets
+    for (int64_t i = 0; i < fg_num; ++i) {
+      int fg_polys_ind = fg_masks_inds[i];
+      T* roi_fg = rois_fg_data + i * 4;
+      uint8_t* mask = masks_data + i * resolution * resolution;
+      Polys2MaskWrtBox(gt_polys[fg_polys_ind], roi_fg, resolution, mask);
+    }
+  } else {
+    // The network cannot handle empty blobs, so we must provide a mask
+    // We simply take the first bg roi, given it an all -1's mask (ignore
+    // label), and label it with class zero (bg).
+    int bg_num = 1;
+    T* rois_fg_data = rois_fg.mutable_data<T>({bg_num, 4}, ctx.GetPlace());
+    const T* rois_data = rois.data<T>();
+    std::vector<int> bg_inds;
+    for (int64_t i = 0; i < roi_size; ++i) {
+      if (label_int32_data[i] == 0) {
+        bg_inds.emplace_back(i);
+        rois_fg_data[0] = rois_data[0] / im_scale;
+        rois_fg_data[1] = rois_data[1] / im_scale;
+        rois_fg_data[2] = rois_data[2] / im_scale;
+        rois_fg_data[3] = rois_data[3] / im_scale;
+        break;
+      }
+    }
+    masks.mutable_data<uint8_t>({bg_num, resolution * resolution},
+                                ctx.GetPlace());
+    math::set_constant(ctx, &masks, -1);
+    int* mask_class_labels_data =
+        mask_class_labels.mutable_data<int>({bg_num, 1}, ctx.GetPlace());
+    mask_class_labels_data[0] = 0;
+    roi_has_mask = std::vector<int>(bg_inds.begin(), bg_inds.end());
+  }
+
+  Tensor masks_expand;
+  ExpandMaskTarget<T>(ctx, masks, mask_class_labels, resolution, num_classes,
+                      &masks_expand);
+
+  T* rois_fg_data = rois_fg.data<T>();
+  for (int k = 0; k < rois_fg.numel(); ++k) {
+    rois_fg_data[k] = rois_fg_data[k] * im_scale;
+  }
+
+  Tensor roi_has_mask_t;
+  int roi_has_mask_size = roi_has_mask.size();
+  int* roi_has_mask_data =
+      roi_has_mask_t.mutable_data<int>({roi_has_mask_size, 1}, ctx.GetPlace());
+  std::copy(roi_has_mask.begin(), roi_has_mask.end(), roi_has_mask_data);
+
+  std::vector<Tensor> res;
+  res.emplace_back(rois_fg);
+  res.emplace_back(roi_has_mask_t);
+  res.emplace_back(masks_expand);
+  return res;
+}
+
+template <typename T>
+class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* im_info = ctx.Input<LoDTensor>("ImInfo");
+    auto* gt_classes = ctx.Input<LoDTensor>("GtClasses");
+    auto* is_crowd = ctx.Input<LoDTensor>("IsCrowd");
+    auto* gt_segms = ctx.Input<LoDTensor>("GtSegms");
+    auto* rois = ctx.Input<LoDTensor>("Rois");
+    auto* label_int32 = ctx.Input<LoDTensor>("LabelsInt32");
+
+    auto* mask_rois = ctx.Output<LoDTensor>("MaskRois");
+    auto* roi_has_mask_int32 = ctx.Output<LoDTensor>("RoiHasMaskInt32");
+    auto* mask_int32 = ctx.Output<LoDTensor>("MaskInt32");
+
+    int num_classes = ctx.Attr<int>("num_classes");
+    int resolution = ctx.Attr<int>("resolution");
+
+    PADDLE_ENFORCE_EQ(gt_classes->lod().size(), 1UL,
+                      "GenerateMaskLabelsOp gt_classes needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
+                      "GenerateMaskLabelsOp is_crowd needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(rois->lod().size(), 1UL,
+                      "GenerateMaskLabelsOp rois needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(label_int32->lod().size(), 1UL,
+                      "GenerateMaskLabelsOp label_int32 needs 1 level of LoD");
+
+    PADDLE_ENFORCE_EQ(gt_segms->lod().size(), 3UL);
+
+    int64_t n = static_cast<int64_t>(gt_classes->lod().back().size() - 1);
+    PADDLE_ENFORCE_EQ(gt_segms->lod()[0].size() - 1, n);
+
+    int mask_dim = num_classes * resolution * resolution;
+
+    mask_rois->mutable_data<T>({rois->numel(), kBoxDim}, ctx.GetPlace());
+    roi_has_mask_int32->mutable_data<int>({rois->numel(), 1}, ctx.GetPlace());
+    mask_int32->mutable_data<int>({rois->numel(), mask_dim}, ctx.GetPlace());
+
+    framework::LoD lod;
+    std::vector<size_t> lod0(1, 0);
+
+    int64_t num_mask = 0;
+    auto& dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
+
+    auto gt_classes_lod = gt_classes->lod().back();
+    auto is_crowd_lod = is_crowd->lod().back();
+    auto rois_lod = rois->lod().back();
+    auto label_int32_lod = label_int32->lod().back();
+    auto gt_segms_lod = gt_segms->lod();
+
+    for (int i = 0; i < n; ++i) {
+      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      Tensor gt_classes_slice =
+          gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
+      Tensor is_crowd_slice =
+          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
+      Tensor label_int32_slice =
+          label_int32->Slice(label_int32_lod[i], label_int32_lod[i + 1]);
+      Tensor rois_slice = rois->Slice(rois_lod[i], rois_lod[i + 1]);
+
+      auto sub_lod_and_offset =
+          framework::GetSubLoDAndAbsoluteOffset(gt_segms_lod, i, i + 1, 0);
+      auto lod_length = sub_lod_and_offset.first;
+      size_t s = sub_lod_and_offset.second.first;
+      size_t e = sub_lod_and_offset.second.second;
+      Tensor gt_segms_slice = gt_segms->Slice(s, e);
+
+      std::vector<Tensor> tensor_output = SampleMaskForOneImage<T>(
+          dev_ctx, im_info_slice, gt_classes_slice, is_crowd_slice,
+          gt_segms_slice, rois_slice, label_int32_slice, num_classes,
+          resolution, lod_length);
+
+      Tensor sampled_mask_rois = tensor_output[0];
+      Tensor sampled_roi_has_mask_int32 = tensor_output[1];
+      Tensor sampled_mask_int32 = tensor_output[2];
+
+      AppendMask<T>(mask_rois, kBoxDim * num_mask, &sampled_mask_rois);
+      AppendMask<int>(roi_has_mask_int32, num_mask,
+                      &sampled_roi_has_mask_int32);
+      AppendMask<int>(mask_int32, mask_dim * num_mask, &sampled_mask_int32);
+
+      num_mask += sampled_mask_rois.dims()[0];
+      lod0.emplace_back(num_mask);
+    }
+
+    lod.emplace_back(lod0);
+    mask_rois->set_lod(lod);
+    roi_has_mask_int32->set_lod(lod);
+    mask_int32->set_lod(lod);
+    mask_rois->Resize({num_mask, kBoxDim});
+    roi_has_mask_int32->Resize({num_mask, 1});
+    mask_int32->Resize({num_mask, mask_dim});
+  }
+};
+
+class GenerateMaskLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("ImInfo",
+             "(Tensor), This input is a 2D Tensor with shape [B, 3]. "
+             "B is the number of input images, "
+             "each element consists of im_height, im_width, im_scale.");
+    AddInput("GtClasses",
+             "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. "
+             "M is the number of groundtruth, "
+             "each element is a class label of groundtruth.");
+    AddInput(
+        "IsCrowd",
+        "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. "
+        "M is the number of groundtruth, "
+        "each element is a flag indicates whether a groundtruth is crowd.");
+    AddInput(
+        "GtSegms",
+        "(LoDTensor), This input is a 2D LoDTensor with shape [S, 2], it's LoD "
+        "level is 3. The LoD[0] represents the gt objects number of each "
+        "instance. LoD[1] represents the segmentation counts of each objects. "
+        "LoD[2] represents the polygons number of each segmentation. S the "
+        "total number of polygons coordinate points. Each element is (x, y) "
+        "coordinate points.");
+    AddInput(
+        "Rois",
+        "(LoDTensor), This input is a 2D LoDTensor with shape [R, 4]. "
+        "R is the number of rois which is the output of "
+        "generate_proposal_labels, "
+        "each element is a bounding box with (xmin, ymin, xmax, ymax) format.");
+    AddInput("LabelsInt32",
+             "(LoDTensor), This intput is a 2D LoDTensor with shape [R, 1], "
+             "each element repersents a class label of a roi");
+    AddOutput(
+        "MaskRois",
+        "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4]. "
+        "P is the number of mask, "
+        "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
+    AddOutput("RoiHasMaskInt32",
+              "(LoDTensor), This output is a 2D LoDTensor with shape [P, 1], "
+              "each element repersents the output mask rois index with regard "
+              "to input rois");
+    AddOutput("MaskInt32",
+              "(LoDTensor), This output is a 4D LoDTensor with shape [P, Q], "
+              "Q equal to num_classes * resolution * resolution");
+
+    AddAttr<int>("num_classes", "Class number.");
+    AddAttr<int>("resolution", "Resolution of mask.");
+
+    AddComment(R"DOC(
+This operator can be, for given the RoIs and corresponding labels,
+to sample foreground RoIs. This mask branch also has
+a :math: `K \\times M^{2}` dimensional output targets for each foreground
+RoI, which encodes K binary masks of resolution M x M, one for each of the
+K classes. This mask targets are used to compute loss of mask branch.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(generate_mask_labels, ops::GenerateMaskLabelsOp,
+                  ops::GenerateMaskLabelsOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(generate_mask_labels,
+                       ops::GenerateMaskLabelsKernel<float>);
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index a652d4d9575..5b2e571baf3 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -48,20 +48,21 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
                    "Input(GtBoxes) shouldn't be null.");
     PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");
 
-    PADDLE_ENFORCE(ctx->HasOutput("Rois"),
-                   "Output(Rois) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Rois"),
+        "Output(Rois) of GenerateProposalLabelsOp should not be null");
     PADDLE_ENFORCE(
         ctx->HasOutput("LabelsInt32"),
-        "Output(LabelsInt32) of RpnTargetAssignOp should not be null");
+        "Output(LabelsInt32) of GenerateProposalLabelsOp should not be null");
     PADDLE_ENFORCE(
         ctx->HasOutput("BboxTargets"),
-        "Output(BboxTargets) of RpnTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("BboxInsideWeights"),
-        "Output(BboxInsideWeights) of RpnTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("BboxOutsideWeights"),
-        "Output(BboxOutsideWeights) of RpnTargetAssignOp should not be null");
+        "Output(BboxTargets) of GenerateProposalLabelsOp should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("BboxInsideWeights"),
+                   "Output(BboxInsideWeights) of GenerateProposalLabelsOp "
+                   "should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("BboxOutsideWeights"),
+                   "Output(BboxOutsideWeights) of GenerateProposalLabelsOp "
+                   "should not be null");
 
     auto rpn_rois_dims = ctx->GetInputDim("RpnRois");
     auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
@@ -225,30 +226,36 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
 
 template <typename T>
 std::vector<Tensor> SampleRoisForOneImage(
-    const platform::CPUDeviceContext& context, Tensor* rpn_rois,
-    Tensor* gt_classes, Tensor* is_crowd, Tensor* gt_boxes, Tensor* im_info,
-    const int batch_size_per_im, const float fg_fraction, const float fg_thresh,
-    const float bg_thresh_hi, const float bg_thresh_lo,
+    const platform::CPUDeviceContext& context, const Tensor& rpn_rois_in,
+    const Tensor& gt_classes, const Tensor& is_crowd, const Tensor& gt_boxes,
+    const Tensor& im_info, const int batch_size_per_im, const float fg_fraction,
+    const float fg_thresh, const float bg_thresh_hi, const float bg_thresh_lo,
     const std::vector<float>& bbox_reg_weights, const int class_nums,
     std::minstd_rand engine, bool use_random) {
-  auto rpn_rois_et = framework::EigenTensor<T, 2>::From(*rpn_rois);
-  auto im_scale = im_info->data<T>()[2];
-  rpn_rois_et = rpn_rois_et / im_scale;
+  auto im_scale = im_info.data<T>()[2];
+
+  Tensor rpn_rois;
+  rpn_rois.mutable_data<T>(rpn_rois_in.dims(), context.GetPlace());
+  T* rpn_rois_dt = rpn_rois.data<T>();
+  const T* rpn_rois_in_dt = rpn_rois_in.data<T>();
+  for (int i = 0; i < rpn_rois.numel(); ++i) {
+    rpn_rois_dt[i] = rpn_rois_in_dt[i] / im_scale;
+  }
 
   Tensor boxes;
-  int proposals_num = gt_boxes->dims()[0] + rpn_rois->dims()[0];
+  int proposals_num = gt_boxes.dims()[0] + rpn_rois.dims()[0];
   boxes.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
-  Concat<T>(context, *gt_boxes, *rpn_rois, &boxes);
+  Concat<T>(context, gt_boxes, rpn_rois, &boxes);
 
   // Overlaps
   Tensor proposal_to_gt_overlaps;
-  proposal_to_gt_overlaps.mutable_data<T>({proposals_num, gt_boxes->dims()[0]},
+  proposal_to_gt_overlaps.mutable_data<T>({proposals_num, gt_boxes.dims()[0]},
                                           context.GetPlace());
-  BboxOverlaps<T>(boxes, *gt_boxes, &proposal_to_gt_overlaps);
+  BboxOverlaps<T>(boxes, gt_boxes, &proposal_to_gt_overlaps);
 
   // Generate proposal index
   std::vector<std::vector<int>> fg_bg_gt = SampleFgBgGt<T>(
-      context, &proposal_to_gt_overlaps, *is_crowd, batch_size_per_im,
+      context, &proposal_to_gt_overlaps, is_crowd, batch_size_per_im,
       fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, engine, use_random);
   std::vector<int> fg_inds = fg_bg_gt[0];
   std::vector<int> bg_inds = fg_bg_gt[1];
@@ -263,7 +270,7 @@ std::vector<Tensor> SampleRoisForOneImage(
   sampled_boxes.mutable_data<T>(bbox_dim, context.GetPlace());
   sampled_labels.mutable_data<int>({boxes_num}, context.GetPlace());
   sampled_gts.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
-  GatherBoxesLabels<T>(context, boxes, *gt_boxes, *gt_classes, fg_inds, bg_inds,
+  GatherBoxesLabels<T>(context, boxes, gt_boxes, gt_classes, fg_inds, bg_inds,
                        gt_inds, &sampled_boxes, &sampled_labels, &sampled_gts);
 
   // Compute targets
@@ -397,8 +404,8 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
           gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
       Tensor im_info_slice = im_info->Slice(i, i + 1);
       std::vector<Tensor> tensor_output = SampleRoisForOneImage<T>(
-          dev_ctx, &rpn_rois_slice, &gt_classes_slice, &is_crowd_slice,
-          &gt_boxes_slice, &im_info_slice, batch_size_per_im, fg_fraction,
+          dev_ctx, rpn_rois_slice, gt_classes_slice, is_crowd_slice,
+          gt_boxes_slice, im_info_slice, batch_size_per_im, fg_fraction,
           fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums,
           engine, use_random);
       Tensor sampled_rois = tensor_output[0];
@@ -467,7 +474,7 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
         "P usuall equal to  batch_size_per_im * batch_size, "
         "each element is a bounding box with [xmin, ymin, xmax, ymax] format.");
     AddOutput("LabelsInt32",
-              "(LoDTensor), This output is a 2D LoDTensor with shape [P], "
+              "(LoDTensor), This output is a 2D LoDTensor with shape [P, 1], "
               "each element repersents a class label of a roi");
     AddOutput("BboxTargets",
               "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
diff --git a/paddle/fluid/operators/detection/mask_util.cc b/paddle/fluid/operators/detection/mask_util.cc
new file mode 100644
index 00000000000..bd6fee71381
--- /dev/null
+++ b/paddle/fluid/operators/detection/mask_util.cc
@@ -0,0 +1,229 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/mask_util.h"
+#include <math.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <limits>
+#include <utility>
+#include "paddle/fluid/memory/memory.h"
+
+namespace paddle {
+namespace operators {
+
+uint32_t UMax(uint32_t a, uint32_t b) { return (a > b) ? a : b; }
+
+static inline int Compare(const void* a, const void* b) {
+  uint32_t c = *(reinterpret_cast<const uint32_t*>(a));
+  uint32_t d = *(reinterpret_cast<const uint32_t*>(b));
+  return c > d ? 1 : c < d ? -1 : 0;
+}
+
+void Decode(const uint32_t* cnts, int m, uint8_t* mask) {
+  uint8_t v = 0;
+  for (int j = 0; j < m; j++) {
+    for (uint32_t k = 0; k < cnts[j]; k++) {
+      *(mask++) = v;
+    }
+    v = !v;
+  }
+}
+
+typedef uint32_t uint;
+void Poly2Mask(const float* xy, int k, int h, int w, uint8_t* mask) {
+  int j, m = 0;
+  double scale = 5;
+  int *x, *y, *u, *v;
+  uint *a, *b;
+  platform::CPUPlace cpu;
+  auto xptr = memory::Alloc(cpu, sizeof(int) * (k + 1) * 2);
+  x = reinterpret_cast<int*>(xptr->ptr());
+  y = x + (k + 1);
+
+  for (j = 0; j < k; j++) x[j] = static_cast<int>(scale * xy[j * 2 + 0] + .5);
+  x[k] = x[0];
+  for (j = 0; j < k; j++) y[j] = static_cast<int>(scale * xy[j * 2 + 1] + .5);
+  y[k] = y[0];
+  for (j = 0; j < k; j++) {
+    m += UMax(abs(x[j] - x[j + 1]), abs(y[j] - y[j + 1])) + 1;
+  }
+  auto vptr = memory::Alloc(cpu, sizeof(int) * m * 2);
+  u = reinterpret_cast<int*>(vptr->ptr());
+  v = u + m;
+  m = 0;
+  for (j = 0; j < k; j++) {
+    int xs = x[j], xe = x[j + 1], ys = y[j], ye = y[j + 1], dx, dy, t, d;
+    int flip;
+    double s;
+    dx = abs(xe - xs);
+    dy = abs(ys - ye);
+    flip = (dx >= dy && xs > xe) || (dx < dy && ys > ye);
+    if (flip) {
+      t = xs;
+      xs = xe;
+      xe = t;
+      t = ys;
+      ys = ye;
+      ye = t;
+    }
+    if (dx >= dy) {
+      s = dx == 0 ? 0 : static_cast<double>(ye - ys) / dx;
+      for (d = 0; d <= dx; d++) {
+        t = flip ? dx - d : d;
+        u[m] = t + xs;
+        v[m] = static_cast<int>(ys + s * t + .5);
+        m++;
+      }
+    } else {
+      s = dy == 0 ? 0 : static_cast<double>(xe - xs) / dy;
+      for (d = 0; d <= dy; d++) {
+        t = flip ? dy - d : d;
+        v[m] = t + ys;
+        u[m] = static_cast<int>(xs + s * t + .5);
+        m++;
+      }
+    }
+  }
+  /* get points along y-boundary and downsample */
+  k = m;
+  m = 0;
+  double xd, yd;
+  auto xyptr = memory::Alloc(cpu, sizeof(int) * k * 2);
+  x = reinterpret_cast<int*>(xyptr->ptr());
+  y = x + k;
+  for (j = 1; j < k; j++) {
+    if (u[j] != u[j - 1]) {
+      xd = static_cast<double>(u[j] < u[j - 1] ? u[j] : u[j] - 1);
+      xd = (xd + .5) / scale - .5;
+      if (floor(xd) != xd || xd < 0 || xd > w - 1) continue;
+      yd = static_cast<double>(v[j] < v[j - 1] ? v[j] : v[j - 1]);
+      yd = (yd + .5) / scale - .5;
+      if (yd < 0)
+        yd = 0;
+      else if (yd > h)
+        yd = h;
+      yd = ceil(yd);
+      x[m] = static_cast<int>(xd);
+      y[m] = static_cast<int>(yd);
+      m++;
+    }
+  }
+  /* compute rle encoding given y-boundary points */
+  k = m;
+  auto aptr = memory::Alloc(cpu, sizeof(uint) * (k + 1));
+  a = reinterpret_cast<uint*>(aptr->ptr());
+  for (j = 0; j < k; j++) a[j] = static_cast<uint>(x[j] * h + y[j]);
+  a[k++] = static_cast<uint>(h * w);
+
+  qsort(a, k, sizeof(uint), Compare);
+  uint p = 0;
+  for (j = 0; j < k; j++) {
+    uint t = a[j];
+    a[j] -= p;
+    p = t;
+  }
+  auto bptr = memory::Alloc(cpu, sizeof(uint32_t) * k);
+  b = reinterpret_cast<uint32_t*>(bptr->ptr());
+  j = m = 0;
+  b[m++] = a[j++];
+  while (j < k) {
+    if (a[j] > 0) {
+      b[m++] = a[j++];
+    } else {
+      j++;
+      if (j < k) b[m - 1] += a[j++];
+    }
+  }
+
+  // convert to mask
+  auto mskptr = memory::Alloc(cpu, sizeof(uint8_t) * h * w);
+  uint8_t* msk = reinterpret_cast<uint8_t*>(mskptr->ptr());
+  Decode(b, m, msk);
+
+  for (int ii = 0; ii < h; ++ii) {
+    for (int jj = 0; jj < w; ++jj) {
+      mask[ii * w + jj] = msk[jj * h + ii];
+    }
+  }
+}
+
+void Poly2Boxes(const std::vector<std::vector<std::vector<float>>>& polys,
+                float* boxes) {
+  // lists
+  for (size_t i = 0; i < polys.size(); ++i) {
+    float x0 = std::numeric_limits<float>::max();
+    float x1 = std::numeric_limits<float>::min();
+    float y0 = std::numeric_limits<float>::max();
+    float y1 = std::numeric_limits<float>::min();
+    // each list may have more than one polys
+    for (size_t j = 0; j < polys[i].size(); ++j) {
+      for (size_t k = 0; k < polys[i][j].size() / 2; ++k) {
+        x0 = std::min(x0, polys[i][j][2 * k]);
+        x1 = std::max(x1, polys[i][j][2 * k]);
+        y0 = std::min(y0, polys[i][j][2 * k + 1]);
+        y1 = std::max(y1, polys[i][j][2 * k + 1]);
+      }
+    }
+    boxes[i * 4] = x0;
+    boxes[i * 4 + 1] = y0;
+    boxes[i * 4 + 2] = x1;
+    boxes[i * 4 + 3] = y1;
+  }
+}
+
+void Polys2MaskWrtBox(const std::vector<std::vector<float>>& polygons,
+                      const float* box, int M, uint8_t* mask) {
+  float w = box[2] - box[0];
+  float h = box[3] - box[1];
+  w = std::max(w, static_cast<float>(1.));
+  h = std::max(h, static_cast<float>(1.));
+
+  uint8_t* msk = nullptr;
+  if (polygons.size() == 1UL) {
+    msk = mask;
+  } else {
+    msk = reinterpret_cast<uint8_t*>(
+        malloc(M * M * polygons.size() * sizeof(uint8_t)));
+  }
+  for (size_t i = 0; i < polygons.size(); ++i) {
+    int k = polygons[i].size() / 2;
+    std::vector<float> p;
+    for (int j = 0; j < k; ++j) {
+      float pw = (polygons[i][2 * j] - box[0]) * M / w;
+      float ph = (polygons[i][2 * j + 1] - box[1]) * M / h;
+      p.push_back(pw);
+      p.push_back(ph);
+    }
+    uint8_t* msk_i = msk + i * M * M;
+    Poly2Mask(p.data(), k, M, M, msk_i);
+  }
+
+  if (polygons.size() > 1UL) {
+    for (size_t i = 0; i < polygons.size(); ++i) {
+      uint8_t* msk_i = msk + i * M * M;
+      for (int j = 0; j < M * M; ++j) {
+        if (i == 0) {
+          mask[j] = msk_i[j];
+        } else {
+          mask[j] = (mask[j] + msk_i[j]) > 0 ? 1 : 0;
+        }
+      }
+    }
+    free(msk);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/mask_util.h b/paddle/fluid/operators/detection/mask_util.h
new file mode 100644
index 00000000000..4e0ea54f6d8
--- /dev/null
+++ b/paddle/fluid/operators/detection/mask_util.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <stdint.h>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+void Poly2Mask(const float* ploy, int k, int h, int w, uint8_t* mask);
+
+void Poly2Boxes(const std::vector<std::vector<std::vector<float>>>& polys,
+                float* boxes);
+
+void Polys2MaskWrtBox(const std::vector<std::vector<float>>& polygons,
+                      const float* box, int M, uint8_t* mask);
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/mask_util_test.cc b/paddle/fluid/operators/detection/mask_util_test.cc
new file mode 100644
index 00000000000..de904e94746
--- /dev/null
+++ b/paddle/fluid/operators/detection/mask_util_test.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/mask_util.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/memory/memory.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+void Compare(const T* a, const T* b, const int n) {
+  for (int i = 0; i < n; i++) {
+    EXPECT_EQ(a[i], b[i]);
+  }
+}
+
+TEST(MaskUtil, Poly2MaskTest) {
+  float polys[] = {1.97f, 1.88f, 5.81f, 1.88f, 1.69f,
+                   6.53f, 5.94f, 6.38f, 1.97f, 1.88f};
+  int h = 8, w = 8;
+  int k = 5;  // length(polys) / 2
+  // clang-format off
+  uint8_t expect_mask[] = {
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 1, 1, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 1, 0, 0, 0, 0,
+      0, 0, 1, 1, 1, 0, 0, 0,
+      0, 0, 1, 1, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0
+  };
+  // clang-format on
+
+  // the groud-truth mask is computed by coco API:
+  //
+  // import pycocotools.mask as mask_util
+  // import numpy as np
+  // segm = [1.97, 1.88, 5.81, 1.88, 1.69, 6.53, 5.94, 6.38, 1.97, 1.88]
+  // rles = mask_util.frPyObjects([segm], im_h, im_w)
+  // mask = mask_util.decode(rles)
+  // print mask
+  platform::CPUPlace cpu;
+  auto allocation = memory::Alloc(cpu, sizeof(expect_mask));
+  uint8_t* mask = reinterpret_cast<uint8_t*>(allocation->ptr());
+  Poly2Mask(polys, k, h, w, mask);
+  Compare<uint8_t>(expect_mask, mask, h * w);
+}
+
+TEST(MaskUtil, Poly2BoxesTest) {
+  // clang-format off
+  std::vector<std::vector<std::vector<float>>> polys = {
+      {{1.97f, 1.88f, 5.81f, 1.88f, 1.69f, 6.53f, 5.94f, 6.38f, 1.97f, 1.88f}},
+      {{2.97f, 1.88f, 3.81f, 1.68f, 1.69f, 6.63f, 6.94f, 6.58f, 2.97f, 0.88f}}
+  };
+  float expect_boxes[] = {
+      1.69f, 1.88f, 5.94f, 6.53f,
+      1.69f, 0.88f, 6.94f, 6.63f
+  };
+  // clang-format on
+
+  platform::CPUPlace cpu;
+  auto allocation = memory::Alloc(cpu, sizeof(expect_boxes));
+  float* boxes = reinterpret_cast<float*>(allocation->ptr());
+  Poly2Boxes(polys, boxes);
+  Compare<float>(expect_boxes, boxes, 8);
+}
+
+TEST(MaskUtil, Polys2MaskWrtBoxTest) {
+  // clang-format off
+  std::vector<std::vector<std::vector<float>>> polys = {{
+      {1.97f, 1.88f, 5.81f, 1.88f, 1.69f, 6.53f, 5.94f, 6.38f, 1.97f, 1.88f},
+      {2.97f, 1.88f, 3.81f, 1.68f, 1.69f, 6.63f, 6.94f, 6.58f, 2.97f, 0.88f}}};
+  float expect_boxes[] = {
+      1.69f, 0.88f, 6.94f, 6.63f
+  };
+  uint8_t expect_mask[] = {
+      0, 0, 0, 0, 0, 0, 0, 0,
+      0, 1, 1, 1, 1, 1, 0, 0,
+      0, 0, 1, 1, 1, 0, 0, 0,
+      0, 0, 1, 1, 1, 0, 0, 0,
+      0, 0, 1, 1, 1, 0, 0, 0,
+      0, 1, 1, 1, 1, 1, 0, 0,
+      0, 1, 1, 1, 1, 1, 1, 0,
+      1, 1, 1, 1, 1, 1, 1, 1
+  };
+  // clang-format on
+
+  platform::CPUPlace cpu;
+  auto allocation = memory::Alloc(cpu, sizeof(expect_boxes));
+  float* boxes = reinterpret_cast<float*>(allocation->ptr());
+  Poly2Boxes(polys, boxes);
+  Compare<float>(expect_boxes, boxes, 4);
+
+  auto allocat_mask = memory::Alloc(cpu, sizeof(expect_mask));
+  uint8_t* mask = reinterpret_cast<uint8_t*>(allocat_mask->ptr());
+  int M = 8;
+  Polys2MaskWrtBox(polys[0], expect_boxes, M, mask);
+  Compare<uint8_t>(expect_mask, mask, M * M);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 0a8c0814a7d..55cef93aacd 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -103,8 +103,10 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
 REGISTER_OPERATOR(gather_grad, ops::GatherGradOp);
 REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
                        ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
+                       ops::GatherOpKernel<uint8_t>,
                        ops::GatherOpKernel<int64_t>);
 REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
                        ops::GatherGradientOpKernel<double>,
                        ops::GatherGradientOpKernel<int>,
+                       ops::GatherGradientOpKernel<uint8_t>,
                        ops::GatherGradientOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index bcec6f3563d..8d695fdedd0 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/roi_align_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -255,8 +256,8 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
 
     Tensor roi_batch_id_list;
     roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    auto cplace = platform::CPUPlace();
+    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto rois_lod = rois->lod().back();
     int rois_batch_size = rois_lod.size() - 1;
     PADDLE_ENFORCE_EQ(
@@ -270,14 +271,18 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
         roi_batch_id_data[i] = n;
       }
     }
-    Tensor roi_batch_id_list_gpu;
-    framework::TensorCopySync(roi_batch_id_list, ctx.GetPlace(),
-                              &roi_batch_id_list_gpu);
-    GPUROIAlignForward<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    int bytes = roi_batch_id_list.numel() * sizeof(int);
+    auto roi_ptr = allocator.Allocate(bytes);
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+                 dev_ctx.stream());
+    GPUROIAlignForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
         output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
-        height, width, pooled_height, pooled_width, sampling_ratio,
-        roi_batch_id_list_gpu.data<int>(),
+        height, width, pooled_height, pooled_width, sampling_ratio, roi_id_data,
         out->mutable_data<T>(ctx.GetPlace()));
   }
 };
@@ -307,8 +312,8 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
     }
     Tensor roi_batch_id_list;
     roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    auto cplace = platform::CPUPlace();
+    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto rois_lod = rois->lod().back();
     int rois_batch_size = rois_lod.size() - 1;
     for (int n = 0; n < rois_batch_size; ++n) {
@@ -316,24 +321,28 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
         roi_batch_id_data[i] = n;
       }
     }
-    Tensor roi_batch_id_list_gpu;
-    framework::TensorCopySync(roi_batch_id_list, ctx.GetPlace(),
-                              &roi_batch_id_list_gpu);
-
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto roi_ptr = allocator.Allocate(roi_batch_id_list.numel() * sizeof(int));
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    int bytes = roi_batch_id_list.numel() * sizeof(int);
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+                 dev_ctx.stream());
     in_grad->mutable_data<T>(ctx.GetPlace());
     math::SetConstant<Place, T> set_zero;
-    set_zero(ctx.cuda_device_context(), in_grad, static_cast<T>(0));
+    set_zero(dev_ctx, in_grad, static_cast<T>(0));
 
     int output_grad_size = out_grad->numel();
     int blocks = NumBlocks(output_grad_size);
     int threads = kNumCUDAThreads;
 
     if (output_grad_size > 0) {
-      GPUROIAlignBackward<
-          T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+      GPUROIAlignBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
           output_grad_size, rois->data<T>(), out_grad->data<T>(), rois_num,
           spatial_scale, channels, height, width, pooled_height, pooled_width,
-          sampling_ratio, roi_batch_id_list_gpu.data<int>(),
+          sampling_ratio, roi_id_data,
           in_grad->mutable_data<T>(ctx.GetPlace()));
     }
   }
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 75c3dd6bc49..ac3a4201e65 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/roi_pool_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -152,8 +153,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
 
     framework::Tensor roi_batch_id_list;
     roi_batch_id_list.Resize({rois_num});
-    int* roi_batch_id_data =
-        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    auto cplace = platform::CPUPlace();
+    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto rois_lod = rois->lod().back();
     int rois_batch_size = rois_lod.size() - 1;
     PADDLE_ENFORCE_EQ(
@@ -168,15 +169,20 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
       }
     }
 
-    framework::Tensor roi_batch_id_list_gpu;
-    framework::TensorCopy(roi_batch_id_list, ctx.GetPlace(),
-                          ctx.device_context(), &roi_batch_id_list_gpu);
-
-    GPUROIPoolForward<
-        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    int bytes = roi_batch_id_list.numel() * sizeof(int);
+    auto roi_ptr = allocator.Allocate(bytes);
+    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+                 dev_ctx.stream());
+
+    GPUROIPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
         output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
-        height, width, pooled_height, pooled_width,
-        roi_batch_id_list_gpu.data<int>(), out->mutable_data<T>(ctx.GetPlace()),
+        height, width, pooled_height, pooled_width, roi_id_data,
+        out->mutable_data<T>(ctx.GetPlace()),
         argmax->mutable_data<int64_t>(ctx.GetPlace()));
   }
 };
@@ -204,8 +210,8 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
     if (x_grad) {
       framework::Tensor roi_batch_id_list;
       roi_batch_id_list.Resize({rois_num});
-      int* roi_batch_id_data =
-          roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+      auto cplace = platform::CPUPlace();
+      int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
       auto rois_lod = rois->lod().back();
       int rois_batch_size = rois_lod.size() - 1;
       for (int n = 0; n < rois_batch_size; ++n) {
@@ -213,25 +219,30 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
           roi_batch_id_data[i] = n;
         }
       }
-      framework::Tensor roi_batch_id_list_gpu;
-      framework::TensorCopy(roi_batch_id_list, ctx.GetPlace(),
-                            ctx.device_context(), &roi_batch_id_list_gpu);
+
+      auto& dev_ctx = ctx.cuda_device_context();
+      auto& allocator =
+          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+      int bytes = roi_batch_id_list.numel() * sizeof(int);
+      auto roi_ptr = allocator.Allocate(bytes);
+      int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
+      const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+      memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
+                   dev_ctx.stream());
 
       x_grad->mutable_data<T>(ctx.GetPlace());
       math::SetConstant<Place, T> set_zero;
-      set_zero(ctx.cuda_device_context(), x_grad, static_cast<T>(0));
+      set_zero(dev_ctx, x_grad, static_cast<T>(0));
 
       int output_grad_size = out_grad->numel();
       int blocks = NumBlocks(output_grad_size);
       int threads = kNumCUDAThreads;
 
       if (output_grad_size > 0) {
-        GPUROIPoolBackward<
-            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        GPUROIPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
             output_grad_size, rois->data<T>(), out_grad->data<T>(),
             argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
-            width, pooled_height, pooled_width,
-            roi_batch_id_list_gpu.data<int>(),
+            width, pooled_height, pooled_width, roi_id_data,
             x_grad->mutable_data<T>(ctx.GetPlace()));
       }
     }
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 14746fa9515..c21b0c13c75 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -101,6 +101,10 @@ class SigmoidCrossEntropyWithLogitsOpMaker
     AddOutput("Out",
               "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
               " of elementwise logistic losses.");
+    AddAttr<bool>("normalize",
+                  "if true, divide the loss by the number of "
+                  "targets != ignore_index.")
+        .SetDefault(false);
     AddAttr<int>("ignore_index",
                  "(int, default kIgnoreIndex), Specifies a target value that "
                  "is ignored and"
@@ -145,9 +149,14 @@ REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad,
                   ops::SigmoidCrossEntropyWithLogitsGradOp);
-REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits,
-                       ops::SigmoidCrossEntropyWithLogitsKernel<
-                           paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    sigmoid_cross_entropy_with_logits,
+    ops::SigmoidCrossEntropyWithLogitsKernel<paddle::platform::CPUDeviceContext,
+                                             float>,
+    ops::SigmoidCrossEntropyWithLogitsKernel<paddle::platform::CPUDeviceContext,
+                                             double>);
 REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
                        ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                           paddle::platform::CPUDeviceContext, float>);
+                           paddle::platform::CPUDeviceContext, float>,
+                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
+                           paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
index a1fbc7e5fab..2a4570ef5ce 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -11,12 +11,184 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "cub/cub.cuh"
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static HOSTDEVICE float real_exp(float x) { return expf(x); }
+static HOSTDEVICE float real_exp(double x) { return exp(x); }
+static HOSTDEVICE float real_log(float x) { return logf(x); }
+static HOSTDEVICE float real_log(double x) { return log(x); }
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void GPUSigmoidForward(const T *x_data, const T *label_data,
+                                  const int ignore_index, const int limit,
+                                  T *out_data, T *counts) {
+  CUDA_1D_KERNEL_LOOP(i, limit) {
+    T x = x_data[i];
+    T label = label_data[i];
+    T eps = static_cast<T>(1e-5);
+    T diff = label - static_cast<T>(ignore_index);
+    if ((diff > -eps) && (diff < eps)) {
+      out_data[i] = static_cast<T>(0.);
+      counts[i] = 0;
+    } else {
+      T term1 = (x > 0) ? x : 0;
+      T term2 = x * label;
+      T term3 = real_log(static_cast<T>(1) + real_exp(static_cast<T>(-abs(x))));
+      out_data[i] = term1 - term2 + term3;
+      counts[i] = 1;
+    }
+  }
+}
+
+template <typename T, int BlockDim>
+__global__ void Sum(const T *counts, int num, const T eps, T *sum) {
+  typedef cub::BlockReduce<double, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  T in = 0;
+  for (int i = threadIdx.x; i < num; i += BlockDim) {
+    in += counts[i];
+  }
+  __syncthreads();
+  auto out =
+      BlockReduce(temp_storage).Reduce(static_cast<double>(in), cub::Sum());
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    T a = out > eps ? out : eps;
+    sum[0] = a;
+  }
+}
+
+template <typename T>
+__global__ void Div(T *loss, const int num, const T *norm) {
+  CUDA_1D_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; }
+}
+
+template <typename T>
+__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data,
+                                   const int ignore_index, const T *dout_data,
+                                   const int limit, T *dx_data, T *counts) {
+  CUDA_1D_KERNEL_LOOP(i, limit) {
+    T x = x_data[i];
+    T label = label_data[i];
+    T dout = dout_data[i];
+    T eps = static_cast<T>(1e-5);
+    T diff = label - static_cast<T>(ignore_index);
+    if ((diff > -eps) && (diff < eps)) {
+      dx_data[i] = static_cast<T>(0.);
+      counts[i] = 0;
+    } else {
+      T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + real_exp(-x));
+      T diff = simoid_x - label;
+      dx_data[i] = dout * diff;
+      counts[i] = 1;
+    }
+  }
+}
+
+// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
+template <typename DeviceContext, typename T>
+class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Labels = context.Input<Tensor>("Label");
+    Tensor *Out = context.Output<Tensor>("Out");
+    int ignore_index = context.Attr<int>("ignore_index");
+    auto out_data = Out->mutable_data<T>(context.GetPlace());
+
+    auto &dev_ctx = context.cuda_device_context();
+    bool normalize = context.Attr<bool>("normalize");
+
+    // Temporary memory
+    auto &allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto cnt_ptr = allocator.Allocate(Labels->numel() * sizeof(T));
+    T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
+
+    int limit = Out->numel();
+    int blocks = NumBlocks(limit);
+    int threads = kNumCUDAThreads;
+    GPUSigmoidForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+        X->data<T>(), Labels->data<T>(), ignore_index, limit, out_data, counts);
+    if (normalize) {
+      auto norm_ptr = allocator.Allocate(sizeof(T));
+      T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
+      Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
+          counts, limit, static_cast<T>(1e-5), norm);
+      Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(out_data, limit, norm);
+    }
+  }
+};
+
+// dX = sigmoid(X) - labels
+template <typename DeviceContext, typename T>
+class GPUSigmoidCrossEntropyWithLogitsGradKernel
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Labels = context.Input<Tensor>("Label");
+    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
+    auto dx_data = dX->mutable_data<T>(context.GetPlace());
+
+    int ignore_index = context.Attr<int>("ignore_index");
+
+    auto &dev_ctx = context.cuda_device_context();
+    // Temporary memory
+    auto &allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto cnt_ptr = allocator.Allocate(X->numel() * sizeof(T));
+    T *counts = reinterpret_cast<T *>(cnt_ptr->ptr());
+
+    int limit = dX->numel();
+    int blocks = NumBlocks(limit);
+    int threads = kNumCUDAThreads;
+    GPUSigmoidBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+        X->data<T>(), Labels->data<T>(), ignore_index, dOut->data<T>(), limit,
+        dx_data, counts);
+    bool normalize = context.Attr<bool>("normalize");
+    if (normalize) {
+      auto norm_ptr = allocator.Allocate(sizeof(T));
+      T *norm = reinterpret_cast<T *>(norm_ptr->ptr());
+      Sum<T, kNumCUDAThreads><<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>(
+          counts, limit, static_cast<T>(1e-5), norm);
+      Div<T><<<blocks, threads, 0, dev_ctx.stream()>>>(dx_data, limit, norm);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits,
-                        ops::SigmoidCrossEntropyWithLogitsKernel<
-                            paddle::platform::CUDADeviceContext, float>);
+                        ops::GPUSigmoidCrossEntropyWithLogitsKernel<
+                            paddle::platform::CUDADeviceContext, float>,
+                        ops::GPUSigmoidCrossEntropyWithLogitsKernel<
+                            paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad,
-                        ops::SigmoidCrossEntropyWithLogitsGradKernel<
-                            paddle::platform::CUDADeviceContext, float>);
+                        ops::GPUSigmoidCrossEntropyWithLogitsGradKernel<
+                            paddle::platform::CUDADeviceContext, float>,
+                        ops::GPUSigmoidCrossEntropyWithLogitsGradKernel<
+                            paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
index 6e75f9e0b8d..8f459d573ae 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -13,54 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/fluid/framework/eigen.h"
+#include <algorithm>
+#include <limits>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-struct SigmoidCrossEntropyWithLogitsForward {
-  HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index)
-      : ignore_index(ignore_index) {}
-
-  HOSTDEVICE T operator()(const T &x, const T &label) const {
-    if (static_cast<int>(label) == ignore_index) {
-      return static_cast<T>(0.);
-    }
-    T term1 = (x > 0) ? x : 0;
-    T term2 = x * label;
-    T term3 = std::log(static_cast<T>(1) + std::exp(-(std::abs(x))));
-    return term1 - term2 + term3;
-  }
-
-  int ignore_index;
-};
-
-template <typename T>
-struct SigmoidCrossEntropyWithLogitsBackward {
-  HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index)
-      : ignore_index(ignore_index) {}
-
-  HOSTDEVICE T operator()(const T &x, const T &label) const {
-    if (static_cast<int>(label) == ignore_index) {
-      return static_cast<T>(0.);
-    }
-    T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
-    return simoid_x - label;
-  }
-
-  int ignore_index;
-};
 
 // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
 template <typename DeviceContext, typename T>
@@ -70,16 +30,37 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
     const Tensor *X = context.Input<Tensor>("X");
     const Tensor *Labels = context.Input<Tensor>("Label");
     Tensor *Out = context.Output<Tensor>("Out");
-    Out->mutable_data<T>(context.GetPlace());
     int ignore_index = context.Attr<int>("ignore_index");
-
-    auto x = EigenVector<T>::Flatten(*X);
-    auto labels = EigenVector<T>::Flatten(*Labels);
-    auto out = EigenVector<T>::Flatten(*Out);
-    auto &place = *context.device_context<DeviceContext>().eigen_device();
-
-    out.device(place) = x.binaryExpr(
-        labels, SigmoidCrossEntropyWithLogitsForward<T>(ignore_index));
+    auto out_data = Out->mutable_data<T>(context.GetPlace());
+    int limit = Out->numel();
+    auto x_data = X->data<T>();
+    auto label_data = Labels->data<T>();
+    for (int idx = 0; idx < limit; ++idx) {
+      T x = x_data[idx];
+      T label = label_data[idx];
+      if (static_cast<int>(label) == ignore_index) {
+        out_data[idx] = static_cast<T>(0.);
+      } else {
+        T term1 = (x > 0) ? x : 0;
+        T term2 = x * label;
+        T term3 = std::log(static_cast<T>(1) + std::exp(-std::abs(x)));
+        out_data[idx] = term1 - term2 + term3;
+      }
+    }
+    bool normalize = context.Attr<bool>("normalize");
+    if (normalize) {
+      int norm = 0;
+      T eps = static_cast<T>(1e-6);
+      for (int idx = 0; idx < limit; ++idx) {
+        T diff = label_data[idx] - static_cast<T>(ignore_index);
+        if ((diff < -eps) || (diff > eps)) {
+          norm += 1;
+        }
+      }
+      eps = static_cast<T>(1e-5);
+      norm = norm > eps ? norm : eps;
+      std::for_each(out_data, out_data + limit, [norm](T &v) { v = v / norm; });
+    }
   }
 };
 
@@ -92,19 +73,39 @@ class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
     const Tensor *Labels = context.Input<Tensor>("Label");
     const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
     Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
-    dX->mutable_data<T>(context.GetPlace());
-
-    auto ignore_index = context.Attr<int>("ignore_index");
-    auto x = EigenVector<T>::Flatten(*X);
-    auto labels = EigenVector<T>::Flatten(*Labels);
-    auto dout = EigenVector<T>::Flatten(*dOut);
-    auto dx = EigenVector<T>::Flatten(*dX);
-    auto &place =
-        *context.template device_context<DeviceContext>().eigen_device();
+    auto dx_data = dX->mutable_data<T>(context.GetPlace());
 
-    auto diff = x.binaryExpr(labels, SigmoidCrossEntropyWithLogitsBackward<T>(
-                                         static_cast<int>(ignore_index)));
-    dx.device(place) = dout * diff;
+    int ignore_index = context.Attr<int>("ignore_index");
+    int limit = dX->numel();
+    auto x_data = X->data<T>();
+    auto label_data = Labels->data<T>();
+    auto dout_data = dOut->data<T>();
+    for (int idx = 0; idx < limit; ++idx) {
+      T x = x_data[idx];
+      T label = label_data[idx];
+      T dout = dout_data[idx];
+      if (static_cast<int>(label) == ignore_index) {
+        dx_data[idx] = static_cast<T>(0.);
+      } else {
+        T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
+        T diff = simoid_x - label;
+        dx_data[idx] = dout * diff;
+      }
+    }
+    bool normalize = context.Attr<bool>("normalize");
+    if (normalize) {
+      int norm = 0;
+      T eps = static_cast<T>(1e-6);
+      for (int idx = 0; idx < limit; ++idx) {
+        T diff = label_data[idx] - static_cast<T>(ignore_index);
+        if ((diff < -eps) || (diff > eps)) {
+          norm += 1;
+        }
+      }
+      eps = static_cast<T>(1e-5);
+      norm = norm > eps ? norm : eps;
+      std::for_each(dx_data, dx_data + limit, [norm](T &v) { v = v / norm; });
+    }
   }
 };
 
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 7b70d19de5c..a24e1d13003 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -88,8 +88,8 @@ class DataToLoDTensorConverter(object):
                     raise ValueError(
                         "Reshape error. What is defined in data layer is {}, but receive {}"
                         .format(self.shape, arr.shape))
-            else:
-                self._check_shape(arr.shape)
+            #else:
+            #    self._check_shape(arr.shape)
         t = core.LoDTensor()
         t.set(arr, self.place)
         if self.lod_level > 0:
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 8aed97dc59b..cddc302d52e 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -44,6 +44,7 @@ __all__ = [
     'roi_perspective_transform',
     'generate_proposal_labels',
     'generate_proposals',
+    'generate_mask_labels',
     'iou_similarity',
     'box_coder',
     'polygon_box_transform',
@@ -1659,7 +1660,7 @@ def generate_proposal_labels(rpn_rois,
                              class_nums=None,
                              use_random=True):
     """
-    ** Generate proposal labels Faster-RCNN **
+    ** Generate Proposal Labels of Faster-RCNN **
     This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth,
     to sample foreground boxes and background boxes, and compute loss target.
 
@@ -1740,6 +1741,140 @@ def generate_proposal_labels(rpn_rois,
     return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights
 
 
+def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
+                         labels_int32, num_classes, resolution):
+    """
+    ** Generate Mask Labels for Mask-RCNN **
+
+    This operator can be, for given the RoIs and corresponding labels,
+    to sample foreground RoIs. This mask branch also has
+    a :math: `K \\times M^{2}` dimensional output targets for each foreground
+    RoI, which encodes K binary masks of resolution M x M, one for each of the
+    K classes. This mask targets are used to compute loss of mask branch.
+
+    Please note, the data format of groud-truth segmentation, assumed the
+    segmentations are as follows. The first instance has two gt objects.
+    The second instance has one gt object, this object has two gt segmentations.
+
+        .. code-block:: python
+
+            #[
+            #  [[[229.14, 370.9, 229.14, 370.9, ...]],
+            #   [[343.7, 139.85, 349.01, 138.46, ...]]], # 0-th instance
+            #  [[[500.0, 390.62, ...],[115.48, 187.86, ...]]] # 1-th instance
+            #]
+
+            batch_masks = []
+            for semgs in batch_semgs:
+                gt_masks = []
+                for semg in semgs:
+                    gt_segm = []
+                    for polys in semg:
+                        gt_segm.append(np.array(polys).reshape(-1, 2))
+                    gt_masks.append(gt_segm)
+                batch_masks.append(gt_masks)
+            
+            
+            place = fluid.CPUPlace()
+            feeder = fluid.DataFeeder(place=place, feed_list=feeds)
+            feeder.feed(batch_masks)
+
+    Args:
+        im_info(Variable): A 2-D Tensor with shape [N, 3]. N is the batch size,
+            each element is [height, width, scale] of image. Image scale is
+            target_size) / original_size.
+        gt_classes(Variable): A 2-D LoDTensor with shape [M, 1]. M is the total
+            number of ground-truth, each element is a class label.
+        is_crowd(Variable): A 2-D LoDTensor with shape as gt_classes,
+            each element is a flag indicating whether a groundtruth is crowd.
+        gt_segms(Variable): This input is a 2D LoDTensor with shape [S, 2],
+            it's LoD level is 3. Usually users do not needs to understand LoD,
+            The users should return correct data format in reader.
+
+
+
+            The LoD[0] represents the gt objects number of
+            each instance. LoD[1] represents the segmentation counts of each
+            objects. LoD[2] represents the polygons number of each segmentation.
+            S the total number of polygons coordinate points. Each element is
+            (x, y) coordinate points.
+        rois(Variable): A 2-D LoDTensor with shape [R, 4]. R is the total
+            number of RoIs, each element is a bounding box with
+            (xmin, ymin, xmax, ymax) format in the range of original image.
+        labels_int32(Variable): A 2-D LoDTensor in shape of [R, 1] with type
+            of int32. R is the same as it in `rois`. Each element repersents
+            a class label of a RoI.
+        num_classes(int): Class number.
+        resolution(int): Resolution of mask predictions.
+
+    Returns:
+        mask_rois (Variable):  A 2D LoDTensor with shape [P, 4]. P is the total
+            number of sampled RoIs. Each element is a bounding box with
+            [xmin, ymin, xmax, ymax] format in range of orignal image size.
+        mask_rois_has_mask_int32 (Variable): A 2D LoDTensor with shape [P, 1],
+            each element repersents the output mask RoI index with regard to
+            to input RoIs.
+        mask_int32 (Variable): A 2D LoDTensor with shape [P, K * M * M],
+            K is the classes number and M is the resolution of mask predictions.
+            Each element repersents the binary mask targets.
+
+    Examples:
+        .. code-block:: python
+
+          im_info = fluid.layers.data(name="im_info", shape=[3],
+              dtype="float32")
+          gt_classes = fluid.layers.data(name="gt_classes", shape=[1],
+              dtype="float32", lod_level=1)
+          is_crowd = fluid.layers.data(name="is_crowd", shape=[1],
+              dtype="float32", lod_level=1)
+          gt_masks = fluid.layers.data(name="gt_masks", shape=[2],
+              dtype="float32", lod_level=3)
+          # rois, labels_int32 can be the output of
+          # fluid.layers.generate_proposal_labels.
+          mask_rois, mask_index, mask_int32 = fluid.layers.generate_mask_labels(
+              im_info=im_info,
+              gt_classes=gt_classes,
+              is_crowd=is_crowd,
+              gt_segms=gt_masks,
+              rois=rois,
+              labels_int32=labels_int32,
+              num_classes=81,
+              resolution=14)
+    """
+
+    helper = LayerHelper('generate_mask_labels', **locals())
+
+    mask_rois = helper.create_variable_for_type_inference(dtype=rois.dtype)
+    roi_has_mask_int32 = helper.create_variable_for_type_inference(
+        dtype=gt_classes.dtype)
+    mask_int32 = helper.create_variable_for_type_inference(
+        dtype=gt_classes.dtype)
+
+    helper.append_op(
+        type="generate_mask_labels",
+        inputs={
+            'ImInfo': im_info,
+            'GtClasses': gt_classes,
+            'IsCrowd': is_crowd,
+            'GtSegms': gt_segms,
+            'Rois': rois,
+            'LabelsInt32': labels_int32
+        },
+        outputs={
+            'MaskRois': mask_rois,
+            'RoiHasMaskInt32': roi_has_mask_int32,
+            'MaskInt32': mask_int32
+        },
+        attrs={'num_classes': num_classes,
+               'resolution': resolution})
+
+    mask_rois.stop_gradient = True
+    roi_has_mask_int32.stop_gradient = True
+    mask_int32.stop_gradient = True
+
+    return mask_rois, roi_has_mask_int32, mask_int32
+
+
 def generate_proposals(scores,
                        bbox_deltas,
                        im_info,
@@ -1754,33 +1889,48 @@ def generate_proposals(scores,
     """
     **Generate proposal Faster-RCNN**
 
-    This operation proposes RoIs according to each box with their probability to be a foreground object and 
-    the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals
+    This operation proposes RoIs according to each box with their
+    probability to be a foreground object and 
+    the box can be calculated by anchors. Bbox_deltais and scores
+    to be an object are the output of RPN. Final proposals
     could be used to train detection net.
 
     For generating proposals, this operation performs following steps:
 
-    1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4)
+    1. Transposes and resizes scores and bbox_deltas in size of
+       (H*W*A, 1) and (H*W*A, 4)
     2. Calculate box locations as proposals candidates. 
     3. Clip boxes to image
     4. Remove predicted boxes with small area. 
     5. Apply NMS to get final proposals as output.
 
     Args:
-        scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object.
-            N is batch size, A is number of anchors, H and W are height and width of the feature map.
-        bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. 
-        im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale
+        scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents
+            the probability for each box to be an object.
+            N is batch size, A is number of anchors, H and W are height and
+            width of the feature map.
+        bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W]
+            represents the differece between predicted box locatoin and
+            anchor location.
+        im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin
+            image information for N batch. Info contains height, width and scale
             between origin image size and the size of feature map.
-        anchors(Variable):   A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map,
-                    num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
-        variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format.
-        pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default.
-        post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default.
+        anchors(Variable):   A 4-D Tensor represents the anchors with a layout
+            of [H, W, A, 4]. H and W are height and width of the feature map,
+            num_anchors is the box count of each position. Each anchor is
+            in (xmin, ymin, xmax, ymax) format an unnormalized.
+        variances(Variable): The expanded variances of anchors with a layout of
+            [H, W, num_priors, 4]. Each variance is in
+            (xcenter, ycenter, w, h) format.
+        pre_nms_top_n(float): Number of total bboxes to be kept per
+            image before NMS. 6000 by default.
+        post_nms_top_n(float): Number of total bboxes to be kept per
+            image after NMS. 1000 by default.
         nms_thresh(float): Threshold in NMS, 0.5 by default.
-        min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default.
-        eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration.
-
+        min_size(float): Remove predicted boxes with either height or
+            width < min_size. 0.1 by default.
+        eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5,
+            adaptive_threshold = adaptive_threshold * eta in each iteration.
     """
     helper = LayerHelper('generate_proposals', **locals())
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 503c91c27ba..6765a89a1b0 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -8927,7 +8927,8 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
 def sigmoid_cross_entropy_with_logits(x,
                                       label,
                                       ignore_index=kIgnoreIndex,
-                                      name=None):
+                                      name=None,
+                                      normalize=False):
     """
     ${comment}
 
@@ -8936,9 +8937,25 @@ def sigmoid_cross_entropy_with_logits(x,
         label(${label_type}): ${label_comment}
         ignore_index(&{ignore_index}): ${ignore_index_comment}
         name(basestring|None): Name of the output.
+        normalize(bool): If true, divide the output by the number of
+            targets != ignore_index.
 
     Returns:
         out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            input = fluid.layers.data(
+                name='data', shape=[10], dtype='float32')
+            label = fluid.layers.data(
+                name='data', shape=[10], dtype='float32')
+            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+                x=input,
+                label=label,
+                ignore_index=-1,
+                normalize=True) # or False
+            # loss = fluid.layers.reduce_sum(loss) # summation of loss
     """
 
     helper = LayerHelper("sigmoid_cross_entropy_with_logits", **locals())
@@ -8953,7 +8970,8 @@ def sigmoid_cross_entropy_with_logits(x,
         type="sigmoid_cross_entropy_with_logits",
         inputs={"X": x,
                 "Label": label},
-        attrs={"ignore_index": ignore_index},
+        attrs={"ignore_index": ignore_index,
+               'normalize': normalize},
         outputs={"Out": out})
     return out
 
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index d99eaa0634f..2d9ed9f9c69 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -203,7 +203,7 @@ class TestGenerateProposalLabels(unittest.TestCase):
                 lod_level=1,
                 append_batch_size=False)
             class_nums = 5
-            rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels(
+            outs = fluid.layers.generate_proposal_labels(
                 rpn_rois=rpn_rois,
                 gt_classes=gt_classes,
                 is_crowd=is_crowd,
@@ -216,6 +216,11 @@ class TestGenerateProposalLabels(unittest.TestCase):
                 bg_thresh_lo=0.0,
                 bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
                 class_nums=class_nums)
+            rois = outs[0]
+            labels_int32 = outs[1]
+            bbox_targets = outs[2]
+            bbox_inside_weights = outs[3]
+            bbox_outside_weights = outs[4]
             assert rois.shape[1] == 4
             assert rois.shape[0] == labels_int32.shape[0]
             assert rois.shape[0] == bbox_targets.shape[0]
@@ -226,6 +231,62 @@ class TestGenerateProposalLabels(unittest.TestCase):
             assert bbox_outside_weights.shape[1] == 4 * class_nums
 
 
+class TestGenerateMaskLabels(unittest.TestCase):
+    def test_generate_mask_labels(self):
+        program = Program()
+        with program_guard(program):
+            im_info = layers.data(
+                name='im_info',
+                shape=[1, 3],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            gt_classes = layers.data(
+                name='gt_classes',
+                shape=[2, 1],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            is_crowd = layers.data(
+                name='is_crowd',
+                shape=[2, 1],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            gt_segms = layers.data(
+                name='gt_segms',
+                shape=[20, 2],
+                dtype='float32',
+                lod_level=3,
+                append_batch_size=False)
+            rois = layers.data(
+                name='rois',
+                shape=[4, 4],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
+            labels_int32 = layers.data(
+                name='labels_int32',
+                shape=[4, 1],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            num_classes = 5
+            resolution = 14
+            outs = fluid.layers.generate_mask_labels(
+                im_info=im_info,
+                gt_classes=gt_classes,
+                is_crowd=is_crowd,
+                gt_segms=gt_segms,
+                rois=rois,
+                labels_int32=labels_int32,
+                num_classes=num_classes,
+                resolution=resolution)
+            mask_rois, roi_has_mask_int32, mask_int32 = outs
+            assert mask_rois.shape[1] == 4
+            assert mask_int32.shape[1] == num_classes * resolution * resolution
+
+
 class TestMultiBoxHead(unittest.TestCase):
     def test_multi_box_head(self):
         data_shape = [3, 224, 224]
@@ -313,7 +374,7 @@ class TestRpnTargetAssign(unittest.TestCase):
                 name='gt_boxes', shape=[4], lod_level=1, dtype='float32')
             is_crowd = layers.data(
                 name='is_crowd',
-                shape=[10],
+                shape=[1, 10],
                 dtype='int32',
                 lod_level=1,
                 append_batch_size=False)
@@ -323,7 +384,7 @@ class TestRpnTargetAssign(unittest.TestCase):
                 dtype='float32',
                 lod_level=1,
                 append_batch_size=False)
-            pred_scores, pred_loc, tgt_lbl, tgt_bbox, bbox_inside_weight = layers.rpn_target_assign(
+            outs = layers.rpn_target_assign(
                 bbox_pred=bbox_pred,
                 cls_logits=cls_logits,
                 anchor_box=anchor_box,
@@ -337,6 +398,11 @@ class TestRpnTargetAssign(unittest.TestCase):
                 rpn_positive_overlap=0.7,
                 rpn_negative_overlap=0.3,
                 use_random=False)
+            pred_scores = outs[0]
+            pred_loc = outs[1]
+            tgt_lbl = outs[2]
+            tgt_bbox = outs[3]
+            bbox_inside_weight = outs[4]
 
             self.assertIsNotNone(pred_scores)
             self.assertIsNotNone(pred_loc)
@@ -351,41 +417,43 @@ class TestRpnTargetAssign(unittest.TestCase):
 
 class TestGenerateProposals(unittest.TestCase):
     def test_generate_proposals(self):
-        data_shape = [20, 64, 64]
-        images = fluid.layers.data(
-            name='images', shape=data_shape, dtype='float32')
-        im_info = fluid.layers.data(
-            name='im_info', shape=[1, 3], dtype='float32')
-        anchors, variances = fluid.layers.anchor_generator(
-            name='anchor_generator',
-            input=images,
-            anchor_sizes=[32, 64],
-            aspect_ratios=[1.0],
-            variance=[0.1, 0.1, 0.2, 0.2],
-            stride=[16.0, 16.0],
-            offset=0.5)
-        num_anchors = anchors.shape[2]
-        scores = fluid.layers.data(
-            name='scores', shape=[1, num_anchors, 8, 8], dtype='float32')
-        bbox_deltas = fluid.layers.data(
-            name='bbox_deltas',
-            shape=[1, num_anchors * 4, 8, 8],
-            dtype='float32')
-        rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals(
-            name='generate_proposals',
-            scores=scores,
-            bbox_deltas=bbox_deltas,
-            im_info=im_info,
-            anchors=anchors,
-            variances=variances,
-            pre_nms_top_n=6000,
-            post_nms_top_n=1000,
-            nms_thresh=0.5,
-            min_size=0.1,
-            eta=1.0)
-        self.assertIsNotNone(rpn_rois)
-        self.assertIsNotNone(rpn_roi_probs)
-        print(rpn_rois.shape)
+        program = Program()
+        with program_guard(program):
+            data_shape = [20, 64, 64]
+            images = fluid.layers.data(
+                name='images', shape=data_shape, dtype='float32')
+            im_info = fluid.layers.data(
+                name='im_info', shape=[3], dtype='float32')
+            anchors, variances = fluid.layers.anchor_generator(
+                name='anchor_generator',
+                input=images,
+                anchor_sizes=[32, 64],
+                aspect_ratios=[1.0],
+                variance=[0.1, 0.1, 0.2, 0.2],
+                stride=[16.0, 16.0],
+                offset=0.5)
+            num_anchors = anchors.shape[2]
+            scores = fluid.layers.data(
+                name='scores', shape=[num_anchors, 8, 8], dtype='float32')
+            bbox_deltas = fluid.layers.data(
+                name='bbox_deltas',
+                shape=[num_anchors * 4, 8, 8],
+                dtype='float32')
+            rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals(
+                name='generate_proposals',
+                scores=scores,
+                bbox_deltas=bbox_deltas,
+                im_info=im_info,
+                anchors=anchors,
+                variances=variances,
+                pre_nms_top_n=6000,
+                post_nms_top_n=1000,
+                nms_thresh=0.5,
+                min_size=0.1,
+                eta=1.0)
+            self.assertIsNotNone(rpn_rois)
+            self.assertIsNotNone(rpn_roi_probs)
+            print(rpn_rois.shape)
 
 
 class TestYoloDetection(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py
new file mode 100644
index 00000000000..1d7ce33ea7c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py
@@ -0,0 +1,421 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+import math
+import six
+import paddle.fluid as fluid
+from op_test import OpTest
+'''
+# Equivalent code
+rles = mask_util.frPyObjects([segm], im_h, im_w)
+mask = mask_util.decode(rles)
+'''
+
+
+def decode(cnts, m):
+    v = 0
+    mask = []
+    for j in range(m):
+        for k in range(cnts[j]):
+            mask.append(v)
+        v = 1 - v
+    return mask
+
+
+def poly2mask(xy, k, h, w):
+    scale = 5.
+    x = [int(scale * p + 0.5) for p in xy[::2]]
+    x = x + [x[0]]
+    y = [int(scale * p + 0.5) for p in xy[1::2]]
+    y = y + [y[0]]
+    m = sum([
+        int(max(abs(x[j] - x[j + 1]), abs(y[j] - y[j + 1]))) + int(1)
+        for j in range(k)
+    ])
+
+    u, v = [], []
+    for j in range(k):
+        xs = x[j]
+        xe = x[j + 1]
+        ys = y[j]
+        ye = y[j + 1]
+        dx = abs(xe - xs)
+        dy = abs(ys - ye)
+        flip = (dx >= dy and xs > xe) or (dx < dy and ys > ye)
+        if flip:
+            xs, xe = xe, xs
+            ys, ye = ye, ys
+
+        if dx >= dy:
+            if (dx == 0): assert ye - ys == 0
+            s = 0 if dx == 0 else float(ye - ys) / dx
+        else:
+            if (dy == 0): assert xe - xs == 0
+            s = 0 if dy == 0 else float(xe - xs) / dy
+
+        if dx >= dy:
+            ts = [dx - d if flip else d for d in range(dx + 1)]
+            u.extend([xs + t for t in ts])
+            v.extend([int(ys + s * t + .5) for t in ts])
+        else:
+            ts = [dy - d if flip else d for d in range(dy + 1)]
+            v.extend([t + ys for t in ts])
+            u.extend([int(xs + s * t + .5) for t in ts])
+
+    k = len(u)
+    x = np.zeros((k), np.int)
+    y = np.zeros((k), np.int)
+    m = 0
+    for j in six.moves.xrange(1, k):
+        if u[j] != u[j - 1]:
+            xd = float(u[j] if (u[j] < u[j - 1]) else (u[j] - 1))
+            xd = (xd + .5) / scale - .5
+            if (math.floor(xd) != xd or xd < 0 or xd > (w - 1)):
+                continue
+            yd = float(v[j] if v[j] < v[j - 1] else v[j - 1])
+            yd = (yd + .5) / scale - .5
+            yd = math.ceil(0 if yd < 0 else (h if yd > h else yd))
+            x[m] = int(xd)
+            y[m] = int(yd)
+            m += 1
+    k = m
+    a = [int(x[i] * h + y[i]) for i in range(k)]
+    a.append(h * w)
+    a.sort()
+    b = [0] + a[:len(a) - 1]
+    a = [c - d for (c, d) in zip(a, b)]
+
+    k += 1
+    b = [0 for i in range(k)]
+    b[0] = a[0]
+    m, j = 1, 1
+    while (j < k):
+        if a[j] > 0:
+            b[m] = a[j]
+            m += 1
+            j += 1
+        else:
+            j += 1
+            if (j < k):
+                b[m - 1] += a[j]
+                j += 1
+    mask = decode(b, m)
+    mask = np.array(mask, dtype=np.int).reshape((w, h))
+    mask = mask.transpose((1, 0))
+    return mask
+
+
+def polys_to_boxes(polys):
+    """Convert a list of polygons into an array of tight bounding boxes."""
+    boxes_from_polys = np.zeros((len(polys), 4), dtype=np.float32)
+    for i in range(len(polys)):
+        poly = polys[i]
+        x0 = min(min(p[::2]) for p in poly)
+        x1 = max(max(p[::2]) for p in poly)
+        y0 = min(min(p[1::2]) for p in poly)
+        y1 = max(max(p[1::2]) for p in poly)
+        boxes_from_polys[i, :] = [x0, y0, x1, y1]
+    return boxes_from_polys
+
+
+def bbox_overlaps(boxes, query_boxes):
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    overlaps = np.zeros((N, K), dtype=boxes.dtype)
+    for k in range(K):
+        box_area = (query_boxes[k, 2] - query_boxes[k, 0] + 1) *\
+                   (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+        for n in range(N):
+            iw = min(boxes[n, 2], query_boxes[k, 2]) -\
+                 max(boxes[n, 0], query_boxes[k, 0]) + 1
+            if iw > 0:
+                ih = min(boxes[n, 3], query_boxes[k, 3]) -\
+                     max(boxes[n, 1], query_boxes[k, 1]) + 1
+                if ih > 0:
+                    ua = float(
+                         (boxes[n, 2] - boxes[n, 0] + 1) *\
+                         (boxes[n, 3] - boxes[n, 1] + 1) +\
+                         box_area - iw * ih)
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
+
+
+def polys_to_mask_wrt_box(polygons, box, M):
+    """Convert from the COCO polygon segmentation format to a binary mask
+    encoded as a 2D array of data type numpy.float32. The polygon segmentation
+    is understood to be enclosed in the given box and rasterized to an M x M
+    mask. The resulting mask is therefore of shape (M, M).
+    """
+    w = box[2] - box[0]
+    h = box[3] - box[1]
+
+    w = np.maximum(w, 1)
+    h = np.maximum(h, 1)
+
+    polygons_norm = []
+    for poly in polygons:
+        p = np.array(poly, dtype=np.float32)
+        p[0::2] = (p[0::2] - box[0]) * M / w
+        p[1::2] = (p[1::2] - box[1]) * M / h
+        polygons_norm.append(p)
+
+    mask = []
+    for polygons in polygons_norm:
+        assert polygons.shape[0] % 2 == 0
+        k = polygons.shape[0] // 2
+        mask.append(poly2mask(polygons, k, M, M))
+    mask = np.array(mask)
+    # Flatten in case polygons was a list
+    mask = np.sum(mask, axis=0)
+    mask = np.array(mask > 0, dtype=np.float32)
+    return mask
+
+
+def expand_mask_targets(masks, mask_class_labels, resolution, num_classes):
+    """Expand masks from shape (#masks, resolution ** 2)
+    to (#masks, #classes * resolution ** 2) to encode class
+    specific mask targets.
+    """
+    assert masks.shape[0] == mask_class_labels.shape[0]
+
+    # Target values of -1 are "don't care" / ignore labels
+    mask_targets = -np.ones(
+        (masks.shape[0], num_classes * resolution**2), dtype=np.int32)
+    for i in range(masks.shape[0]):
+        cls = int(mask_class_labels[i])
+        start = resolution**2 * cls
+        end = start + resolution**2
+        # Ignore background instance
+        # (only happens when there is no fg samples in an image)
+        if cls > 0:
+            mask_targets[i, start:end] = masks[i, :]
+    return mask_targets
+
+
+def generate_mask_labels(num_classes, im_info, gt_classes, is_crowd,
+                         label_int32, gt_polys, resolution, rois, roi_lod,
+                         gt_lod):
+    mask_rois = []
+    roi_has_mask_int32 = []
+    mask_int32 = []
+    new_lod = []
+    for i in range(len(im_info)):
+        roi_s = roi_lod[i]
+        roi_e = roi_lod[i + 1]
+        gt_s = gt_lod[i]
+        gt_e = gt_lod[i + 1]
+        mask_blob = _sample_mask(num_classes, im_info[i], gt_classes[gt_s:gt_e],
+                                 is_crowd[gt_s:gt_e], label_int32[roi_s:roi_e],
+                                 gt_polys[i], resolution, rois[roi_s:roi_e])
+        new_lod.append(mask_blob['mask_rois'].shape[0])
+        mask_rois.append(mask_blob['mask_rois'])
+        roi_has_mask_int32.append(mask_blob['roi_has_mask_int32'])
+        mask_int32.append(mask_blob['mask_int32'])
+    return mask_rois, roi_has_mask_int32, mask_int32, new_lod
+
+
+def _sample_mask(
+        num_classes,
+        im_info,
+        gt_classes,
+        is_crowd,
+        label_int32,
+        gt_polys,  # [[[], []], []]
+        resolution,
+        rois):
+    mask_blob = {}
+    im_scale = im_info[2]
+    sample_boxes = rois
+    polys_gt_inds = np.where((gt_classes > 0) & (is_crowd == 0))[0]
+    polys_gt = [gt_polys[i] for i in polys_gt_inds]
+    boxes_from_polys = polys_to_boxes(polys_gt)
+
+    fg_inds = np.where(label_int32 > 0)[0]
+    roi_has_mask = fg_inds.copy()
+    if fg_inds.shape[0] > 0:
+        mask_class_labels = label_int32[fg_inds]
+        masks = np.zeros((fg_inds.shape[0], resolution**2), dtype=np.int32)
+        rois_fg = sample_boxes[fg_inds]
+        overlaps_bbfg_bbpolys = bbox_overlaps(
+            rois_fg.astype(np.float32), boxes_from_polys.astype(np.float32))
+        fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1)
+        for i in range(rois_fg.shape[0]):
+            fg_polys_ind = fg_polys_inds[i]
+            poly_gt = polys_gt[fg_polys_ind]
+            roi_fg = rois_fg[i]
+            mask = polys_to_mask_wrt_box(poly_gt, roi_fg, resolution)
+            mask = np.array(mask > 0, dtype=np.int32)
+            masks[i, :] = np.reshape(mask, resolution**2)
+    else:
+        bg_inds = np.where(label_int32 == 0)[0]
+        rois_fg = sample_boxes[bg_inds[0]].reshape((1, -1))
+        masks = -np.ones((1, resolution**2), dtype=np.int32)
+        mask_class_labels = np.zeros((1, ))
+        roi_has_mask = np.append(roi_has_mask, 0)
+    masks = expand_mask_targets(masks, mask_class_labels, resolution,
+                                num_classes)
+    rois_fg *= im_scale
+    mask_blob['mask_rois'] = rois_fg
+    mask_blob['roi_has_mask_int32'] = roi_has_mask
+    mask_blob['mask_int32'] = masks
+    return mask_blob
+
+
+def trans_lod(lod):
+    new_lod = [0]
+    for i in range(len(lod)):
+        new_lod.append(lod[i] + new_lod[i])
+    return new_lod
+
+
+class TestGenerateMaskLabels(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        self.make_generate_proposal_labels_out()
+        self.generate_gt_polys()
+        self.generate_groundtruth()
+        self.init_test_output()
+        self.inputs = {
+            'ImInfo': self.im_info,
+            'GtClasses': (self.gt_classes.astype(np.int32), self.gt_lod),
+            'IsCrowd': (self.is_crowd.astype(np.int32), self.gt_lod),
+            'LabelsInt32': (self.label_int32.astype(np.int32), self.rois_lod),
+            'GtSegms': (self.gt_polys.astype(np.float32), self.masks_lod),
+            'Rois': (self.rois.astype(np.float32), self.rois_lod)
+        }
+        self.attrs = {
+            'num_classes': self.num_classes,
+            'resolution': self.resolution
+        }
+        self.outputs = {
+            'MaskRois': (self.mask_rois, [self.new_lod]),
+            'RoiHasMaskInt32': (self.roi_has_mask_int32, [self.new_lod]),
+            'MaskInt32': (self.mask_int32, [self.new_lod])
+        }
+
+    def init_test_case(self):
+        self.num_classes = 81
+        self.resolution = 14
+        self.batch_size = 2
+        self.batch_size_per_im = 64
+        self.images_shape = [100, 200]
+        np.random.seed(0)
+
+    def make_generate_proposal_labels_out(self):
+        rois = []
+        self.rois_lod = [[]]
+        self.label_int32 = []
+        for bno in range(self.batch_size):
+            self.rois_lod[0].append(self.batch_size_per_im)
+            for i in range(self.batch_size_per_im):
+                xywh = np.random.rand(4)
+                xy1 = xywh[0:2] * 2
+                wh = xywh[2:4] * (self.images_shape[0] - xy1)
+                xy2 = xy1 + wh
+                roi = [xy1[0], xy1[1], xy2[0], xy2[1]]
+                rois.append(roi)
+        self.rois = np.array(rois).astype("float32")
+        for idx, roi_num in enumerate(self.rois_lod[0]):
+            for roi_id in range(roi_num):
+                class_id = np.random.random_integers(self.num_classes - 1)
+                if idx == 0:
+                    # set an image with no foreground, to test the empty case
+                    self.label_int32.append(0)
+                else:
+                    self.label_int32.append(class_id)
+        label_np = np.array(self.label_int32)
+        self.label_int32 = label_np[:, np.newaxis]
+
+    def generate_gt_polys(self):
+        h, w = self.images_shape[0:2]
+        self.gt_polys = []
+        self.gt_polys_list = []
+        max_gt = 4
+        max_poly_num = 5
+        min_poly_size = 4
+        max_poly_size = 16
+        lod0 = []
+        lod1 = []
+        lod2 = []
+        for i in range(self.batch_size):
+            gt_num = np.random.randint(1, high=max_gt, size=1)[0]
+            lod0.append(gt_num)
+            ptss = []
+            for i in range(gt_num):
+                poly_num = np.random.randint(1, max_poly_num, size=1)[0]
+                lod1.append(poly_num)
+                pts = []
+                for j in range(poly_num):
+                    poly_size = np.random.randint(
+                        min_poly_size, max_poly_size, size=1)[0]
+                    x = np.random.rand(poly_size, 1) * w
+                    y = np.random.rand(poly_size, 1) * h
+                    xy = np.concatenate((x, y), axis=1)
+                    pts.append(xy.flatten().tolist())
+                    self.gt_polys.extend(xy.flatten().tolist())
+                    lod2.append(poly_size)
+                ptss.append(pts)
+            self.gt_polys_list.append(ptss)
+        self.masks_lod = [lod0, lod1, lod2]
+        self.gt_lod = [lod0]
+        self.gt_polys = np.array(self.gt_polys).astype('float32').reshape(-1, 2)
+
+    def generate_groundtruth(self):
+        self.im_info = []
+        self.gt_classes = []
+        self.is_crowd = []
+        for roi_num in self.gt_lod[0]:
+            self.im_info.append(self.images_shape + [1.0])
+            for roi_id in range(roi_num):
+                class_id = np.random.random_integers(self.num_classes - 1)
+                self.gt_classes.append(class_id)
+                self.is_crowd.append(0)
+        self.im_info = np.array(self.im_info).astype(np.float32)
+        gt_classes_np = np.array(self.gt_classes)
+        self.gt_classes = gt_classes_np[:, np.newaxis]
+        is_crowd_np = np.array(self.is_crowd)
+        self.is_crowd = is_crowd_np[:, np.newaxis]
+
+    def init_test_output(self):
+        roi_lod = trans_lod(self.rois_lod[0])
+        gt_lod = trans_lod(self.gt_lod[0])
+        outs = generate_mask_labels(self.num_classes, self.im_info,
+                                    self.gt_classes, self.is_crowd,
+                                    self.label_int32, self.gt_polys_list,
+                                    self.resolution, self.rois, roi_lod, gt_lod)
+        self.mask_rois = outs[0]
+        self.roi_has_mask_int32 = outs[1]
+        self.mask_int32 = outs[2]
+        self.new_lod = outs[3]
+
+        self.mask_rois = np.vstack(self.mask_rois)
+        self.roi_has_mask_int32 = np.hstack(self.roi_has_mask_int32)[:,
+                                                                     np.newaxis]
+        self.mask_int32 = np.vstack(self.mask_int32)
+
+    def setUp(self):
+        self.op_type = "generate_mask_labels"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
index 2d5cd3b24bf..5f6328707fd 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
@@ -4,7 +4,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://w_idxw.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
index 9340d558577..5ce405dccae 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -4,7 +4,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://w_idxw.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+
 import unittest
 import numpy as np
 import sys
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
index 41797a241ca..ae1883f1f7e 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -18,6 +18,7 @@ import numpy as np
 from op_test import OpTest
 from scipy.special import logit
 from scipy.special import expit
+import paddle.fluid.core as core
 import unittest
 
 
@@ -117,5 +118,36 @@ class TestSigmoidCrossEntropyWithLogitsOp3(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestSigmoidCrossEntropyWithNorm(OpTest):
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = 64
+        num_classes = 20
+        ignore_index = -1
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype("float32")),
+            'Label': np.random.randint(-1, 2, (batch_size, num_classes))
+            .astype("float32")
+        }
+        self.attrs = {'ignore_index': ignore_index, 'normalize': True}
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        out = -term1 - term2
+        out[np.where(self.inputs['Label'] == ignore_index)] = 0
+        if self.attrs['normalize']:
+            out = out / float(
+                np.where(self.inputs['Label'] != ignore_index)[0].size)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab


From e6218c1d7b8f60c56f70ecdda8f0a26ce2c690f3 Mon Sep 17 00:00:00 2001
From: nhzlx <nhzlx.dragon@gmail.com>
Date: Wed, 23 Jan 2019 05:16:49 +0000
Subject: [PATCH 71/73] change the input to a smaller value test=develop

---
 paddle/fluid/inference/tests/api/tester_helper.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 18ed7175574..b1f7a3464ac 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -183,7 +183,7 @@ void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
     float *input_data = static_cast<float *>(input.data.data());
     // fill input data, for profile easily, do not use random data here.
     for (size_t j = 0; j < len; ++j) {
-      *(input_data + j) = Random(0, 10.);
+      *(input_data + j) = Random(0.0, 1.0) / 10.;
     }
   }
   (*inputs).emplace_back(input_slots);
-- 
GitLab


From 8b50ad80ff6934512d3959947ac1e71ea3fb9ea3 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Wed, 23 Jan 2019 15:13:22 +0800
Subject: [PATCH 72/73] checkpoint at distributed training (#14854)

checkpoint for distributed training.
---
 .../operators/distributed/grpc/grpc_client.cc |  89 ++--
 .../operators/distributed/grpc/grpc_client.h  |  17 +-
 .../operators/distributed/grpc/grpc_server.cc |  59 ++-
 .../operators/distributed/grpc/grpc_service.h |   3 +
 .../operators/distributed/request_handler.h   |  13 +
 .../distributed/request_handler_impl.cc       |  30 +-
 .../distributed/request_handler_impl.h        |  10 +
 .../fluid/operators/distributed/rpc_client.h  |   7 +
 .../operators/distributed/send_recv.proto.in  |  18 +
 .../distributed_ops/listen_and_serv_op.cc     |   5 +
 .../distributed_ops/listen_and_serv_op.h      |   3 +-
 .../operators/distributed_ops/recv_op.cc      |  63 ++-
 paddle/fluid/platform/mkldnn_reuse.h          |   4 +-
 python/paddle/fluid/framework.py              |  15 +-
 python/paddle/fluid/io.py                     | 454 +++++++++++++-----
 .../fluid/tests/unittests/dist_save_load.py   |  57 ++-
 .../fluid/tests/unittests/dist_simnet_bow.py  |  13 +-
 .../fluid/tests/unittests/test_dist_base.py   |   6 +-
 .../tests/unittests/test_dist_save_load.py    |  73 ++-
 .../tests/unittests/test_dist_transpiler.py   |  49 +-
 .../fluid/transpiler/distribute_transpiler.py | 414 ++++++++++++++--
 21 files changed, 1122 insertions(+), 280 deletions(-)

diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
index 7875c16c3cf..52310f8d04d 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@@ -74,7 +74,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
   SendProcessor* s = new SendProcessor(ch);
-  const std::string method = "SendRPC";
+  const std::string method = kSendRPC;
   VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
 
@@ -107,7 +107,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
 
 void ProcGetResponse(const VarHandle& var_h,
                      const ::grpc::ByteBuffer& ret_msg) {
-  VLOG(100) << "ProcGetResponse";
+  VLOG(4) << "ProcGetResponse";
   framework::Variable* outvar = nullptr;
   // get response's trainer_id is not used
   int trainer_id;
@@ -127,59 +127,74 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
                                      const platform::DeviceContext& ctx,
                                      const framework::Scope& scope,
                                      const std::string& var_name,
+                                     const std::string& out_varname,
                                      int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name,
+  return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname,
                       "/sendrecv.SendRecvService/GetVariable", time_out);
 }
 
+VarHandlePtr GRPCClient::AsyncGetVarNoBarrier(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& var_name,
+    const std::string& out_varname, int64_t time_out) {
+  std::string var_name_no_barrier =
+      string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE);
+
+  return _AsyncGetVar(
+      ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname,
+      "/sendrecv.SendRecvService/GetVariableNoBarrier", time_out);
+}
+
 VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
     const std::string& ep, const platform::DeviceContext& ctx,
     const framework::Scope& scope, const std::string& var_name,
     int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name,
+  return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name,
                       "/sendrecv.SendRecvService/GetMonomerVariable", time_out);
 }
 
-VarHandlePtr GRPCClient::_AsyncGetVar(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& var_name,
-                                      const std::string& rpc_path,
-                                      int64_t time_out) {
+VarHandlePtr GRPCClient::_AsyncGetVar(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& method,
+    const std::string& var_name, const std::string& out_varname,
+    const std::string& rpc_path, int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
+  const std::string out_varname_val = out_varname;
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
   GetProcessor* s = new GetProcessor(ch);
-  const std::string method = "GetRPC";
-  VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
+
+  VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
 
-  framework::AsyncIO([var_name_val, s, method, p_ctx, h, rpc_path, this] {
-    // prepare input
-    sendrecv::VariableMessage req;
-    req.set_varname(var_name_val);
-    req.set_trainer_id(trainer_id_);
-    ::grpc::ByteBuffer buf;
-    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
+  framework::AsyncIO(
+      [var_name_val, out_varname_val, s, method, p_ctx, h, rpc_path, this] {
+        // prepare input
+        sendrecv::VariableMessage req;
+        req.set_varname(var_name_val);
+        req.set_out_varname(out_varname_val);
+        req.set_trainer_id(trainer_id_);
+        ::grpc::ByteBuffer buf;
+        RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
 
-    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
+        VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
 
-    // stub context
-    s->response_call_back_ = ProcGetResponse;
+        // stub context
+        s->response_call_back_ = ProcGetResponse;
 
-    platform::RecordRPCEvent record_event(method, p_ctx);
+        platform::RecordRPCEvent record_event(method, p_ctx);
 
-    auto call =
-        s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
-    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+        auto call =
+            s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
+        call->StartCall();
+        call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
 
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      h->Wait();
-    }
-  });
+        if (UNLIKELY(platform::IsProfileEnabled())) {
+          h->Wait();
+        }
+      });
 
   req_count_++;
 
@@ -202,7 +217,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
   const auto ch = GetChannel(ep_val);
   GetProcessor* s = new GetProcessor(ch);
 
-  const std::string method = "PrefetchRPC";
+  const std::string method = kPrefetchRPC;
 
   VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
@@ -242,7 +257,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = "BatchBarrierRPC";
+  const std::string method = kBatchBarrierRPC;
   VarHandlePtr h(
       new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr));
   s->Prepare(h, time_out);
@@ -267,7 +282,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
                                                int64_t time_out) {
   const auto ch = GetChannel(ep);
   FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
-  const std::string method = "FetchBarrierRPC";
+  const std::string method = kFetchBarrierRPC;
   VarHandlePtr h(
       new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
   s->Prepare(h, time_out);
@@ -293,7 +308,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
                                                 int64_t time_out) {
   const auto ch = GetChannel(ep);
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = "SendMonomerFetchBarrierRPC";
+  const std::string method = kSendMonomerFetchBarrierRPC;
   VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr));
   s->Prepare(h, time_out);
 
@@ -320,7 +335,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
   const auto ch = GetChannel(ep);
 
   BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = "SendCompleteRPC";
+  const std::string method = kSendCompleteRPC;
   VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr));
   s->Prepare(h, time_out);
 
@@ -347,7 +362,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
 
   CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
 
-  const std::string method = "CheckPointNotifyRPC";
+  const std::string method = kCheckPointNotifyRPC;
 
   VarHandlePtr h(
       new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr));
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h
index fa77d212576..ce0d2152aa2 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h
@@ -186,8 +186,15 @@ class GRPCClient : public RPCClient {
                            const platform::DeviceContext& ctx,
                            const framework::Scope& scope,
                            const std::string& var_name,
+                           const std::string& out_varname,
                            int64_t time_out = FLAGS_rpc_deadline) override;
 
+  VarHandlePtr AsyncGetVarNoBarrier(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
+      const std::string& out_varname,
+      int64_t time_out = FLAGS_rpc_deadline) override;
+
   VarHandlePtr AsyncGetMonomerVariable(
       const std::string& ep, const platform::DeviceContext& ctx,
       const framework::Scope& scope, const std::string& var_name,
@@ -228,11 +235,11 @@ class GRPCClient : public RPCClient {
   void Proceed();
 
   std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
-  VarHandlePtr _AsyncGetVar(const std::string& ep,
-                            const platform::DeviceContext& ctx,
-                            const framework::Scope& scope,
-                            const std::string& var_name, const std::string& rpc,
-                            int64_t time_out);
+  VarHandlePtr _AsyncGetVar(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& method,
+      const std::string& var_name, const std::string& out_varname,
+      const std::string& rpc_path, int64_t time_out = FLAGS_rpc_deadline);
 
  private:
   grpc::CompletionQueue cq_;
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
index 08f777e279e..4a9c158cb0a 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
@@ -136,17 +136,65 @@ class RequestGet final : public RequestBase {
   void Process() override {
     // proc request.
     std::string varname = request_.varname();
+    std::string out_varname = request_.out_varname();
     int trainer_id = request_.trainer_id();
-    VLOG(4) << "RequestGet " << varname;
+
+    VLOG(4) << "RequestGet " << out_varname << " from " << varname;
 
     auto scope = request_handler_->scope();
-    auto invar = scope->FindVar(varname);
+    framework::Variable* invar = nullptr;
     framework::Variable* outvar = nullptr;
 
-    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);
+    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id,
+                             out_varname);
 
     if (outvar) {
-      SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+      SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
+                            &reply_);
+    }
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  ::grpc::ByteBuffer reply_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+};
+
+class RequestGetNoBarrier final : public RequestBase {
+ public:
+  explicit RequestGetNoBarrier(GrpcService::AsyncService* service,
+                               ::grpc::ServerCompletionQueue* cq,
+                               RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+    auto method_id =
+        static_cast<int>(distributed::GrpcMethod::kGetVariableNoBarrier);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, &request_, &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+
+  virtual ~RequestGetNoBarrier() {}
+
+  std::string GetReqName() override { return request_.varname(); }
+
+  void Process() override {
+    // proc request.
+    std::string varname = request_.varname();
+    std::string out_varname = request_.out_varname();
+    int trainer_id = request_.trainer_id();
+
+    VLOG(4) << "RequestGetNoBarrier " << out_varname << " from " << varname;
+
+    auto scope = request_handler_->scope();
+    framework::Variable* invar = nullptr;
+    framework::Variable* outvar = nullptr;
+
+    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id,
+                             out_varname);
+
+    if (outvar) {
+      SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
                             &reply_);
     }
     Finish(reply_, &responder_);
@@ -460,6 +508,9 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
     b = new RequestSend(&service_, cq.get(), handler, req_id);
   } else if (rpc_name == kRequestGet) {
     b = new RequestGet(&service_, cq.get(), handler, req_id);
+
+  } else if (rpc_name == kRequestGetNoBarrier) {
+    b = new RequestGetNoBarrier(&service_, cq.get(), handler, req_id);
   } else if (rpc_name == kRequestGetMonomerVariable) {
     b = new RequestGetMonomerVariable(&service_, cq.get(), handler, req_id,
                                       this);
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h
index 0b5c5151e63..2965fe4490b 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_service.h
@@ -81,6 +81,7 @@ enum class GrpcMethod {
   kGetVariable,
   kPrefetchVariable,
   kCheckpointNotify,
+  kGetVariableNoBarrier,
   kGetMonomerVariable,
   kGetMonomerBarrier,
 };
@@ -94,6 +95,8 @@ inline const char* GrpcMethodName(GrpcMethod id) {
       return "/sendrecv.SendRecvService/SendVariable";
     case GrpcMethod::kGetVariable:
       return "/sendrecv.SendRecvService/GetVariable";
+    case GrpcMethod::kGetVariableNoBarrier:
+      return "/sendrecv.SendRecvService/GetVariableNoBarrier";
     case GrpcMethod::kGetMonomerVariable:
       return "/sendrecv.SendRecvService/GetMonomerVariable";
     case GrpcMethod::kGetMonomerBarrier:
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 62b24f150b4..991158ac720 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -42,11 +42,24 @@ constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier";
 constexpr char kRequestPrefetch[] = "RequestPrefetch";
 constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
 constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
+constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier";
+
+constexpr char kSendRPC[] = "SendRPC";
+constexpr char kGetRPC[] = "GetRPC";
+constexpr char kGetNoBarrierRPC[] = "GetNoBarrierRPC";
+constexpr char kGetMonomerRPC[] = "GetMonomerRPC";
+constexpr char kPrefetchRPC[] = "PrefetchRPC";
+constexpr char kBatchBarrierRPC[] = "BatchBarrierRPC";
+constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC";
+constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC";
+constexpr char kSendCompleteRPC[] = "SendCompleteRPC";
+constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC";
 
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
 #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
 #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
 #define COMPLETE_MESSAGE "COMPLETE@RECV"
+#define WITHOUT_BARRIER_MESSAGE "@WITHOUT_BARRIER@RECV"
 
 #define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
 #define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 9722f8c96e9..913ae76b38d 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
+#include "paddle/fluid/string/piece.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
@@ -81,7 +82,8 @@ bool RequestGetHandler::Handle(const std::string& varname,
                                const int trainer_id,
                                const std::string& out_var_name,
                                const std::string& table_name) {
-  VLOG(4) << "RequestGetHandler:" << varname;
+  VLOG(4) << "RequestGetHandler:" << varname
+          << " out_var_name: " << out_var_name;
 
   if (sync_mode_) {
     if (varname == FETCH_BARRIER_MESSAGE) {
@@ -112,6 +114,32 @@ bool RequestGetHandler::Handle(const std::string& varname,
   return true;
 }
 
+bool RequestGetNoBarrierHandler::Handle(const std::string& varname,
+                                        framework::Scope* scope,
+                                        framework::Variable* invar,
+                                        framework::Variable** outvar,
+                                        const int trainer_id,
+                                        const std::string& out_var_name,
+                                        const std::string& table_name) {
+  VLOG(4) << "RequestGetNoBarrierHandler:" << varname
+          << " out_var_name: " << out_var_name;
+
+  // get var from pserver immediately without barriers
+  string::Piece without_barrier_piece(WITHOUT_BARRIER_MESSAGE);
+  string::Piece var_name_piece = string::Piece(varname);
+
+  if (string::Contains(var_name_piece, without_barrier_piece)) {
+    var_name_piece = string::TrimSuffix(var_name_piece, without_barrier_piece);
+    VLOG(4) << "Get var " << var_name_piece << " with "
+            << WITHOUT_BARRIER_MESSAGE;
+    *outvar = scope_->FindVar(var_name_piece.ToString());
+    return true;
+  } else {
+    PADDLE_THROW("GetNoBarrier must contain %s", WITHOUT_BARRIER_MESSAGE);
+  }
+  return true;
+}
+
 bool RequestPrefetchHandler::Handle(const std::string& varname,
                                     framework::Scope* scope,
                                     framework::Variable* invar,
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
index 5e0b25c5c2c..f3c1b24526b 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@@ -67,6 +67,16 @@ class RequestGetHandler final : public RequestHandler {
   bool enable_dc_asgd_;
 };
 
+class RequestGetNoBarrierHandler final : public RequestHandler {
+ public:
+  RequestGetNoBarrierHandler() : RequestHandler(false) {}
+  virtual ~RequestGetNoBarrierHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
+};
+
 static inline void BuildVar(const std::string& param_name,
                             std::initializer_list<const char*> arguments,
                             paddle::framework::proto::OpDesc::Var* var) {
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index b668d869787..ea54e0c2951 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -43,8 +43,15 @@ class RPCClient {
                                    const platform::DeviceContext& ctx,
                                    const framework::Scope& scope,
                                    const std::string& var_name,
+                                   const std::string& out_varname,
                                    int64_t time_out = FLAGS_rpc_deadline) = 0;
 
+  virtual VarHandlePtr AsyncGetVarNoBarrier(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
+      const std::string& out_varname,
+      int64_t time_out = FLAGS_rpc_deadline) = 0;
+
   virtual VarHandlePtr AsyncGetMonomerVariable(
       const std::string& ep, const platform::DeviceContext& ctx,
       const framework::Scope& scope, const std::string& var_name,
diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in
index b39eef04d8d..63036678843 100644
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
@@ -17,8 +17,14 @@ package sendrecv;
 option cc_generic_services = @cc_generic_services@;
 
 service SendRecvService {
+  // For parameter server round-robin like hashing, do not split tensors.
+  // Send and recv only one tensor
+  // TODO(typhoonzero): add streaming API
   rpc SendVariable(VariableMessage) returns (VoidMessage) {}
+  // Argument VariableMessage for GetVariable should only contain varname.
   rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+  rpc GetVariableNoBarrier(VariableMessage) returns (VariableMessage) {}
+  // pre-fetch variable by given variable name and Ids
   rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
 
   rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
@@ -27,12 +33,17 @@ service SendRecvService {
   rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
 }
 
+// It can be: LoDTensor、SelectedRows or NCCL_ID
 enum VarType {
   LOD_TENSOR = 0;
   SELECTED_ROWS = 1;
   NCCL_ID = 2;
 }
 
+// VariableMessage is serialized paddle variable message.
+// NOTICE(gongwb):don't modify this proto if you are not
+//   not familar with how we serialize in sendrecvop_utils.h
+//   and deserilize it in  variable_response.h.
 message VariableMessage {
   enum Type {
     // Pod Types
@@ -49,14 +60,21 @@ message VariableMessage {
   string varname = 1;
   // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
   VarType type = 2;
+  // bool persistable is not needed for sending.
+  // tensor info:
   Type data_type = 3;
   repeated int64 dims = 4;
 
+  // lod details:
   int64 lod_level = 5;
   repeated LodData lod = 6;
+  // selected_rows height, aka. original dim0
   int64 slr_height = 7;
+  // tensor data
   bytes serialized = 8;
+  // selected_rows data
   bytes rows = 9;
+  // Look up table block execution output variable name.
   string out_varname = 10;
   // If 1, the ps server will start profiling, the ps
   // server stops profiling and generates a profile to /tmp/profile_ps_*
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index 629f364d712..53968831ea0 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -347,6 +347,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
       new distributed::RequestPrefetchHandler(sync_mode));
   request_checkpoint_handler_.reset(new distributed::RequestCheckpointHandler(
       sync_mode, checkpoint_block_id));
+  request_get_no_barrier_handler_.reset(
+      new distributed::RequestGetNoBarrierHandler());
 
   rpc_service_->RegisterRPC(distributed::kRequestSend,
                             request_send_handler_.get(),
@@ -359,6 +361,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
                             FLAGS_rpc_prefetch_thread_num);
   rpc_service_->RegisterRPC(distributed::kRequestCheckpoint,
                             request_checkpoint_handler_.get());
+  rpc_service_->RegisterRPC(distributed::kRequestGetNoBarrier,
+                            request_get_no_barrier_handler_.get());
 
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
@@ -413,6 +417,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   f(request_get_handler_.get());
   f(request_prefetch_handler_.get());
   f(request_checkpoint_handler_.get());
+  f(request_get_no_barrier_handler_.get());
 
   // start the server listening after all member initialized.
   server_thread_.reset(new std::thread(RunServer, rpc_service_));
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
index 9431978df83..f20442bad7c 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
@@ -55,7 +55,6 @@ class ListenAndServOp : public framework::OperatorBase {
                   const framework::VariableNameMap& inputs,
                   const framework::VariableNameMap& outputs,
                   const framework::AttributeMap& attrs);
-
   virtual ~ListenAndServOp();
 
   void RunSyncLoop(framework::Executor* executor,
@@ -89,6 +88,8 @@ class ListenAndServOp : public framework::OperatorBase {
   mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
   mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
   mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler>
+      request_get_no_barrier_handler_;
   mutable std::shared_ptr<distributed::RequestHandler>
       request_prefetch_handler_;
   mutable std::shared_ptr<distributed::RequestHandler>
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index 48065437e38..120c65f2969 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -27,30 +27,50 @@ namespace operators {
 
 class RecvOp : public framework::OperatorBase {
  public:
-  RecvOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap& attrs)
+  RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    auto outs = Outputs("Out");
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    std::vector<std::string> varnames =
+        Attr<std::vector<std::string>>("varnames");
     int sync_mode = Attr<int>("sync_mode");
+    auto outs = Outputs("Out");
+    bool with_barrier = Attr<bool>("with_barrier");
 
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(place);
 
-    distributed::RPCClient* rpc_client =
+    distributed::RPCClient *rpc_client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>(
             Attr<int>("trainer_id"));
 
-    std::vector<distributed::VarHandlePtr> rets;
-    for (size_t i = 0; i < outs.size(); i++) {
-      VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
-      rets.push_back(rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]));
-    }
-    if (sync_mode) {
+    if (with_barrier) {
+      std::vector<distributed::VarHandlePtr> rets;
+      for (size_t i = 0; i < outs.size(); i++) {
+        std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
+        VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
+                << varname << " and with AsyncGetVar";
+        rets.push_back(
+            rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i]));
+      }
+      if (sync_mode) {
+        for (size_t i = 0; i < rets.size(); i++) {
+          PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+        }
+      }
+    } else {
+      std::vector<distributed::VarHandlePtr> rets;
+      for (size_t i = 0; i < outs.size(); i++) {
+        std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
+        VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
+                << varname << " and with AsyncGetVarNoBarrier";
+        rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope,
+                                                        varname, outs[i]));
+      }
       for (size_t i = 0; i < rets.size(); i++) {
         PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
       }
@@ -79,12 +99,23 @@ This operator can get variables from server side.
                  "(int, default 0)"
                  "sync recv or async recv.")
         .SetDefault(0);
+    AddAttr<bool>("with_barrier",
+                  "(bool, default True) if with_barrier=False, will use "
+                  "AsyncGetVarNoBarrier get variable from pserver immediately")
+        .SetDefault(true);
+    AddAttr<std::vector<std::string>>(
+        "varnames",
+        "(string vector, default {}) "
+        "sometimes we need to put received var in another name "
+        "for example: we need var named 'moment_1@127.0.0.1:1001', "
+        "and it real name on parameter server is 'moment_1'. ")
+        .SetDefault({});
   }
 };
 
 class RecvOpShapeInference : public framework::InferShapeBase {
  public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
+  void operator()(framework::InferShapeContext *ctx) const override {}
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index faac6a12c66..269280d604a 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -365,7 +365,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
     mem_fmt.ndims = axis.size();
     for (unsigned int i = 0; i < nchw_tz.size(); ++i) {
       mem_fmt.dims[i] = nchw_tz[i];  // logical dimensions (nchw format,
-                                     // regardless physical layout)
+      // regardless physical layout)
     }
     mem_fmt.data_type = mkldnn_f32;
     mem_fmt.format = mkldnn_blocked;
@@ -374,7 +374,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
     for (int i = nchw_tz.size() - 1; i >= 0; --i) {
       mem_fmt.layout_desc.blocking.padding_dims[i] =
           nchw_tz[i];  // logical dimensions (nchw format, regardless physical
-                       // layout)
+      // layout)
       mem_fmt.layout_desc.blocking.block_dims[i] = 1;
       mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0;  // no offset
       mem_fmt.layout_desc.blocking.strides[0][axis[i]] = total_stride;
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index fc5e471ae30..22f505854e2 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1696,12 +1696,20 @@ class Program(object):
         self._current_role = core.op_proto_and_checker_maker.OpRole.Forward
         self._op_role_var = []
 
-        # for distribute
+        # for distribute training
+        # _is_distributed = True if under distributed training
         self._is_distributed = False
+        # _is_chief = True if the trainer is the first one, usually No.0
         self._is_chief = False
-        self._slice_vars_and_attrs = []
+        # _parameters_on_pservers records all the parameters distributed on parameter servers.
+        self._parameters_on_pservers = None
+        # _endpoints is a list about parameter servers ip:port, such as ["ip:port","ip:port"]
         self._endpoints = []
+        # if current role is parameter server, the _ps_endpoint is its "ip:port"
+        self._ps_endpoint = None
+        # trainers_endpoints, it is used for distribution.
         self._trainers_endpoints = []
+        # the distributed lookup table names
         self._distributed_lookup_table = None
 
     @property
@@ -2232,8 +2240,9 @@ class Program(object):
                             "Program")
         self._is_distributed = other._is_distributed
         self._is_chief = other._is_chief
-        self._slice_vars_and_attrs = other._slice_vars_and_attrs
+        self._parameters_on_pservers = other._parameters_on_pservers
         self._endpoints = other._endpoints
+        self._ps_endpoint = other._ps_endpoint
         self._distributed_lookup_table = other._distributed_lookup_table
 
     def _copy_data_info_from(self, other):
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index e74a87fc68d..6b1d4cc34f3 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -19,6 +19,7 @@ import errno
 import time
 import shutil
 import six
+from functools import reduce
 
 from paddle.fluid.executor import Executor
 from paddle.fluid.evaluator import Evaluator
@@ -183,8 +184,6 @@ def save_vars(executor,
             # NOTE: don't save the variable which type is RAW
             if each_var.type == core.VarDesc.VarType.RAW:
                 continue
-            if each_var.name == main_program._distributed_lookup_table:
-                continue
             new_var = _clone_var_in_block_(save_block, each_var)
             if filename is None:
                 save_block.append_op(
@@ -206,16 +205,6 @@ def save_vars(executor,
                 outputs={},
                 attrs={'file_path': os.path.join(dirname, filename)})
 
-        # if there is lookup table, the trainer 0 will notify all pserver to save.
-        if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table:
-            lookup_table_filename = os.path.join(dirname, "__lookup_table__")
-            attrs = {}
-            attrs['epmap'] = main_program._endpoints
-            attrs['dir'] = lookup_table_filename
-            attrs['lookup_table'] = main_program._distributed_lookup_table
-            save_block.append_op(
-                type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
-
         executor.run(save_program)
 
 
@@ -267,6 +256,186 @@ def save_params(executor, dirname, main_program=None, filename=None):
         filename=filename)
 
 
+def _save_distributed_persistables(executor, dirname, main_program):
+    """
+    save_persistables for distributed training.
+    the method will do things listed below:
+    1.save part of persistable variables on trainer.
+    2.receive "remote prefetch variables" from parameter servers and merge them.
+    3.save "distributed lookup table" on parameter servers.
+    4.receive "optimizer variables" from parameter servers and merge them.
+
+    Args:
+        executor(Executor): The executor to run for saving parameters.
+        dirname(str): The saving directory path.
+        main_program(Program): The program whose parameters will be
+                            saved. the main_program must be the trainer_program
+                            get after transpiler.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            t = distribute_transpiler.DistributeTranspiler()
+            t.transpile(...)
+            train_program = t.get_trainer_program()
+            _save_distributed_persistables(executor=exe, dirname=param_path, main_program=train_program)
+    """
+
+    def __save_remote_params(executor, dirname, remote_params_map):
+        """
+        recive params on pserver through rpc.
+        if the params are be sliced, will concat them to one, then save it.
+        """
+        if not remote_params_map:
+            return
+
+        prog = Program()
+        block = prog.global_block()
+
+        # recv optimize vars from pserver
+        for name, remote_params in remote_params_map.items():
+            origin_var = None
+            is_slice = False
+            slice_vars = [0] * len(remote_params)
+            slice_var_names = [""] * len(remote_params)
+            endpoints = [""] * len(remote_params)
+
+            for idx, optimizer in enumerate(remote_params):
+                origin = optimizer.origin
+                slice = optimizer.slice
+                is_slice = optimizer.is_slice
+                block_id = optimizer.block_id
+                endpoint = optimizer.endpoint
+
+                if idx == 0:
+                    origin_var = block.create_var(
+                        name=origin.name,
+                        type=origin.type,
+                        shape=origin.shape,
+                        dtype=origin.dtype,
+                        persistable=True)
+
+                slice_var = block.create_var(
+                    name="{}.slice.{}".format(slice.name, idx),
+                    type=slice.type,
+                    shape=slice.shape,
+                    dtype=slice.dtype,
+                    persistable=True)
+
+                index = block_id if is_slice else idx
+                slice_vars[index] = slice_var
+                slice_var_names[index] = slice.name
+                endpoints[index] = endpoint
+
+            if is_slice:
+                block.append_op(
+                    type='recv',
+                    inputs={"X": []},
+                    outputs={"Out": slice_vars},
+                    attrs={
+                        "epmap": endpoints,
+                        "with_barrier": False,
+                        "varnames": slice_var_names,
+                        "sync_mode": True
+                    })
+                block.append_op(
+                    type='concat',
+                    inputs={'X': slice_vars},
+                    outputs={'Out': origin_var},
+                    attrs={})
+            else:
+                block.append_op(
+                    type='recv',
+                    inputs={"X": []},
+                    outputs={"Out": [origin_var]},
+                    attrs={
+                        "epmap": endpoints[:1],
+                        "with_barrier": False,
+                        "varnames": slice_var_names,
+                        "sync_mode": True
+                    })
+            block.append_op(
+                type='save',
+                inputs={'X': [origin_var]},
+                outputs={},
+                attrs={'file_path': os.path.join(dirname, origin_var.name)})
+            block.append_op(type='delete_var', inputs={'X': slice_vars})
+        executor.run(prog)
+
+    def __save_distributed_lookup_tables(executor, dirname,
+                                         distributed_lookup_table, endpoints):
+        """
+        because the distributed lookup table may too huge to merge and save at one place,
+        it will be saved at parameter server independent respectively.
+
+        the save directory is dirname/"__lookup_table__".
+
+        """
+        prog = Program()
+        block = prog.global_block()
+
+        # if there is lookup table, the trainer 0 will notify all pserver to save.
+        lookup_table_filename = os.path.join(dirname, "__lookup_table__")
+        attrs = {}
+        attrs['epmap'] = endpoints
+        attrs['dir'] = lookup_table_filename
+        attrs['lookup_table'] = distributed_lookup_table
+        block.append_op(
+            type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
+        executor.run(prog)
+
+    def __exclude_vars(exclude_var_names=[]):
+        def is_valid(var):
+            if var.name in exclude_var_names:
+                return False
+            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                        var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                        var.desc.type() == core.VarDesc.VarType.READER:
+                return False
+            return var.persistable
+
+        return is_valid
+
+    if not isinstance(main_program, Program):
+        raise ValueError("'main_program' should be an instance of Program.")
+
+    if not main_program._is_distributed:
+        raise ValueError(
+            "'_save_distributed_persistables' just be designed for distributed training."
+        )
+
+    remote_params_map = main_program._parameters_on_pservers.get_distributed_vars_by_vtypes(
+        ["Optimizer", "RemotePrefetch"], groupby=True)
+
+    exclude_var_names = []
+    if remote_params_map:
+        exclude_var_names.extend(remote_params_map.keys())
+
+    if main_program._distributed_lookup_table:
+        if isinstance(main_program._distributed_lookup_table, list):
+            exclude_var_names.extend(main_program._distributed_lookup_table)
+        else:
+            exclude_var_names.append(main_program._distributed_lookup_table)
+
+    local_vars = list(
+        filter(__exclude_vars(exclude_var_names), main_program.list_vars()))
+    save_vars(
+        executor, main_program=main_program, dirname=dirname, vars=local_vars)
+
+    if main_program._is_chief:
+        if remote_params_map:
+            __save_remote_params(executor, dirname, remote_params_map)
+        if main_program._distributed_lookup_table:
+            __save_distributed_lookup_tables(
+                executor, dirname, main_program._distributed_lookup_table,
+                main_program._endpoints)
+
+
 def save_persistables(executor, dirname, main_program=None, filename=None):
     """
     This function filters out all variables with `persistable==True` from the
@@ -301,13 +470,19 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
             fluid.io.save_persistables(executor=exe, dirname=param_path,
                                        main_program=None)
     """
-    save_vars(
-        executor,
-        dirname=dirname,
-        main_program=main_program,
-        vars=None,
-        predicate=is_persistable,
-        filename=filename)
+
+    if main_program and main_program._is_distributed:
+        _save_distributed_persistables(
+            executor, dirname=dirname, main_program=main_program)
+
+    else:
+        save_vars(
+            executor,
+            dirname=dirname,
+            main_program=main_program,
+            vars=None,
+            predicate=is_persistable,
+            filename=filename)
 
 
 def load_vars(executor,
@@ -402,17 +577,11 @@ def load_vars(executor,
         if not isinstance(main_program, Program):
             raise TypeError("program should be as Program type or None")
 
-        load_slice_vars = []
-        for each_var in main_program._slice_vars_and_attrs:
-            load_slice_vars.append(each_var[2].name)
-
         load_var_map = {}
         for each_var in vars:
             assert isinstance(each_var, Variable)
             if each_var.type == core.VarDesc.VarType.RAW:
                 continue
-            if each_var.name in load_slice_vars:
-                continue
             new_var = _clone_var_in_block_(load_block, each_var)
             if filename is None:
                 load_block.append_op(
@@ -435,10 +604,6 @@ def load_vars(executor,
                 attrs={'file_path': os.path.join(dirname, filename)})
         executor.run(load_prog)
 
-        # load slice vars on pserver, if have it.
-        _load_slice_up_vars(executor, dirname,
-                            main_program._slice_vars_and_attrs)
-
 
 def load_params(executor, dirname, main_program=None, filename=None):
     """
@@ -521,12 +686,134 @@ def load_persistables(executor, dirname, main_program=None, filename=None):
             fluid.io.load_persistables(executor=exe, dirname=param_path,
                                        main_program=None)
     """
-    load_vars(
-        executor,
-        dirname=dirname,
-        main_program=main_program,
-        predicate=is_persistable,
-        filename=filename)
+
+    if main_program and main_program._is_distributed:
+        _load_distributed_persistables(
+            executor, dirname=dirname, main_program=main_program)
+    else:
+        load_vars(
+            executor,
+            dirname=dirname,
+            main_program=main_program,
+            predicate=is_persistable,
+            filename=filename)
+
+
+def _load_distributed_persistables(executor, dirname, main_program=None):
+    """
+    customized load_persistables for distributed training.
+    it should be used on parameter server,
+
+    Args:
+        executor(Executor): The executor to run for saving parameters.
+        dirname(str): The load directory path.
+        main_program(Program): The program whose parameters will be
+                            loaded. the main_program must be the pserver_program
+                            get after transpiler.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            t = distribute_transpiler.DistributeTranspiler()
+            t.transpile(...)
+            pserver_prog = t.get_pserver_program(...)
+            _load_distributed_persistables(executor=exe, dirname=param_path, main_program=pserver_prog)
+    """
+
+    def __is_distributed_part_var(varname):
+        trainer_idx = varname.find(".trainer_")
+        block_idx = varname.find(".block")
+        return trainer_idx or block_idx
+
+    def __load_persistable_vars(executor, dirname, need_load_vars):
+        load_prog = Program()
+        load_block = load_prog.global_block()
+        need_delete_vars = []
+
+        for param in need_load_vars:
+            origin_var = param.origin
+            slice_var = param.slice
+            is_slice = param.is_slice
+            offset = param.offset
+
+            if is_slice:
+                origin = load_block.create_var(
+                    name="{}.load".format(origin_var.name),
+                    type=origin_var.type,
+                    shape=origin_var.shape,
+                    dtype=origin_var.dtype,
+                    persistable=True)
+
+                load_block.append_op(
+                    type='load',
+                    inputs={},
+                    outputs={'Out': [origin]},
+                    attrs={
+                        'file_path': os.path.join(dirname, origin_var.name)
+                    })
+
+                slice = load_block.create_var(
+                    name=slice_var.name,
+                    type=slice_var.type,
+                    shape=slice_var.shape,
+                    dtype=slice_var.dtype,
+                    persistable=True)
+
+                dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
+                start = int(offset / dim1_flatten)
+                end = int(offset / dim1_flatten + slice.shape[0])
+
+                load_block.append_op(
+                    type="slice",
+                    inputs={'Input': origin},
+                    outputs={'Out': slice},
+                    attrs={'axes': [0],
+                           'starts': [start],
+                           'ends': [end]})
+
+                need_delete_vars.append(origin)
+            else:
+                origin = load_block.create_var(
+                    name="{}".format(origin_var.name),
+                    type=origin_var.type,
+                    shape=origin_var.shape,
+                    dtype=origin_var.dtype,
+                    persistable=True)
+                load_block.append_op(
+                    type='load',
+                    inputs={},
+                    outputs={'Out': [origin]},
+                    attrs={
+                        'file_path': os.path.join(dirname, origin_var.name)
+                    })
+
+        load_block.append_op(
+            type='delete_var',
+            inputs={'X': need_delete_vars}, )
+
+        executor.run(load_prog)
+
+    if not isinstance(main_program, Program):
+        raise ValueError("'main_program' should be an instance of Program.")
+
+    if not main_program._is_distributed:
+        raise ValueError(
+            "'_load_distributed_persistables' just be designed for distributed training."
+        )
+
+    if not main_program._ps_endpoint:
+        raise ValueError(
+            "'_load_distributed_persistables' need current_endpoint set in DistributeTranspiler.transpile"
+        )
+
+    need_load_vars = main_program._parameters_on_pservers.get_distributed_vars_by_ep(
+        main_program._ps_endpoint)
+    __load_persistable_vars(executor, dirname, need_load_vars)
 
 
 def prepend_feed_ops(inference_program,
@@ -795,52 +1082,6 @@ def load_inference_model(dirname,
     return [program, feed_target_names, fetch_targets]
 
 
-def _save_lookup_tables_by_notify(executor, dirname, lookup_table,
-                                  pserver_endpoints):
-    """
-    This function will send checkpoint notify message from Trainer 0
-    to all the pservers.
-    The checkpoint notify message contains lookup table name,
-    the absolute path on pserver to save lookup_table.
-
-    Args:
-        executor(Executor): The executor to run for send checkpoint notify.
-        dirname(str): The folder where to save.
-        lookup_table(string): the lookup table name, when use distribute
-            lookup table, we can get lookup table name by DistributeTranspiler.
-            table_name
-        ps_endpoint_list(list): the parameter server ip:port list.
-            when use distribute lookup table, we can get ps_endpoint_list by
-            distribute arguments.
-    Return:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            table_name = "share_w"
-            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
-
-            _save_pserver_vars_by_notify(executor=exe,
-                    dirname=param_path, lookup_table=table_name,
-                    pserver_endpoints=ps_endpoints)
-    """
-
-    pserver_notify_program = Program()
-    pserver_notify_block = pserver_notify_program.global_block()
-
-    attrs = {}
-    attrs['epmap'] = pserver_endpoints
-    attrs['dir'] = dirname
-    attrs['lookup_table'] = lookup_table
-
-    pserver_notify_block.append_op(
-        type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
-    executor.run(pserver_notify_program)
-
-
 def _endpoints_replacement(program, endpoints):
     ENDPOINT_MAP = "epmap"
     for op in program.global_block().ops:
@@ -911,54 +1152,3 @@ def get_parameter_value_by_name(name, executor, program=None):
         program = default_main_program()
     var = program.global_block().var(name)
     return get_parameter_value(var, executor)
-
-
-def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
-    if not slice_vars_and_attrs:
-        return
-
-    load_prog = Program()
-    load_block = load_prog.global_block()
-    need_delete_vars = []
-
-    for var_tuple in slice_vars_and_attrs:
-        orig_var = var_tuple[0]
-        start = var_tuple[1]
-        slice_var = var_tuple[2]
-        end = start + slice_var.shape[0]
-
-        orig_var_name = orig_var.name
-        orig_var.name = "{}.origin".format(orig_var_name)
-
-        clone_orig_var = load_block.create_var(
-            name=orig_var.name,
-            type=orig_var.type,
-            shape=orig_var.shape,
-            dtype=orig_var.dtype,
-            persistable=True)
-
-        clone_slice_var = load_block.create_var(
-            name=slice_var.name,
-            type=slice_var.type,
-            shape=slice_var.shape,
-            dtype=slice_var.dtype,
-            persistable=True)
-
-        load_block.append_op(
-            type='load',
-            inputs={},
-            outputs={'Out': [clone_orig_var]},
-            attrs={'file_path': os.path.join(dirname, orig_var_name)})
-        load_block.append_op(
-            type="slice",
-            inputs={'Input': clone_orig_var},
-            outputs={'Out': clone_slice_var},
-            attrs={'axes': [0],
-                   'starts': [start],
-                   'ends': [end]})
-        need_delete_vars.append(clone_orig_var)
-
-    load_block.append_op(
-        type='delete_var',
-        inputs={'X': need_delete_vars}, )
-    executor.run(load_prog)
diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py
index faec5350424..f0f13a9d49c 100644
--- a/python/paddle/fluid/tests/unittests/dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/dist_save_load.py
@@ -80,7 +80,8 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
         # NOTE: pserver should not call memory optimize
         t = self.get_transpiler(args.trainer_id,
                                 fluid.default_main_program(), args.endpoints,
-                                args.trainers, args.sync_mode)
+                                args.trainers, args.sync_mode, False,
+                                args.current_endpoint)
         pserver_prog = t.get_pserver_program(args.current_endpoint)
         startup_prog = t.get_startup_program(args.current_endpoint,
                                              pserver_prog)
@@ -93,7 +94,8 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
         exe.run(startup_prog)
 
         if need_load and model_dir:
-            self._load_persistable_vars(exe, model_dir, startup_prog)
+            fluid.io.load_persistables(exe, model_dir, pserver_prog)
+
         exe.run(pserver_prog)
 
     def run_trainer(self, args):
@@ -158,19 +160,46 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
 
         need_save = bool(int(os.getenv("SAVE", "0")))
         model_dir = os.getenv("MODEL_DIR", "")
-
-        if need_save:
-            for _ in six.moves.xrange(RUN_STEP):
-                loss, = exe.run(fetch_list=[avg_cost.name],
-                                feed=feeder.feed(get_data()))
-            if need_save and model_dir:
-                io.save_persistables(startup_exe, model_dir, trainer_prog)
-
-        var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor())
-        if six.PY2:
-            print(pickle.dumps(np.ravel(var).tolist()))
+        save_mode = os.getenv("SAVE_MODE", "")
+
+        if save_mode == "LOCAL":
+            if need_save:
+                for _ in six.moves.xrange(RUN_STEP):
+                    loss, = exe.run(fetch_list=[avg_cost.name],
+                                    feed=feeder.feed(get_data()))
+                if need_save and model_dir:
+                    io.save_persistables(startup_exe, model_dir, trainer_prog)
+
+            var = np.array(fluid.global_scope().find_var('__fc_b__').get_tensor(
+            ))
+            if six.PY2:
+                print(pickle.dumps(np.ravel(var).tolist()))
+            else:
+                sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist()))
+
+        elif save_mode == "DIST":
+            skip_steps = int(os.getenv("SKIP_STEPS"))
+            loss = None
+            if need_save:
+                for idx in six.moves.xrange(8):
+                    loss, = exe.run(fetch_list=[avg_cost.name],
+                                    feed=feeder.feed(get_data()))
+                    if need_save and model_dir and idx == skip_steps and args.trainer_id == 0:
+                        io.save_persistables(startup_exe, model_dir,
+                                             trainer_prog)
+            else:
+                for idx in six.moves.xrange(8):
+                    data = get_data()
+                    if idx <= skip_steps:
+                        continue
+                    loss, = exe.run(fetch_list=[avg_cost.name],
+                                    feed=feeder.feed(data))
+            if six.PY2:
+                print(pickle.dumps(loss.tolist()))
+            else:
+                sys.stdout.buffer.write(pickle.dumps(loss.tolist()))
         else:
-            sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist()))
+            raise Exception("save_mode must be LOCAL or DIST")
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
index fac5e037a46..09afae6114e 100644
--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
@@ -75,9 +75,13 @@ def get_loss(cos_q_pt, cos_q_nt):
     return avg_cost
 
 
-def get_optimizer():
-    # SGD optimizer
-    optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
+def get_optimizer(op="sgd"):
+    if op.upper() == "sgd".upper():
+        optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
+    elif op.upper() == "adam".upper():
+        optimizer = fluid.optimizer.Adam(learning_rate=base_lr)
+    else:
+        optimizer = fluid.optimizer.SGD(learning_rate=base_lr)
     return optimizer
 
 
@@ -237,7 +241,8 @@ class TestDistSimnetBow2x2(TestDistRunnerBase):
         inference_program = fluid.default_main_program().clone()
 
         # Optimization
-        opt = get_optimizer()
+        opt = os.getenv('OPTIMIZER', 'sgd')
+        opt = get_optimizer(opt)
         opt.minimize(avg_cost)
 
         # Reader
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 69a38618cde..e51ae1a944e 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -43,7 +43,8 @@ class TestDistRunnerBase(object):
                        pserver_endpoints,
                        trainers,
                        sync_mode,
-                       dc_asgd=False):
+                       dc_asgd=False,
+                       current_endpoint=None):
         # NOTE: import fluid until runtime, or else forking processes will cause error.
         config = fluid.DistributeTranspilerConfig()
         config.enable_dc_asgd = dc_asgd
@@ -53,7 +54,8 @@ class TestDistRunnerBase(object):
             program=main_program,
             pservers=pserver_endpoints,
             trainers=trainers,
-            sync_mode=sync_mode)
+            sync_mode=sync_mode,
+            current_endpoint=current_endpoint)
         return t
 
     def run_pserver(self, args):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
index 4588ca7c17b..e795bc410ee 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
@@ -33,7 +33,6 @@ class TestDistSaveLoadDense2x2(TestDistBase):
                          delta=1e-3,
                          check_error_log=False,
                          need_envs={}):
-
         required_envs = {
             "PATH": os.getenv("PATH", ""),
             "PYTHONPATH": os.getenv("PYTHONPATH", ""),
@@ -77,7 +76,77 @@ class TestDistSaveLoadDense2x2(TestDistBase):
         need_envs = {
             "IS_DISTRIBUTED": '0',
             "IS_SPARSE": '0',
-            'IS_SELF_CONTAINED_LR': '1'
+            'IS_SELF_CONTAINED_LR': '1',
+            'SAVE_MODE': 'LOCAL',
+        }
+        self.check_with_place(
+            "dist_save_load.py",
+            delta=0,
+            check_error_log=False,
+            need_envs=need_envs)
+
+
+class TestDistSaveLoadWithPServerStateDense2x2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._enforce_place = "CPU"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "http_proxy": ""
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        model_dir = tempfile.mkdtemp()
+
+        save_env = {}
+        save_env["SAVE_MODE"] = "DIST"
+        save_env["SAVE"] = "1"
+        save_env["MODEL_DIR"] = model_dir
+        save_env.update(required_envs)
+
+        tr0_var_1, tr1_var_1 = self._run_cluster(model_file, save_env,
+                                                 check_error_log)
+
+        load_env = {}
+        load_env["LOAD"] = "1"
+        load_env["MODEL_DIR"] = model_dir
+        load_env.update(required_envs)
+        tr0_var_2, tr1_var_2 = self._run_cluster(model_file, load_env,
+                                                 check_error_log)
+
+        shutil.rmtree(model_dir)
+
+        train0_1_np = np.array(tr0_var_1)
+        train1_1_np = np.array(tr1_var_1)
+        train0_2_np = np.array(tr0_var_2)
+        train1_2_np = np.array(tr1_var_2)
+
+        self.assertAlmostEqual(
+            train0_1_np.all(), train0_2_np.all(), delta=delta)
+        self.assertAlmostEqual(
+            train1_1_np.all(), train1_2_np.all(), delta=delta)
+
+    def test_dist(self):
+        need_envs = {
+            "IS_DISTRIBUTED": '0',
+            "IS_SPARSE": '0',
+            'IS_SELF_CONTAINED_LR': '1',
+            'SAVE_MODE': 'DIST',
+            'OPTIMIZER': 'ADAM',
+            'SKIP_STEPS': str(np.random.randint(2, 6))
         }
         self.check_with_place(
             "dist_save_load.py",
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 3d1ce6b27c9..3566fed2152 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -741,21 +741,40 @@ class TestLoadSliceVar(TranspilerTest):
         pserver, _ = self.get_pserver(self.pserver1_ep)
         pserver2, _ = self.get_pserver(self.pserver2_ep)
 
-        self.assertTrue(pserver._slice_vars_and_attrs)
-        self.assertTrue(pserver2._slice_vars_and_attrs)
-
-        for idx in six.moves.xrange(len(pserver._slice_vars_and_attrs)):
-            self.assertEqual(pserver._slice_vars_and_attrs[idx][0],
-                             pserver2._slice_vars_and_attrs[idx][0])
-
-            total_numel = six.moves.reduce(
-                lambda x, y: x * y, pserver._slice_vars_and_attrs[idx][0].shape)
-            self.assertEqual(
-                total_numel,
-                six.moves.reduce(lambda x, y: x * y,
-                                 pserver._slice_vars_and_attrs[idx][2].shape) +
-                six.moves.reduce(lambda x, y: x * y,
-                                 pserver2._slice_vars_and_attrs[idx][2].shape))
+        vars_ps1 = pserver._parameters_on_pservers.get_distributed_vars_by_ep(
+            self.pserver1_ep)
+        vars_ps2 = pserver._parameters_on_pservers.get_distributed_vars_by_ep(
+            self.pserver2_ep)
+
+        self.assertTrue(vars_ps1)
+        self.assertTrue(vars_ps2)
+
+        for idx in six.moves.xrange(len(vars_ps1)):
+            total_numel = 0
+            ps1_numel, ps2_numel = 0, 0
+
+            ps1_var = vars_ps1[idx]
+
+            if not ps1_var.is_slice:
+                total_numel = six.moves.reduce(lambda x, y: x * y,
+                                               vars_ps1[idx].origin.shape)
+                ps1_numel = six.moves.reduce(lambda x, y: x * y,
+                                             vars_ps1[idx].slice.shape)
+            else:
+                ps2_var = None
+                for var in vars_ps2:
+                    if var.origin.name == ps1_var.origin.name:
+                        ps2_var = var
+                        break
+
+                total_numel = six.moves.reduce(lambda x, y: x * y,
+                                               ps1_var.origin.shape)
+                ps1_numel = six.moves.reduce(lambda x, y: x * y,
+                                             ps1_var.slice.shape)
+                ps2_numel = six.moves.reduce(lambda x, y: x * y,
+                                             ps2_var.slice.shape)
+
+            self.assertEqual(total_numel, ps1_numel + ps2_numel)
 
 
 class TestNCCL2Transpile(TranspilerTest):
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index ea5a4cf7cdb..c61cb54e1f2 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -39,7 +39,7 @@ from .ps_dispatcher import RoundRobin, PSDispatcher
 from .. import core, framework, unique_name
 from ..framework import Program, default_main_program, \
     default_startup_program, Block, \
-    Parameter, grad_var_name
+    Parameter, Variable, grad_var_name
 from .details import *
 from ..distribute_lookup_table import find_distributed_lookup_table
 from functools import reduce
@@ -62,6 +62,260 @@ def log(*args):
         print(args)
 
 
+class VarStruct(object):
+    """
+    record part properties of a Variable in python.
+    """
+
+    def __init__(self, name, shape, dtype, type, lod_level, persistable):
+        self.name = name
+        self.shape = shape
+        self.dtype = dtype
+        self.type = type
+        self.lod_level = lod_level
+        self.persistable = persistable
+
+
+class VarDistributed(object):
+    """
+    a class to record the var distributed on parameter servers.
+    the class will record the relationship between origin var and slice var.
+    the slice var's properties, such as type/shape/offset/endpoint.
+    """
+
+    def __init__(self,
+                 origin_var,
+                 slice_var,
+                 is_slice=None,
+                 block_id=None,
+                 offset=None,
+                 vtype=None,
+                 endpoint=None):
+        """
+        Args:
+            origin_var(Variable|VarStruct): origin var properties
+            slice_var(Variable|VarStruct): slice var properties
+            is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard.
+            block_id(int|None): the number about the slice var.
+            offset(int|None): if the slice var is sliced, offset is the numel before the var.
+            vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch.
+            endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001"
+        """
+
+        if isinstance(origin_var, Variable):
+            self.origin = self.__create_var_struct(origin_var)
+        else:
+            self.origin = origin_var
+
+        if isinstance(slice_var, Variable):
+            self.slice = self.__create_var_struct(slice_var)
+        else:
+            self.slice = slice_var
+
+        if self.equal(self.origin, self.slice):
+            self.is_slice = False
+            self.block_id = 0
+            self.offset = 0
+        else:
+            self.is_slice = True
+            self.block_id = 0
+            self.offset = 0
+
+        if is_slice is not None:
+            self.is_slice = is_slice
+        if block_id is not None:
+            self.block_id = block_id
+        if offset is not None:
+            self.offset = offset
+
+        self.vtype = vtype
+        self.endpoint = endpoint
+
+    @staticmethod
+    def __create_var_struct(var):
+        return VarStruct(var.name, var.shape, var.dtype, var.type,
+                         var.lod_level, var.persistable)
+
+    @staticmethod
+    def equal(var1, var2):
+        """
+        the two var is equal or not.
+        Returns:
+            bool: equal will return True else False
+        """
+        assert isinstance(var1, VarStruct) and isinstance(var2, VarStruct)
+
+        return var1.name == var2.name and \
+               var1.type == var2.type and \
+               var1.shape == var2.shape and \
+               var1.dtype == var2.dtype and \
+               var1.lod_level == var2.lod_level and \
+               var1.persistable == var2.persistable
+
+    def __str__(self):
+        origin_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})". \
+            format(i="{", e="}", name=self.origin.name, type=self.origin.type,
+                   shape=self.origin.shape, dtype=self.origin.dtype)
+
+        slice_var_str = "{name} : fluid.{type}.shape{shape}.astype({dtype})" \
+                        ".slice({is_slice}).block({block_id}).offset({offset})". \
+            format(i="{", e="}", name=self.slice.name, type=self.slice.type,
+                   shape=self.slice.shape, dtype=self.slice.dtype,
+                   is_slice=self.is_slice, block_id=self.block_id, offset=self.offset)
+
+        return "var owned: {}, origin var: ( {} ), slice var: ( {} ), endpoint: {} ".format(
+            self.vtype, origin_var_str, slice_var_str, self.endpoint)
+
+
+class VarsDistributed(object):
+    """
+    a gather about VarDistributed with many methods to find distributed vars.
+    through the class, we can get overview about the distributed parameters on parameter servers.
+    this class may centralized and convenient for developer to manage and get variable's distribute.
+    other module can also use this to find variables such io.py.
+    """
+
+    def __init__(self):
+        self.distributed_vars = []
+
+    def add_distributed_var(self,
+                            origin_var,
+                            slice_var,
+                            is_slice=None,
+                            block_id=None,
+                            offset=None,
+                            vtype=None,
+                            endpoint=None):
+        """
+        add distributed var in this.
+
+        Args:
+            origin_var(Variable|VarStruct): origin var properties
+            slice_var(Variable|VarStruct): slice var properties
+            is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard.
+            block_id(int|None): the number about the slice var.
+            offset(int|None): if the slice var is sliced, offset is the numel before the var.
+            vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch.
+            endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001"
+        Returns:
+            None
+        """
+        self.distributed_vars.append(
+            VarDistributed(origin_var, slice_var, is_slice, block_id, offset,
+                           vtype, endpoint))
+
+    def get_distributed_var_by_slice(self, var_name):
+        """
+        get distributed var by conditions.
+
+        Args:
+            var_name(str): slice var name, such as "w.traier0.block1"
+        Returns:
+            VarDistributed: distributed var.
+        """
+        for dist_var in self.distributed_vars:
+            if dist_var.slice.name == var_name:
+                return dist_var
+        return None
+
+    @staticmethod
+    def equal(var1, var2):
+        """
+        the two var is equal or not.
+        Returns:
+            bool: equal will return True else False
+        """
+        return var1.name == var2.name and \
+               var1.type == var2.type and \
+               var1.shape == var2.shape and \
+               var1.dtype == var2.dtype and \
+               var1.lod_level == var2.lod_level and \
+               var1.persistable == var2.persistable
+
+    def get_distributed_var_by_origin_and_ep(self, origin_var_name, endpoint):
+        """
+        get distributed var by conditions.
+
+        Args:
+            origin_var_name(str):
+            endpoint(str): the parameter endpoint, such as "127.0.0.1:1001"
+        Returns:
+            VarDistributed: distributed var.
+        """
+        for dist_var in self.distributed_vars:
+            if dist_var.origin.name == origin_var_name and dist_var.endpoint == endpoint:
+                return dist_var
+        return None
+
+    def get_distributed_vars_by_vtypes(self, vtypes, groupby=False):
+        """
+        get distributed vars by conditions.
+
+        Args:
+            vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch"
+            groupby(bool|False): group by origin var or not.
+
+        Returns:
+            list: distributed var list.
+            dict: distributed var map when groupby=True
+        """
+        vtype_vars = []
+        for var in self.distributed_vars:
+            if var.vtype in vtypes:
+                vtype_vars.append(var)
+        if not groupby:
+            return vtype_vars
+
+        params_map = {}
+        for var in vtype_vars:
+            origin_var_name = var.origin.name
+
+            if origin_var_name in params_map.keys():
+                optimizers = params_map.get(origin_var_name)
+            else:
+                optimizers = []
+            optimizers.append(var)
+            params_map[origin_var_name] = optimizers
+        return params_map
+
+    def get_distributed_vars_by_ep(self, endpoint, vtype=None):
+        """
+        get distributed vars by conditions.
+
+        Args:
+            endpoint(str): the parameter server endpoint, such as "127.0.0.1:2001"
+            vtype(str|None): distributed var's vtype, such as "Optimizer", "RemotePrefetch"
+
+        Returns:
+            list: distributed var list.
+        """
+        endpoint_vars = []
+        for var in self.distributed_vars:
+            if var.endpoint == endpoint:
+                endpoint_vars.append(var)
+        if not vtype:
+            return endpoint_vars
+
+        vtype_vars = []
+        for var in endpoint_vars:
+            if var.vtype == vtype:
+                vtype_vars.append(var)
+        return vtype_vars
+
+    def overview(self):
+        """
+        get the overview string about all params on all parameter servers.
+
+        Returns:
+            Str: overview string.
+
+        """
+        vars_str = []
+        for var in self.distributed_vars:
+            vars_str.append(str(var))
+        return "\n".join(vars_str)
+
+
 class VarBlock:
     def __init__(self, varname, offset, size):
         self.varname = varname
@@ -223,16 +477,13 @@ class DistributeTranspiler(object):
                          trainer_id,
                          trainers,
                          current_endpoint,
-                         startup_program=None,
-                         wait_port=True):
+                         startup_program=None):
         if not startup_program:
             startup_program = default_startup_program()
         if trainer_id >= 0:
             worker_endpoints = trainers.split(",")
             # send NCCL_ID to others or recv from trainer 0
             worker_endpoints.remove(current_endpoint)
-            if trainer_id == 0 and wait_port:
-                wait_server_ready(worker_endpoints)
 
             nccl_id_var = startup_program.global_block().create_var(
                 name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
@@ -313,13 +564,11 @@ class DistributeTranspiler(object):
 
         if self.config.mode == "nccl2":
             assert (isinstance(trainers, str))
-            self.origin_program._trainers_endpoints = trainers.split(",")
             self._transpile_nccl2(
                 trainer_id,
                 trainers,
                 current_endpoint,
-                startup_program=startup_program,
-                wait_port=self.config.wait_port)
+                startup_program=startup_program)
             return
 
         self.trainer_num = trainers
@@ -327,6 +576,7 @@ class DistributeTranspiler(object):
         self.trainer_id = trainer_id
         pserver_endpoints = pservers.split(",")
         self.pserver_endpoints = pserver_endpoints
+        self.vars_overview = VarsDistributed()
         self.optimize_ops, self.params_grads = self._get_optimize_pass()
 
         ps_dispatcher = self.config.split_method(self.pserver_endpoints)
@@ -347,6 +597,7 @@ class DistributeTranspiler(object):
         # add distributed attrs to program
         self.origin_program._is_distributed = True
         self.origin_program._endpoints = self.pserver_endpoints
+        self.origin_program._ps_endpoint = current_endpoint
         self.origin_program._is_chief = self.trainer_id == 0
         self.origin_program._distributed_lookup_table = self.table_name if self.table_name else None
 
@@ -454,6 +705,10 @@ class DistributeTranspiler(object):
             self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i])
             self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
 
+            distributed_var = self.vars_overview.get_distributed_var_by_slice(
+                recv_vars[i].name)
+            distributed_var.endpoint = ep
+
         # step4: Concat the parameters splits together after recv.
         all_recv_outputs = []
         for param_varname, splited_var in six.iteritems(self.param_var_mapping):
@@ -480,6 +735,12 @@ class DistributeTranspiler(object):
                 recv_op_role_var_name = splited_trainer_grad[0].name
 
             if param_varname in self.sparse_param_to_height_sections:
+
+                for table_name in table_names:
+                    distributed_var = self.vars_overview.get_distributed_var_by_slice(
+                        table_name)
+                    distributed_var.vtype = "RemotePrefetch"
+
                 height_sections = self.sparse_param_to_height_sections[
                     param_varname]
                 self._update_remote_sparse_update_op(
@@ -532,6 +793,9 @@ class DistributeTranspiler(object):
                                                         pserver_endpoints)
             self._split_table_grad_and_add_send_vars(program, pserver_endpoints)
 
+        self._get_distributed_optimizer_vars()
+        self.origin_program._parameters_on_pservers = self.vars_overview
+
     def get_trainer_program(self, wait_port=True):
         """
         Get transpiled trainer side program.
@@ -541,6 +805,7 @@ class DistributeTranspiler(object):
         """
         # remove optimize ops and add a send op to main_program
         # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
+
         lr_ops = self._get_lr_ops()
         delete_ops(self.origin_program.global_block(), self.optimize_ops)
         delete_ops(self.origin_program.global_block(), lr_ops)
@@ -665,9 +930,14 @@ class DistributeTranspiler(object):
         # NOTE: assume blocks of the same variable is not distributed
         # on the same pserver, only change param/grad varnames for
         # trainers to fetch.
+        sys.stderr.write(
+            "get_pserver_program() is deprecated, call get_pserver_programs() to get pserver main and startup in a single call.\n"
+        )
         # step1
         pserver_program = Program()
         pserver_program.random_seed = self.origin_program.random_seed
+        pserver_program._copy_dist_param_info_from(self.origin_program)
+
         # step2: Create vars to receive vars at parameter servers.
         recv_inputs = []
         for v in self.param_grad_ep_mapping[endpoint]["params"]:
@@ -703,9 +973,6 @@ class DistributeTranspiler(object):
             else:
                 recv_inputs.append(single_trainer_var)
 
-        self._slice_params_and_optimizes = self._get_slice_vars_and_attrs(
-            endpoint)
-
         # step 3
         # Create a union-find data structure from optimize ops,
         # If two ops are connected, we could add these two ops
@@ -882,10 +1149,6 @@ class DistributeTranspiler(object):
             outputs={},
             attrs=attrs)
 
-        # add distributed attrs
-        pserver_program._slice_vars_and_attrs = list(
-            self._slice_params_and_optimizes.values())
-
         pserver_program._sync_with_cpp()
         # save pserver program to generate pserver side startup relatively.
         self.pserver_program = pserver_program
@@ -984,30 +1247,88 @@ class DistributeTranspiler(object):
                     inputs={"X": startup_param_var},
                     outputs={"Out": startup_tmpvar})
 
-        # add slice vars
-        s_prog._slice_vars_and_attrs = pserver_program._slice_vars_and_attrs
-
         return s_prog
 
-    def _get_slice_vars_and_attrs(self, endpoint):
-        slice_vars_and_attrs = {}
+    # ====================== private transpiler functions =====================
+    def _get_slice_var_info(self, slice_var):
         block_suffix = "block"
-        for param in self.param_grad_ep_mapping[endpoint]["params"]:
-            orig_var_name, block_name, _ = self._get_varname_parts(param.name)
-            if not block_name:
-                continue
+        block_idx = 0
+        offset = 0
+        is_slice = False
 
-            block_idx = int(block_name.split(block_suffix)[1])
-            orig_var = self.origin_program.global_block().vars[orig_var_name]
+        orig_var_name, block_name, _ = self._get_varname_parts(slice_var.name)
 
-            skip_dim0 = 0
-            slice_vars = self.param_var_mapping[orig_var_name]
-            for slice_var in slice_vars[:block_idx]:
-                skip_dim0 += slice_var.shape[0]
-            slice_vars_and_attrs[param.name] = [orig_var, skip_dim0, param]
-        return slice_vars_and_attrs
+        if not block_name:
+            return is_slice, block_idx, offset
 
-    # ====================== private transpiler functions =====================
+        block_idx = int(block_name.split(block_suffix)[1])
+        skip_dim0 = 0
+        slice_vars = self.param_var_mapping[orig_var_name]
+
+        orig_dim1_flatten = reduce(lambda x, y: x * y, slice_vars[0].shape[1:])
+
+        for slice_var in slice_vars[:block_idx]:
+            skip_dim0 += slice_var.shape[0]
+
+        offset = skip_dim0 * orig_dim1_flatten
+        is_slice = True
+        return is_slice, block_idx, offset
+
+    def _get_distributed_optimizer_vars(self):
+        def _get_distributed_optimizer_var(endpoint):
+            opt_op_on_pserver = []
+            for _, op in enumerate(self.optimize_ops):
+                if self._is_optimizer_op(op) and self._is_opt_op_on_pserver(
+                        endpoint, op):
+                    opt_op_on_pserver.append(op)
+
+            for opt_op in opt_op_on_pserver:
+                dist_var = None
+                for key in opt_op.input_names:
+                    if key == "Param":
+                        param_name = opt_op.input(key)[0]
+                        dist_var = self.vars_overview.get_distributed_var_by_origin_and_ep(
+                            param_name, endpoint)
+                        break
+                for key in opt_op.input_names:
+                    if key in ["Param", "Grad", "LearningRate"]:
+                        continue
+                    origin_var = self.origin_program.global_block().vars[
+                        opt_op.input(key)[0]]
+                    # update accumulator variable shape
+                    new_shape = self._get_optimizer_input_shape(
+                        opt_op.type, key, origin_var.shape,
+                        dist_var.slice.shape)
+
+                    if new_shape == dist_var.slice.shape:
+                        splited_var = VarStruct(
+                            name=origin_var.name,
+                            shape=new_shape,
+                            dtype=origin_var.dtype,
+                            type=origin_var.type,
+                            lod_level=origin_var.lod_level,
+                            persistable=origin_var.persistable)
+
+                        self.vars_overview.add_distributed_var(
+                            origin_var=origin_var,
+                            slice_var=splited_var,
+                            is_slice=dist_var.is_slice,
+                            block_id=dist_var.block_id,
+                            offset=dist_var.offset,
+                            vtype="Optimizer",
+                            endpoint=endpoint)
+                    else:
+                        self.vars_overview.add_distributed_var(
+                            origin_var=origin_var,
+                            slice_var=origin_var,
+                            is_slice=False,
+                            block_id=0,
+                            offset=0,
+                            vtype="Optimizer",
+                            endpoint=endpoint)
+
+        for ep in self.pserver_endpoints:
+            _get_distributed_optimizer_var(ep)
 
     def _update_dist_lookup_table_vars(self, param_list, grad_list,
                                        params_grads):
@@ -1093,6 +1414,22 @@ class DistributeTranspiler(object):
         # origin_param_name -> [splited_param_vars]
         self.param_var_mapping = self._create_vars_from_blocklist(
             self.origin_program, param_blocks)
+
+        for orig_name, splited_vars in self.param_var_mapping.items():
+            orig_var = self.origin_program.global_block().var(orig_name)
+
+            for splited_var in splited_vars:
+                is_slice, block_id, offset = self._get_slice_var_info(
+                    splited_var)
+
+                self.vars_overview.add_distributed_var(
+                    origin_var=orig_var,
+                    slice_var=splited_var,
+                    block_id=block_id,
+                    offset=offset,
+                    is_slice=is_slice,
+                    vtype="Param")
+
         # origin_grad_name -> [splited_grad_vars]
         self.grad_var_mapping = self._create_vars_from_blocklist(
             self.origin_program,
@@ -1729,13 +2066,6 @@ class DistributeTranspiler(object):
                 shape=new_shape)
             new_inputs[key] = tmpvar
 
-            # var shape been changed
-            if new_shape != var.shape:
-                slice_var_args = self._slice_params_and_optimizes[
-                    param_var.name]
-                self._slice_params_and_optimizes[
-                    var.name] = [var, slice_var_args[1], tmpvar]
-
         # change output's ParamOut variable
         outputs = self._get_output_map_from_op(
             self.origin_program.global_block().vars, opt_op)
@@ -1763,8 +2093,8 @@ class DistributeTranspiler(object):
                 # skip per trainer vars
                 if g.name.find(".trainer_") == -1:
                     # only param or grads have splited blocks
-                    if self._orig_varname(g.name) in self.grad_name_to_param_name or\
-                        self._orig_varname(g.name) in self.param_name_to_grad_name:
+                    if self._orig_varname(g.name) in self.grad_name_to_param_name or \
+                            self._orig_varname(g.name) in self.param_name_to_grad_name:
                         grad_block = g
                         break
         return grad_block
-- 
GitLab


From dbdaf15ca0c0d4fb5264015b4621434ffc36063f Mon Sep 17 00:00:00 2001
From: guomingz <guoming.zhang@intel.com>
Date: Wed, 23 Jan 2019 16:50:38 +0800
Subject: [PATCH 73/73] [V1.3] Add the calibration tool code for int8 inference
 and focus test. (#15062)

* Add the calibration tool code for int8 inference and focus test.

* Fix the calibration tool per the review comments.

test=develop

* Update the calibrator doc and remove extra line.

* Fix the invalid is_negative_input attr set on Mobilenet.

* Add the comments and fix the format issue.

test=develop

* Update the CMakelist.txt for Calibration PR.Disable the Calibration UT if not enable MKLDNN.

test=develop

* Update the CMakeList.txt.

test=develop

* Disable the test_calibration case on WIN and MAC.

test=develop

* Add the missing brackets.

test=develop

* Remove the outdated map operator which not supported on Python3.

test=develop

* Fix the style issue.

test=develop

* 1.Update the CMakeList.txt to disable calibration tool ut when the WITH_MKL is not set;
2.Add the workaround to enable the FLAGS_use_mkldnn for PR_CI(PADDLE).

test=develop

* Fix the typo and format the License header.

test=develop

* 1.Add and Update TODOs per review comments.
2.Code clean.

test=develop
---
 .../fluid/contrib/int8_inference/__init__.py  |  13 +
 .../fluid/contrib/int8_inference/utility.py   | 708 ++++++++++++++++++
 .../paddle/fluid/contrib/tests/CMakeLists.txt |   4 +
 .../fluid/contrib/tests/test_calibration.py   | 230 ++++++
 4 files changed, 955 insertions(+)
 create mode 100644 python/paddle/fluid/contrib/int8_inference/__init__.py
 create mode 100644 python/paddle/fluid/contrib/int8_inference/utility.py
 create mode 100644 python/paddle/fluid/contrib/tests/test_calibration.py

diff --git a/python/paddle/fluid/contrib/int8_inference/__init__.py b/python/paddle/fluid/contrib/int8_inference/__init__.py
new file mode 100644
index 00000000000..eca2dce114b
--- /dev/null
+++ b/python/paddle/fluid/contrib/int8_inference/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fluid/contrib/int8_inference/utility.py b/python/paddle/fluid/contrib/int8_inference/utility.py
new file mode 100644
index 00000000000..197fc5f2d26
--- /dev/null
+++ b/python/paddle/fluid/contrib/int8_inference/utility.py
@@ -0,0 +1,708 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.fluid.core as core
+import numpy as np
+import math
+import os
+import paddle.fluid as fluid
+
+
+class Calibrator(object):
+    '''
+    The calibrator class transforms the program and updates the calculated scale into it.
+    This is INT8 v1 calibration tool, mainly for the support of ResNet-50 and MobileNet.
+    '''
+    # TODO(guomingz): Below op list will be updated once more INT8 op kernels are supported.
+    non_conv_int8_op_type = ("pool2d")
+    supported_int8_op_type = ("conv2d", "pool2d")
+    const_sign_op_type = ('pool2d', 'reshape', 'concat', 'transpose')
+    u8_max = 255
+    s8_max = 127
+
+    def __init__(self, *args, **kwargs):
+        self.program = kwargs['program']
+        self.iterations = kwargs['iterations']
+        self.pretrained_model = kwargs['pretrained_model']
+        self.debug = kwargs['debug']
+        self.algo = kwargs['algo']
+
+        self._conv_input_var_name = []
+        self._conv_output_var_name = []
+        self._pool2d_output_var_name = []
+        self._weights_var_name = []
+        self._residual_input_var_name = []
+        self._int8_output_var_op_index_dict = {}
+        self._conv_op_index = [
+            index for index, value in enumerate(self.program.global_block().ops)
+            if value.type == 'conv2d'
+        ]
+
+        self._var_max_value_map = {}
+        self._var_max_range = {}
+        self._weights_scaling_factor = {}
+        self._u8_output_var = []
+        self._s8_output_var = []
+        self._persistable_vars = []
+
+    def generate_sampling_program(self):
+        self.__init_analysis()
+        self.__generate_output_program()
+
+    def generate_quantized_data(self, sampling_data):
+        self.__sampling(sampling_data)
+        self.__save_scale()
+        self.__update_program()
+        self.__update_output_program_attr()
+        self.__display_debug()
+
+    def __display_debug(self):
+        if self.debug:
+            self.__dot(self._output_program)
+            print(self._output_program)
+
+    def __get_max_range_by_var_name(self, program, var_name):
+        """
+        Check the specified variable was generated from Relu layer or not.
+        If the variable was the output of one of the pool2d/reshape/concat
+        /transpose, we keep trace the ancestor of this variable;
+        If the variable was the output the conv op, we check it's has_relu
+        attr;
+        Otherwise, we return the Calibrator.s8 as default value.
+        Returns:
+            Return Calibrator.u8_max if the variable was generated by Relu,
+            otherwise it will returns Calibrator.s8
+        """
+        search_end_index = -1
+        input_index_name = {}
+        output_index_name = {}
+        ops_type = []
+
+        for index, op in enumerate(program.current_block().ops):
+            ops_type.append(op.type)
+
+            input_index_name[index] = op.input_arg_names
+
+            output_index_name[index] = op.output_arg_names
+            if var_name in op.output_arg_names:
+                search_end_index = index
+
+        # analysis
+        while search_end_index >= 0:
+            if ops_type[search_end_index] == "relu":
+                return Calibrator.u8_max
+
+            input_name = input_index_name[search_end_index][0]
+
+            for i in output_index_name.keys():
+                if input_name in output_index_name[i]:
+                    search_end_index = i
+                    break
+
+            if ops_type[
+                    search_end_index] not in Calibrator.const_sign_op_type and ops_type[
+                        search_end_index] != 'conv2d':
+                return Calibrator.s8_max
+
+            if ops_type[search_end_index] != 'conv2d':
+                continue
+
+            if program.current_block().ops[search_end_index].has_attr(
+                    'fuse_relu') and program.current_block().ops[
+                        search_end_index].attr('fuse_relu'):
+                return Calibrator.u8_max
+            else:
+                return Calibrator.s8_max
+
+        return Calibrator.s8_max
+
+    def __check_op_type_with_specified_var_as_input(self,
+                                                    program,
+                                                    var_name,
+                                                    start_index=0):
+        '''
+        Check whether all the type of ops that use the specified variable as the
+        input.If one of those op is not int8-enabled, return False.
+        '''
+        op_type_list = [
+            op.type for op in program.current_block().ops[start_index:]
+            if var_name in op.input_arg_names
+        ]
+        for i in op_type_list:
+            if not i in Calibrator.supported_int8_op_type:
+                return False
+        return True
+
+    def __check_var_source_dt(self, var_name):
+        '''
+        Check whether the specified variable is the output of int8 conv op or not.
+        If true, return the original op index.
+        If false, return -1
+        '''
+        return self._int8_output_var_op_index_dict[
+            var_name] if var_name in self._int8_output_var_op_index_dict else -1
+
+    def __update_int8_output_var_op_index_dict(self, index, var_name=None):
+        '''
+        Update the int8_output_variable/op_index dictionary
+        '''
+        for k, v in self._int8_output_var_op_index_dict.items():
+            if v >= index:
+                self._int8_output_var_op_index_dict[k] = v + 1
+        if var_name:
+            self._int8_output_var_op_index_dict[var_name] = index
+
+    def __update_program(self):
+        '''
+        Update the program with the quantize/dequantize op insertion.
+        '''
+        quantize_index, dequantize_index = self.__get_quantize_dequantize_combination(
+            self._output_program)
+        inserted_op_length = 0
+        calc_max_func = self.__get_optimal_scaling_factor if self.algo == "KL" else np.max
+        insert_op_collection = sorted(quantize_index + dequantize_index)
+
+        for index in insert_op_collection:
+            if index in quantize_index:
+                quantize_tmp = self._output_program.current_block().create_var(
+                    name="quantize_{}_tmp".format(index),
+                    dtype=core.VarDesc.VarType.UINT8)
+                original_out_name = self._output_program.current_block().ops[
+                    index + inserted_op_length - 1].output_names[0]
+                original_out = self._output_program.current_block().ops[
+                    index + inserted_op_length - 1].output(original_out_name)[0]
+
+                op = self._output_program.current_block()._insert_op(
+                    index=index + inserted_op_length,
+                    type="quantize",
+                    inputs={"Input": original_out},
+                    outputs={"Output": quantize_tmp}, )
+
+                op._set_attr("data_format", "MKLDNNLAYOUT")
+                op._set_attr("use_mkldnn", 1)
+                op._set_attr(
+                    "Scale", self._var_max_range[original_out] /
+                    calc_max_func(self._var_max_value_map[original_out]))
+
+                if self.__get_max_range_by_var_name(
+                        self._output_program,
+                        original_out) == Calibrator.s8_max:
+                    op._set_attr("is_negative_input", 1)
+
+                self.__update_int8_output_var_op_index_dict(
+                    index + inserted_op_length, "quantize_{}_tmp".format(index))
+
+                inserted_op_length += 1
+                for op in self._output_program.current_block().ops[
+                        index + inserted_op_length:]:
+                    for j in op.input_names:
+                        if op.input(j) and op.input(
+                                j
+                        )[0] == original_out and op.type in Calibrator.supported_int8_op_type:
+                            op.desc.set_input(j,
+                                              ["{}".format(quantize_tmp.name)])
+            else:
+                start_index = index + inserted_op_length
+                dequantize_tmp_var = self._output_program.current_block(
+                ).create_var(
+                    name="dequantize_{}_tmp".format(index + 1),
+                    dtype="float32", )
+                original_out_var = None
+
+                for original_input in self._output_program.current_block().ops[
+                        start_index].input_arg_names:
+                    index_res = self.__get_op_index_by_output_var(
+                        self._output_program, original_input)
+                    if index_res != -1:
+                        original_out_var = original_input
+                        break
+
+                if original_out_var:
+                    op = self._output_program.current_block()._insert_op(
+                        index=start_index,
+                        type="dequantize",
+                        inputs={"Input": original_out_var},
+                        outputs={"Output": dequantize_tmp_var})
+                    op._set_attr("data_format", "MKLDNNLAYOUT")
+                    op._set_attr("use_mkldnn", 1)
+                    op._set_attr("Scale", self._var_max_range[original_out_var]
+                                 / calc_max_func(self._var_max_value_map[
+                                     original_out_var]))
+
+                    for op_index in range(
+                            start_index + 1,
+                            len(self._output_program.current_block().ops)):
+                        if self._output_program.current_block(
+                        ).ops[op_index].type == "conv2d" and self._output_program.current_block(
+                        ).ops[op_index].attr("force_fp32_output"):
+                            continue
+                        else:
+                            for j in self._output_program.current_block().ops[
+                                    op_index].input_names:
+                                if len(self._output_program.current_block().ops[
+                                        op_index].input(j)
+                                       ) and self._output_program.current_block(
+                                       ).ops[op_index].input(j)[
+                                           0] == original_out_var:
+                                    self._output_program.current_block(
+                                    ).ops[op_index].desc.set_input(
+                                        j,
+                                        ["{}".format(dequantize_tmp_var.name)])
+
+                    inserted_op_length += 1
+
+                    op._set_attr("data_format", "MKLDNNLAYOUT")
+                    op._set_attr("use_mkldnn", 1)
+
+    def __update_output_program_attr(self):
+        for i in self._output_program.list_vars():
+            if i.name in self._persistable_vars:
+                i.persistable = False
+                os.system("rm -rf {}/{}".format(self.pretrained_model, i.name))
+
+        for i in self._u8_output_var:
+            self._output_program.current_block().var(i).desc.set_dtype(
+                core.VarDesc.VarType.UINT8)
+
+        for i in self._s8_output_var:
+            self._output_program.current_block().var(i).desc.set_dtype(
+                core.VarDesc.VarType.INT8)
+
+    @property
+    def sampling_program(self):
+        return self._output_program
+
+    @property
+    def sampling_vars(self):
+        return self._weights_var_name + self._conv_input_var_name + self._conv_output_var_name + self._residual_input_var_name + self._pool2d_output_var_name
+
+    def _is_close(self, a, b, rel_tol=1e-09, abs_tol=0.0):
+        return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
+
+    def __generate_output_program(self):
+        for i in self.program.list_vars():
+            if not i.persistable and i.name in self.sampling_vars:
+                i.persistable = True
+                self._persistable_vars.append(i.name)
+
+        self._output_program = self.program.clone()
+
+    def __save_scale(self):
+        '''
+        Update the convolution scale information.
+        '''
+        func = self.__get_optimal_scaling_factor if self.algo == 'KL' else np.max
+        for i in self._conv_op_index[1:]:
+            weights_var_name = self.program.current_block().ops[i].input(
+                'Filter')[0]
+            input_var_name = self.program.current_block().ops[i].input('Input')[
+                0]
+            output_var_name = self.program.current_block().ops[i].output(
+                'Output')[0]
+            self._output_program.current_block().ops[i]._set_attr(
+                "Scale_weights", self._weights_scaling_factor[weights_var_name])
+
+            self._output_program.current_block().ops[i]._set_attr(
+                "Scale_in", self._var_max_range[input_var_name] /
+                func(self._var_max_value_map[input_var_name]))
+            self._output_program.current_block().ops[i]._set_attr(
+                "Scale_out", self._var_max_range[output_var_name] /
+                func(self._var_max_value_map[output_var_name]))
+            if self._output_program.current_block().ops[i].desc.input(
+                    "ResidualData"):
+                residual_var_name = self._output_program.current_block().ops[
+                    i].desc.input("ResidualData")[0]
+                self._output_program.current_block().ops[i]._set_attr(
+                    "Scale_in_eltwise", self._var_max_range[residual_var_name] /
+                    func(self._var_max_value_map[residual_var_name]))
+
+    def __sampling(self, sampling_data):
+        '''
+        Sampling the variables data range.
+        '''
+        for i in self.program.list_vars():
+            if i.name not in self.sampling_vars:
+                continue
+
+            if i.name in self._weights_var_name:
+                scaling_factor_per_channel = []
+                data = sampling_data[i.name][0]
+                for j in range(data.shape[0]):
+                    var_value = float(np.max(np.abs(data[j])))
+                    if not self._is_close(var_value, 0.0):
+                        scaling_factor_per_channel.append(Calibrator.s8_max /
+                                                          var_value)
+                    else:
+                        scaling_factor_per_channel.append(0.0)
+                self._weights_scaling_factor[
+                    i.name] = scaling_factor_per_channel
+            else:
+                if i.name in self._conv_output_var_name:
+                    op_pos = self.__get_op_index_by_output_var(self.program,
+                                                               i.name)
+                    cur_op = self.program.current_block().ops[op_pos]
+
+                    if cur_op.has_attr('fuse_relu') and cur_op.attr(
+                            'fuse_relu'):
+                        max_range = Calibrator.u8_max
+                        self._u8_output_var.append(i.name)
+                    else:
+                        max_range = Calibrator.s8_max
+                        self._s8_output_var.append(i.name)
+                else:
+                    max_range = self.__get_max_range_by_var_name(self.program,
+                                                                 i.name)
+                max_value = [[np.abs(np_data)]
+                             for np_data in sampling_data[i.name]]
+
+                self._var_max_range[i.name] = max_range
+                self._var_max_value_map[i.name] = max_value
+
+    def __check_force_fp32_attr_by_output_var(self, program, var_name):
+        for op in program.current_block().ops:
+            if op.type == "conv2d" and var_name in op.output_arg_names:
+                return op.attr("force_fp32_output")
+        return False
+
+    def __get_op_index_by_output_var(self, program, var_name, start_index=0):
+        '''
+        Check whether the specified input variable is the output of the
+        conv/pool2d op's output or not.
+
+        Returns:
+            The index if the variable is the output of any conv/pool2d op's
+            output.
+            -1 when the variable is not the output of any conv/pool2d op's 
+            output.
+        '''
+        for index, op in enumerate(program.current_block().ops[start_index:]):
+            if var_name in op.output_arg_names and op.type in Calibrator.supported_int8_op_type:
+                return index
+        return -1
+
+    def __get_op_index_by_input_var(self, program, var_name, start_index=0):
+        '''
+        Get the op index by specified input variable.
+        Returns:
+            The op index if the variable is the input of this op or -1 if the 
+            variable is not the input of any op. 
+        '''
+        for index, op in enumerate(program.current_block().ops[start_index:]):
+            if var_name in op.input_arg_names:
+                return index
+
+        return -1
+
+    def __get_quantize_dequantize_combination(self, program):
+        """
+        Get the quantize/dequantize op index for further inserting.
+        Args:
+            The program desc.
+        Returns:
+            Two lists contains the quantize op and dequantize op index information.
+        """
+        quantize_op_index = []
+        dequantize_op_index = []
+        minimal_conv_count = 2  # there must be two conv ops if not enable the first conv int8.
+        if len(self._conv_op_index) < minimal_conv_count:
+            return [], []
+
+        for index, value in enumerate(self._conv_op_index):
+            if index == 0:
+                quantize_op_index.append(self._conv_op_index[index + 1])
+            elif index == len(self._conv_op_index) - 1:
+                output_var = program.current_block().ops[value].output(
+                    "Output")[0]
+                if self.__check_op_type_with_specified_var_as_input(
+                        program, output_var, index):
+                    dequantize_op_index.append(self._conv_op_index[index] + 2)
+                else:
+                    program.current_block().ops[value]._set_attr(
+                        "force_fp32_output", True)
+
+            elif self._conv_op_index[index] + 1 < self._conv_op_index[index +
+                                                                      1]:
+
+                program.current_block().ops[self._conv_op_index[
+                    index]]._set_attr("force_fp32_output", True)
+
+                for op_index in range(self._conv_op_index[index + 1],
+                                      self._conv_op_index[index], -1):
+                    op_type = program.current_block().ops[op_index].type
+                    op_has_int8_input = False
+                    input_var_name = None
+                    input_length = len(program.current_block().ops[op_index]
+                                       .input_arg_names)
+
+                    for var_name in program.current_block().ops[
+                            op_index].input_arg_names:
+                        if self.__check_var_source_dt(var_name) != -1:
+                            op_has_int8_input = True
+                            input_var_name = var_name
+                            break
+
+                    if op_has_int8_input:
+                        if op_type == "conv2d":
+                            if program.current_block().ops[op_index +
+                                                           1].type == "conv2d":
+                                continue
+                            elif program.current_block(
+                            ).ops[op_index +
+                                  1].type in Calibrator.non_conv_int8_op_type:
+                                dequantize_op_index.append(op_index + 2)
+                                break
+                            else:
+                                program.current_block().ops[op_index]._set_attr(
+                                    "force_fp32_output", True)
+                                continue
+                        elif not self.__check_force_fp32_attr_by_output_var(
+                                program, input_var_name
+                        ) and op_index not in dequantize_op_index:
+                            share_input_flag = True
+                            for input_attr_name in program.current_block().ops[
+                                    op_index].input_names:
+                                input_var_name = program.current_block().ops[
+                                    op_index].input(input_attr_name)[0]
+                                cousin_op_index = self.__get_op_index_by_input_var(
+                                    program, input_var_name)
+                                if cousin_op_index != -1 and cousin_op_index in dequantize_op_index:
+                                    share_input_flag = False
+                                    break
+                            if share_input_flag:
+                                dequantize_op_index.append(op_index)
+
+                    elif input_length:
+                        output_is_to_int8_op = False
+                        share_input_flag = True
+                        for var_name in program.current_block().ops[
+                                op_index].input_arg_names:
+                            if not self.__check_op_type_with_specified_var_as_input(
+                                    program, var_name):
+                                share_input_flag = False
+                                break
+
+                        for var_name in program.current_block().ops[
+                                op_index].output_arg_names:
+                            if self.__get_op_index_by_output_var(
+                                    program, var_name, op_index) != -1:
+                                output_is_to_int8_op = True
+                                break
+
+                        if share_input_flag or output_is_to_int8_op:
+                            quantize_op_index.append(op_index)
+
+        return quantize_op_index, dequantize_op_index
+
+    def __init_analysis(self):
+        '''
+        Collect the variable names for sampling.
+        '''
+        start_index = 1  #analysis the conv op detail from second conv op.
+
+        for i in self._conv_op_index[start_index:]:
+            self._weights_var_name.append(self.program.current_block().ops[i]
+                                          .input('Filter')[0])
+            self._conv_input_var_name.append(self.program.current_block().ops[i]
+                                             .input('Input')[0])
+            self._conv_output_var_name.append(self.program.current_block().ops[
+                i].output('Output')[0])
+            self._int8_output_var_op_index_dict[self.program.current_block()
+                                                .ops[i].output('Output')[0]] = i
+            if self.program.current_block().ops[i].desc.input("ResidualData"):
+                self._residual_input_var_name.append(self.program.current_block(
+                ).ops[i].desc.input("ResidualData")[0])
+
+            if self.program.current_block().ops[i + 1].type == "pool2d":
+                self._pool2d_output_var_name.append(self.program.current_block(
+                ).ops[i + 1].output('Out')[0])
+
+    def __expand_quantized_bins(self, quantized_bins, reference_bins):
+        expanded_quantized_bins = [0] * len(reference_bins)
+        num_merged_bins = len(reference_bins) / len(quantized_bins)
+        j_start = 0
+        j_end = num_merged_bins
+        for idx in xrange(len(quantized_bins)):
+            zero_count = reference_bins[j_start:j_end].count(0)
+            num_merged_bins = j_end - j_start
+            if zero_count == num_merged_bins:
+                avg_bin_ele = 0
+            else:
+                avg_bin_ele = quantized_bins[idx] / (
+                    num_merged_bins - zero_count + 0.0)
+            for idx1 in xrange(j_start, j_end):
+                expanded_quantized_bins[idx1] = (0 if reference_bins[idx1] == 0
+                                                 else avg_bin_ele)
+            j_start += num_merged_bins
+            j_end += num_merged_bins
+            if (idx + 1) == len(quantized_bins) - 1:
+                j_end = len(reference_bins)
+        return expanded_quantized_bins
+
+    def __safe_entropy(self, reference_distr_P, P_sum, candidate_distr_Q,
+                       Q_sum):
+        '''
+        Calculate the entropy.
+        '''
+        assert len(reference_distr_P) == len(candidate_distr_Q)
+        tmp_sum1 = 0
+        tmp_sum2 = 0
+        for idx in range(len(reference_distr_P)):
+            p_idx = reference_distr_P[idx]
+            q_idx = candidate_distr_Q[idx]
+            if p_idx == 0:
+                tmp_sum1 += 0
+                tmp_sum2 += 0
+            else:
+                if q_idx == 0:
+                    print("Fatal error!, idx = " + str(idx) +
+                          " qindex = 0! p_idx = " + str(p_idx))
+                tmp_sum1 += p_idx * (math.log(Q_sum * p_idx))
+                tmp_sum2 += p_idx * (math.log(P_sum * q_idx))
+        return (tmp_sum1 - tmp_sum2) / P_sum
+
+    # Reference: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
+    def __get_optimal_scaling_factor(self,
+                                     activation_blob,
+                                     num_quantized_bins=255):
+        '''
+        Using the KL-divergenc method to get the more precise scaling factor.
+        '''
+        max_val = np.max(activation_blob)
+        min_val = np.min(activation_blob)
+        if min_val >= 0:
+            hist, hist_edeges = np.histogram(
+                activation_blob, bins=2048, range=(min_val, max_val))
+            ending_iter = 2047
+            starting_iter = int(ending_iter * 0.7)
+        else:
+            th = max(abs(max_val), abs(min_val))
+            hist, hist_edeges = np.histogram(
+                activation_blob, bins=2048, range=(-th, th))
+            starting_iter = 0
+            ending_iter = 2047
+            if abs(max_val) > abs(min_val):
+                while starting_iter < ending_iter:
+                    if hist[starting_iter] == 0:
+                        starting_iter += 1
+                        continue
+                    else:
+                        break
+                starting_iter += int((ending_iter - starting_iter) * 0.6)
+            else:
+                while ending_iter > 0:
+                    if hist[ending_iter] == 0:
+                        ending_iter -= 1
+                        continue
+                    else:
+                        break
+                starting_iter = int(0.6 * ending_iter)
+        bin_width = hist_edeges[1] - hist_edeges[0]
+        P_sum = len(activation_blob)
+        min_kl_divergence = 0
+        min_kl_index = 0
+        kl_inited = False
+        for i in range(starting_iter, ending_iter + 1):
+            reference_distr_P = hist[0:i].tolist()
+            outliers_count = sum(hist[i:2048])
+            if reference_distr_P[i - 1] == 0:
+                continue
+            reference_distr_P[i - 1] += outliers_count
+            reference_distr_bins = reference_distr_P[:]
+            candidate_distr_Q = hist[0:i].tolist()
+            num_merged_bins = i / num_quantized_bins
+            candidate_distr_Q_quantized = [0] * num_quantized_bins
+            j_start = 0
+            j_end = num_merged_bins
+            for idx in xrange(num_quantized_bins):
+                candidate_distr_Q_quantized[idx] = sum(candidate_distr_Q[
+                    j_start:j_end])
+                j_start += num_merged_bins
+                j_end += num_merged_bins
+                if (idx + 1) == num_quantized_bins - 1:
+                    j_end = i
+            candidate_distr_Q = self.__expand_quantized_bins(
+                candidate_distr_Q_quantized, reference_distr_bins)
+            Q_sum = sum(candidate_distr_Q)
+            kl_divergence = self.__safe_entropy(reference_distr_P, P_sum,
+                                                candidate_distr_Q, Q_sum)
+            if not kl_inited:
+                min_kl_divergence = kl_divergence
+                min_kl_index = i
+                kl_inited = True
+            elif kl_divergence < min_kl_divergence:
+                min_kl_divergence = kl_divergence
+                min_kl_index = i
+            else:
+                pass
+        if min_kl_index == 0:
+            while starting_iter > 0:
+                if hist[starting_iter] == 0:
+                    starting_iter -= 1
+                    continue
+                else:
+                    break
+            min_kl_index = starting_iter
+        return (min_kl_index + 0.5) * bin_width
+
+    @staticmethod
+    def __dot(program, output_name="model.dot"):
+        '''
+        Generate the graphiz dot file for debugging.
+        '''
+        dot_graph = ""
+        dot_nodes = []
+        dot_edges = []
+        dot_graph += "digraph pm {\n"
+        for block in program.blocks:
+            ops = list(block.ops)
+            for index, op in enumerate(ops):
+                op_type = op.type
+                op_name = op_type + "_" + op.output_arg_names[0].replace(
+                    ".", "_") + "___" + str(index)
+                for name in op.input_arg_names:
+                    name = name.replace(".", "_")
+                    dot_edge = name + " -> " + op_name
+                    if dot_edge not in dot_edges:
+                        dot_edges.append(dot_edge)
+                    dot_node = name + " [shape=oval, style=filled, fillcolor=yellow]"
+                    if dot_node not in dot_nodes:
+                        dot_nodes.append(dot_node)
+
+                for name in op.output_arg_names:
+                    name = name.replace(".", "_")
+                    dot_edge = op_name + " -> " + name
+                    if dot_edge not in dot_edges:
+                        dot_edges.append(dot_edge)
+                if op_type in Calibrator.supported_int8_op_type:
+                    if op_type == "conv2d" and op.has_attr(
+                            'force_fp32_output') and op.attr(
+                                "force_fp32_output"):
+                        dot_node = op_name + " [shape=box, style=filled, color=deeppink]"
+                    else:
+                        dot_node = op_name + " [shape=box, style=filled, color=greenyellow]"
+                elif op_type in ["quantize", "dequantize"]:
+                    dot_node = op_name + " [shape=box, style=filled, color=gold]"
+                else:
+                    dot_node = op_name + " [shape=box, style=filled, fillcolor=red]"
+
+                if dot_node not in dot_nodes:
+                    dot_nodes.append(dot_node)
+
+        for dot_edge in dot_edges:
+            dot_graph += dot_edge + "\n"
+        for dot_node in dot_nodes:
+            dot_graph += dot_node + "\n"
+        dot_graph += "}"
+
+        with open(output_name, 'w') as f:
+            f.write(dot_graph)
diff --git a/python/paddle/fluid/contrib/tests/CMakeLists.txt b/python/paddle/fluid/contrib/tests/CMakeLists.txt
index 79bec8c4ad3..81aee1233d1 100644
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
@@ -1,6 +1,10 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
+if(APPLE OR WIN32 OR NOT WITH_MKL)
+    list(REMOVE_ITEM TEST_OPS test_calibration)
+endif()
+
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
diff --git a/python/paddle/fluid/contrib/tests/test_calibration.py b/python/paddle/fluid/contrib/tests/test_calibration.py
new file mode 100644
index 00000000000..17e4eb8b831
--- /dev/null
+++ b/python/paddle/fluid/contrib/tests/test_calibration.py
@@ -0,0 +1,230 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+import unittest
+import os
+import numpy as np
+import time
+import sys
+import random
+import paddle
+import paddle.fluid as fluid
+import argparse
+import functools
+import contextlib
+import paddle.fluid.profiler as profiler
+from PIL import Image, ImageEnhance
+import math
+sys.path.append('..')
+import int8_inference.utility as ut
+
+random.seed(0)
+np.random.seed(0)
+
+DATA_DIM = 224
+
+THREAD = 1
+BUF_SIZE = 102400
+
+DATA_DIR = 'data/ILSVRC2012'
+
+img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+
+
+# TODO(guomingz): Remove duplicated code from line 45 ~ line 114
+def resize_short(img, target_size):
+    percent = float(target_size) / min(img.size[0], img.size[1])
+    resized_width = int(round(img.size[0] * percent))
+    resized_height = int(round(img.size[1] * percent))
+    img = img.resize((resized_width, resized_height), Image.LANCZOS)
+    return img
+
+
+def crop_image(img, target_size, center):
+    width, height = img.size
+    size = target_size
+    if center == True:
+        w_start = (width - size) / 2
+        h_start = (height - size) / 2
+    else:
+        w_start = np.random.randint(0, width - size + 1)
+        h_start = np.random.randint(0, height - size + 1)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img.crop((w_start, h_start, w_end, h_end))
+    return img
+
+
+def process_image(sample, mode, color_jitter, rotate):
+    img_path = sample[0]
+
+    img = Image.open(img_path)
+
+    img = resize_short(img, target_size=256)
+    img = crop_image(img, target_size=DATA_DIM, center=True)
+
+    if img.mode != 'RGB':
+        img = img.convert('RGB')
+
+    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
+    img -= img_mean
+    img /= img_std
+
+    return img, sample[1]
+
+
+def _reader_creator(file_list,
+                    mode,
+                    shuffle=False,
+                    color_jitter=False,
+                    rotate=False,
+                    data_dir=DATA_DIR):
+    def reader():
+        with open(file_list) as flist:
+            full_lines = [line.strip() for line in flist]
+            if shuffle:
+                np.random.shuffle(full_lines)
+
+            lines = full_lines
+
+            for line in lines:
+                img_path, label = line.split()
+                img_path = os.path.join(data_dir, img_path)
+                if not os.path.exists(img_path):
+                    continue
+                yield img_path, int(label)
+
+    mapper = functools.partial(
+        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
+
+    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
+
+
+def val(data_dir=DATA_DIR):
+    file_list = os.path.join(data_dir, 'val_list.txt')
+    return _reader_creator(file_list, 'val', shuffle=False, data_dir=data_dir)
+
+
+class TestCalibration(unittest.TestCase):
+    def setUp(self):
+        # TODO(guomingz): Put the download process in the cmake.
+        # Download and unzip test data set
+        imagenet_dl_url = 'http://paddle-inference-dist.bj.bcebos.com/int8/calibration_test_data.tar.gz'
+        zip_file_name = imagenet_dl_url.split('/')[-1]
+        cmd = 'rm -rf data {}  && mkdir data && wget {} && tar xvf {} -C data'.format(
+            zip_file_name, imagenet_dl_url, zip_file_name)
+        os.system(cmd)
+        # resnet50 fp32 data
+        resnet50_fp32_model_url = 'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
+        resnet50_zip_name = resnet50_fp32_model_url.split('/')[-1]
+        resnet50_unzip_folder_name = 'resnet50_fp32'
+        cmd = 'rm -rf {} {} && mkdir {} && wget {} && tar xvf {} -C {}'.format(
+            resnet50_unzip_folder_name, resnet50_zip_name,
+            resnet50_unzip_folder_name, resnet50_fp32_model_url,
+            resnet50_zip_name, resnet50_unzip_folder_name)
+        os.system(cmd)
+
+        self.iterations = 100
+        self.skip_batch_num = 5
+
+    def run_program(self, model_path, generate_int8=False, algo='direct'):
+        image_shape = [3, 224, 224]
+        os.environ['FLAGS_use_mkldnn'] = 'True'
+
+        fluid.memory_optimize(fluid.default_main_program())
+
+        exe = fluid.Executor(fluid.CPUPlace())
+
+        [infer_program, feed_dict,
+         fetch_targets] = fluid.io.load_inference_model(model_path, exe)
+
+        t = fluid.transpiler.InferenceTranspiler()
+        t.transpile(infer_program, fluid.CPUPlace())
+
+        val_reader = paddle.batch(val(), batch_size=1)
+
+        if generate_int8:
+            int8_model = os.path.join(os.getcwd(), "calibration_out")
+
+            if os.path.exists(int8_model):
+                os.system("rm -rf " + int8_model)
+                os.system("mkdir " + int8_model)
+
+            print("Start calibration ...")
+
+            calibrator = ut.Calibrator(
+                program=infer_program,
+                pretrained_model=model_path,
+                iterations=100,
+                debug=False,
+                algo=algo)
+
+            sampling_data = {}
+
+            calibrator.generate_sampling_program()
+        test_info = []
+        cnt = 0
+        for batch_id, data in enumerate(val_reader()):
+            image = np.array(
+                [x[0].reshape(image_shape) for x in data]).astype("float32")
+            label = np.array([x[1] for x in data]).astype("int64")
+            label = label.reshape([-1, 1])
+            running_program = calibrator.sampling_program.clone(
+            ) if generate_int8 else infer_program.clone()
+            for op in running_program.current_block().ops:
+                if op.has_attr("use_mkldnn"):
+                    op._set_attr("use_mkldnn", True)
+
+            _, acc1, _ = exe.run(
+                running_program,
+                feed={feed_dict[0]: image,
+                      feed_dict[1]: label},
+                fetch_list=fetch_targets)
+            if generate_int8:
+                for i in calibrator.sampling_program.list_vars():
+                    if i.name in calibrator.sampling_vars:
+                        np_data = np.array(fluid.global_scope().find_var(i.name)
+                                           .get_tensor())
+                        if i.name not in sampling_data:
+                            sampling_data[i.name] = []
+                        sampling_data[i.name].append(np_data)
+
+            test_info.append(np.mean(acc1) * len(data))
+            cnt += len(data)
+
+            if batch_id != self.iterations - 1:
+                continue
+
+            break
+
+        if generate_int8:
+            calibrator.generate_quantized_data(sampling_data)
+            fluid.io.save_inference_model(int8_model, feed_dict, fetch_targets,
+                                          exe, calibrator.sampling_program)
+            print(
+                "Calibration is done and the corresponding files were generated at {}".
+                format(os.path.abspath("calibration_out")))
+        else:
+            return np.sum(test_info) / cnt
+
+    def test_calibration_for_resnet50(self):
+        fp32_acc1 = self.run_program("resnet50_fp32/model")
+        self.run_program("resnet50_fp32/model", True)
+        int8_acc1 = self.run_program("calibration_out")
+        delta_value = np.abs(fp32_acc1 - int8_acc1)
+        self.assertLess(delta_value, 0.01)
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab